├── .circleci
    └── config.yml
├── .gitattributes
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── committee-membership-current.yaml
├── committees-current.yaml
├── committees-historical.yaml
├── congress_lookup.py
├── executive.yaml
├── legislators-current.yaml
├── legislators-district-offices.yaml
├── legislators-historical.yaml
├── legislators-social-media.yaml
├── misc
    ├── biographical-directory-footnotes.json
    └── new-member-template.yaml
├── scripts
    ├── alternate_bulk_formats.py
    ├── archive
    │   ├── 114th_congress.py
    │   ├── committee_membership_house.py
    │   ├── election_results_2014.csv
    │   ├── election_results_2018_senate.csv
    │   ├── election_results_2020.csv
    │   ├── election_results_2022.csv
    │   ├── election_results_2024.csv
    │   ├── election_results_house_2016.py
    │   ├── election_results_senate_2016.csv
    │   ├── election_results_senate_2016.py
    │   ├── everypolitician.py
    │   ├── house_history_gender.py
    │   └── print_leadership_roles.py
    ├── bioguide.py
    ├── bioguide_guess_new_member_ids.py
    ├── bioguide_xml.py
    ├── committee_membership.py
    ├── contact_forms.py
    ├── cspan.py
    ├── data
    │   ├── social_media_blacklist.csv
    │   └── social_media_whitelist.csv
    ├── election_results.py
    ├── email
    │   └── config.yml.example
    ├── export_csv.py
    ├── geocode_offices.py
    ├── historical_committees.py
    ├── house_contacts.py
    ├── house_history.py
    ├── house_websites.py
    ├── icpsr_ids.py
    ├── influence_ids.py
    ├── lint.py
    ├── office_validator.py
    ├── pictorial_ids.py
    ├── requirements.txt
    ├── retire.py
    ├── run_script_to_branch
    ├── senate_contacts.py
    ├── social
    │   └── twitter.py
    ├── social_media.py
    ├── sweep.py
    ├── thomas_ids.py
    ├── untire.py
    ├── update_gh_pages.sh
    ├── utils.py
    ├── wikidata_update.py
    └── wikipedia_ids.py
└── test
    ├── are_files_linted.py
    ├── test_pictorial_ids.py
    ├── validate.py
    └── workout.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     docker:
 5 |       - image: cimg/python:3.11
 6 |     steps:
 7 |       # Set up.
 8 |       - checkout
 9 |       - run: pip install -r scripts/requirements.txt
10 | 
11 |       # Run tests.
12 |       - run: python test/workout.py
13 |       - run: pyflakes .
14 |       - run: python test/are_files_linted.py
15 |       - run: python test/validate.py
16 | 
17 |   # Update the gh-pages branch. This requires that
18 |   # CircleCI be set up with read-write permission
19 |   # on the repo, which is not CircleCI's default.
20 |   deploy:
21 |     docker:
22 |       - image: cimg/python:3.8
23 |     steps:
24 |       - checkout
25 |       - run: pip install -r scripts/requirements.txt
26 |       - run: scripts/update_gh_pages.sh
27 | 
28 | workflows:
29 |   version: 2
30 |   build-and-deploy:
31 |     jobs:
32 |       - build
33 |       - deploy:
34 |           filters:
35 |             branches:
36 |               only: main
37 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | alternate_formats/* -diff
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | /scripts/cache
 3 | *.pickle
 4 | .DS_Store
 5 | /scripts/email/config.yml
 6 | \.~lock*
 7 | /scripts/build
 8 | /domains.rb
 9 | /venv/
10 | .idea
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Public domain
2 | 
3 | The project is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication][CC0].
4 | 
5 | All contributions to this project will be released under the CC0 dedication. By submitting a pull request, you are agreeing to comply with this waiver of copyright interest.
6 | 
7 | [CC0]: http://creativecommons.org/publicdomain/zero/1.0/
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/congress_lookup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #coding: utf-8
  3 | __author__ = 'stsmith'
  4 | 
  5 | # congress_lookup: Look up information about congress from the congress-legislators database
  6 | # See: https://github.com/unitedstates/congress-legislators and https://github.com/TheWalkers/congress-legislators
  7 | 
  8 | # The project is in the public domain within the United States, and
  9 | # copyright and related rights in the work worldwide are waived
 10 | # through the CC0 1.0 Universal public domain dedication.
 11 | 
 12 | # Author 2017 Steven T. Smith <steve dot t dot smith at gmail dot com>
 13 | 
 14 | import argparse as ap, contextlib, fnmatch, os, sys, time, warnings, yaml
 15 | 
 16 | # version dependent libraries
 17 | # https://docs.python.org/2/library/urllib.html
 18 | # https://docs.python.org/3.0/library/urllib.parse.html
 19 | if (sys.version_info > (3, 0)):
 20 |     from urllib.request import urlopen
 21 |     import urllib.parse as urlparse
 22 | else:
 23 |     from urllib2 import urlopen
 24 |     import urlparse
 25 | 
 26 | class CongressLookup:
 27 |     '''A class used to lookup legislator properties from the github congress-legislators YAML database.'''
 28 | 
 29 |     def __init__(self):
 30 |         self.args = self.parseArgs()
 31 |         self.data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),self.args.data_dir)
 32 |         self.properties = dict()
 33 |         self.database_load()
 34 |         for prop in self.args.properties: self.lookup_property(prop)
 35 | 
 36 |     def parseArgs(self):
 37 |         parser = ap.ArgumentParser()
 38 |         parser.add_argument('properties', metavar='PROPS', type=str, nargs='+',
 39 |                             help='Properties to look up')
 40 |         parser.add_argument('-c', '--committee', help="Committee name (wildcard)", type=str, default=None)
 41 |         parser.add_argument('-n', '--last-name', help="Last name of legislator (wildcard)", type=str, default=None)
 42 |         parser.add_argument('-d', '--data-dir', help="Database directory", type=str, default='.')
 43 |         parser.add_argument('-r', '--repo', help="GitHub repo URL", type=str, default='https://github.com/unitedstates/congress-legislators/')
 44 |         parser.add_argument('-T', '--current-term', help="Properties from only the current term", action='store_true')
 45 |         parser.add_argument('-D', '--download', help="Download data", action='store_true', default=False)
 46 |         parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
 47 |         return parser.parse_args()
 48 | 
 49 |     def lookup_property(self,property):
 50 |         if self.args.committee is not None:
 51 |             self.lookup_by_committee(property)
 52 |         if self.args.last_name is not None:
 53 |             self.lookup_by_lastname(property)
 54 | 
 55 |     def lookup_by_committee(self,property):
 56 |         for comm in (comm for comm in self.committees if self.inclusive_wildcard_match(comm['name'],self.args.committee)):
 57 |             if self.args.debug: print(comm)
 58 |             print('"{}" member properties:'.format(comm['name'].encode('utf-8')))
 59 |             members = self.membership[comm['thomas_id']] if comm['thomas_id'] in self.membership else []
 60 |             for member in members: self.lookup_by_member(property,member)
 61 | 
 62 |     def inclusive_wildcard_match(self,name,pat):
 63 |         if any(c in pat for c in '*?[]'):       # a wildcard pattern
 64 |             # prepend or append a * for inclusiveness if not already there
 65 |             if pat[0] != '*': pat = '*' + pat
 66 |             if pat[-1] != '*': pat = pat + '*'
 67 |         else:                                   # not a wildcard
 68 |             pat = '*' + pat + '*'
 69 |         return fnmatch.fnmatch(name,pat)
 70 | 
 71 |     def lookup_by_member(self,property,member):
 72 |         for leg in ( leg for leg in self.legislators if \
 73 |                     (leg['name']['official_full'] == member['name']) \
 74 |                     or ('bioguide' in leg['id'] and 'bioguide' in member and leg['id']['bioguide'] == member['bioguide']) \
 75 |                     or ('thomas' in leg['id'] and 'thomas' in member and leg['id']['thomas'] == member['thomas']) ):
 76 |             self.lookup_legislator_properties(property,leg)
 77 | 
 78 |     def lookup_by_lastname(self,property):
 79 |         for leg in (leg for leg in self.legislators if fnmatch.fnmatch(leg['name']['last'],self.args.last_name)):
 80 |             if self.args.debug: print(leg)
 81 |             self.lookup_legislator_properties(property,leg)
 82 | 
 83 |     def lookup_legislator_properties(self,property,legislator):
 84 |         self.properties[property] = set([term[property] for term in legislator['terms'] if self.lookup_filter(property,term)])
 85 |         for off in self.offices:
 86 |             if self.args.debug: print(off)
 87 |             if any(off['id'][db] == legislator['id'][db] for db in off['id'] if db in off['id'] and db in legislator['id']):
 88 |                 self.properties[property] |= set([ok[property] for ok in off['offices'] if property in ok and len(ok[property]) > 0])
 89 |                 break
 90 |         print('Property \'{}\' for {}:'.format(property,legislator['name']['official_full'].encode('utf-8')))
 91 |         print('\n'.join(sorted(self.properties[property])))
 92 | 
 93 |     def lookup_filter(self,property,term):
 94 |         result = property in term and len(term[property]) > 0
 95 |         if result and self.args.current_term:
 96 |             result &= 'end' in term and time.strptime(term['end'],'%Y-%m-%d') >= time.localtime()
 97 |         return result
 98 | 
 99 |     def database_load(self):
100 |         try:
101 |             with self.database_access('legislators-current.yaml') as y:
102 |                 self.legislators = self.yaml_load(y, Loader=yaml.CLoader)
103 |             with self.database_access('legislators-district-offices.yaml') as y:
104 |                 self.offices = self.yaml_load(y, Loader=yaml.CLoader)
105 |             if self.args.committee is not None:
106 |                 with self.database_access('committees-current.yaml') as y:
107 |                     self.committees = self.yaml_load(y, Loader=yaml.CLoader)
108 |                 with self.database_access('committee-membership-current.yaml') as y:
109 |                     self.membership = self.yaml_load(y, Loader=yaml.CLoader)
110 |             else:
111 |                 self.committees = None
112 |         except (BaseException,IOError) as e:
113 |             print(e)
114 |             raise Exception('Clone data from {} and copy it to {} .'.format(self.args.repo,self.data_path))
115 | 
116 |     def yaml_load(self,y,Loader=yaml.loader.Loader):
117 |         res = yaml.load(y, Loader=Loader)
118 |         if res is None: res = []  # make it an empty iterable
119 |         return res
120 | 
121 |     def database_access(self,filename):
122 |         if self.args.download:
123 |             if self.args.repo[-1] != '/': self.args.repo += '/'
124 |             url_base = urlparse.urljoin(urlparse.urlunparse(urlparse.urlparse(self.args.repo)._replace(netloc='raw.githubusercontent.com')),'main/')
125 |             # contextlib required for urlopen in with ... as for v < 3.3
126 |             res = contextlib.closing(urlopen( urlparse.urljoin(url_base,filename) ))
127 |         else:
128 |             fname_fullpath = os.path.join(self.data_path,filename)
129 |             if os.path.exists(fname_fullpath):
130 |                 res = open(fname_fullpath,'r')
131 |             else:
132 |                 warnings.warn('File {} doesn\'t exist; clone data from {} and copy it to {} .'.format(filename,self.args.repo,self.data_path))
133 |                 res = self.Emptysource()
134 |         return res
135 | 
136 |     class Emptysource(object):
137 |         def read(self, size):
138 |             return ''  # empty
139 |         def write(self, data):
140 |             pass  # ignore the data
141 |         def __enter__(self): return self
142 |         def __exit__(*x): pass
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     res = CongressLookup()
147 | 


--------------------------------------------------------------------------------
/misc/new-member-template.yaml:
--------------------------------------------------------------------------------
 1 | # All of the fields we can put on a legislator,
 2 | # as we might add when a new legislator takes
 3 | # office.
 4 | #
 5 | # Run the lint.py script after editing the main
 6 | # YAML files. It will conveniently remove all of
 7 | # the comments.
 8 | #
 9 | # In separate commits, run:
10 | #   * house_contacts.py (which will add url, address, etc. fields to House members)
11 | #   * senate_contacts.py (likewise for Senate members)
12 | #   * committee_membership.py (updates committee membership)
13 | 
14 | - id:
15 |     bioguide: Q000000      # http://bioguide.congress.gov/
16 |     lis: S999              # not assigned until there is a Senate roll call vote
17 |     fec:                   # http://fec.gov/finance/disclosure/candcmte_info.shtml
18 |     - H1XX99999            #   (you're looking for a Candidate ID)
19 |     govtrack: 456789       # you may assign the next available integer (try: `(echo -n "1+"; git grep -h govtrack: *.yaml | sort | tail -1 | sed "s/ *govtrack: //") | bc`)
20 |     opensecrets: N00099999 # http://www.opensecrets.org/
21 |     votesmart: 159999      # http://votesmart.org/
22 |     icpsr: 99999           # not knowable until voteview.org publishes roll call raw data
23 |     cspan: 75516           # people search at http://www.c-span.org/ (personid)
24 |     wikipedia: John Doe    # https://en.wikipedia.org/wiki/Main_Page (replace _ with space!)
25 |     wikidata: Q30129999    # from the "Wikidata item" URL linked from the their Wikipedia page
26 |     ballotpedia: John Doe  # http://ballotpedia.org/Main_Page (replace _ with space!)
27 |     house_history: 10999   # http://history.house.gov/People/Search/
28 |     google_entity_id: kg:/g/11dddd111d # ...
29 |   name:
30 |     first: John
31 |     middle: Person         # optional, can also be an initial like 'P.'
32 |     nickname: Whoami       # if clearly in use
33 |     last: Doe
34 |     suffix: Jr.            # optional
35 |   bio:
36 |     gender:                # M or F, no quotes
37 |     birthday: '1960-06-06' # can find on Bioguide
38 |   terms:
39 | 
40 |   # for a representative
41 |   - type: rep
42 |     start: '2017-01-03'    # date of swearing in
43 |     end: '2019-01-03'      # always the next odd-year Jan 3, until a death/resignation
44 |     state: FL              # USPS state abbreviation
45 |     district: 19           # an integer; 0 for At-Large
46 |     party:                 # Republican, Democrat, Independent
47 |     caucus:                # for Independents only, Republican or Democrat
48 |     url: https://someone.house.gov
49 |     contact_form: https://www.house.gov/name/email.htm
50 | 
51 |   # for a senator
52 |   - type: sen
53 |     start: '2015-01-03'    # date of swearing in
54 |     end: '2021-01-03'      # always a future Jan 3, until a death/resignation
55 |     how: appointment       # for senators appointed by the governor only
56 |     end-type: special-election # when "how: appointment" is used, prior to the special election,
57 |                                # set the "end" date to the special election date and set this flag
58 |     state: FL              # USPS state abbreviation
59 |     class: 1               # copy from the senator this person is succeeding
60 |     party:                 # Republican, Democrat, Independent
61 |     caucus:                # for Independents only, Republican or Democrat
62 |     state_rank: junior     # or senior
63 |     url: https://someone.senate.gov/
64 |     contact_form: https://www.name.senate.gov/contact/
65 | 


--------------------------------------------------------------------------------
/scripts/alternate_bulk_formats.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import glob
  4 | import os
  5 | 
  6 | import utils
  7 | 
  8 | 
  9 | def generate_csv():
 10 | 
 11 | 	#yaml filenames
 12 | 	yamls = ["legislators-current.yaml","legislators-historical.yaml"]
 13 | 	yaml_social = "legislators-social-media.yaml"
 14 | 
 15 | 
 16 | 
 17 | 	#list of yaml field name, csv column name tuples. Split into categories which do not reflect yaml structure (structured for logical csv column ordering)
 18 | 	bio_fields = [
 19 | 	("last", "last_name"),
 20 | 	("first", "first_name"),
 21 | 	("middle", "middle_name"),
 22 | 	("suffix", "suffix"),
 23 | 	("nickname", "nickname"),
 24 | 	("official_full", "full_name"),
 25 | 	("birthday", "birthday"),
 26 | 	("gender", "gender")
 27 | 	]
 28 | 
 29 | 	#ID crosswalks, omit FEC id's, which may contain (arbitrary?) number of values
 30 | 	crosswalk_fields = [
 31 | 	("bioguide", "bioguide_id"),
 32 | 	("thomas", "thomas_id"),
 33 | 	("opensecrets", "opensecrets_id"),
 34 | 	("lis","lis_id"),
 35 | 	("fec","fec_ids"),
 36 | 	("cspan", "cspan_id"),
 37 | 	("govtrack", "govtrack_id"),
 38 | 	("votesmart", "votesmart_id"),
 39 | 	("ballotpedia", "ballotpedia_id"),
 40 | 	("washington_post", "washington_post_id"),
 41 | 	("icpsr", "icpsr_id"),
 42 | 	("wikipedia", "wikipedia_id")
 43 | 	]
 44 | 
 45 | 	#separate list for children of "terms", csv only captures data for most recent term
 46 | 	#currently excluding start/end dates - earliest start to latest end is deceptive (excludes gaps) as is start/end for most recent term
 47 | 	term_fields = [
 48 | 	("type", "type"),
 49 | 	("state", "state"),
 50 | 	("district", "district"),
 51 | 	("class", "senate_class"),
 52 | 	("party", "party"),
 53 | 	("url", "url"),
 54 | 	("address", "address"),
 55 | 	("phone", "phone"),
 56 | 	("contact_form", "contact_form"),
 57 | 	("rss_url", "rss_url"),
 58 | 	]
 59 | 
 60 | 	#pulled from legislators-social-media.yaml
 61 | 	social_media_fields = [
 62 | 	("twitter", "twitter"),
 63 | 	("twitter_id", "twitter_id"),
 64 | 	("facebook", "facebook"),
 65 | 	("youtube", "youtube"),
 66 | 	("youtube_id", "youtube_id"),
 67 | 	("mastodon", "mastodon")
 68 | 	]
 69 | 
 70 | 
 71 | 	print("Loading %s..." %yaml_social)
 72 | 	social = utils.load_data(yaml_social)
 73 | 
 74 | 	for filename in yamls:
 75 | 		print("Converting %s to CSV..." % filename)
 76 | 
 77 | 		legislators = utils.load_data(filename)
 78 | 
 79 | 		#convert yaml to csv
 80 | 		csv_output = csv.writer(open("../" + filename.replace(".yaml", ".csv"),"w"))
 81 | 
 82 | 		head = []
 83 | 		for pair in bio_fields:
 84 | 			head.append(pair[1])
 85 | 		for pair in term_fields:
 86 | 			head.append(pair[1])
 87 | 		for pair in social_media_fields:
 88 | 			head.append(pair[1])
 89 | 		for pair in crosswalk_fields:
 90 | 			head.append(pair[1])
 91 | 		csv_output.writerow(head)
 92 | 
 93 | 		for legislator in legislators:
 94 | 			legislator_row = []
 95 | 			for pair in bio_fields:
 96 | 				if 'name' in legislator and pair[0] in legislator['name']:
 97 | 					legislator_row.append(legislator['name'][pair[0]])
 98 | 				elif 'bio' in legislator and pair[0] in legislator['bio']:
 99 | 					legislator_row.append(legislator['bio'][pair[0]])
100 | 				else:
101 | 					legislator_row.append(None)
102 | 
103 | 			for pair in term_fields:
104 | 				latest_term = legislator['terms'][len(legislator['terms'])-1]
105 | 				if pair[0] in latest_term:
106 | 					legislator_row.append(latest_term[pair[0]])
107 | 				else:
108 | 					legislator_row.append(None)
109 | 
110 | 			social_match = None
111 | 			for social_legislator in social:
112 | 				if 'bioguide' in legislator['id'] and 'bioguide' in social_legislator['id'] and legislator['id']['bioguide'] == social_legislator['id']['bioguide']:
113 | 					social_match = social_legislator
114 | 					break
115 | 				elif 'thomas' in legislator['id'] and 'thomas' in social_legislator['id'] and legislator['id']['thomas'] == social_legislator['id']['thomas']:
116 | 					social_match = social_legislator
117 | 					break
118 | 				elif 'govtrack' in legislator['id'] and 'govtrack' in social_legislator['id'] and legislator['id']['govtrack'] == social_legislator['id']['govtrack']:
119 | 					social_match = social_legislator
120 | 					break
121 | 			for pair in social_media_fields:
122 | 				if social_match != None:
123 | 					if pair[0] in social_match['social']:
124 | 						legislator_row.append(social_match['social'][pair[0]])
125 | 					else:
126 | 						legislator_row.append(None)
127 | 				else:
128 | 					legislator_row.append(None)
129 | 
130 | 			for pair in crosswalk_fields:
131 | 				if pair[0] in legislator['id']:
132 | 					value = legislator['id'][pair[0]]
133 | 					if isinstance(value, list):
134 | 						# make FEC IDs comma-separated
135 | 						value = ",".join(value)
136 | 					legislator_row.append(value)
137 | 				else:
138 | 					legislator_row.append(None)
139 | 
140 | 			csv_output.writerow(legislator_row)
141 | 
142 | 	generate_district_office_csv()
143 | 
144 | 
145 | def generate_district_office_csv():
146 | 	filename = "legislators-district-offices.yaml"
147 | 	print("Converting %s to CSV..." % filename)
148 | 	legislators_offices = utils.load_data(filename)
149 | 	fields = [
150 | 		"bioguide", "thomas", "govtrack", "id", "address", "building",
151 | 		"city", "fax", "hours", "phone", "state", "suite", "zip",
152 | 		"latitude", "longitude"]
153 | 
154 | 	f = open("../" + filename.replace(".yaml", ".csv"), "w")
155 | 	csv_output = csv.DictWriter(f, fieldnames=fields)
156 | 	csv_output.writeheader()
157 | 
158 | 	for legislator_offices in legislators_offices:
159 | 		legislator_ids = legislator_offices['id']
160 | 		for office in legislator_offices['offices']:
161 | 			office.update(legislator_ids)
162 | 			csv_output.writerow(office)
163 | 
164 | 
165 | def generate_json():
166 | 
167 | 	#yaml filenames
168 |     yamls = list(map(os.path.basename, glob.glob("../*.yaml")))
169 | 
170 |     for filename in yamls:
171 |         print("Converting %s to JSON..." % filename)
172 |         data = utils.load_data(filename)
173 |         '''handle edge case of incorrect coercion for twitter ids in social media data
174 |     		json/js can only handle maximum of 53-bit integers, so 64-bit integer twitter ids *must* be stringified
175 |     		to consistently preserve value in json. otherwise they may be rounded and malformed
176 |     	'''
177 |         if 'legislators-social-media' in filename:
178 |         	for social_legislator in data:
179 |         		if 'twitter_id' in social_legislator['social']:
180 |         			social_legislator['social']['twitter_id'] = str(social_legislator['social']['twitter_id'])
181 | 
182 | 		#convert yaml to json
183 |         utils.write(
184 |             json.dumps(data, default=utils.format_datetime, indent=2),
185 |             "../" + filename.replace(".yaml", ".json"))
186 | 
187 | if __name__ == '__main__':
188 | 	generate_csv()
189 | 	generate_json()
190 | 
191 | 


--------------------------------------------------------------------------------
/scripts/archive/114th_congress.py:
--------------------------------------------------------------------------------
  1 | # Temporary script to help us get the data in shape
  2 | # for the 114th Congress.
  3 | 
  4 | # Get: (thanks Derek!)
  5 | #   https://docs.google.com/spreadsheets/d/1H8z7Ah4jSlXiuIol3oXoWBR8s6h0OtA62dNlU-kiIlU/edit#gid=1419747559
  6 | # and download as 'election_results_2014.csv'.
  7 | 
  8 | # TODO:
  9 | # * What is the expected first day of the Congress? (Closest guess of swearing-in dates.)
 10 | # * Am adding "TODO: TODO" to new terms that weren't copied from older terms. Needs checking, possibly additional details like url, contact form.
 11 | 
 12 | from collections import OrderedDict
 13 | import copy
 14 | import csv
 15 | 
 16 | import utils
 17 | 
 18 | def run():
 19 | 
 20 | 	# Which members were up for relection, won in their office, or were
 21 | 	# a winner in another office?
 22 | 	won_row = { }
 23 | 	incumbents = set()
 24 | 	winners = set()
 25 | 	incumbent_winners = set()
 26 | 	new_members = []
 27 | 	for row in csv.DictReader(open("election_results_2014.csv")):
 28 | 		if row["new_member"] == "":
 29 | 			print("not decided yet...", row)
 30 | 			continue
 31 | 
 32 | 		# For NC-12, Alma Adams won the vacant seat and the 114th Congress
 33 | 		# term. It's coded in the spreadsheet as if she's a new member, but
 34 | 		# since we've already added her in the 113th Congress we need to
 35 | 		# pretend here that she's a returning member.
 36 | 		if row["new_id"] == "A000370":
 37 | 			row["member_id"] = "A000370"
 38 | 
 39 | 		incumbents.add(row["member_id"])
 40 | 		winners.add(row["new_id"])
 41 | 		won_row[row["new_id"]] = row
 42 | 		if row["member_id"] == row["new_id"]:
 43 | 			incumbent_winners.add(row["new_id"])
 44 | 		if row["new_id"] == "":
 45 | 			new_members.append(row)
 46 | 
 47 | 	# Make a stub term based on a row in Derek's spreadsheet.
 48 | 	def build_term(row, mark):
 49 | 		if row['chamber'] == 'House':
 50 | 			end_date = '2017-01-03'
 51 | 		elif row['district'] == 'Class II':
 52 | 			end_date = '2021-01-03'
 53 | 		elif row['district'] == 'Class III':
 54 | 			end_date = '2017-01-03'
 55 | 		else:
 56 | 			raise ValueError()
 57 | 
 58 | 		ret = OrderedDict([
 59 | 			("type", "rep" if row['chamber'] == 'House' else 'sen'),
 60 | 			("start", '2015-01-06'),
 61 | 			("end", end_date),
 62 | 			("state", row['state_abbrev']),
 63 | 		])
 64 | 
 65 | 		if ret["type"] == "rep":
 66 | 			ret["district"] = int(row['district']) if row['district'] != "AL" else 0
 67 | 		else:
 68 | 			if row["district"] == "Class II":
 69 | 				ret["class"] = 2
 70 | 			elif row["district"] == "Class III":
 71 | 				ret["class"] = 3
 72 | 			else:
 73 | 				raise ValueError()
 74 | 			if mark:
 75 | 				ret["state_rank"] = "junior"
 76 | 
 77 | 		if row["winner_party"] == "D":
 78 | 			ret["party"] = "Democrat"
 79 | 		elif row["winner_party"] == "R":
 80 | 			ret["party"] = "Republican"
 81 | 		else:
 82 | 			raise ValueError()
 83 | 
 84 | 		if mark:
 85 | 			ret["TODO"] = "TODO"
 86 | 
 87 | 		return ret
 88 | 
 89 | 	# Load legislators.
 90 | 	legislators_current = utils.load_data("legislators-current.yaml")
 91 | 	legislators_historical = utils.load_data("legislators-historical.yaml")
 92 | 	legislators_social_media = utils.load_data("legislators-social-media.yaml")
 93 | 
 94 | 	# Sweep current members.
 95 | 	to_retire = []
 96 | 	for p in legislators_current:
 97 | 		id = p['id']['bioguide']
 98 | 		if id in incumbents:
 99 | 			# This legislator was up for reelection.
100 | 			if id in incumbent_winners:
101 | 				# And won. Extend the term.
102 | 				t = copy.deepcopy(p['terms'][-1])
103 | 				p['terms'].append(t)
104 | 				t.update(build_term(won_row[id], False))
105 | 
106 | 			elif id in winners:
107 | 				# Incumbent won something else. Start
108 | 				# a fresh term.
109 | 				p['terms'].append(build_term(won_row[id], True))
110 | 
111 | 			else:
112 | 				# Incumbent lost.
113 | 				to_retire.append(p)
114 | 
115 | 	# Any legislators to bring forward?
116 | 	to_return = []
117 | 	for p in legislators_historical:
118 | 		id = p['id']['bioguide']
119 | 		if id in winners:
120 | 			p['terms'].append(build_term(won_row[id], True))
121 | 			to_return.append(p)
122 | 
123 | 	# Now that we're outside of the iterator, modify lists.
124 | 	for p in to_retire:
125 | 		legislators_current.remove(p)
126 | 		legislators_historical.append(p)
127 | 	for p in to_return:
128 | 		legislators_current.append(p)
129 | 		legislators_historical.remove(p)
130 | 
131 | 	# Delete entries in legislators-social-media for those retiring
132 | 	retiring_leg_bioguideids = [leg['id']['bioguide'] for leg in to_retire]
133 | 	for p in legislators_social_media:
134 | 		id = p['id']['bioguide']
135 | 		if id in retiring_leg_bioguideids:
136 | 			legislators_social_media.remove(p)
137 | 
138 | 	# Add stubs for new members.
139 | 	def fix_date(date):
140 | 		m, d, y = date.split("/")
141 | 		return "%04d-%02d-%02d" % (int(y), int(m), int(d))
142 | 	for i, row in enumerate(new_members):
143 | 		p = OrderedDict([
144 | 			("id", OrderedDict([
145 | 			    ("bioguide", "TODO"),
146 | 			    ("thomas", "TODO"),
147 | 			    ("lis", "TODO"),
148 | 			    ("fec", row['new_fec_cand_id'].split(',')),
149 | 			    ("govtrack", 412608+i), # assigning IDs here
150 | 			    ("opensecrets", "TODO"),
151 | 			    ("votesmart", "TODO"),
152 | 			    ("icpsr", "TODO"),
153 | 			    ("cspan", "TODO"),
154 | 			    ("wikipedia", "TODO"),
155 | 			    ("ballotpedia", "TODO"),
156 | 			    ("house_history", "TODO"),
157 | 			])),
158 | 			("name", OrderedDict()),
159 | 			("bio", OrderedDict([
160 | 				("gender", row["gender"]),
161 | 				("birthday", fix_date(row["date_of_birth"]) if row["date_of_birth"] != "" else "TODO"),
162 | 			])),
163 | 			("terms", [
164 | 				build_term(row, True),
165 | 			])
166 | 		])
167 | 
168 | 		if len(row["new_member"].split(" ")) == 2:
169 | 			p['name']['first'] = row["new_member"].split(" ")[0]
170 | 			p['name']['last'] = row["new_member"].split(" ")[1]
171 | 		else:
172 | 			p['name']['FULL'] = row["new_member"]
173 | 			p['name']['first'] = "TODO"
174 | 			p['name']['last'] = "TODO"
175 | 
176 | 		legislators_current.append(p)
177 | 
178 | 
179 | 	# Save.
180 | 	utils.save_data(legislators_current, "legislators-current.yaml")
181 | 	utils.save_data(legislators_historical, "legislators-historical.yaml")
182 | 	utils.save_data(legislators_social_media, "legislators-social-media.yaml")
183 | 
184 | if __name__ == '__main__':
185 |   run()
186 | 


--------------------------------------------------------------------------------
/scripts/archive/committee_membership_house.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Use the NYTimes API to get House committee information.
  4 | # When we wrote this script we believed the House Clerk was
  5 | # not yet making this info available.
  6 | 
  7 | import utils
  8 | import json
  9 | import copy
 10 | from utils import download, load_data, save_data
 11 | 
 12 | committee_membership = { }
 13 | 
 14 | committees_current = load_data("committees-current.yaml")
 15 | memberships_current = load_data("committee-membership-current.yaml")
 16 | 
 17 | # default to not caching
 18 | cache = utils.flags().get('cache', False)
 19 | force = not cache
 20 | 
 21 | congress = 113
 22 | 
 23 | # map house/senate committee IDs to their dicts
 24 | all_ids = []
 25 | 
 26 | house_ref = { }
 27 | for cx in committees_current:
 28 |   if cx["type"] == "house":
 29 |     house_ref[cx["thomas_id"]] = cx
 30 |     all_ids.append(cx['thomas_id'])
 31 | 
 32 | senate_ref = { }
 33 | for cx in committees_current:
 34 |   if cx["type"] == "senate":
 35 |     senate_ref[cx["thomas_id"]] = cx
 36 |     all_ids.append(cx['thomas_id'])
 37 | 
 38 | # map people by their bioguide ID
 39 | y = load_data("legislators-current.yaml")
 40 | by_bioguide = { }
 41 | for m in y:
 42 |   bioguide = m['id']['bioguide']
 43 |   by_bioguide[bioguide] = m
 44 | 
 45 | 
 46 | # load in committees from the NYT Congress API (API key not kept in source control)
 47 | api_key = open("cache/nyt_api_key").read() # file's whole body is the api key
 48 | 
 49 | url = "http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%i/house/committees.json?api-key=%s" % (congress, api_key)
 50 | 
 51 | body = download(url, "committees/membership/nyt-house.json", force)
 52 | committees = json.loads(body)['results'][0]['committees']
 53 | 
 54 | for committee in committees:
 55 |   committee_id = committee['id']
 56 | 
 57 |   committee_url = "http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%i/house/committees/%s.json?api-key=%s" % (congress, committee_id, api_key)
 58 | 
 59 |   # current disagreement between THOMAS and NYT (but use HSIG in URL above)
 60 |   if committee_id == "HSIG":
 61 |     committee_id = "HLIG"
 62 | 
 63 |   if committee_id not in all_ids:
 64 |     continue
 65 | 
 66 |   committee_party = committee['chair_party']
 67 | 
 68 |   committee_body = download(committee_url, "committees/membership/house/%s.json" % committee_id, force)
 69 |   members = json.loads(committee_body)['results'][0]['current_members']
 70 | 
 71 |   committee_membership[committee_id] = []
 72 |   for member in members:
 73 |     bioguide_id = member['id']
 74 | 
 75 |     print("[{}] {}".format(committee_id, bioguide_id))
 76 | 
 77 |     if bioguide_id not in by_bioguide:
 78 |       continue
 79 | 
 80 |     legislator = by_bioguide[bioguide_id]
 81 |     # last_term = legislator['terms'][-1]
 82 | 
 83 |     if member['party'] == committee_party:
 84 |       party = "majority"
 85 |     else:
 86 |       party = "minority"
 87 | 
 88 |     # this really shouldn't be calculated, but for now it's what we've got
 89 |     rank = int(member['rank_in_party'])
 90 |     if rank == 1:
 91 |       if party == "majority":
 92 |         title = "Chair"
 93 |       else:
 94 |         title = "Ranking Member"
 95 |     else:
 96 |       title = None
 97 | 
 98 |     details = {
 99 |       'name': legislator['name']['official_full'],
100 |       'party': party,
101 |       'rank': rank,
102 |       'bioguide': bioguide_id,
103 |       'thomas': legislator['id']['thomas']
104 |     }
105 | 
106 |     if title:
107 |       details['title'] = title
108 | 
109 |     committee_membership[committee_id].append(details)
110 | 
111 | # sort members to put majority party first, then order by rank
112 | # (fixing the order makes for better diffs)
113 | for c in committee_membership.values():
114 |   c.sort(key = lambda m : (m["party"]=="minority", m["rank"]))
115 | 
116 | # preserve senate memberships
117 | senate_membership = {}
118 | for committee_id in memberships_current:
119 |   if not committee_id.startswith("H"):
120 |     committee_membership[committee_id] = copy.deepcopy(memberships_current[committee_id])
121 | 
122 | print("Saving committee memberships...")
123 | save_data(committee_membership, "committee-membership-current.yaml")
124 | 


--------------------------------------------------------------------------------
/scripts/archive/election_results_2018_senate.csv:
--------------------------------------------------------------------------------
 1 | Incumbent Party,State,Senator,GovTrack ID,Bioguide ID,Party,FEC.gov ID,Wikipedia Page Name,Wikidata ID (see Wikipedia sidebar),Ballotpedia Page Name,First Name,Middle Name,Last Name,Gender (M/F),Birthday (often on Wikipedia)
 2 | D,AZ,"Sinema, Kyrsten",412509,,,,,,,,,,,
 3 | D,CA,"Feinstein, Dianne",300043,,,,,,,,,,,
 4 | D,CT,"Murphy, Christopher",412194,,,,,,,,,,,
 5 | D,DE,"Carper, Thomas R.",300019,,,,,,,,,,,
 6 | R,FL,"Scott, Rick",,S001217,Republican,S8FL00273,Rick Scott,Q439729,Rick Scott,Rick,,Scott,M,1952-12-01
 7 | D,HI,"Hirono, Mazie K.",412200,,,,,,,,,,,
 8 | R,IN,"Braun, Mike",,B001310,Republican,S8IN00171,Mike Braun,Q42804470,Mike Braun,Mike,,Braun,M,1954-03-24
 9 | D,MA,"Warren, Elizabeth",412542,,,,,,,,,,,
10 | D,MD,"Cardin, Benjamin L.",400064,,,,,,,,,,,
11 | I,ME,"King, Angus S., Jr.",412545,,,,,,,,,,,
12 | D,MI,"Stabenow, Debbie",300093,,,,,,,,,,,
13 | D,MN,"Klobuchar, Amy",412242,,,,,,,,,,,
14 | R,MO,"Hawley, Josh",,H001089,Republican,S8MO00160,Josh Hawley,Q23020745,Josh Hawley,Joshua,,Hawley,M,1979-12-31
15 | R,MS,"Wicker, Roger F.",400432,,,,,,,,,,,
16 | D,MT,"Tester, Jon",412244,,,,,,,,,,,
17 | R,ND,"Cramer, Kevin",412555,,,,,,,,,,,
18 | R,NE,"Fischer, Deb",412556,,,,,,,,,,,
19 | D,NJ,"Menendez, Robert",400272,,,,,,,,,,,
20 | D,NM,"Heinrich, Martin",412281,,,,,,,,,,,
21 | D,NV,"Rosen, Jacky",412715,,,,,,,,,,,
22 | D,NY,"Gillibrand, Kirsten E.",412223,,,,,,,,,,,
23 | D,OH,"Brown, Sherrod",400050,,,,,,,,,,,
24 | D,PA,"Casey, Robert P., Jr.",412246,,,,,,,,,,,
25 | D,RI,"Whitehouse, Sheldon",412247,,,,,,,,,,,
26 | R,TN,"Blackburn, Marsha",400032,,,,,,,,,,,
27 | R,TX,"Cruz, Ted",412573,,,,,,,,,,,
28 | R,UT,"Romney, Mitt",,R000615,Republican,S4MA00143,MItt Romney,Q4496,Mitt Romney,Mitt,,Romney,M,1947-03-12
29 | D,VA,"Kaine, Tim",412582,,,,,,,,,,,
30 | I,VT,"Sanders, Bernard",400357,,,,,,,,,,,
31 | D,WA,"Cantwell, Maria",300018,,,,,,,,,,,
32 | D,WI,"Baldwin, Tammy",400013,,,,,,,,,,,
33 | D,WV,"Manchin, Joe, III",412391,,,,,,,,,,,
34 | R,WY,"Barrasso, John",412251,,,,,,,,,,,
35 | 


--------------------------------------------------------------------------------
/scripts/archive/election_results_2020.csv:
--------------------------------------------------------------------------------
  1 | Race,Incumbent Win? Y/N,GovTrack ID,Party,Last Name,First Name,Middle Name,Suffix,Gender (M/F),Birthday (YYYY-MM-DD),FEC.gov ID,Wikidata ID,Twitter Handle,Found photo?
  2 | AK,Y,412665,R,Sullivan,,,,,,,,,
  3 | AL,N,,R,Tuberville,Tommy,Hawley,,M,1954-09-18,S0AL00230,Q7819948,@TTuberville,
  4 | AR,Y,412508,R,Cotton,,,,,,,,,
  5 | CO,N,,D,Hickenlooper,John,Wright,Jr.,M,1952-02-07,S0CO00575,Q430518,@Hickenlooper,
  6 | DE,Y,412390,D,Coons,,,,,,,,,
  7 | GA,Run-off election occurs after Jan 3.,,,,,,,,,,,,
  8 | IA,Y,412667,R,Ernst,,,,,,,,,
  9 | ID,Y,412322,R,Risch,,,,,,,,,
 10 | IL,Y,300038,D,Durbin,,,,,,,,,
 11 | KS,Y,412704,R,Marshall,,,,,,,,,
 12 | KY,Y,300072,R,McConnell,,,,,,,,,
 13 | LA,Y,412269,R,Cassidy,,,,,,,,,
 14 | MA,Y,400253,D,Markey,,,,,,,,,
 15 | ME,Y,300025,R,Collins,,,,,,,,,
 16 | MI,Y,412305,D,Peters,,,,,,,,,
 17 | MN,Y,412742,D,Smith,,,,,,,,,
 18 | MS,Y,412743,R,Hyde-Smith,,,,,,,,,
 19 | MT,Y,412549,R,Daines,,,,,,,,,
 20 | NC,Y,412668,R,Tillis,,,,,,,,,
 21 | NE,Y,412671,R,Sasse,,,,,,,,,
 22 | NH,Y,412323,D,Shaheen,,,,,,,,,
 23 | NJ,Y,412598,D,Booker,,,,,,,,,
 24 | NM,N,412293,D,Luján,Ben Ray,,,M,1972-06-07,S0NM00058,Q324256,@BenRayLujan,
 25 | OK,Y,300055,R,Inhofe,,,,,,,,,
 26 | OR,Y,412325,D,Merkley,,,,,,,,,
 27 | RI,Y,300081,D,Reed,,,,,,,,,
 28 | SC,Y,300047,R,Graham,,,,,,,,,
 29 | SD,Y,412669,R,Rounds,,,,,,,,,
 30 | TN,N,,R,Hagerty,Bill,Francis,IV,M,1959-08-14,S0TN00169,Q27734214,@BillHagertyTN,
 31 | TX,Y,300027,R,Cornyn,,,,,,,,,
 32 | VA,Y,412321,D,Warner,,,,,,,,,
 33 | WV,Y,400061,R,Capito,,,,,,,,,
 34 | WY,N,412294,R,Lummis,Cynthia,Marie,,F,1954-09-10,S0WY00137,Q456064,@CynthiaMLummis,
 35 | AK0,Y,,,,,,,,,,,,
 36 | AL1,N,,R,Carl,Jerry,Lee,Jr.,M,1958-06-17,H0AL01055,Q102277702,@CarlForAlabama,
 37 | AL2,N,,R,Moore,Felix,Barry,,M,1966-09-26,H8AL02171,Q63198048,,
 38 | AL3,Y,,,,,,,,,,,,
 39 | AL4,Y,,,,,,,,,,,,
 40 | AL5,Y,,,,,,,,,,,,
 41 | AL6,Y,,,,,,,,,,,,
 42 | AL7,Y,,,,,,,,,,,,
 43 | AR1,Y,,,,,,,,,,,,
 44 | AR2,Y,,,,,,,,,,,,
 45 | AR3,Y,,,,,,,,,,,,
 46 | AR4,Y,,,,,,,,,,,,
 47 | AS0,Y,,,,,,,,,,,,
 48 | AZ1,Y,,,,,,,,,,,,
 49 | AZ2,Y,,,,,,,,,,,,
 50 | AZ3,Y,,,,,,,,,,,,
 51 | AZ4,Y,,,,,,,,,,,,
 52 | AZ5,Y,,,,,,,,,,,,
 53 | AZ6,Y,,,,,,,,,,,,
 54 | AZ7,Y,,,,,,,,,,,,
 55 | AZ8,Y,,,,,,,,,,,,
 56 | AZ9,Y,,,,,,,,,,,,
 57 | CA1,Y,,,,,,,,,,,,
 58 | CA2,Y,,,,,,,,,,,,
 59 | CA3,Y,,,,,,,,,,,,
 60 | CA4,Y,,,,,,,,,,,,
 61 | CA5,Y,,,,,,,,,,,,
 62 | CA6,Y,,,,,,,,,,,,
 63 | CA7,Y,,,,,,,,,,,,
 64 | CA8,N,,R,Obernolte,Jay,Phillip,,M,1970-08-18,H0CA08135,Q16849797,@JayObernolte,
 65 | CA9,Y,,,,,,,,,,,,
 66 | CA10,Y,,,,,,,,,,,,
 67 | CA11,Y,,,,,,,,,,,,
 68 | CA12,Y,,,,,,,,,,,,
 69 | CA13,Y,,,,,,,,,,,,
 70 | CA14,Y,,,,,,,,,,,,
 71 | CA15,Y,,,,,,,,,,,,
 72 | CA16,Y,,,,,,,,,,,,
 73 | CA17,Y,,,,,,,,,,,,
 74 | CA18,Y,,,,,,,,,,,,
 75 | CA19,Y,,,,,,,,,,,,
 76 | CA20,Y,,,,,,,,,,,,
 77 | CA21,N,412515,R,Valadao,David,Goncalves,,M,1977-04-14,H2CA20094,Q3528567,DGValadao,
 78 | CA22,Y,,,,,,,,,,,,
 79 | CA23,Y,,,,,,,,,,,,
 80 | CA24,Y,,,,,,,,,,,,
 81 | CA25,Y,,,,,,,,,,,,
 82 | CA26,Y,,,,,,,,,,,,
 83 | CA27,Y,,,,,,,,,,,,
 84 | CA28,Y,,,,,,,,,,,,
 85 | CA29,Y,,,,,,,,,,,,
 86 | CA30,Y,,,,,,,,,,,,
 87 | CA31,Y,,,,,,,,,,,,
 88 | CA32,Y,,,,,,,,,,,,
 89 | CA33,Y,,,,,,,,,,,,
 90 | CA34,Y,,,,,,,,,,,,
 91 | CA35,Y,,,,,,,,,,,,
 92 | CA36,Y,,,,,,,,,,,,
 93 | CA37,Y,,,,,,,,,,,,
 94 | CA38,Y,,,,,,,,,,,,
 95 | CA39,N,,R,Kim,Young,Oak,,F,1962-10-18,H8CA39240,Q19662859,@YoungKimCA,
 96 | CA40,Y,,,,,,,,,,,,
 97 | CA41,Y,,,,,,,,,,,,
 98 | CA42,Y,,,,,,,,,,,,
 99 | CA43,Y,,,,,,,,,,,,
100 | CA44,Y,,,,,,,,,,,,
101 | CA45,Y,,,,,,,,,,,,
102 | CA46,Y,,,,,,,,,,,,
103 | CA47,Y,,,,,,,,,,,,
104 | CA48,N,,R,Steel,Michelle,Eunjoo,,F,1955-06-21,H0CA48198,Q6837200,@MichelleSteelCA,
105 | CA49,Y,,,,,,,,,,,,
106 | CA50,N,400196,R,Issa,Darrell,Edward,,M,1953-11-01,H0CA50178,Q1166592,@DarrellIssa,
107 | CA51,Y,,,,,,,,,,,,
108 | CA52,Y,,,,,,,,,,,,
109 | CA53,N,,D,Jacobs,Sara,,,F,1989-02-01,H0CA53115,Q50825637,@SaraJacobsCA,
110 | CO1,Y,,,,,,,,,,,,
111 | CO2,Y,,,,,,,,,,,,
112 | CO3,N,,R,Boebert,Lauren,Opal,,F,1986-12-15,H0CO03165,Q96761544,@LaurenBoebert,
113 | CO4,Y,,,,,,,,,,,,
114 | CO5,Y,,,,,,,,,,,,
115 | CO6,Y,,,,,,,,,,,,
116 | CO7,Y,,,,,,,,,,,,
117 | CT1,Y,,,,,,,,,,,,
118 | CT2,Y,,,,,,,,,,,,
119 | CT3,Y,,,,,,,,,,,,
120 | CT4,Y,,,,,,,,,,,,
121 | CT5,Y,,,,,,,,,,,,
122 | DC0,Y,,,,,,,,,,,,
123 | DE0,Y,,,,,,,,,,,,
124 | FL1,Y,,,,,,,,,,,,
125 | FL2,Y,,,,,,,,,,,,
126 | FL3,N,,R,Cammack,Katherine,,,F,1988-02-16,H0FL03175,Q98523243,@Kat_Cammack,
127 | FL4,Y,,,,,,,,,,,,
128 | FL5,Y,,,,,,,,,,,,
129 | FL6,Y,,,,,,,,,,,,
130 | FL7,Y,,,,,,,,,,,,
131 | FL8,Y,,,,,,,,,,,,
132 | FL9,Y,,,,,,,,,,,,
133 | FL10,Y,,,,,,,,,,,,
134 | FL11,Y,,,,,,,,,,,,
135 | FL12,Y,,,,,,,,,,,,
136 | FL13,Y,,,,,,,,,,,,
137 | FL14,Y,,,,,,,,,,,,
138 | FL15,N,,R,Franklin,Scott,,,M,1964-08-23,H0FL15104,Q101198561,@ScottFranklinFL,
139 | FL16,Y,,,,,,,,,,,,
140 | FL17,Y,,,,,,,,,,,,
141 | FL18,Y,,,,,,,,,,,,
142 | FL19,N,,R,Donalds,Byron,Lowell,,M,1972-10-28,H0FL19205,Q59726216,@ByronDonalds,
143 | FL20,Y,,,,,,,,,,,,
144 | FL21,Y,,,,,,,,,,,,
145 | FL22,Y,,,,,,,,,,,,
146 | FL23,Y,,,,,,,,,,,,
147 | FL24,Y,,,,,,,,,,,,
148 | FL25,Y,,,,,,,,,,,,
149 | FL26,N,,R,Giménez,Carlos,A.,,M,1954-01-17,H0FL26036,Q5041653,currently @MayorGimenez,
150 | FL27,N,,R,Salazar,Maria,Elvira,,M,1961-11-01,H8FL27185,Q6003715,@MaElviraSalazar,
151 | GA1,Y,,,,,,,,,,,,
152 | GA2,Y,,,,,,,,,,,,
153 | GA3,Y,,,,,,,,,,,,
154 | GA4,Y,,,,,,,,,,,,
155 | GA5,N,,D,Williams,Nikema,Natassha,,F,1978-07-30,H0GA05301,Q56486570,@NikemaWilliams,
156 | GA6,Y,,,,,,,,,,,,
157 | GA7,N,,D,Bourdeaux,Carolyn,,,F,1970-06-03,H8GA07201,Q58333638,@Carolyn4GA7,
158 | GA8,Y,,,,,,,,,,,,
159 | GA9,N,,R,Clyde,Andrew,,,M,1963-11-22,H0GA09246,Q102277679,,
160 | GA10,Y,,,,,,,,,,,,
161 | GA11,Y,,,,,,,,,,,,
162 | GA12,Y,,,,,,,,,,,,
163 | GA13,Y,,,,,,,,,,,,
164 | GA14,N,,R,Greene,Marjorie,Taylor,,F,1974-05-27,H0GA06192,Q98380406,@MTGreenee,
165 | GU0,Y,,,,,,,,,,,,
166 | HI1,Y,,,,,,,,,,,,
167 | HI2,N,,D,Kahele,Kaialiʻi,,,M,1974-03-28,H0HI02155,Q28861508,@KaiKahele,
168 | IA1,N,,R,Hinson,Ashley,,,F,1983-06-27,H0IA01174,Q60713905,@HinsonAshley,
169 | IA2,N,,R,Miller-Meeks,Mariannette,Jane,,F,1955-09-06,H8IA02043,Q58495662,@MillerMeeks,
170 | IA3,Y,,,,,,,,,,,,
171 | IA4,N,,R,Feenstra,Randy,L.,,M,1969-01-14,H0IA04145,Q7292187,@RandyFeenstra,
172 | ID1,Y,,,,,,,,,,,,
173 | ID2,Y,,,,,,,,,,,,
174 | IL1,Y,,,,,,,,,,,,
175 | IL2,Y,,,,,,,,,,,,
176 | IL3,N,,D,Newman,Marie,,,F,1964-04-13,H8IL03102,Q47960940,currently @Marie4Congress,
177 | IL4,Y,,,,,,,,,,,,
178 | IL5,Y,,,,,,,,,,,,
179 | IL6,Y,,,,,,,,,,,,
180 | IL7,Y,,,,,,,,,,,,
181 | IL8,Y,,,,,,,,,,,,
182 | IL9,Y,,,,,,,,,,,,
183 | IL10,Y,,,,,,,,,,,,
184 | IL11,Y,,,,,,,,,,,,
185 | IL12,Y,,,,,,,,,,,,
186 | IL13,Y,,,,,,,,,,,,
187 | IL14,Y,,,,,,,,,,,,
188 | IL15,N,,R,Miller,Mary,,,F,1959-08-27,H0IL15129,Q101204553,@Miller_Congress,
189 | IL16,Y,,,,,,,,,,,,
190 | IL17,Y,,,,,,,,,,,,
191 | IL18,Y,,,,,,,,,,,,
192 | IN1,N,,D,Mrvan,Frank,John,,M,1969-04-16,H0IN01150,Q96077897,currently @Mrvan4Congress,
193 | IN2,Y,,,,,,,,,,,,
194 | IN3,Y,,,,,,,,,,,,
195 | IN4,Y,,,,,,,,,,,,
196 | IN5,N,,R,Spartz,Victoria,,,F,1978-10-06,H0IN05326,Q44059867,@Victoria_Spartz,
197 | IN6,Y,,,,,,,,,,,,
198 | IN7,Y,,,,,,,,,,,,
199 | IN8,Y,,,,,,,,,,,,
200 | IN9,Y,,,,,,,,,,,,
201 | KS1,N,,R,Mann,Tracey,Robert,,M,1976-12-17,H0KS01123,Q48767554,@TraceyMannKS,
202 | KS2,N,,R,LaTurner,Jacob,,,M,1988-02-17,H0KS02188,Q16731273,@JakeLaTurner,
203 | KS3,Y,,,,,,,,,,,,
204 | KS4,Y,,,,,,,,,,,,
205 | KY1,Y,,,,,,,,,,,,
206 | KY2,Y,,,,,,,,,,,,
207 | KY3,Y,,,,,,,,,,,,
208 | KY4,Y,,,,,,,,,,,,
209 | KY5,Y,,,,,,,,,,,,
210 | KY6,Y,,,,,,,,,,,,
211 | LA1,Y,,,,,,,,,,,,
212 | LA2,Y,,,,,,,,,,,,
213 | LA3,Y,,,,,,,,,,,,
214 | LA4,Y,,,,,,,,,,,,
215 | LA5,Rep.-elect Luke Letlow has died of COVID-19 on December 29,,,,,,,,,,,,
216 | LA6,Y,,,,,,,,,,,,
217 | MA1,Y,,,,,,,,,,,,
218 | MA2,Y,,,,,,,,,,,,
219 | MA3,Y,,,,,,,,,,,,
220 | MA4,N,,D,Auchincloss,Jake,Daniel,,M,1988-01-29,H0MA04192,Q101196632,@JakeAuch,
221 | MA5,Y,,,,,,,,,,,,
222 | MA6,Y,,,,,,,,,,,,
223 | MA7,Y,,,,,,,,,,,,
224 | MA8,Y,,,,,,,,,,,,
225 | MA9,Y,,,,,,,,,,,,
226 | MD1,Y,,,,,,,,,,,,
227 | MD2,Y,,,,,,,,,,,,
228 | MD3,Y,,,,,,,,,,,,
229 | MD4,Y,,,,,,,,,,,,
230 | MD5,Y,,,,,,,,,,,,
231 | MD6,Y,,,,,,,,,,,,
232 | MD7,Y,,,,,,,,,,,,
233 | MD8,Y,,,,,,,,,,,,
234 | ME1,Y,,,,,,,,,,,,
235 | ME2,Y,,,,,,,,,,,,
236 | MI1,Y,,,,,,,,,,,,
237 | MI2,Y,,,,,,,,,,,,
238 | MI3,N,,R,Meijer,Peter,James,,M,1988-01-10,H0MI03308,Q96419165,currently @VoteMeijer,
239 | MI4,Y,,,,,,,,,,,,
240 | MI5,Y,,,,,,,,,,,,
241 | MI6,Y,,,,,,,,,,,,
242 | MI7,Y,,,,,,,,,,,,
243 | MI8,Y,,,,,,,,,,,,
244 | MI9,Y,,,,,,,,,,,,
245 | MI10,N,,R,McClain,Lisa,,,F,1966-04-07,H0MI10287,Q102184540,currently @LisaForCongress,
246 | MI11,Y,,,,,,,,,,,,
247 | MI12,Y,,,,,,,,,,,,
248 | MI13,Y,,,,,,,,,,,,
249 | MI14,Y,,,,,,,,,,,,
250 | MN1,Y,,,,,,,,,,,,
251 | MN2,Y,,,,,,,,,,,,
252 | MN3,Y,,,,,,,,,,,,
253 | MN4,Y,,,,,,,,,,,,
254 | MN5,Y,,,,,,,,,,,,
255 | MN6,Y,,,,,,,,,,,,
256 | MN7,N,,R,Fischbach,Michelle,Louise Helene,,F,1965-11-03,H0MN07091,Q6837025,@FischbachMN7,
257 | MN8,Y,,,,,,,,,,,,
258 | MO1,N,,D,Bush,Cori,,,F,1976-07-21,H8MO01143,Q98084800,@CoriBush,
259 | MO2,Y,,,,,,,,,,,,
260 | MO3,Y,,,,,,,,,,,,
261 | MO4,Y,,,,,,,,,,,,
262 | MO5,Y,,,,,,,,,,,,
263 | MO6,Y,,,,,,,,,,,,
264 | MO7,Y,,,,,,,,,,,,
265 | MO8,Y,,,,,,,,,,,,
266 | MP0,Y,,,,,,,,,,,,
267 | MS1,Y,,,,,,,,,,,,
268 | MS2,Y,,,,,,,,,,,,
269 | MS3,Y,,,,,,,,,,,,
270 | MS4,Y,,,,,,,,,,,,
271 | MT0,N,,R,Rosendale,Matthew,Martin,Sr.,M,1960-07-07,H4MT00050,Q6791163,currently @MattForMontana,
272 | NC1,Y,,,,,,,,,,,,
273 | NC2,N,,D,Ross,Deborah,Koff,,F,1963-06-20,H0NC02125,Q5248285,@DeborahRossNC,
274 | NC3,Y,,,,,,,,,,,,
275 | NC4,Y,,,,,,,,,,,,
276 | NC5,Y,,,,,,,,,,,,
277 | NC6,N,,D,Manning,Kathy,Ellen,,F,1956-12-03,H8NC13067,Q101136890,@KathyManningNC,
278 | NC7,Y,,,,,,,,,,,,
279 | NC8,Y,,,,,,,,,,,,
280 | NC9,Y,,,,,,,,,,,,
281 | NC10,Y,,,,,,,,,,,,
282 | NC11,N,,R,Cawthorn,David,Madison,,M,1995-08-01,H0NC11233,Q96633736,currently @CawthornforNC,
283 | NC12,Y,,,,,,,,,,,,
284 | NC13,Y,,,,,,,,,,,,
285 | ND0,Y,,,,,,,,,,,,
286 | NE1,Y,,,,,,,,,,,,
287 | NE2,Y,,,,,,,,,,,,
288 | NE3,Y,,,,,,,,,,,,
289 | NH1,Y,,,,,,,,,,,,
290 | NH2,Y,,,,,,,,,,,,
291 | NJ1,Y,,,,,,,,,,,,
292 | NJ2,Y,,,,,,,,,,,,
293 | NJ3,Y,,,,,,,,,,,,
294 | NJ4,Y,,,,,,,,,,,,
295 | NJ5,Y,,,,,,,,,,,,
296 | NJ6,Y,,,,,,,,,,,,
297 | NJ7,Y,,,,,,,,,,,,
298 | NJ8,Y,,,,,,,,,,,,
299 | NJ9,Y,,,,,,,,,,,,
300 | NJ10,Y,,,,,,,,,,,,
301 | NJ11,Y,,,,,,,,,,,,
302 | NJ12,Y,,,,,,,,,,,,
303 | NM1,Y,,,,,,,,,,,,
304 | NM2,N,,R,Herrell,Stella,Yvette,,F,1964-03-16,H8NM02156,Q16225780,currently @Yvette4Congress,
305 | NM3,N,,D,Fernandez,Teresa,Leger,,F,1959-07-01,S0NJ00258,Q96054905,currently @TeresaForNM,
306 | NV1,Y,,,,,,,,,,,,
307 | NV2,Y,,,,,,,,,,,,
308 | NV3,Y,,,,,,,,,,,,
309 | NV4,Y,,,,,,,,,,,,
310 | NY1,Y,,,,,,,,,,,,
311 | NY2,N,,R,Garbarino,Andrew,,,M,1984-09-27,H0NY02234,Q21257859,currently @GarbarinoforNY,
312 | NY3,Y,,,,,,,,,,,,
313 | NY4,Y,,,,,,,,,,,,
314 | NY5,Y,,,,,,,,,,,,
315 | NY6,Y,,,,,,,,,,,,
316 | NY7,Y,,,,,,,,,,,,
317 | NY8,Y,,,,,,,,,,,,
318 | NY9,Y,,,,,,,,,,,,
319 | NY10,Y,,,,,,,,,,,,
320 | NY11,N,,R,Malliotakis,Nicole,,,F,1980-11-11,H0NY11078,Q7030112,@NMalliotakis,
321 | NY12,Y,,,,,,,,,,,,
322 | NY13,Y,,,,,,,,,,,,
323 | NY14,Y,,,,,,,,,,,,
324 | NY15,N,,D,Torres,Ritchie,John,,M,1988-03-12,H0NY15160,Q16205227,@RitchieTorres,
325 | NY16,N,,D,Bowman,Jamaal,,,M,1976-04-01,H0NY16143,Q96419280,@JamaalBowmanNY,
326 | NY17,N,,D,Jones,Mondaire,,,M,1987-05-18,H0NY17174,Q96781248,@MondaireJones,
327 | NY18,Y,,,,,,,,,,,,
328 | NY19,Y,,,,,,,,,,,,
329 | NY20,Y,,,,,,,,,,,,
330 | NY21,Y,,,,,,,,,,,,
331 | NY22,Race too close to call?,,,,,,,,,,,,
332 | NY23,Y,,,,,,,,,,,,
333 | NY24,Y,,,,,,,,,,,,
334 | NY25,Y,,,,,,,,,,,,
335 | NY26,Y,,,,,,,,,,,,
336 | NY27,Y,,,,,,,,,,,,
337 | OH1,Y,,,,,,,,,,,,
338 | OH2,Y,,,,,,,,,,,,
339 | OH3,Y,,,,,,,,,,,,
340 | OH4,Y,,,,,,,,,,,,
341 | OH5,Y,,,,,,,,,,,,
342 | OH6,Y,,,,,,,,,,,,
343 | OH7,Y,,,,,,,,,,,,
344 | OH8,Y,,,,,,,,,,,,
345 | OH9,Y,,,,,,,,,,,,
346 | OH10,Y,,,,,,,,,,,,
347 | OH11,Y,,,,,,,,,,,,
348 | OH12,Y,,,,,,,,,,,,
349 | OH13,Y,,,,,,,,,,,,
350 | OH14,Y,,,,,,,,,,,,
351 | OH15,Y,,,,,,,,,,,,
352 | OH16,Y,,,,,,,,,,,,
353 | OK1,Y,,,,,,,,,,,,
354 | OK2,Y,,,,,,,,,,,,
355 | OK3,Y,,,,,,,,,,,,
356 | OK4,Y,,,,,,,,,,,,
357 | OK5,N,,R,Bice,Stephanie,,,F,1973-11-11,H0OK05205,Q60190894,@StephanieBice,
358 | OR1,Y,,,,,,,,,,,,
359 | OR2,N,,R,Bentz,Cliff,Stewart,,M,1952-01-12,H0OR02127,Q5132536,@CliffBentz,
360 | OR3,Y,,,,,,,,,,,,
361 | OR4,Y,,,,,,,,,,,,
362 | OR5,Y,,,,,,,,,,,,
363 | PA1,Y,,,,,,,,,,,,
364 | PA2,Y,,,,,,,,,,,,
365 | PA3,Y,,,,,,,,,,,,
366 | PA4,Y,,,,,,,,,,,,
367 | PA5,Y,,,,,,,,,,,,
368 | PA6,Y,,,,,,,,,,,,
369 | PA7,Y,,,,,,,,,,,,
370 | PA8,Y,,,,,,,,,,,,
371 | PA9,Y,,,,,,,,,,,,
372 | PA10,Y,,,,,,,,,,,,
373 | PA11,Y,,,,,,,,,,,,
374 | PA12,Y,,,,,,,,,,,,
375 | PA13,Y,,,,,,,,,,,,
376 | PA14,Y,,,,,,,,,,,,
377 | PA15,Y,,,,,,,,,,,,
378 | PA16,Y,,,,,,,,,,,,
379 | PA17,Y,,,,,,,,,,,,
380 | PA18,Y,,,,,,,,,,,,
381 | PR0,Y,,,,,,,,,,,,
382 | RI1,Y,,,,,,,,,,,,
383 | RI2,Y,,,,,,,,,,,,
384 | SC1,N,,R,Mace,Nancy,Ruth,,F,1977-12-04,H0SC01394,Q6962831,@NancyMace,
385 | SC2,Y,,,,,,,,,,,,
386 | SC3,Y,,,,,,,,,,,,
387 | SC4,Y,,,,,,,,,,,,
388 | SC5,Y,,,,,,,,,,,,
389 | SC6,Y,,,,,,,,,,,,
390 | SC7,Y,,,,,,,,,,,,
391 | SD0,Y,,,,,,,,,,,,
392 | TN1,N,,R,Harshbarger,Diana,,,F,1960-01-01,H0TN01118,Q101197341,@DHarshbargerTN1,
393 | TN2,Y,,,,,,,,,,,,
394 | TN3,Y,,,,,,,,,,,,
395 | TN4,Y,,,,,,,,,,,,
396 | TN5,Y,,,,,,,,,,,,
397 | TN6,Y,,,,,,,,,,,,
398 | TN7,Y,,,,,,,,,,,,
399 | TN8,Y,,,,,,,,,,,,
400 | TN9,Y,,,,,,,,,,,,
401 | TX1,Y,,,,,,,,,,,,
402 | TX2,Y,,,,,,,,,,,,
403 | TX3,Y,,,,,,,,,,,,
404 | TX4,N,,R,Fallon,Patrick,Edward,,M,1967-12-19,H0TX04219,Q16196923,currently @FallonForTexas,
405 | TX5,Y,,,,,,,,,,,,
406 | TX6,Y,,,,,,,,,,,,
407 | TX7,Y,,,,,,,,,,,,
408 | TX8,Y,,,,,,,,,,,,
409 | TX9,Y,,,,,,,,,,,,
410 | TX10,Y,,,,,,,,,,,,
411 | TX11,N,,R,Pfluger,August,Lee,II,M,1978-12-28,H0TX11230,Q101196462,@AugustPfluger,
412 | TX12,Y,,,,,,,,,,,,
413 | TX13,N,,R,Jackson,Ronny,Lynn,,M,1967-05-04,H0TX13228,Q47270118,currently @RonnyJackson4TX,
414 | TX14,Y,,,,,,,,,,,,
415 | TX15,Y,,,,,,,,,,,,
416 | TX16,Y,,,,,,,,,,,,
417 | TX17,N,400367,R,Sessions,Pete,,,,,,,,
418 | TX18,Y,,,,,,,,,,,,
419 | TX19,Y,,,,,,,,,,,,
420 | TX20,Y,,,,,,,,,,,,
421 | TX21,Y,,,,,,,,,,,,
422 | TX22,N,,R,Nehls,Troy,E.,,M,1968-04-07,H0TX22302,Q96741441,@SheriffTNehls,
423 | TX23,N,,R,Gonzales,Ernest,Anthony,II,M,1980-10-10,H0TX35015,,currently @TonyGonzales4TX,
424 | TX24,N,,R,Van Duyne,Beth,Ann,,F,1970-11-16,H0TX24209,Q66309702,@BethVanDuyne,
425 | TX25,Y,,,,,,,,,,,,
426 | TX26,Y,,,,,,,,,,,,
427 | TX27,Y,,,,,,,,,,,,
428 | TX28,Y,,,,,,,,,,,,
429 | TX29,Y,,,,,,,,,,,,
430 | TX30,Y,,,,,,,,,,,,
431 | TX31,Y,,,,,,,,,,,,
432 | TX32,Y,,,,,,,,,,,,
433 | TX33,Y,,,,,,,,,,,,
434 | TX34,Y,,,,,,,,,,,,
435 | TX35,Y,,,,,,,,,,,,
436 | TX36,Y,,,,,,,,,,,,
437 | UT1,N,,R,Moore,Blake,David,,M,1980-06-22,H0UT01205,Q101196971,currently @ElectBlakeMoore,https://en.wikipedia.org/wiki/Blake_Moore#/media/File:Blake_Moore_117th_U.S_Congress.jpg
438 | UT2,Y,,,,,,,,,,,,
439 | UT3,Y,,,,,,,,,,,,
440 | UT4,N,,R,Owens,Clarence,Burgess,,M,1951-08-02,H0UT04076,Q4998602,@BurgessOwens,
441 | VA1,Y,,,,,,,,,,,,
442 | VA2,Y,,,,,,,,,,,,
443 | VA3,Y,,,,,,,,,,,,
444 | VA4,Y,,,,,,,,,,,,
445 | VA5,N,,R,Good,Robert,G.,,M,1965-09-11,H0VA05160,Q103850475,currently @GoodForCongress,
446 | VA6,Y,,,,,,,,,,,,
447 | VA7,Y,,,,,,,,,,,,
448 | VA8,Y,,,,,,,,,,,,
449 | VA9,Y,,,,,,,,,,,,
450 | VA10,Y,,,,,,,,,,,,
451 | VA11,Y,,,,,,,,,,,,
452 | VI0,Y,,,,,,,,,,,,
453 | VT0,Y,,,,,,,,,,,,
454 | WA1,Y,,,,,,,,,,,,
455 | WA2,Y,,,,,,,,,,,,
456 | WA3,Y,,,,,,,,,,,,
457 | WA4,Y,,,,,,,,,,,,
458 | WA5,Y,,,,,,,,,,,,
459 | WA6,Y,,,,,,,,,,,,
460 | WA7,Y,,,,,,,,,,,,
461 | WA8,Y,,,,,,,,,,,,
462 | WA9,Y,,,,,,,,,,,,
463 | WA10,N,,D,Strickland,Marilyn,,,F,1962-09-25,H0WA10034,Q1898180,currently @StricklandForWA,
464 | WI1,Y,,,,,,,,,,,,
465 | WI2,Y,,,,,,,,,,,,
466 | WI3,Y,,,,,,,,,,,,
467 | WI4,Y,,,,,,,,,,,,
468 | WI5,N,,R,Fitzgerald,Scott,L.,,M,1963-11-16,H0WI05113,Q7436650,currently @SenFitzgerald,
469 | WI6,Y,,,,,,,,,,,,
470 | WI7,Y,,,,,,,,,,,,
471 | WI8,Y,,,,,,,,,,,,
472 | WV1,Y,,,,,,,,,,,,
473 | WV2,Y,,,,,,,,,,,,
474 | WV3,Y,,,,,,,,,,,,
475 | WY0,Y,,,,,,,,,,,,


--------------------------------------------------------------------------------
/scripts/archive/election_results_house_2016.py:
--------------------------------------------------------------------------------
  1 | import collections, requests, lxml
  2 | from utils import load_data, save_data
  3 | 
  4 | try:
  5 | 	from yaml import CLoader
  6 | 	assert CLoader #silence pyflakes
  7 | except ImportError:
  8 | 	print("Warning: libyaml not found, loading will be slow...")
  9 | 
 10 | # # Open existing data.
 11 | historical = load_data("legislators-historical.yaml")
 12 | current = load_data("legislators-current.yaml")
 13 | 
 14 | # # Map bioguide IDs to records.
 15 | bioguide = { }
 16 | for entry in historical + current:
 17 | 	bioguide[entry['id']['bioguide']] = entry
 18 | 
 19 | # # Get highest existing GovTrack ID.
 20 | govtrack_id = max(p['id']['govtrack'] for p in historical+current)
 21 | 
 22 | # load members-elect
 23 | xml = requests.get("http://clerk.house.gov/member_info/unofficial-115-member-elect-data.xml")
 24 | root=lxml.etree.fromstring(xml.content)
 25 | 
 26 | elected = []
 27 | for xml_member in root.findall('./members/member'):
 28 | 	mi = xml_member.find("member-info")
 29 | 	bioguide_id = mi.find("bioguideID").text
 30 | 
 31 | 	#print("bioguide_id is {} for {}".format(bioguide_id, xml_member.find("statedistrict").text))
 32 | 	if bioguide_id is None:
 33 | 		print("WARN: no member found for {}".format(xml_member.find("statedistrict").text))
 34 | 		continue
 35 | 	
 36 | 	if bioguide_id in bioguide:
 37 | 		# Incumbent won or current representative has become a senator
 38 | 		# or historical member is returning to office.
 39 | 		p = bioguide[bioguide_id]
 40 | 		party = p['terms'][-1]['party']
 41 | 
 42 | 	else:
 43 | 		# Make a new entry.
 44 | 		govtrack_id += 1
 45 | 		p = collections.OrderedDict([
 46 | 			("id", collections.OrderedDict([
 47 | 				("bioguide", bioguide_id),
 48 | 				#("fec", [row['fec']]),
 49 | 				("govtrack", govtrack_id),
 50 | 				#("opensecrets", None), # don't know yet
 51 | 				#("votesmart", int(row['votesmart'])),
 52 | 				#("wikipedia", row['wikipedia']),
 53 | 				#("ballotpedia", row['ballotpedia']),
 54 | 			])),
 55 | 			("name", collections.OrderedDict([
 56 | 				("first", mi.find('firstname').text),
 57 | 				("last", mi.find('lastname').text),
 58 | 				#("official_full", mi.find('official_full').text), #not available yet
 59 | 			])),
 60 | 			("bio", collections.OrderedDict([
 61 | 			 	("gender", "M" if mi.find('courtesy').text == "Mr." else "F"),
 62 | 			 	#("birthday", row['birthday']),
 63 | 			])),
 64 | 			("terms", []),
 65 | 		])
 66 | 
 67 | 	party_char = mi.find('party').text
 68 | 	party = 'Republican' if party_char == 'R' else 'Democrat' # valid?
 69 | 	caucus_char = mi.find('caucus').text
 70 | 	caucus = 'Republican' if caucus_char == 'R' else 'Democrat' # valid?
 71 | 
 72 | 	district = int(xml_member.find("statedistrict").text[2:])
 73 | 	# Add a new term.
 74 | 	p['terms'].append(collections.OrderedDict([
 75 | 		("type", "rep"),
 76 | 		("start", "2017-01-03"),
 77 | 		("end", "2019-01-03"),
 78 | 		("state", mi.find('state').get('postal-code')),
 79 | 		("district", district),
 80 | 		("party", party),
 81 | 		("phone", mi.find("phone").text),
 82 | 	]))
 83 | 
 84 | 	if caucus != party:
 85 | 		p['terms'][-1]['caucus'] = caucus
 86 | 
 87 | 	if len(p['terms']) > 1:
 88 | 		# This is an incumbent. Copy some fields forward.
 89 | 		for k in ('url', 'rss_url'):
 90 | 			if k in p['terms'][-2]:
 91 | 				p['terms'][-1][k] = p['terms'][-2][k]
 92 | 			
 93 | 	# Add to array.
 94 | 	elected.append(p)
 95 | 
 96 | # Move losers to the historical file.
 97 | for p in list(current):
 98 | 	if p['terms'][-1]['type'] == 'rep' and p not in elected:
 99 | 		#print("moving {} {} {} to historical".format(p['id']['bioguide'], p['name']['first'], p['name']['last']))
100 | 		current.remove(p)
101 | 		historical.append(p)
102 | 
103 | 		# If they have any current leadership roles, end it.
104 | 		for r in p.get('leadership_roles', []):
105 | 			if not r.get('end'):
106 | 				r['end'] = "2017-01-03"
107 | 
108 | # Move returning members to the current file 
109 | for p in elected:
110 | 	if p in historical:
111 | 		historical.remove(p)
112 | 		current.append(p)
113 | 
114 | # Add new members to the current file, after the returning members.
115 | for p in elected:
116 | 	if p not in current:
117 | 		current.append(p)
118 | 
119 | # Save.
120 | save_data(current, "legislators-current.yaml")
121 | save_data(historical, "legislators-historical.yaml")
122 | 


--------------------------------------------------------------------------------
/scripts/archive/election_results_senate_2016.csv:
--------------------------------------------------------------------------------
 1 | chamber,class,new,state,full name,party,bioguide,fec,votesmart,wikipedia,ballotpedia,first,middle,last,nickname,gender,birthday
 2 | Senate,3,,MO,Roy Blunt,,B000575,,,,,,,,,,
 3 | Senate,3,,NC,Richard Burr,,B001135,,,,,,,,,,
 4 | Senate,3,,AR,John Boozman,,B001236,,,,,,,,,,
 5 | Senate,3,,CO,Michael Bennet,,B001267,,,,,,,,,,
 6 | Senate,3,,CT,Richard Blumenthal,,B001277,,,,,,,,,,
 7 | Senate,3,,ID,Mike Crapo,,C000880,,,,,,,,,,
 8 | Senate,3,Y,IL,Tammy Duckworth,,D000622,,,,,,,,,,
 9 | Senate,3,,IA,Chuck Grassley,,G000386,,,,,,,,,,
10 | Senate,3,,ND,John Hoeven,,H001061,,,,,,,,,,
11 | Senate,3,,GA,Johnny Isakson,,I000055,,,,,,,,,,
12 | Senate,3,,WI,Ron Johnson,,J000293,,,,,,,,,,
13 | Senate,3,,VT,Patrick Leahy,,L000174,,,,,,,,,,
14 | Senate,3,,OK,James Lankford,,L000575,,,,,,,,,,
15 | Senate,3,,UT,Mike Lee,,L000577,,,,,,,,,,
16 | Senate,3,,AZ,John McCain,,M000303,,,,,,,,,,
17 | Senate,3,,KS,Jerry Moran,,M000934,,,,,,,,,,
18 | Senate,3,,WA,Patty Murray,,M001111,,,,,,,,,,
19 | Senate,3,,AK,Lisa Murkowski,,M001153,,,,,,,,,,
20 | Senate,3,,OH,Rob Portman,,P000449,,,,,,,,,,
21 | Senate,3,,KY,Rand Paul,,P000603,,,,,,,,,,
22 | Senate,3,,FL,Marco Rubio,,R000595,,,,,,,,,,
23 | Senate,3,,NY,Chuck Schumer,,S000148,,,,,,,,,,
24 | Senate,3,,AL,Richard Shelby,,S000320,,,,,,,,,,
25 | Senate,3,,SC,Tim Scott,,S001184,,,,,,,,,,
26 | Senate,3,,HI,Brian Schatz,,S001194,,,,,,,,,,
27 | Senate,3,,SD,John Thune,,T000250,,,,,,,,,,
28 | Senate,3,,PA,Pat Toomey,,T000461,,,,,,,,,,
29 | Senate,3,Y,MD,Chris Van Hollen,,V000128,,,,,,,,,,
30 | Senate,3,,OR,Ron Wyden,,W000779,,,,,,,,,,
31 | Senate,3,Y,IN,Todd Young,,Y000064,,,,,,,,,,
32 | Senate,3,Y,CA,Kamala Harris,Democrat,H001075,S6CA00584,120012,Kamala Harris,Kamala Harris,Kamala,,Harris,,F,1964-10-20
33 | Senate,3,Y,LA,John Neely Kennedy,Republican,K000393,S4LA00065,35496,John Neely Kennedy,John Neely Kennedy,John,Neely,Kennedy,,M,1951-11-21
34 | Senate,3,Y,NH,Maggie Hassan,Democrat,H001076,S6NH00091,42552,Maggie Hassan,Maggie Hassan,Margaret,Wood,Hassan,Maggie,F,1958-02-27
35 | Senate,3,Y,NV,Catherine Cortez Masto,Democrat,C001113,S6NV00200,69579,Catherine Cortez Masto,Catherine Cortez Masto,Catherine,,Cortez Masto,,F,1964-03-29
36 | 


--------------------------------------------------------------------------------
/scripts/archive/election_results_senate_2016.py:
--------------------------------------------------------------------------------
  1 | import csv, collections
  2 | from utils import load_data, save_data
  3 | 
  4 | # Open existing data.
  5 | historical = load_data("legislators-historical.yaml")
  6 | current = load_data("legislators-current.yaml")
  7 | 
  8 | # Map bioguide IDs to records.
  9 | bioguide = { }
 10 | for entry in historical + current:
 11 | 	bioguide[entry['id']['bioguide']] = entry
 12 | 
 13 | # Get highest existing GovTrack ID.
 14 | govtrack_id = max(p['id']['govtrack'] for p in historical+current)
 15 | 
 16 | # Process election results.
 17 | elected = []
 18 | for row in csv.DictReader(open("election_results_senate_2016.csv")):
 19 | 	if row['bioguide'] in bioguide:
 20 | 		# Incumbent won or current representative has become a senator
 21 | 		# or historical member is returning to office.
 22 | 		p = bioguide[row['bioguide']]
 23 | 		party = p['terms'][-1]['party']
 24 | 
 25 | 	else:
 26 | 		# Make a new entry.
 27 | 		govtrack_id += 1
 28 | 		p = collections.OrderedDict([
 29 | 			("id", collections.OrderedDict([
 30 | 				("bioguide", row['bioguide']),
 31 | 				("fec", [row['fec']]),
 32 | 				("govtrack", govtrack_id),
 33 | 				#("opensecrets", None), # don't know yet
 34 | 				("votesmart", int(row['votesmart'])),
 35 | 				("wikipedia", row['wikipedia']),
 36 | 				("ballotpedia", row['ballotpedia']),
 37 | 			])),
 38 | 			("name", collections.OrderedDict([
 39 | 				(k, row[k]) for k in ("first", "middle", "nickname", "last") if row[k]
 40 | 			])),
 41 | 			("bio", collections.OrderedDict([
 42 | 				("gender", row['gender']),
 43 | 				("birthday", row['birthday']),
 44 | 			])),
 45 | 			("terms", []),
 46 | 		])
 47 | 
 48 | 	# Add a new term.
 49 | 	p['terms'].append(collections.OrderedDict([
 50 | 		("type", "sen"),
 51 | 		("start", "2017-01-03"),
 52 | 		("end", "2023-01-03"),
 53 | 		("state", row['state']),
 54 | 		("class", 3),
 55 | 	]))
 56 | 
 57 | 	if row['new'] == "Y":
 58 | 		# Not an incumbent. Therefore this person becomes
 59 | 		# the junior senator and the other (non-class-3)
 60 | 		# senator becomes the senior senator.
 61 | 		p['terms'][-1]['state_rank'] = "junior"
 62 | 		p['terms'][-1]['party'] = row['party'] or p['terms'][-2]['party'] # as listed in the CSV, or from their previous term if previously served
 63 | 		for p1 in current:
 64 | 			if p1['terms'][-1]['type'] == 'sen' and p1['terms'][-1]['state'] == row['state'] and p1['terms'][-1]['class'] != 3:
 65 | 				p1['terms'][-1]['state_rank'] = "senior"
 66 | 				break
 67 | 	else:
 68 | 		# This is an incumbent. Copy some fields forward.
 69 | 		for k in ('state_rank', 'party', 'caucus', 'url', 'rss_url'):
 70 | 			if k in p['terms'][-2]:
 71 | 				p['terms'][-1][k] = p['terms'][-2][k]
 72 | 
 73 | 	# Add to array.
 74 | 	elected.append(p)
 75 | 
 76 | # Move losers to the historical file.
 77 | for p in current:
 78 | 	if p['terms'][-1]['type'] == 'sen' and p['terms'][-1]['class'] == 3 \
 79 | 		and p not in elected:
 80 | 		current.remove(p)
 81 | 		historical.append(p)
 82 | 
 83 | 		# If they have any current leadership roles, end it.
 84 | 		for r in p.get('leadership_roles', []):
 85 | 			if not r.get('end'):
 86 | 				r['end'] = "2017-01-03"
 87 | 
 88 | # Move returning members to the current file -- actually there are no
 89 | # cases of this. All of the existing non-incumbents are current reps
 90 | # who became senators.
 91 | for p in elected:
 92 | 	if p in historical:
 93 | 		historical.remove(p)
 94 | 		current.append(p)
 95 | 
 96 | # Add new members to the current file, after the returning members.
 97 | for p in elected:
 98 | 	if p not in current:
 99 | 		current.append(p)
100 | 
101 | # Save.
102 | save_data(historical, "legislators-historical.yaml")
103 | save_data(current, "legislators-current.yaml")
104 | 


--------------------------------------------------------------------------------
/scripts/archive/everypolitician.py:
--------------------------------------------------------------------------------
  1 | # Converts our data into CSV files for everypolitician.org,
  2 | # one file for the House and one file for the Senate.
  3 | #
  4 | # Usage:
  5 | # python everypolitician.py outputbasename/
  6 | #
  7 | # Which will write:
  8 | # outputbasename/house.csv
  9 | # outputbasename/senate.csv
 10 | 
 11 | import sys, csv
 12 | 
 13 | from utils import yaml_load, CURRENT_CONGRESS, states
 14 | 
 15 | def run():
 16 | 	if len(sys.argv) < 2:
 17 | 		print("Usage: python everypolitician.py outputbasename/")
 18 | 		sys.exit(0)
 19 | 
 20 | 	# Load current legislators.
 21 | 	data = yaml_load("../legislators-current.yaml")
 22 | 	data_social_media = { }
 23 | 	for legislator in yaml_load("../legislators-social-media.yaml"):
 24 | 		data_social_media[legislator['id']['bioguide']] = legislator
 25 | 
 26 | 	# Create output files.
 27 | 	writers = {
 28 | 		"rep": csv.writer(open(sys.argv[1] + "house.csv", "w")),
 29 | 		"sen": csv.writer(open(sys.argv[1] + "senate.csv", "w")),
 30 | 	}
 31 | 	for w in writers.values():
 32 | 		w.writerow([
 33 | 			"id",
 34 | 			"name",
 35 | 			"area",
 36 | 			"group",
 37 | 			"term",
 38 | 			"start_date",
 39 | 			"end_date",
 40 | 			"given_name",
 41 | 			"family_name",
 42 | 			"honorific_suffix",
 43 | 			"sort_name",
 44 | 			"phone",
 45 | 			"gender",
 46 | 			"birth_date",
 47 | 			"image",
 48 | 			"twitter",
 49 | 			"facebook",
 50 | 			"instagram",
 51 | 			"wikipedia",
 52 | 			"website",
 53 | 		])
 54 | 
 55 | 	# Write out one row per legislator for their current term.
 56 | 	for legislator in data:
 57 | 		term = legislator['terms'][-1]
 58 | 
 59 | 		# TODO: "If someone changed party/faction affilation in the middle of the term, you should include two entries, with the relevant start/end dates set."
 60 | 
 61 | 		w = writers[term['type']]
 62 | 		w.writerow([
 63 | 			legislator['id']['bioguide'],
 64 | 			build_name(legislator, term, 'full'),
 65 | 			build_area(term),
 66 | 			term['party'],
 67 | 			CURRENT_CONGRESS,
 68 | 			term['start'],
 69 | 			term['end'],
 70 | 			legislator['name'].get('first'),
 71 | 			legislator['name'].get('last'),
 72 | 			legislator['name'].get('suffix'),
 73 | 			build_name(legislator, term, 'sort'),
 74 | 			term.get('phone'),
 75 | 			legislator['bio'].get('gender'),
 76 | 			legislator['bio'].get('birthday'),
 77 | 			"https://theunitedstates.io/images/congress/original/%s.jpg" % legislator['id']['bioguide'],
 78 | 			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("twitter"),
 79 | 			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("facebook"),
 80 | 			data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("instagram"),
 81 | 			legislator['id'].get('wikipedia', '').replace(" ", "_"),
 82 | 			term['url'],
 83 | 		])
 84 | 
 85 | ordinal_strings = { 1: "st", 2: "nd", 3: "rd", 11: 'th', 12: 'th', 13: 'th' }
 86 | def ordinal(num):
 87 | 	return str(num) + ordinal_strings.get(num % 100, ordinal_strings.get(num % 10, "th"))
 88 | 
 89 | def build_area(term):
 90 | 	# Builds the string for the "area" column, which is a human-readable
 91 | 	# description of the legislator's state or district.
 92 | 	ret = states[term['state']]
 93 | 	if term['type'] == 'rep':
 94 | 		ret += "’s "
 95 | 		if term['district'] == 0:
 96 | 			ret += "At-Large"
 97 | 		else:
 98 | 			ret += ordinal(term['district'])
 99 | 		ret += " Congressional District"
100 | 	return ret
101 | 
102 | def build_name(p, t, mode):
103 | 	# Based on:
104 | 	# https://github.com/govtrack/govtrack.us-web/blob/master/person/name.py
105 | 
106 | 	# First name.
107 | 	firstname = p['name']['first']
108 | 	if firstname.endswith('.'):
109 | 		firstname = p['name']['middle']
110 | 	if p['name'].get('nickname') and len(p['name']['nickname']) < len(firstname):
111 | 			firstname = p['name']['nickname']
112 | 
113 | 	# Last name.
114 | 	lastname = p['name']['last']
115 | 	if p['name'].get('suffix'):
116 | 		lastname += ', ' + p['name']['suffix']
117 | 
118 | 	if mode == "full":
119 | 		return firstname + ' ' + lastname
120 | 	elif mode == "sort":
121 | 		return lastname + ', ' + firstname
122 | 	else:
123 | 		raise ValueError(mode)
124 | 
125 | if __name__ == '__main__':
126 |   run()
127 | 


--------------------------------------------------------------------------------
/scripts/archive/house_history_gender.py:
--------------------------------------------------------------------------------
 1 | import re, urllib.request, urllib.parse
 2 | from utils import yaml_load, yaml_dump
 3 | 
 4 | def run():
 5 | 
 6 | 	# Use the House History Website's Women in Congress search results to get a list of IDs.
 7 | 	# Because this requires a POST, our utils.download() function won't work.
 8 | 	querystring = b"Command=Next&Term=Search&SearchIn=LastName&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&HispanicAmericansInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CongressNumber=114&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLastName%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2C114%2CLastName&X-Requested-With=XMLHttpRequest"
 9 | 	women_house_history_ids = set()
10 | 	for pagenum in range(0, 30+1):
11 | 		body = urllib.request.urlopen(
12 | 			"http://history.house.gov/People/Search?Length=6",
13 | 			querystring.replace(b"__PAGE__", str(pagenum).encode("ascii"))
14 | 			).read().decode("utf8")
15 | 		for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body):
16 | 			women_house_history_ids.add(int(match))
17 | 
18 | 	# Now check and update the gender of all legislators.
19 | 	matched_women_house_history_ids = set()
20 | 	missing_ids = set()
21 | 	for fn in ("../legislators-current.yaml", "../legislators-historical.yaml"):
22 | 		legislators = yaml_load(fn)
23 | 		for p in legislators:
24 | 			house_history_id = p.get("id", {}).get("house_history")
25 | 
26 | 			if not house_history_id:
27 | 				# We have all of the women, so anyone left must be a man.
28 | 				p.setdefault("bio", {})["gender"] = "M"
29 | 				missing_ids.add(p.get("id", {}).get("bioguide"))
30 | 				continue
31 | 
32 | 			p.setdefault("bio", {})["gender"] = "F" if house_history_id in women_house_history_ids else "M"
33 | 
34 | 			if house_history_id in women_house_history_ids:
35 | 				matched_women_house_history_ids.add(house_history_id)
36 | 
37 | 		yaml_dump(legislators, fn)
38 | 
39 | 	print("%d women in Congress reported by the House History website" % len(women_house_history_ids))
40 | 	print("%d women in Congress were not found in our files." % len(women_house_history_ids-matched_women_house_history_ids))
41 | 	print(" ", " ".join((str(x) for x in (women_house_history_ids-matched_women_house_history_ids))))
42 | 	print("%d legislators are missing house_history IDs, set to male." % len(missing_ids))
43 | 
44 | if __name__ == '__main__':
45 |   run()


--------------------------------------------------------------------------------
/scripts/archive/print_leadership_roles.py:
--------------------------------------------------------------------------------
 1 | #print out leadership roles for manual review
 2 | 
 3 | import rtyaml
 4 | import utils
 5 | 
 6 | with open("legislators-current.yaml") as f:
 7 |   legislators = rtyaml.load(f)
 8 | for legislator in legislators:
 9 |   if 'leadership_roles' in legislator:
10 |     print("{}, {}".format(legislator["name"]["last"], legislator["name"]["first"]))
11 |   for role in legislator.get("leadership_roles", []):
12 |     
13 |     start = utils.parse_date(role["start"])
14 |     if not "end" in role:
15 |       print("{} {} started {} with no end".format(role["chamber"], role["title"], role["start"]))
16 |     else:
17 |       print("{} {} started {} and ended {}".format(role["chamber"], role["title"], role["start"], role["end"]))
18 |     
19 | 


--------------------------------------------------------------------------------
/scripts/bioguide.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # gets fundamental information for every member with a bioguide ID:
  4 | # first name, nickname, middle name, last name, name suffix
  5 | # birthday
  6 | 
  7 | # options:
  8 | #  --cache: load from cache if present on disk (default: true)
  9 | #  --current: do *only* current legislators (default: true)
 10 | #  --historical: do *only* historical legislators (default: false)
 11 | #  --bioguide: do *only* a single legislator
 12 | #  --relationships: Get familial relationships to other members of congress past and present, when applicable
 13 | 
 14 | import lxml.html, io
 15 | import datetime
 16 | import re
 17 | import utils
 18 | from utils import download, load_data, save_data
 19 | 
 20 | def run():
 21 | 
 22 |   def update_birthday(bioguide, person, main):
 23 | 
 24 |     birthday = birthday_for(main)
 25 |     if not birthday:
 26 |       print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main))
 27 |       warnings.append(bioguide)
 28 |       return
 29 |     if birthday == "UNKNOWN":
 30 |       return
 31 | 
 32 |     try:
 33 |       birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y")
 34 |     except ValueError:
 35 |       print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main))
 36 |       warnings.append(bioguide)
 37 |       return
 38 | 
 39 |     birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
 40 |     person.setdefault("bio", {})["birthday"] = birthday
 41 | 
 42 | 
 43 |   def birthday_for(string):
 44 |     # exceptions for not-nicely-placed semicolons
 45 |     string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
 46 |     string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
 47 |     string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
 48 |     string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
 49 |     string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
 50 |     string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
 51 | 
 52 |     # look for a date
 53 |     pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
 54 |     match = re.search(pattern, string, re.I)
 55 |     if not match or not match.group(1):
 56 |       # specifically detect cases that we can't handle to avoid unnecessary warnings
 57 |       if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN"
 58 |       if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN"
 59 |       return None
 60 |     return match.group(1).strip()
 61 | 
 62 |   def relationships_of(string):
 63 |     # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio
 64 |     # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)"
 65 |     pattern = "^\((.*?)\)"
 66 |     match = re.search(pattern, string, re.I)
 67 | 
 68 |     relationships = []
 69 | 
 70 |     if match and len(match.groups()) > 0:
 71 |       relationship_text = match.group(1).encode("ascii", "replace")
 72 | 
 73 |       # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar
 74 |       from nltk import tree, pos_tag, RegexpParser
 75 |       tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text)
 76 |       pos = pos_tag(tokens)
 77 | 
 78 |       grammar = r"""
 79 |         NAME: {<NNP>+}
 80 |         NAMES: { <IN><NAME>(?:<CC><NAME>)* }
 81 |         RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ }
 82 |         MATCH: { <RELATIONSHIP><NAMES> }
 83 |         """
 84 |       cp = RegexpParser(grammar)
 85 |       chunks = cp.parse(pos)
 86 | 
 87 |       # iterate through the Relationship/Names pairs
 88 |       for n in chunks:
 89 |         if isinstance(n, tree.Tree) and n.node == "MATCH":
 90 |           people = []
 91 |           relationship = None
 92 |           for piece in n:
 93 |             if piece.node == "RELATIONSHIP":
 94 |               relationship = " ".join([x[0] for x in piece])
 95 |             elif piece.node == "NAMES":
 96 |               for name in [x for x in piece if isinstance(x, tree.Tree)]:
 97 |                 people.append(" ".join([x[0] for x in name]))
 98 |           for person in people:
 99 |             relationships.append({ "relation": relationship, "name": person})
100 |     return relationships
101 | 
102 |   # default to caching
103 |   cache = utils.flags().get('cache', True)
104 |   force = not cache
105 | 
106 |   # pick either current or historical
107 |   # order is important here, since current defaults to true
108 |   if utils.flags().get('historical', False):
109 |     filename = "legislators-historical.yaml"
110 |   elif utils.flags().get('current', True):
111 |     filename = "legislators-current.yaml"
112 |   else:
113 |     print("No legislators selected.")
114 |     exit(0)
115 | 
116 |   print("Loading %s..." % filename)
117 |   legislators = load_data(filename)
118 | 
119 | 
120 |   # reoriented cache to access by bioguide ID
121 |   by_bioguide = { }
122 |   for m in legislators:
123 |     if "bioguide" in m["id"]:
124 |       by_bioguide[m["id"]["bioguide"]] = m
125 | 
126 | 
127 |   # optionally focus on one legislator
128 | 
129 |   bioguide = utils.flags().get('bioguide', None)
130 |   if bioguide:
131 |     bioguides = [bioguide]
132 |   else:
133 |     bioguides = list(by_bioguide.keys())
134 | 
135 |   warnings = []
136 |   missing = []
137 |   count = 0
138 |   families = 0
139 | 
140 |   for bioguide in bioguides:
141 |     # Download & parse the HTML of the bioguide page.
142 |     try:
143 |     	dom = fetch_bioguide_page(bioguide, force)
144 |     except Exception as e:
145 |     	print(e)
146 |     	missing.append(bioguide)
147 |     	continue
148 | 
149 |     # Extract the member's name and the biography paragraph (main).
150 | 
151 |     try:
152 |       name = dom.cssselect("p font")[0]
153 |       main = dom.cssselect("p")[0]
154 |     except IndexError:
155 |       print("[%s] Missing name or content!" % bioguide)
156 |       exit(0)
157 | 
158 |     name = name.text_content().strip()
159 |     main = main.text_content().strip().replace("\n", " ").replace("\r", " ")
160 |     main = re.sub("\s+", " ", main)
161 | 
162 |     # Extract the member's birthday.
163 | 
164 |     update_birthday(bioguide, by_bioguide[bioguide], main)
165 | 
166 |     # Extract relationships with other Members of Congress.
167 | 
168 |     if utils.flags().get("relationships", False):
169 |       #relationship information, if present, is in a parenthetical immediately after the name.
170 |       #should always be present if we passed the IndexError catch above
171 |       after_name = dom.cssselect("p font")[0].tail.strip()
172 |       relationships = relationships_of(after_name)
173 |       if len(relationships):
174 |         families = families + 1
175 |         by_bioguide[bioguide]["family"] = relationships
176 | 
177 |     count = count + 1
178 | 
179 | 
180 |   print()
181 |   if warnings:
182 |     print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings)))
183 | 
184 |   if missing:
185 |     print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing)))
186 | 
187 |   print("Saving data to %s..." % filename)
188 |   save_data(legislators, filename)
189 | 
190 |   print("Saved %d legislators to %s" % (count, filename))
191 | 
192 |   if utils.flags().get("relationships", False):
193 |     print("Found family members for %d of those legislators" % families)
194 | 
195 |   # Some testing code to help isolate and fix issued:
196 |   # f
197 |   # none = "PEARSON, Joseph, a Representative from North Carolina; born in Rowan County, N.C., in 1776; completed preparatory studies; studied law; was admitted to the bar and commenced practice in Salisbury, N.C.; member of the State house of commons; elected as a Federalist to the Eleventh, Twelfth, and Thirteenth Congresses (March 4, 1809-March 3, 1815); while in Congress fought a duel with John George Jackson, of Virginia, and on the second fire wounded his opponent in the hip; died in Salisbury, N.C., October 27, 1834."
198 |   # print "Pearson (none): %s" % birthday_for(none)
199 | 
200 |   # owens = "OWENS, William, a Representative from New York; born in Brooklyn, Kings County, N.Y., January, 20, 1949; B.S., Manhattan College, Riverdale, N.Y., 1971; J.D., Fordham University, New York, N.Y., 1974; United States Air Force; lawyer, private practice; faculty, State University of New York, Plattsburgh, N.Y., 1978-1986; elected as a Democrat to the One Hundred Eleventh Congress, by special election to fill the vacancy caused by the resignation of United States Representative John McHugh, and reelected to the two succeeding Congresses (November 3, 2009-present)."
201 |   # print "Owens (January, 20, 1949): %s" % birthday_for(owens)
202 | 
203 |   # shea = "SHEA-PORTER, Carol, a Representative from New Hampshire; born in New York City, New York County, N.Y., December, 1952; graduated from Oyster River High School, Durham, N.H., 1971; B.A., University of New Hampshire, Durham, N.H., 1975; M.P.A., University of New Hampshire, Durham, N.H., 1979; social worker; professor; elected as a Democrat to the One Hundred Tenth Congress and to the succeeding Congress (January 3, 2007-January 3, 2011); unsuccessful candidate for reelection to the One Hundred Twelfth Congress in 2010; elected as a Democrat to the One Hundred Thirteenth Congress (January 3, 2013-present)."
204 |   # print "Shea (none): %s" % birthday_for(shea)
205 | 
206 |   # control = "PEARSON, Richmond, a Representative from North Carolina; born at Richmond Hill, Yadkin County, N.C., January 26, 1852; attended Horner's School, Oxford, N.C., and was graduated from Princeton College in 1872; studied law; was admitted to the bar in 1874; in the same year was appointed United States consul to Verviers and Liege, Belgium; resigned in 1877; member of the State house of representatives 1884-1886; elected as a Republican to the Fifty-fourth and Fifty-fifth Congresses (March 4, 1895-March 3, 1899); successfully contested the election of William T. Crawford to the Fifty-sixth Congress and served from May 10, 1900, to March 3, 1901; appointed by President Theodore Roosevelt as United States consul to Genoa, Italy, December 11, 1901, as Envoy Extraordinary and Minister Plenipotentiary to Persia in 1902, and as Minister to Greece and Montenegro in 1907; resigned from the diplomatic service in 1909; died at Richmond Hill, Asheville, N.C., September 12, 1923; interment in Riverside Cemetery."
207 |   # print "\nControl (January 26, 1852): %s" % birthday_for(control)
208 | 
209 | def fetch_bioguide_page(bioguide, force):
210 |   url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide
211 |   cache = "legislators/bioguide/%s.html" % bioguide
212 |   try:
213 |     body = download(url, cache, force)
214 | 
215 |     # Fix a problem?
216 |     body = body.replace("&Aacute;\xc2\x81", "&Aacute;")
217 | 
218 |     # Entities like &#146; are in Windows-1252 encoding. Normally lxml
219 |     # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser
220 |     # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't
221 |     # provide a cssselect method on element objects. So we'll just decode ourselves.
222 |     body = utils.unescape(body, "Windows-1252")
223 | 
224 |     dom = lxml.html.parse(io.StringIO(body)).getroot()
225 |   except lxml.etree.XMLSyntaxError:
226 |     raise Exception("Error parsing: " + url)
227 | 
228 |   # Sanity check.
229 | 
230 |   if len(dom.cssselect("title")) == 0:
231 |     raise Exception("No page for bioguide %s!" % bioguide)
232 | 
233 |   return dom
234 | 
235 | if __name__ == '__main__':
236 |   run()
237 | 


--------------------------------------------------------------------------------
/scripts/bioguide_guess_new_member_ids.py:
--------------------------------------------------------------------------------
 1 | import rtyaml
 2 | 
 3 | from bioguide import fetch_bioguide_page
 4 | 
 5 | def run():
 6 | 
 7 |   print("Finding highest bioguide numbers we know of...")
 8 |   highest_num_by_letter = { }
 9 |   for fn in ('legislators-current', 'legislators-historical'):
10 |     P = rtyaml.load(open('../%s.yaml' % fn))
11 |     for p in P:
12 |       if not p['id'].get('bioguide'): continue
13 |       if p['id']['bioguide'] == "TODO": continue # 114th Congress staging
14 |       letter = p['id']['bioguide'][0]
15 |       num = p['id']['bioguide'][1:]
16 |       highest_num_by_letter[letter] = max(highest_num_by_letter.get(letter, ''), num)
17 | 
18 |   print("Checking for new bioguide pages...")
19 |   for letter in sorted(highest_num_by_letter):
20 |     num = int(highest_num_by_letter[letter])
21 |     while True:
22 |       num += 1
23 |       bioguide = "%s%06d" % (letter, num)
24 |       try:
25 |         dom = fetch_bioguide_page(bioguide, True)
26 |       except Exception:
27 |         break
28 |       print(bioguide, dom.cssselect("title")[0].text)
29 | 
30 | if __name__ == '__main__':
31 |   run()
32 | 


--------------------------------------------------------------------------------
/scripts/bioguide_xml.py:
--------------------------------------------------------------------------------
 1 | # Update metadata fields like birthdays from
 2 | # bioguide.congress.gov bulk data downloads.
 3 | #
 4 | # Usage:
 5 | # python3 bioguide_xml.py path/to/BioguideProfiles.zip
 6 | 
 7 | import sys
 8 | import zipfile
 9 | import re
10 | import json
11 | import rtyaml
12 | import datetime
13 | 
14 | def run():
15 |     # Load existing legislators and map bioguide IDs
16 |     # to their entries.
17 |     legislator_data = { }
18 |     legislators = { }
19 |     for ft in ("current", "historical"):
20 |         with open("../legislators-{}.yaml".format(ft)) as f:
21 |             data = rtyaml.load(f)
22 |             legislator_data[ft] = data
23 |             for p in data:
24 |                 legislators[p["id"]["bioguide"]] = p
25 | 
26 |     def parse_birthday_from_text(text):
27 |         # exceptions for not-nicely-placed semicolons
28 |         text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
29 |         text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
30 |         text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
31 |         text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
32 |         text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
33 |         text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
34 | 
35 |         # look for a date
36 |         pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
37 |         match = re.search(pattern, text, re.I)
38 |         if not match or not match.group(1):
39 |           # specifically detect cases that we can't handle to avoid unnecessary warnings
40 |           if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None
41 |           if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None
42 |           return None, None
43 |         original_text = match.group(1).strip()
44 | 
45 |         try:
46 |           birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y")
47 |         except ValueError:
48 |           print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide_id, original_text))
49 |           return None, original_text
50 | 
51 |         birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
52 |         return birthday, original_text
53 | 
54 |     # Process all profile data in the bioguide ZIP file.
55 |     with zipfile.ZipFile(sys.argv[1]) as zf:
56 |         for profile_fn in zf.namelist():
57 |             bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1)
58 |             if bioguide_id not in legislators:
59 |                 #print("No legislator for", bioguide_id)
60 |                 continue
61 |             with zf.open(profile_fn) as zff:
62 |                 profile = json.load(zff)
63 |                 if "profileText" not in profile:
64 |                     continue
65 | 
66 |                 legislator = legislators[bioguide_id]
67 | 
68 |                 # Get birthday from text.
69 |                 birthday, original_text = parse_birthday_from_text(profile["profileText"])
70 |                 if birthday:
71 | 
72 |                     # Check birthday from metadata --- not as reliable.
73 |                     # Since the metadata may only have a year, only match
74 |                     # as much of the date string as it has.
75 |                     if profile.get("birthDate") and not profile.get("birthCirca"):
76 |                         if profile["birthDate"] != birthday[0:len(profile["birthDate"])]:
77 |                              print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text))
78 |                         else:
79 |                             # They match, so update.
80 |                             legislators.setdefault("bio", {})
81 |                             legislator["bio"]["birthday"] = birthday
82 | 
83 | 
84 |     # Write out updated data files.
85 |     for fn in legislator_data:
86 |         with open("../legislators-{}.yaml".format(ft), "w") as f:
87 |             rtyaml.dump(legislator_data[fn], f)
88 | 
89 | if __name__ == "__main__":
90 |     run()
91 | 


--------------------------------------------------------------------------------
/scripts/committee_membership.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Data Sources:
  4 | #   House:
  5 | #     http://clerk.house.gov/xml/lists/MemberData.xml
  6 | #   Senate:
  7 | #     https://www.senate.gov/general/committee_membership/committee_memberships_{thomas_id}.xml
  8 | 
  9 | # Data Files Updated:
 10 | #   committee-membership-current.yaml:
 11 | #     All entries are overwritten except for house members of joint committees
 12 | #     which have to be manually entered since there is no source of this data
 13 | #   committees-current.yaml:
 14 | #     Fro House committees, updates name, address, and phone
 15 | #     For Senate committees, updates name and url
 16 | 
 17 | 
 18 | import re, lxml.html, lxml.etree
 19 | from collections import OrderedDict
 20 | import utils
 21 | from utils import download, load_data, save_data
 22 | 
 23 | 
 24 | def run():
 25 |   committee_membership = load_data("committee-membership-current.yaml")
 26 |   committees_current = load_data("committees-current.yaml")
 27 | 
 28 |   # default to not caching
 29 |   cache = utils.flags().get('cache', False)
 30 |   force = not cache
 31 | 
 32 | 
 33 |   # map house/senate committee IDs to their dicts
 34 |   house_ref = { }
 35 |   for cx in committees_current:
 36 |     if "house_committee_id" in cx:
 37 |       house_ref[cx["house_committee_id"]] = cx
 38 |   senate_ref = { }
 39 |   for cx in committees_current:
 40 |     if "senate_committee_id" in cx:
 41 |       senate_ref[cx["senate_committee_id"]] = cx
 42 | 
 43 | 
 44 |   # map state/district to current senators because the Senate committee
 45 |   # membership data does not contain IDs for senators, and map to bioguide
 46 |   # IDs so we can copy forward the official_full name for House members
 47 |   legislators_current = load_data("legislators-current.yaml")
 48 |   senators = { }
 49 |   for moc in legislators_current:
 50 |     term = moc["terms"][-1]
 51 |     if term["type"] == "sen":
 52 |       for n in [moc["name"]] + moc.get("other_names", []):
 53 |         senators[(term["state"], n["last"])] = moc
 54 |   legislators_current = { moc["id"]["bioguide"]: moc for moc in legislators_current }
 55 | 
 56 | 
 57 |   # Scrape clerk.house.gov...
 58 |   def scrape_house():
 59 |     # clear out all of the existing House members of committees (i.e. all House committee membership
 60 |     # and the House part of Joint committee membership)
 61 |     for committee, members in committee_membership.items():
 62 |       for m in list(members): # must clone before editing list
 63 |         if committee[0] == "H" or m.get("chamber") == "house":
 64 |           members.remove(m)
 65 | 
 66 |     r = download("http://clerk.house.gov/xml/lists/MemberData.xml", "clerk_xml", force)
 67 |     dom = lxml.etree.fromstring(r.encode("latin-1")) # must be bytes to parse if there is an encoding declaration inside the string
 68 | 
 69 |     # Update committee metadata.
 70 |     def update_house_committee_metadata(xml_cx, cx, parentdict, is_subcommittee):
 71 |       sub_prefix = "sub" if is_subcommittee else ""
 72 | 
 73 |       if cx is None:
 74 |         # New committee.
 75 |         if not is_subcommittee:
 76 |           cx = {
 77 |             "type": "house",
 78 |             "thomas_id": "H" + xml_cx.attrib["type"][0].upper() + xml_cx.attrib["comcode"][0:2],
 79 |             "house_committee_id": xml_cx.attrib["comcode"][0:2]
 80 |           }
 81 |           house_ref[cx["house_committee_id"]] = cx
 82 |         else:
 83 |           cx = {
 84 |             "name": None, # placeholder so order is right
 85 |             "thomas_id": xml_cx.attrib["subcomcode"][2:]
 86 |           }
 87 |         parentdict.append(cx)
 88 | 
 89 |       cx["name"] = normalize_text(xml_cx.find(sub_prefix + "committee-fullname").text)
 90 |       if not is_subcommittee and not cx["name"].startswith("Joint "): cx["name"] = "House " + cx["name"]
 91 | 
 92 |       building = xml_cx.attrib[sub_prefix + "com-building-code"]
 93 |       if building == "C":
 94 |         building = "CAPITOL"
 95 |       #address format: 1301 LHOB; Washington, DC 20515-6001
 96 |       cx["address"] = xml_cx.attrib[sub_prefix + "com-room"] + " " + building \
 97 |          + "; Washington, DC " + xml_cx.attrib[sub_prefix + "com-zip"] \
 98 |          + (("-" + xml_cx.attrib[sub_prefix + "com-zip-suffix"]) if xml_cx.attrib[sub_prefix + "com-zip-suffix"] != "0" else "")
 99 |       cx["phone"] = "(202) " + xml_cx.attrib[sub_prefix + "com-phone"]
100 | 
101 |       if not is_subcommittee:
102 |         for xml_sx in xml_cx.findall("subcommittee"):
103 |           sxx = [s for s in cx["subcommittees"] if s["thomas_id"] == xml_sx.attrib["subcomcode"][2:]]
104 |           update_house_committee_metadata(xml_sx, sxx[0] if len(sxx) > 0 else None, cx["subcommittees"], True)
105 | 
106 |     committees = dom.xpath("/MemberData/committees")[0]
107 |     for xml_cx in committees.findall("committee"):
108 |       house_committee_id = xml_cx.attrib["comcode"][0:2]
109 |       update_house_committee_metadata(xml_cx, house_ref.get(house_committee_id), committees_current, False)
110 | 
111 |     # Determine which party is in the majority. Only the majority
112 |     # party holds chair positions. At least one should have the
113 |     # position Chair.
114 |     house_majority_caucus = dom.xpath("string(/MemberData/members/member[committee-assignments/committee[@leadership='Chair']]/member-info/caucus)")
115 | 
116 |     for xml_member in dom.xpath("/MemberData/members/member"):
117 |       bioguide_id = xml_member.xpath("member-info/bioguideID")[0].text
118 |       if not bioguide_id: #sometimes the xml has vacancies as blanks
119 |         continue
120 | 
121 |       # Although there is a name in the XML data, for consistency use the one we
122 |       # have in legislators-current.yaml, if one is set.
123 |       try:
124 |         official_name = legislators_current[bioguide_id]["name"]["official_full"]
125 |       except KeyError:
126 |         official_name = xml_member.xpath("member-info/official-name")[0].text
127 | 
128 |       #is using caucus better than using party?
129 |       caucus = xml_member.xpath("member-info/caucus")[0].text
130 |       party = "majority" if caucus == house_majority_caucus else "minority"
131 | 
132 |       #for each committee or subcommittee membership
133 |       for cm in xml_member.xpath("committee-assignments/committee|committee-assignments/subcommittee"):
134 |         if "comcode" in cm.attrib:
135 |           house_committee_id = cm.attrib["comcode"][:2]
136 |           if house_committee_id == "HL": continue # this doesn't appear to be a committee and seems like a data error
137 |           thomas_committee_id = house_ref[house_committee_id]["thomas_id"]
138 |         elif "subcomcode" in cm.attrib:
139 |           house_committee_id = cm.attrib["subcomcode"][:2]
140 |           thomas_committee_id = house_ref[house_committee_id]["thomas_id"] + cm.attrib["subcomcode"][2:]
141 |         else:
142 |           continue # some nodes are invalid
143 | 
144 |         membership = OrderedDict()
145 |         membership["name"] = official_name
146 |         membership["party"] = party
147 |         membership["rank"] = int(cm.attrib["rank"])
148 | 
149 |         if "leadership" in cm.attrib:
150 |           membership["title"] = cm.attrib["leadership"] # TODO .replace("woman", "").replace("man", "")
151 |         elif membership["rank"] == 1:
152 |           #xml doesn't contain ranking member titles
153 |           if membership["party"] == "majority":
154 |             membership["title"] = "Chair"
155 |           else:
156 |             membership["title"] = "Ranking Member"
157 |         membership["bioguide"] = bioguide_id
158 | 
159 |         if house_ref[house_committee_id]["type"] == "joint":
160 |           membership["chamber"] = "house"
161 | 
162 |         committee_membership.setdefault(thomas_committee_id, []).append(membership)
163 | 
164 |   # Scrape senate.gov....
165 |   def scrape_senate():
166 |     url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm"
167 |     body = download(url, "committees/membership/senate.html", force)
168 | 
169 |     for id, name in re.findall(r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)</option>', body, re.I |  re.S):
170 |       if id not in senate_ref:
171 |         print("Unrecognized committee:", id, name)
172 |         continue
173 | 
174 |       cx = senate_ref[id]
175 |       is_joint = (id[0] == "J")
176 | 
177 |       # Scrape some metadata on the HTML page first.
178 | 
179 |       committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id
180 |       print("[%s] Fetching members for %s (%s)" % (id, name, committee_url))
181 |       body2 = download(committee_url, "committees/membership/senate/%s.html" % id, force)
182 | 
183 |       if not body2:
184 |         print("\tcommittee page not good:", committee_url)
185 |         continue
186 | 
187 |       m = re.search(r'<span class="contenttext"><a href="(http://(.*?).senate.gov/)">', body2, re.I)
188 |       if m:
189 |         cx["url"] = m.group(1)
190 | 
191 |       # Use the XML for the rest.
192 | 
193 |       print("\tDownloading XML...")
194 |       committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id
195 | 
196 |       body3 = download(committee_url, "committees/membership/senate/%s.xml" % id, force)
197 |       dom = lxml.etree.fromstring(body3.encode("utf8")) # must be bytes to parse if there is an encoding declaration inside the string
198 | 
199 |       cx["name"] = normalize_text(dom.xpath("committees/committee_name")[0].text)
200 |       if id[0] != "J" and id[0:2] != 'SC':
201 |         cx["name"] = "Senate " + cx["name"]
202 | 
203 |       majority_party = dom.xpath("committees/majority_party")[0].text
204 | 
205 |       # update full committee members
206 |       scrape_senate_members(
207 |         dom.xpath("committees/members/member"),
208 |         committee_membership.setdefault(id, []),
209 |         majority_party, is_joint)
210 | 
211 |       # update subcommittees
212 |       for subcom in dom.xpath("committees/subcommittee"):
213 |         scid = subcom.xpath("committee_code")[0].text[4:]
214 |         for sx in cx.get('subcommittees', []):
215 |           if sx["thomas_id"] == scid:
216 |             break
217 |         else:
218 |           print("Subcommittee not found, creating it", scid, name)
219 |           sx = OrderedDict()
220 |           sx['thomas_id'] = scid
221 |           cx.setdefault('subcommittees', []).append(sx)
222 | 
223 |         # update metadata
224 |         name = subcom.xpath("subcommittee_name")[0].text
225 |         sx["name"] = normalize_text(name)
226 |         sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"])
227 |         sx["name"] = re.sub(r"\s+", " ", sx["name"])
228 | 
229 |         scrape_senate_members(
230 |           subcom.xpath("members/member"),
231 |           committee_membership.setdefault(id + scid, []),
232 |           majority_party, is_joint)
233 | 
234 |   def scrape_senate_members(members, output_list, majority_party, is_joint):
235 |     # Keep a copy of the previous membership, and then clear the Senate members
236 |     # of the committee.
237 |     existing_members_data = list(output_list) # clone
238 |     if not is_joint:
239 |       output_list.clear()
240 |     else:
241 |       for m in list(output_list): # must clone before editing list
242 |         if m.get("chamber") == "senate":
243 |           output_list.remove(m)
244 | 
245 |     # Update members.
246 |     ids = set()
247 |     count_by_party = { "majority": 0, "minority": 0 }
248 |     for node in members:
249 |       ids.add(scrape_senate_member(output_list, node, majority_party, is_joint, count_by_party, existing_members_data))
250 | 
251 |     # Purge non-members. Ignore House members of joint committees.
252 |     i = 0
253 |     while i < len(output_list):
254 |       if output_list[i]['bioguide'] not in ids and output_list[i].get("chamber") in (None, "senate"):
255 |         output_list[i:i+1] = []
256 |       else:
257 |         i += 1
258 | 
259 |     # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party
260 |     output_list.sort(key = lambda e : (e["party"] != "majority", e["rank"]))
261 | 
262 |   def scrape_senate_member(output_list, membernode, majority_party, is_joint, count_by_party, existing_members_data):
263 |     last_name = membernode.xpath("name/last")[0].text
264 |     state = membernode.xpath("state")[0].text
265 |     party = "majority" if membernode.xpath("party")[0].text == majority_party else "minority"
266 |     title = membernode.xpath("position")[0].text
267 |     if title == "Member": title = None
268 |     if title == "Ranking": title = "Ranking Member"
269 | 
270 |     # look up senator by state and last name
271 |     if (state, last_name) == ("NM", "Lujan"): last_name = "Luján"
272 |     if (state, last_name) not in senators:
273 |       print("\t[%s] Unknown member: %s" % (state, last_name))
274 |       return None
275 | 
276 |     moc = senators[(state, last_name)]
277 | 
278 |     entry = OrderedDict()
279 |     if 'official_full' in moc['name']:
280 |       entry["name"] = moc['name']['official_full']
281 |     else:
282 |       print("missing name->official_full field for", moc['id']['bioguide'])
283 |     entry["party"] = party
284 |     count_by_party[party] += 1
285 |     entry["rank"] = count_by_party[party]
286 |     if title: entry["title"] = title
287 |     entry.update(ids_from(moc["id"]))
288 |     if is_joint: entry["chamber"] = "senate"
289 | 
290 |     # Look for an existing entry for this member and take
291 |     # start_date and source from it, if set.
292 |     for item in existing_members_data:
293 |       if item["bioguide"] == entry["bioguide"]:
294 |         for key in ("start_date", "source"):
295 |             if key in item:
296 |                 entry[key] = item[key]
297 | 
298 |     output_list.append(entry)
299 | 
300 |     # Return bioguide ID of member added.
301 |     return entry["bioguide"]
302 | 
303 |   # stick to a specific small set of official IDs to cross-link members
304 |   # this limits the IDs from going out of control in this file, while
305 |   # preserving us flexibility to be inclusive of IDs in the main leg files
306 |   def ids_from(moc):
307 |     ids = {}
308 |     if "bioguide" in moc:
309 |       ids["bioguide"] = moc["bioguide"]
310 |     if len(ids) == 0:
311 |       raise ValueError("Missing an official ID for this legislator, won't be able to link back")
312 |     return ids
313 | 
314 |   # MAIN
315 |   scrape_house()
316 |   scrape_senate()
317 | 
318 |   # ensure each committee has members in a stable, sorted order
319 |   for comm, mbrs in committee_membership.items():
320 |     # joint committees also have to sort by chamber
321 |     if comm[0] == "J":
322 |       mbrs.sort(key=lambda entry: (entry["party"] == "minority", entry["rank"], entry["chamber"] != "senate"))
323 | 
324 |     # Senate and House committees have different sort orders to match
325 |     # earlier data, but there's no particular reason for this
326 |     elif comm[0] == "S":
327 |       mbrs.sort(key=lambda entry: (entry["party"] == "minority", entry["rank"]))
328 |     else:
329 |       mbrs.sort(key=lambda entry: (entry["rank"], entry["party"] == "minority"))
330 | 
331 |   save_data(committee_membership, "committee-membership-current.yaml")
332 |   save_data(committees_current, "committees-current.yaml")
333 | 
334 | 
335 | def normalize_text(text):
336 |   # Remove leading and trailing whitespace (coul also use .strip()).
337 |   text = re.sub(r"^\s+|\s+$", "", text)
338 | 
339 |   # Remove double spaces and turn all internal whitespace into spaces.
340 |   text = re.sub(r"\s+", " ", text)
341 | 
342 |   return text
343 | 
344 | 
345 | if __name__ == '__main__':
346 |   run()
347 | 


--------------------------------------------------------------------------------
/scripts/contact_forms.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | '''Gets contact webform URLs for the intersection of members with bioguide ids
 4 | and with correlating contact form steps in unitedstates/contact-congress:
 5 | 
 6 | args:
 7 | <bioguide_id bioguide_id ...>
 8 | A list of bioguide ids to import.
 9 | 
10 | options:
11 |   --debug[=True]
12 |   Whether or not verbose output should be printed to the command line
13 | '''
14 | 
15 | import yaml
16 | from urllib.request import urlopen
17 | 
18 | import utils
19 | from utils import load_data, save_data
20 | 
21 | 
22 | # These members have forms in iframes, and Contact-Congress has different
23 | # needs than human users might.
24 | SKIP_BIOGUIDES = ['M000312']
25 | 
26 | 
27 | def run():
28 |     options = utils.flags()
29 |     debug = options.get('debug', False)
30 | 
31 |     filename = "legislators-current.yaml"
32 |     args = utils.args()
33 |     legislators = load_data(filename)
34 | 
35 |     if len(args) != 0:
36 |         bioguides = args
37 |         print("Fetching contact forms for %s..." % ', '.join(bioguides))
38 |     else:
39 |         bioguides = [member['id']['bioguide'] for member in legislators]
40 |         print("Fetching contact forms for all current members...")
41 | 
42 |     for legislator in legislators:
43 |         bioguide = legislator['id']['bioguide']
44 |         if bioguide not in bioguides: continue
45 |         if bioguide in SKIP_BIOGUIDES: continue
46 | 
47 |         if debug: print("Downloading form for %s" % bioguide, flush=True)
48 | 
49 |         try:
50 |             steps = contact_steps_for(bioguide)
51 |         except LegislatorNotFoundError as e:
52 |             if debug: print("skipping, %s..." % e, flush=True)
53 |             continue
54 | 
55 |         legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit']
56 | 
57 |     print("Saving data to %s..." % filename)
58 |     save_data(legislators, filename)
59 | 
60 | 
61 | def contact_steps_for(bioguide):
62 |     base_url = "https://raw.githubusercontent.com/unitedstates/contact-congress/main/members/{bioguide}.yaml"
63 |     response = urlopen(base_url.format(bioguide=bioguide))
64 |     if response.code == 404:
65 |         raise LegislatorNotFoundError("%s not found in unitedstates/contact-congress!" % bioguide)
66 |     return yaml.load(response.read())
67 | 
68 | 
69 | class LegislatorNotFoundError(Exception):
70 |     pass
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     run()
75 | 


--------------------------------------------------------------------------------
/scripts/cspan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Update current cspan IDs using NYT Congress API.
 4 | 
 5 | import json, urllib.request, urllib.parse, urllib.error
 6 | from utils import load_data, save_data
 7 | 
 8 | def run():
 9 |     # load in current members
10 |     y = load_data("legislators-current.yaml")
11 |     for m in y:
12 |         # retrieve C-SPAN id, if available, from ProPublica API
13 |         # TODO: use utils.download here
14 |         response = urllib.request.urlopen("https://projects.propublica.org/represent/api/v1/members/%s.json" % m['id']['bioguide']).read()
15 |         j = json.loads(response.decode("utf8"))
16 |         cspan = j['results'][0]['cspan_id']
17 |         if not cspan == '':
18 |             m['id']['cspan'] = int(cspan)
19 |     save_data(y, "legislators-current.yaml")
20 | 
21 | if __name__ == '__main__':
22 |   run()
23 | 


--------------------------------------------------------------------------------
/scripts/data/social_media_blacklist.csv:
--------------------------------------------------------------------------------
 1 | service,pattern,comment
 2 | twitter,housedemocrats,house dem caucus
 3 | twitter,search\?q=,search links
 4 | twitter,SmallBizGOP,some house republican caucus
 5 | facebook,likebox\.php,like buttons
 6 | twitter,^kellyayotte$,"CHECK LATER – no link on her homepage, twitter button is conspicuously absent"
 7 | twitter,elizabethforma,campaign account
 8 | twitter,heinrich4nm,campaign account
 9 | twitter,gopconference,some house republican caucus
10 | twitter,^petegallego$,campaign account
11 | twitter,^share$,not the share url
12 | twitter,javascripts,probably not real
13 | twitter,user_timeline,probably not real
14 | twitter,statuses,probably not real
15 | twitter,WaysandMeansGOP,some house republican caucus
16 | twitter,congjeffmiller,abandoned
17 | twitter,Daniel_Inouye,mentioned on some people's sites
18 | twitter,^SydneyFreedberg$,appears somewhere
19 | twitter,^kyledeb$,appears somewhere
20 | twitter,nbcnews,appears somewhere
21 | twitter,^MarcoRubio$,accidentally linked to campaign account
22 | twitter,^TedCruz$,accidentally linked to campaign account
23 | twitter,^NRO$,appears somewhere
24 | twitter,20515507227,commented out embed on peter roskam's site
25 | twitter,^tammybaldwin$,campaign account
26 | facebook,media/set,media plugin
27 | facebook,photo.php,media plugin
28 | facebook,plugins/like.php,like plugin
29 | facebook,feeds/page.php,some plugin
30 | facebook,republicanconference,shared page
31 | facebook,HouseDemocrats,shared page
32 | facebook,photos.php,photo plugin
33 | facebook,album.php,album plugin
34 | facebook,^2008$,fbml year xmlns thing
35 | facebook,^plugins$,plugin plugin
36 | facebook,^feeds$,feeds plugin
37 | facebook,HouseChamber,shared page
38 | facebook,VoteMartinHeinrich,campaign account
39 | facebook,^JeffFlake1$,campaign account
40 | facebook,^media$,media plugin
41 | youtube,^embed$,embed tag
42 | youtube,^e$,embed tag
43 | youtube,^v$,embed tag
44 | youtube,^cp$,embed tag
45 | youtube,SmallBizRepublicans,shared page
46 | youtube,^watch$,embed tag
47 | youtube,housedems,embed tag
48 | youtube,republicanconference,embed tag
49 | youtube,^WSB111$,Some weird vendor thing
50 | youtube,^p$,embed tag
51 | youtube,HouseConference,shared page
52 | youtube,^rss$,rss link
53 | youtube,RepublicanLeader,shared page
54 | youtube,ElizabethForMA,campaign account
55 | youtube,^player_api$,embed tag
56 | twitter,^PatrickMurphyFL$,campaign account
57 | facebook,128261203867188,commented out thing on carolyn mccarthy's site
58 | facebook,^BrianSchatz$,campaign account
59 | facebook,^pages$,pages is never the answer
60 | youtube,^upload\)<$,weird
61 | facebook,^blumenauer$,campaign account
62 | youtube,^HCmNo6p7XRqNs$,"auto-generated channel, not videos created by staff"
63 | youtube,^channel$,ignore detections of “/channel”
64 | facebook,^socialjobs$,some job thing on booker's site
65 | twitter,^USRepJoeWilson$,404s
66 | twitter,^FraminghamPatch$,not the right one
67 | twitter,^CandiceMiller$,"we have it already, blacklisted because it shows up elsewhere mistakenly"
68 | facebook,^billnelson$,"campaign account, even though listed on official page"
69 | instagram,republicanconference, not an individual's account
70 | instagram,housedemocrats, not an individual's account
71 | twitter,^housegop$,house gop conference
72 | twitter,^share\?,share URL with query string
73 | twitter,^sethmoulton$,seth moulton's campaign account
74 | facebook,^HouseRepublicans$,house conference account
75 | facebook,^sharer$,share URL
76 | youtube,^c$,junk
77 | twitter,^OlsonPressShop$,"not the right one (linked, but other one is used)"
78 | instagram,^johncornyn$,campaign account
79 | instagram,^housegop$,house gop conference
80 | instagram,^en_US$,junk
81 | instagram,^rep$,junk
82 | instagram,^t51$,junk
83 | twitter,^home$,junk
84 | twitter,^intent$,junk
85 | 


--------------------------------------------------------------------------------
/scripts/data/social_media_whitelist.csv:
--------------------------------------------------------------------------------
 1 | service,account,comment
 2 | twitter,CongressmanDan,"not linked, but is official"
 3 | twitter,repbarbaralee,"not linked, but is official"
 4 | twitter,pedropierluisi,on separate page
 5 | twitter,BettyMcCollum04,"not linked, but is official"
 6 | twitter,CongCulberson,"not linked, but is official"
 7 | twitter,franklautenberg,"not linked, but is official"
 8 | twitter,RepShelley,"not linked, but is official"
 9 | twitter,DocHastings,"not linked, but is official"
10 | twitter,NydiaVelazquez,"not linked, but is official"
11 | twitter,BillPascrell,"not linked, but is official"
12 | twitter,maziehirono,"no official site yet, but is official"
13 | youtube,SenatorVitter,"official site links to individual video, but not account"
14 | youtube,CongresswomanHirono,"official twitter links to account"
15 | youtube,SenatorWhitehouse,"linked, but obfuscated by javascript"
16 | youtube,SenatorBaucus,"official site links to official videos but not account"
17 | youtube,SenatorIsakson,"official site links to official videos but not account"
18 | youtube,KirstenEGillibrand,"official site links to official videos but not account"


--------------------------------------------------------------------------------
/scripts/election_results.py:
--------------------------------------------------------------------------------
  1 | # Updates the data files according to the results of
  2 | # a general election using a spreadsheet of election
  3 | # results and prepares for a new Congress. This script
  4 | # does the following:
  5 | #
  6 | # * Adds end dates to all current leadership roles since
  7 | #   leadership resets in both chambers each Congress.
  8 | # * Brings senators not up for reelection, and Puerto
  9 | #   Rico's resident commissioner in off-years, forward
 10 | #   unchanged.
 11 | # * Creates new legislator entries for new people in
 12 | #   the election results spreadsheet. The next available
 13 | #   GovTrack ID is assigned to each new legislator.
 14 | # * Creates new terms for each election winner in the
 15 | #   election results spreadsheet (incumbents and new
 16 | #   legislators).
 17 | # * Clears the committee-membership-current.yaml file
 18 | #   since all House and Senate committees reset at the
 19 | #   start of a new Congress.
 20 | # * Clears out the social media entries for legislators
 21 | #   no longer serving.
 22 | #
 23 | # Usage:
 24 | # * Use the same column headers as in the last spreadsheet (see
 25 | #   the previous .csv file in the archive directory).
 26 | # * Save the spreadsheet to archive/election_results_{year}.csv.
 27 | # * Edit the ELECTION_YEAR constant below.
 28 | # * Make sure the legislators-{current,historical}.yaml files are 
 29 | #   clean -- i.e. if you've run this script, revert any changes
 30 | #   before running this script again with e.g.:
 31 | #   git checkout origin/main ../*.yaml
 32 | # * Run this script.
 33 | # * Make other changes manually for special elections.
 34 | # * Run wikidata_update.py to fill in some other fields.
 35 | # * Run `NOW=2023-01-03 test/validate.py` to check for errors.
 36 | 
 37 | import traceback
 38 | from types import SimpleNamespace as SN
 39 | 
 40 | import collections, csv, re
 41 | from utils import load_data, save_data
 42 | 
 43 | ELECTION_YEAR = 2024
 44 | 
 45 | def run():
 46 | 	# Compute helper constants.
 47 | 	SENATE_CLASS = ((ELECTION_YEAR-2) % 6) // 2 + 1
 48 | 
 49 | 	# Open existing data.
 50 | 	print("Opening legislator data...")
 51 | 	legislators_historical = load_data("legislators-historical.yaml")
 52 | 	legislators_current = load_data("legislators-current.yaml")
 53 | 
 54 | 	# New member data.
 55 | 	party_map = { "R": "Republican", "D": "Democrat", "I": "Independent" }
 56 | 	new_legislators = []
 57 | 
 58 | 	# Only one class of senators was up for election. Mark all other
 59 | 	# senators as still serving. Additionally, in off years for the
 60 | 	# four-year-termed resident commissioner of Puerto Rico, mark
 61 | 	# that person as still serving also.
 62 | 	current = []
 63 | 	for p in legislators_current:
 64 | 		if p["terms"][-1]["type"] == "sen" and p["terms"][-1]["class"] != SENATE_CLASS:
 65 | 			current.append(p["id"]["govtrack"])
 66 | 		if p["terms"][-1]["state"] == "PR" and (ELECTION_YEAR % 4 != 0):
 67 | 			current.append(p["id"]["govtrack"])
 68 | 
 69 | 	# Map bioguide IDs to existing legislators to read the Bioguide ID
 70 | 	# column of the CSV file.
 71 | 	bioguide_id_map = { }
 72 | 	for entry in legislators_historical + legislators_current:
 73 | 		bioguide_id_map[entry['id']['bioguide']] = entry
 74 | 
 75 | 	# Get highest existing GovTrack ID to know where to start for assigning new IDs.
 76 | 	# Store it in a mutable data structure so that the inner function can increment it.
 77 | 	max_govtrack_id = SN(
 78 | 		value=max(p['id']['govtrack'] for p in (legislators_historical+legislators_current)))
 79 | 
 80 | 	# Load spreadsheet of Senate election results.
 81 | 	print("Applying election results...")
 82 | 
 83 | 	def process_row(row):
 84 | 		# Get state and district from race code. An empty
 85 | 		# district means a senate race.
 86 | 		state, district = re.match(r"^([A-Z]{2})(\d*)$", row["Race"]).groups()
 87 | 
 88 | 		if row['Bioguide ID'] in bioguide_id_map:
 89 | 			# Use the Bioguide ID to get the legislator who won, which might be
 90 | 			# the incumbent or a representative elected to the senate, or
 91 | 			# someone who previously served in Congress, etc. The House provides
 92 | 			# draft IDs for new members, so the ID in the spreadsheet may not
 93 | 			# match an existing person.
 94 | 			p = bioguide_id_map[row['Bioguide ID']]
 95 | 		else:
 96 | 			# Make a new legislator entry.
 97 | 			max_govtrack_id.value += 1
 98 | 			p = collections.OrderedDict([
 99 | 				("id", collections.OrderedDict([
100 | 					("bioguide", row['Bioguide ID'] if row['Bioguide ID'] != "(not assigned)" else None),
101 | 					("fec", [row['FEC.gov ID']]),
102 | 					("govtrack", max_govtrack_id.value),
103 | 					#("opensecrets", None), # don't know yet
104 | 					#("votesmart", int(row['votesmart'])), # not doing this anymore
105 | 					("wikipedia", row['Wikipedia URL'].replace("https://en.wikipedia.org/wiki/", "").replace("_", " ")),
106 | 					#("wikidata", row['Wikidata ID']), # will convert from wikipedia
107 | 					#("ballotpedia", row['Ballotpedia Page Name']),
108 | 				])),
109 | 				("name", collections.OrderedDict([
110 | 					("first", row['First Name']),
111 | 					("middle", row['Middle Name']),
112 | 					("last", row['Last Name']),
113 | 					("suffix", row['Suffix']),
114 | 					("official_full", row['Name']), # best guess
115 | 				])),
116 | 				("bio", collections.OrderedDict([
117 | 				 	("gender", row['Gender (M/F)']),
118 | 				 	("birthday", row['Birthday (YYYY-MM-DD)']),
119 | 				])),
120 | 				("terms", []),
121 | 			])
122 | 
123 | 			# Delete keys that were filled with Nones or empty strings
124 | 			# because we don't have the data yet, other than Bioguide ID
125 | 			# because we'll need that to be filled in manually anyway.
126 | 			for section in ("id", "name", "bio"):
127 | 				for k in list(p[section]): # clone key list before modifying dict
128 | 					if not p[section][k] and not (section == "id" and k == "bioguide"):
129 | 						del p[section][k]
130 | 
131 | 			new_legislators.append(p)
132 | 
133 | 		# Add to array marking this legislator as currently serving.
134 | 		current.append(p['id']['govtrack'])
135 | 
136 | 		# Add a new term.
137 | 		if district == "": # Senate race
138 | 			term = collections.OrderedDict([
139 | 				("type", "sen"),
140 | 				("start", "{next_year}-01-03".format(next_year=ELECTION_YEAR+1)),
141 | 				("end", "{in_six_years}-01-03".format(in_six_years=ELECTION_YEAR+1+6)),
142 | 				("state", state),
143 | 				("class", SENATE_CLASS),
144 | 				("state_rank", None), # computed later
145 | 			])
146 | 		else:
147 | 			term = collections.OrderedDict([
148 | 				("type", "rep"),
149 | 				("start", "{next_year}-01-03".format(next_year=ELECTION_YEAR+1)),
150 | 				("end", "{in_two_years}-01-03".format(in_two_years=ELECTION_YEAR+1+2)),
151 | 				("state", state),
152 | 				("district", int(district)),
153 | 			])
154 | 
155 | 		# If party is given in the table (for some incumbents and
156 | 		# all new winners), use it. Otherwise just make a field so
157 | 		# it's in the right order.
158 | 		term.update(collections.OrderedDict([
159 | 			("party", party_map[row['Party (D/R/I)']] if row['Party (D/R/I)'] else None),
160 | 		]))
161 | 		p['terms'].append(term)
162 | 		if term['party'] == "Independent":
163 | 			term["caucus"] = row['Caucus']
164 | 
165 | 		if len(p['terms']) > 1:
166 | 			# This is an incumbent or at least served previously.
167 | 			# Copy some fields forward that are likely to remain the same, if we
168 | 			# haven't already set them.
169 | 			for k in ('party', 'caucus'):
170 | 				if k in p['terms'][-2] and not term.get(k):
171 | 					term[k] = p['terms'][-2][k]
172 | 		if len(p['terms']) > 1 and p["terms"][-2]["type"] == term["type"]:
173 | 			# Copy some more fields if the last term was in the same chamber.
174 | 			for k in ('url', 'rss_url'):
175 | 				if k in p['terms'][-2] and not term.get(k):
176 | 					term[k] = p['terms'][-2][k]
177 | 
178 | 	election_results = csv.DictReader(open("archive/election_results_{year}.csv".format(year=ELECTION_YEAR)))
179 | 	for row in election_results:
180 | 		if row['Race'] == "": return # end of spreadsheet
181 | 		try:
182 | 			process_row(row)
183 | 		except:
184 | 			print(row)
185 | 			traceback.print_exc()
186 | 			print()
187 | 
188 | 	# End any current leadership roles.
189 | 	for p in legislators_current:
190 | 		for r in p.get('leadership_roles', []):
191 | 			if not r.get('end'):
192 | 				r['end'] = "{next_year}-01-03".format(next_year=ELECTION_YEAR+1)
193 | 
194 | 	# Split the legislators back into the historical and current lists:
195 | 
196 | 	# Move previously-current legislators into the historical list
197 | 	# if they are no longer serving, in the order that they appear
198 | 	# in the current list.
199 | 	for p in legislators_current:
200 | 		if p["id"]["govtrack"] not in current:
201 | 			legislators_historical.append(p)
202 | 	legislators_current = [p for p in legislators_current if p['id']['govtrack'] in current]
203 | 
204 | 	# Move former legislators forward into the current list if they
205 | 	# are returning to Congress, in the order they appear in the
206 | 	# historical list.
207 | 	for p in legislators_historical:
208 | 		if p["id"]["govtrack"] in current:
209 | 			legislators_current.append(p)
210 | 	legislators_historical = [p for p in legislators_historical if p['id']['govtrack'] not in current]
211 | 
212 | 	# Add new legislators in the order they occur in the election
213 | 	# results spreadsheet.
214 | 	for p in new_legislators:
215 | 		legislators_current.append(p)
216 | 
217 | 	# Re-compute the state_rank junior/senior status of all senators.
218 | 	# We'll get this authoritatively from the Senate by senate_contacts.py
219 | 	# once that data is up, but we'll make an educated guess now.
220 | 	state_rank_assignment = set()
221 | 	# Senior senators not up for re-election keep their status:
222 | 	for p in legislators_current:
223 | 		term = p['terms'][-1]
224 | 		if term['type'] == 'sen' and term['class'] != SENATE_CLASS and term['state_rank'] == 'senior':
225 | 			state_rank_assignment.add(p['terms'][-1]['state'])
226 | 	# Senior senators who won re-election pull their status forward:
227 | 	for p in legislators_current:
228 | 		term = p['terms'][-1]
229 | 		if term['state'] in state_rank_assignment: continue # we already assigned the senior senator
230 | 		if term['type'] == 'sen' and term['class'] == SENATE_CLASS and len(p['terms']) > 1 \
231 | 			and p['terms'][-2]['type'] == 'sen' and p['terms'][-2]['state'] == term['state'] and p['terms'][-2]['state_rank'] == 'senior':
232 | 			term['state_rank'] = 'senior'
233 | 			state_rank_assignment.add(p['terms'][-1]['state'])
234 | 	# Junior senators not up for re-election become senior if we didn't see a senior senator yet:
235 | 	for p in legislators_current:
236 | 		term = p['terms'][-1]
237 | 		if term['state'] in state_rank_assignment: continue # we already assigned the senior senator
238 | 		if term['type'] == 'sen' and term['class'] != SENATE_CLASS and term['state_rank'] == 'junior':
239 | 			term['state_rank'] = 'senior'
240 | 			state_rank_assignment.add(p['terms'][-1]['state'])
241 | 	# Remaining senators are senior if we haven't seen a senior senator yet, else junior:
242 | 	for p in legislators_current:
243 | 		term = p['terms'][-1]
244 | 		if term['type'] == 'sen' and term['state_rank'] is None:
245 | 			if term['state'] not in state_rank_assignment:
246 | 				term['state_rank'] = 'senior'
247 | 				state_rank_assignment.add(term['state'])
248 | 			else:
249 | 				term['state_rank'] = 'junior'
250 | 
251 | 	# Save.
252 | 	print("Saving legislator data...")
253 | 	save_data(legislators_current, "legislators-current.yaml")
254 | 	save_data(legislators_historical, "legislators-historical.yaml")
255 | 
256 | 	# Run the sweep script to clear out data that needs to be cleared out
257 | 	# for legislators that are gone.
258 | 	import sweep
259 | 	sweep.run()
260 | 
261 | 	# Clears committee membership.
262 | 	save_data({}, "committee-membership-current.yaml")
263 | 
264 | if __name__ == "__main__":
265 | 	run()
266 | 


--------------------------------------------------------------------------------
/scripts/email/config.yml.example:
--------------------------------------------------------------------------------
 1 | # email:
 2 | #   # smtp details
 3 | #   hostname:
 4 | #   port:
 5 | #   user_name:
 6 | #   password:
 7 | #   starttls:
 8 | #   # email defaults
 9 | #   subject: "[unitedstates/congress-legislators] Notice"
10 | #   from:
11 | #   from_name: "unitedstates"
12 | #   to:


--------------------------------------------------------------------------------
/scripts/export_csv.py:
--------------------------------------------------------------------------------
 1 | # Converts the specified YAML file to an equivalent-ish CSV file
 2 | # (on standard output).
 3 | #
 4 | # python export_csv.py ../legislators-current.yaml
 5 | 
 6 | import sys, csv
 7 | from collections import OrderedDict
 8 | 
 9 | from utils import yaml_load
10 | 
11 | def run():
12 | 
13 | 	if len(sys.argv) < 2:
14 | 		print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv")
15 | 		sys.exit(0)
16 | 
17 | 	data = yaml_load(sys.argv[1])
18 | 
19 | 	###############################################
20 | 
21 | 	def flatten_object(obj, path, ret):
22 | 		"""Takes an object obj and flattens it into a dictionary ret.
23 | 
24 | 		For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }.
25 | 		"""
26 | 		for k, v in list(obj.items()):
27 | 			if isinstance(v, dict):
28 | 				flatten_object(v, (path + "__" if path else "") + k + "__", ret)
29 | 			elif isinstance(v, list):
30 | 				# don't peek inside lists
31 | 				pass
32 | 			else:
33 | 				ret[path + k] = v
34 | 		return ret
35 | 
36 | 	# Scan through the records recursively to get a list of column names.
37 | 	# Attempt to preserve the field order as found in the YAML file. Since
38 | 	# any field may be absent, no one record can provide the complete field
39 | 	# order. Build the best field order by looking at what each field tends
40 | 	# to be preceded by.
41 | 	fields = set()
42 | 	preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred
43 | 	for record in data:
44 | 		prev_key = None
45 | 		for key in flatten_object(record, "", OrderedDict()):
46 | 			fields.add(key)
47 | 
48 | 			preceding_keys.setdefault(key, {}).setdefault(prev_key, 0)
49 | 			preceding_keys[key][prev_key] += 1
50 | 			prev_key = key
51 | 
52 | 	# Convert to relative frequencies.
53 | 	for k, v in list(preceding_keys.items()):
54 | 		s = float(sum(v.values()))
55 | 		for k2 in v:
56 | 			v[k2] /= s
57 | 
58 | 	# Get a good order for the fields. Greedily add keys from left to right
59 | 	# maximizing the conditional probability that the preceding key would
60 | 	# precede the key on the right.
61 | 	field_order = [None]
62 | 	prev_key = None
63 | 	while len(field_order) < len(fields):
64 | 		# Which key is such that prev_key is its most likely precedessor?
65 | 		# We do it this way (and not what is prev_key's most likely follower)
66 | 		# because we should be using a probability (of sorts) that is
67 | 		# conditional on the key being present. Otherwise we lost infrequent
68 | 		# keys.
69 | 		next_key = max([f for f in fields if f not in field_order], key =
70 | 			lambda k :
71 | 				max(preceding_keys[k].get(pk, 0) for pk in field_order))
72 | 		field_order.append(next_key)
73 | 		prev_key = next_key
74 | 	field_order = field_order[1:] # remove the None at the start
75 | 
76 | 	# Write CSV header.
77 | 	w = csv.writer(sys.stdout)
78 | 	w.writerow(field_order)
79 | 
80 | 	# Write the objects.
81 | 	for record in data:
82 | 		obj = flatten_object(record, "", {})
83 | 		w.writerow([
84 | 			obj.get(f, "")
85 | 			for f in field_order
86 | 			])
87 | 
88 | if __name__ == '__main__':
89 |   run()


--------------------------------------------------------------------------------
/scripts/geocode_offices.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Geocodes district office addresses using Google Maps.
  4 | # Opens legislators-district-offices.yaml, finds offices
  5 | # that haven't previously been geocoded and have a street
  6 | # adddress, city, and state, then geocodes them and adds
  7 | # latitude and longitude fields to the office object
  8 | # and writes back to the same file.
  9 | #
 10 | # Assumes you have a Google Maps API key in
 11 | # scripts/cache/google_maps_api_key.txt, and that
 12 | # this key is enabled for the Geocoding API in the
 13 | # Google APIs Console.
 14 | 
 15 | import requests
 16 | import utils
 17 | 
 18 | class GeocodeException(Exception):
 19 | 	def __init__(self, message):
 20 | 		super(GeocodeException, self).__init__(message)
 21 | 
 22 | def run(legislator_ids=None):
 23 | 	legislators = utils.load_data('legislators-district-offices.yaml')
 24 | 	try:
 25 | 		for l in legislators:
 26 | 			if legislator_ids and l['id']['bioguide'] not in legislator_ids:
 27 | 				continue
 28 | 			geocode_offices(l)
 29 | 	finally:
 30 | 		# Save in-progress geocodes in case of keyboard interrupt
 31 | 		print("Saving data...")
 32 | 		utils.save_data(legislators, 'legislators-district-offices.yaml')
 33 | 
 34 | def geocode_offices(l):
 35 | 	for o in l.get('offices', []):
 36 | 		if o.get('latitude'):
 37 | 			continue
 38 | 		if not o.get('address') or not o.get('city') or not o.get('state'):
 39 | 			continue
 40 | 		address_query = ', '.join([o['address'], o['city'], utils.states[o['state']]])
 41 | 		result = None
 42 | 		try:
 43 | 			result = geocode(address_query)
 44 | 			_sanity_check_location(o, l['id']['bioguide'], result)
 45 | 		except GeocodeException as e:
 46 | 			print('Geocoding failed for %s office %s (%s): %s. Query: "%s". Result: "%s"' % (
 47 | 				l['id']['bioguide'], o['city'], o['address'], e, address_query,
 48 | 				result['formatted_address'] if result else None))
 49 | 			continue
 50 | 
 51 | 		location = result['geometry']['location']
 52 | 		o['latitude'] = location['lat']
 53 | 		o['longitude'] = location['lng']
 54 | 		print('Success: %s office %s, query "%s" geocoded to "%s" (%s,%s)' % (
 55 | 			l['id']['bioguide'], o['city'], address_query, result['formatted_address'],
 56 | 			location['lat'], location['lng']))
 57 | 
 58 | def geocode(address):
 59 | 	params = {
 60 | 		'address': address,
 61 | 		'key': _get_api_key(),
 62 | 		}
 63 | 	response = requests.get('https://maps.googleapis.com/maps/api/geocode/json', params=params)
 64 | 	js = response.json()
 65 | 	if js.get('status') != 'OK':
 66 | 		raise GeocodeException('Non-success response from geocoder: %s' % js.get('status'))
 67 | 	return js['results'][0]
 68 | 
 69 | _api_key = None
 70 | 
 71 | def _get_api_key():
 72 | 	global _api_key
 73 | 	if not _api_key:
 74 | 		_api_key = open('cache/google_maps_api_key.txt').read().strip()
 75 | 	return _api_key
 76 | 
 77 | def _find_address_component(geocode_result, component_type):
 78 | 	for component in geocode_result['address_components']:
 79 | 		if component_type in component['types']:
 80 | 			return component
 81 | 	return None
 82 | 
 83 | SANITY_CHECK_EXEMPTIONS = (
 84 | 	# (bioguide, office_city)
 85 | 	('B001295', 'Mt. Vernon'),
 86 | 	('B001290', 'Spotsylvania'),
 87 | 	('B001300', 'San Pedro'),
 88 | 	('C000984', 'Ellicott'),
 89 | 	('C001038', 'Bronx'),
 90 | 	('C001038', 'Queens'),
 91 | 	('C001067', 'Brooklyn'),
 92 | 	('D000482', 'Penn Hills'),
 93 | 	('D000625', 'Brooklyn'),
 94 | 	('D000625', 'Staten Island'),
 95 | 	('D000626', 'West Chester'),
 96 | 	('E000179', 'Bronx'),
 97 | 	('E000179', 'Mt. Vernon'),
 98 | 	('H000324', 'Mangonia Park'),
 99 | 	('H001059', 'Campton Hills'),
100 | 	('J000294', 'Brooklyn'),
101 | 	('K000375', 'Hyannis'),
102 | 	('M000087', 'Astoria'),
103 | 	('M000087', 'Brooklyn'),
104 | 	('M001137', 'Arverne'),
105 | 	('M001137', 'Jamaica'),
106 | 	('M001151', 'Pittsburgh'),
107 | 	('M001179', 'Lake Ariel'),
108 | 	('M001188', 'Flushing'),
109 | 	('M001188', 'Forest Hills'),
110 | 	('M001193', 'Marlton'),
111 | 	('M001201', 'Shelby Township'),
112 | 	('N000002', 'Brooklyn'),
113 | 	('N000032', 'Fort Lauderdale'),
114 | 	('P000605', 'York'),
115 | 	('Q000023', 'Lakeview'),
116 | 	('R000486', 'Commerce'),
117 | 	('R000576', 'Timonium'),
118 | 	('R000601', 'Rockwall'),
119 | 	('S000248', 'Bronx'),
120 | 	('S000522', 'Hamilton'),
121 | 	('V000081', 'Brooklyn'),
122 | 	('W000808', 'Miami Gardens'),
123 | 	('W000822', 'Ewing'),
124 | 	('S000522', 'Plumsted'),
125 | 	)
126 | 
127 | def _sanity_check_location(office, bioguide_id, geocode_result):
128 | 	for exemption in SANITY_CHECK_EXEMPTIONS:
129 | 		if bioguide_id == exemption[0] and office['city'] == exemption[1]:
130 | 			return
131 | 
132 | 	state_result_component = _find_address_component(geocode_result, 'administrative_area_level_1')
133 | 	if not state_result_component:
134 | 		raise GeocodeException('No state code found in geocode result')
135 | 	result_state = state_result_component['short_name']
136 | 	if result_state != office['state']:
137 | 		raise GeocodeException('Geocode result is not in the right state')
138 | 
139 | 	city_result_component = _find_address_component(geocode_result, 'locality')
140 | 	if not city_result_component:
141 | 		raise GeocodeException('No city found in geocode result')
142 | 	result_city = city_result_component['long_name']
143 | 	result_city_alt = city_result_component['short_name']
144 | 	if not (_do_city_names_match(result_city, office['city']) or _do_city_names_match(result_city_alt, office['city'])):
145 | 		# For big cities, Google Maps seems to consider the "city" to be e.g. Los Angeles
146 | 		# even though the mailing address and colloquial address may be e.g. Panorama City.
147 | 		# This common name is in the "neighorhood field, so look at that too
148 | 		result_subcity_component = _find_address_component(geocode_result, 'neighborhood')
149 | 		if result_subcity_component:
150 | 			result_subcity = result_subcity_component['long_name']
151 | 			if _do_city_names_match(result_subcity, office['city']):
152 | 				return
153 | 		raise GeocodeException('Geocode result is not in the right city')
154 | 
155 | def _do_city_names_match(name1, name2):
156 | 	return name1.lower().replace('.', '') == name2.lower().replace('.', '')
157 | 
158 | if __name__ == '__main__':
159 | 	import sys
160 | 	run(legislator_ids=sys.argv[1:])
161 | 


--------------------------------------------------------------------------------
/scripts/historical_committees.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Parse the THOMAS advanced search page for a list of all committees
  4 | # and subcommittees from the 93rd Congress forward and store them in
  5 | # the committees-historical.yaml file. It will include current committees
  6 | # as well.
  7 | 
  8 | import zipfile
  9 | from collections import OrderedDict
 10 | import utils
 11 | from utils import load_data, save_data, CURRENT_CONGRESS, scraper
 12 | import io
 13 | import lxml.etree
 14 | 
 15 | def run():
 16 |   committees_historical = load_data("committees-historical.yaml")
 17 | 
 18 |   # default to not caching
 19 |   flags = utils.flags()
 20 |   cache = flags.get('cache', False)
 21 | 
 22 |   if cache:
 23 |     from scrapelib.cache import FileCache
 24 |     scraper.cache_storage = FileCache('cache')
 25 |     scraper.cache_write_only = False
 26 |   else:
 27 |     raise
 28 | 
 29 |   # map thomas_id's to their dicts
 30 |   committees_historical_ref = { }
 31 |   for cx in committees_historical:
 32 |     committees_historical_ref[cx["thomas_id"]] = cx
 33 | 
 34 | 
 35 |   # pick the range of committees to get
 36 |   single_congress = flags.get('congress', False)
 37 |   if single_congress:
 38 |     start_congress = int(single_congress)
 39 |     end_congress = int(single_congress) + 1
 40 |   else:
 41 |     start_congress = 113
 42 |     end_congress = CURRENT_CONGRESS + 1
 43 | 
 44 | 
 45 |   urls = {'senate': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/s/BILLSTATUS-{congress}-s.zip',
 46 |           'house': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/hr/BILLSTATUS-{congress}-hr.zip'}
 47 | 
 48 |   all_committees = {'house': {}, 'senate': {}}
 49 |     
 50 |   for congress in range(start_congress, end_congress):
 51 |     for chamber, bill_status_url in urls.items():
 52 |       chamber_committees = all_committees[chamber]
 53 |       
 54 |       url = bill_status_url.format(congress=congress)
 55 |       response = scraper.get(url)      
 56 | 
 57 |       with zipfile.ZipFile(io.BytesIO(response.content)) as z:
 58 |         for name in z.namelist():
 59 |           if name.startswith('BILLSTATUS'):
 60 |             with z.open(name) as xml_file:
 61 |               bill_status = lxml.etree.parse(xml_file)
 62 |               committees =  bill_status.xpath('//billCommittees/item')
 63 |               for committee in committees:
 64 |                 code = str(committee.xpath('./systemCode/text()')[0])
 65 |                 name = str(committee.xpath('./name/text()')[0])
 66 |                 if name.endswith(' Committee'):
 67 |                   name = name[:-10]
 68 |                 if code not in chamber_committees:
 69 |                   chamber_committees[code] = {'names': {congress: name},
 70 |                                               'subcommittees': {}}
 71 |                 else:
 72 |                   if congress not in chamber_committees[code]:
 73 |                     chamber_committees[code]['names'][congress] = name
 74 | 
 75 |                 subcommittees_d = chamber_committees[code]['subcommittees']
 76 |                 for subcommittee in committee.xpath('./subcommittees/item'):
 77 |                   code = str(subcommittee.xpath('./systemCode/text()')[0])
 78 |                   name = str(subcommittee.xpath('./name/text()')[0])
 79 |                   if name.endswith(' Subcommittee'):
 80 |                     name = name[:-13]
 81 |                   if code not in subcommittees_d:
 82 |                     subcommittees_d[code] = {congress: name}
 83 |                   else:
 84 |                     if congress not in subcommittees_d[code]:
 85 |                       subcommittees_d[code][congress] = name
 86 | 
 87 |       import pprint
 88 |       pprint.pprint(chamber_committees)
 89 |       print(len(chamber_committees))
 90 | 
 91 | 
 92 |   for chamber, committees in all_committees.items():
 93 |     for code, committee in committees.items():
 94 |       id = str(code).upper()
 95 | 
 96 |       id = id[:-2]
 97 | 
 98 |       if id in committees_historical_ref:
 99 |         # Update existing record.
100 |         cx = committees_historical_ref[id]
101 | 
102 |       else:
103 |         # Create a new record.
104 |         cx = OrderedDict()
105 |         committees_historical_ref[id] = cx
106 |         cx['type'] = chamber.lower()
107 |         if id[0] != "J": # Joint committees show their full name, otherwise they show a partial name
108 |           cx['name'] = chamber + " Committee on " + name
109 |         else:
110 |           cx['name'] = committee['names'][min(committee['names'])]
111 |         cx['thomas_id'] = id
112 |         committees_historical.append(cx)
113 | 
114 |       for code, subcommittee in committee['subcommittees'].items():
115 | 
116 |         for sx in cx.setdefault('subcommittees', []):
117 |           if sx['thomas_id'] == code[-2:]:
118 |             # found existing record
119 |             break
120 |         else:
121 |           # 'break' not executed, so create a new record
122 |           sx = OrderedDict()
123 |           sx['name'] = subcommittee[min(subcommittee)]
124 |           sx['thomas_id'] = code[-2:]
125 |           cx['subcommittees'].append(sx)
126 | 
127 | 
128 |           sx.setdefault('congresses', [])
129 |           sx.setdefault('names', {})
130 | 
131 |           for congress, name in subcommittee.items():
132 |             if congress not in sx['congresses']:
133 |                sx['congresses'].append(congress)
134 | 
135 |                sx['names'][congress] = name
136 | 
137 |       cx.setdefault('congresses', [])
138 |       cx.setdefault('names', {})
139 | 
140 |       for congress, name in committee['names'].items():
141 |         if congress not in cx['congresses']:
142 |           cx['congresses'].append(congress)
143 |           cx['names'][congress] = name
144 | 
145 |                  
146 |   # TODO
147 |   # after checking diff on first commit, we should re-sort
148 |   #committees_historical.sort(key = lambda c : c["thomas_id"])
149 |   #for c in committees_historical:
150 |   #  c.get("subcommittees", []).sort(key = lambda s : s["thomas_id"])
151 | 
152 |   save_data(committees_historical, "committees-historical.yaml")
153 | 
154 | if __name__ == '__main__':
155 |   run()
156 | 


--------------------------------------------------------------------------------
/scripts/house_contacts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Update current congressmember's contact info from clerk XML feed
  4 | 
  5 | import requests
  6 | import lxml
  7 | import re
  8 | from datetime import datetime
  9 | 
 10 | from utils import load_data, save_data, parse_date
 11 | 
 12 | def run():
 13 | 	today = datetime.now().date()
 14 | 
 15 | 	y = load_data("legislators-current.yaml")
 16 | 
 17 | 	# TODO use download util?
 18 | 	xml = requests.get("http://clerk.house.gov/xml/lists/MemberData.xml")
 19 | 	#xml = requests.get("https://clerk.house.gov/xml/lists/unofficial-118-member-elect-data.xml")
 20 | 	root=lxml.etree.fromstring(xml.content)
 21 | 
 22 | 	for moc in y:
 23 | 		try:
 24 | 			term = moc["terms"][-1]
 25 | 		except IndexError:
 26 | 			print("Member has no terms", moc)
 27 | 			continue
 28 | 
 29 | 		if term["type"] != "rep": continue
 30 | 
 31 | 		if today < parse_date(term["start"]) or today > parse_date(term["end"]):
 32 | 			print("Member's last listed term is not current", moc, term["start"])
 33 | 			continue
 34 | 
 35 | 		ssdd = "%s%02d" % (term["state"], term["district"])
 36 | 
 37 | 		query_str = "./members/member/[statedistrict='%s']" % ssdd
 38 | 
 39 | 		# Odd state abbreviation.
 40 | 		query_str = query_str.replace("AS00", "AQ00")
 41 | 
 42 | 		mi = root.findall(query_str)[0].find('member-info')
 43 | 
 44 | 		# Check that the bioguide ID matches.
 45 | 		bioguideid = mi.find('bioguideID').text
 46 | 		if moc['id'].get('bioguide') is not None and \
 47 | 		      bioguideid != moc['id']['bioguide']:
 48 | 			print("Warning: Bioguide ID did not match for %s%02d (%s != %s)" % (
 49 | 				term["state"], term["district"],
 50 | 				bioguideid, moc['id']['bioguide']))
 51 | 		elif moc['id'].get('bioguide') is None:
 52 | 			# At the start of a Congress, we can import the Bioguide ID from
 53 | 			# the official data since we matched on state & district.
 54 | 
 55 | 			# To keep the field order nice, insert it at the start of the
 56 | 			# IDs list.
 57 | 			moc['id'] = dict([("bioguide", bioguideid)]
 58 | 				           + list(moc['id'].items()))
 59 | 
 60 | 		# for now, no automatic name updates since there is disagremeent on how to handle
 61 | 		# firstname = mi.find('firstname').text
 62 | 		# middlename = mi.find('middlename').text #could be empty
 63 | 		# lastname = mi.find('lastname').text
 64 | 
 65 | 		if mi.find('official-name') is None or mi.find('official-name').text is None:
 66 | 			print("Warning: No official-name tag for %s" % ssdd)
 67 | 			officialname = None
 68 | 		else:
 69 | 			officialname = re.sub("'", "’", mi.find('official-name').text)
 70 | 
 71 | 		office_room = mi.find('office-room').text
 72 | 		office_building = mi.find('office-building').text
 73 | 
 74 | 		office_building_full = office_building.replace("RHOB", "Rayburn House Office Building")
 75 | 		office_building_full = office_building_full.replace("CHOB", "Cannon House Office Building")
 76 | 		office_building_full = office_building_full.replace("LHOB", "Longworth House Office Building")
 77 | 
 78 | 		office_zip = mi.find('office-zip').text
 79 | 		office_zip_suffix = mi.find('office-zip-suffix').text
 80 | 
 81 | 		office = "{} {}".format(office_room, office_building_full)
 82 | 		address = "{} {} Washington DC {}-{}".format(office_room, office_building_full, office_zip, office_zip_suffix)
 83 | 
 84 | 		phone = mi.find('phone').text
 85 | 		phone_parsed = re.sub(r"^\((\d\d\d)\) ", lambda m : m.group(1) + "-", phone) # replace (XXX) area code with XXX- for compatibility w/ existing format
 86 | 
 87 | 		#for now, no automatic name updates since there is disagremeent on how to handle
 88 | 		# moc["name"]["first"] = firstname
 89 | 		# if (middlename):
 90 | 		# 	moc["name"]["middle"] = middlename
 91 | 		# else:
 92 | 		# 	if ("middle" in moc["name"]):
 93 | 		# 		del moc["name"]["middle"]
 94 | 		# moc["name"]["last"] = lastname
 95 | 
 96 | 		# TODO: leave if none?
 97 | 		if (officialname):
 98 | 			moc["name"]["official_full"] = officialname
 99 | 		term["address"] = address
100 | 		term["office"] = office
101 | 		term["phone"] = phone_parsed
102 | 
103 | 	save_data(y, "legislators-current.yaml")
104 | 
105 | if __name__ == '__main__':
106 |   run()
107 | 


--------------------------------------------------------------------------------
/scripts/house_history.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Stores a house_history ID for all legislators that don't yet
 4 | # have one, by scraping history.house.gov.
 5 | 
 6 | import lxml.html, io
 7 | import requests
 8 | from utils import load_data, save_data
 9 | import sys
10 | 
11 | def run():
12 |   # load legislators YAML files
13 |   yamlfiles = { }
14 |   for fn in ('historical', 'current'):
15 |     fn = 'legislators-%s.yaml' % fn
16 |     print("Loading %s..." % fn)
17 |     yamlfiles[fn] = load_data(fn)
18 | 
19 |   # reoriented cache to access by bioguide ID
20 |   by_bioguide = { }
21 |   known_house_history_ids = set()
22 |   for legislators in yamlfiles.values():
23 |     for m in legislators:
24 |       if "bioguide" in m["id"]:
25 |         by_bioguide[m["id"]["bioguide"]] = m
26 |       if "house_history" in m["id"]:
27 |         known_house_history_ids.add(m["id"]["house_history"])
28 |   count = 0
29 | 
30 |   # scrape history.house.gov
31 |   if len(sys.argv) == 1:
32 |     id_range = range(22000, 25000)
33 |   else:
34 |     id_range = [int(arg) for arg in sys.argv[1:]]
35 |   for id in id_range:
36 |     # skip known IDs
37 |     if id in known_house_history_ids:
38 |       continue
39 |     print(id)
40 |     bioguide_id = get_bioguide_for_house_history_id(id)
41 |     if bioguide_id and bioguide_id in by_bioguide:
42 |       print(id, bioguide_id)
43 |       by_bioguide[bioguide_id]["id"]["house_history"] = id
44 |       count = count + 1
45 | 
46 |   # write YAML files to disk
47 |   for filename, legislators in yamlfiles.items():
48 |     print("Saving data to %s..." % filename)
49 |     save_data(legislators, filename)
50 | 
51 |   # how many updates did we make?
52 |   print("Saved %d legislators" % count)
53 | 
54 | def get_bioguide_for_house_history_id(id):
55 |     url = "http://history.house.gov/People/Detail/%s" % id
56 |     r = requests.get(url, allow_redirects=False)
57 |     if r.status_code == 200:
58 |         dom = lxml.html.parse(io.StringIO(r.text)).getroot()
59 |         try:
60 |             bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href')
61 |             return bioguide_link.split('=')[1]
62 |         except:
63 |             return None
64 |     else:
65 |         return None
66 | 
67 | if __name__ == '__main__':
68 |   run()


--------------------------------------------------------------------------------
/scripts/house_websites.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Uses https://www.house.gov/representatives/ to scrape official member websites.
  4 | # Only known source.
  5 | 
  6 | # Assumptions:
  7 | #  member's state and district fields are present and accurate.
  8 | #  member's most recent term in the terms field is their current one.
  9 | 
 10 | import lxml.html, io, urllib.request, urllib.error, urllib.parse
 11 | import re
 12 | import utils
 13 | from utils import load_data, save_data, states as state_names
 14 | 
 15 | 
 16 | def run():
 17 | 
 18 |   # default to not caching
 19 |   cache = utils.flags().get('cache', False)
 20 |   force = not cache
 21 | 
 22 | 
 23 |   states = []
 24 |   current = load_data("legislators-current.yaml")
 25 |   by_district = { }
 26 |   for m in current:
 27 |     last_term = m['terms'][-1]
 28 |     if last_term['type'] != 'sen':
 29 |       state = last_term['state']
 30 | 
 31 |       full_district = "%s%02d" % (state, int(last_term['district']))
 32 |       by_district[full_district] = m
 33 | 
 34 |       if not state in states:
 35 |         states.append(state)
 36 | 
 37 |   destination = "legislators/house.html"
 38 |   url = "https://www.house.gov/representatives/"
 39 |   body = utils.download(url, destination, force)
 40 |   if not body:
 41 |     print("Couldn't download House listing!")
 42 |     exit(0)
 43 | 
 44 |   try:
 45 |     dom = lxml.html.parse(io.StringIO(body)).getroot()
 46 |   except lxml.etree.XMLSyntaxError:
 47 |     print("Error parsing House listing!")
 48 |     exit(0)
 49 | 
 50 | 
 51 |   # process:
 52 |   #   go through every state in our records, fetching that state's table
 53 |   #   go through every row after the first, pick the district to isolate the member
 54 |   #   pluck out the URL, update that member's last term's URL
 55 |   count = 0
 56 |   for state in sorted(states):
 57 |     state_name = state_names[state].lower().replace(' ', '-')
 58 |     table = dom.cssselect("table.table caption#state-%s" % state_name)[0].getparent()
 59 |     rows = table.cssselect("tbody tr")
 60 | 
 61 |     for row in rows:
 62 |       cells = row.cssselect("td")
 63 |       if not cells:
 64 |         continue
 65 | 
 66 |       district = str(cells[0].text_content()).strip()
 67 |       if (
 68 |         (district == "At Large")
 69 |         or (district == "Delegate")
 70 |         or (district == "Resident Commissioner")
 71 |       ):
 72 |         district = 0
 73 |       else:
 74 |         district = int(re.sub(r'[^\d]', '', district))
 75 | 
 76 |       url = cells[1].cssselect("a")[0].get("href")
 77 |       original_url = url
 78 | 
 79 |       # The House uses subdomains now, and occasionally the directory
 80 |       # uses URLs with some trailing redirected-to page, like /home.
 81 |       # We can safely use the subdomain as the root, to be future-proof
 82 |       # against redirects changing mid-session.
 83 | 
 84 |       # We should still follow any redirects, and not just trust the
 85 |       # directory to have the current active subdomain. As an example,
 86 |       # the directory lists randyforbes.house.gov, which redirects to
 87 |       # forbes.house.gov.
 88 |       resp = urllib.request.urlopen(url)
 89 |       url = resp.geturl()
 90 | 
 91 |       # kill everything after the domain
 92 |       url = re.sub(".gov/.*$", ".gov", url)
 93 | 
 94 |       if state == "AQ":
 95 |         state = "AS"
 96 |       full_district = "%s%02d" % (state, int(district))
 97 |       if full_district in by_district:
 98 |         print("[%s] %s %s" % (full_district, url, "" if url == original_url.rstrip("/") else (" <= " + original_url)))
 99 |         by_district[full_district]['terms'][-1]['url'] = url
100 |       else:
101 |         print("[%s] No current legislator" % full_district)
102 | 
103 |       count += 1
104 | 
105 |   print("Processed %i people rows on House listing." % count)
106 | 
107 |   print("Saving data...")
108 |   save_data(current, "legislators-current.yaml")
109 | 
110 | if __name__ == '__main__':
111 |   run()
112 | 


--------------------------------------------------------------------------------
/scripts/icpsr_ids.py:
--------------------------------------------------------------------------------
  1 |  #!/usr/bin/env python
  2 | 
  3 | # gets ICPSR ID for every member
  4 | 
  5 | # options:
  6 | #  --cache: load from cache if present on disk (default: true)
  7 | #  --bioguide: load only one legislator, by his/her bioguide ID
  8 | #  --congress: do *only* updates for legislators serving in specific congress
  9 | 
 10 | import utils
 11 | from utils import load_data, save_data, parse_date
 12 | import csv
 13 | from io import StringIO
 14 | 
 15 | def run():
 16 | 
 17 |     # default to caching
 18 |     cache = utils.flags().get('cache', True)
 19 |     force = not cache
 20 | 
 21 | 
 22 |     only_bioguide = utils.flags().get('bioguide', None)
 23 |     congress = utils.flags().get('congress',None)
 24 | 
 25 | 
 26 |     data_files = []
 27 | 
 28 |     print("Loading %s..." % "legislators-current.yaml")
 29 |     legislators = load_data("legislators-current.yaml")
 30 |     data_files.append((legislators,"legislators-current.yaml"))
 31 |     print("Loading %s..." % "legislators-historical.yaml")
 32 |     legislators = load_data("legislators-historical.yaml")
 33 |     data_files.append((legislators,"legislators-historical.yaml"))
 34 | 
 35 |     # load member data from vote view
 36 |     if congress == None:
 37 |         raise Exception("the --congress flag is required")
 38 |     elif int(congress) < 10 and int(congress) > 0:
 39 |         url_senate = "https://voteview.com/static/data/out/members/S00%s_members.csv" % congress
 40 |         url_house = "https://voteview.com/static/data/out/members/H00%s_members.csv" % congress
 41 |     elif int(congress) < 100 and int(congress) >= 10:
 42 |         url_senate = "https://voteview.com/static/data/out/members/S0%s_members.csv" % congress
 43 |         url_house = "https://voteview.com/static/data/out/members/H0%s_members.csv" % congress
 44 |     elif int(congress) >= 100:
 45 |         url_senate = "https://voteview.com/static/data/out/members/S%s_members.csv" % congress
 46 |         url_house = "https://voteview.com/static/data/out/members/H%s_members.csv" % congress
 47 |     else:
 48 |         raise Exception("no data for congress " + congress)
 49 | 
 50 |     senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
 51 |     senate_data = utils.download(url_senate, senate_destination, force)
 52 | 
 53 |     house_destination = "icpsr/source/house_rollcall%s.txt" % congress
 54 |     house_data = utils.download(url_house, house_destination, force)
 55 | 
 56 |     error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "w"))
 57 |     error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"])
 58 | 
 59 | 
 60 | 
 61 |     read_files = [("sen",senate_data),("rep",house_data)]
 62 |     print("Running for congress " + congress)
 63 |     for read_file_chamber,read_file_content in read_files:
 64 |         for data_file in data_files:
 65 |             for legislator in data_file[0]:
 66 |                 num_matches = 0
 67 |                 write_id = ""
 68 |                 # this can't run unless we've already collected a bioguide for this person
 69 |                 bioguide = legislator["id"].get("bioguide", None)
 70 |                 # if we've limited this to just one bioguide, skip over everyone else
 71 |                 if only_bioguide and (bioguide != only_bioguide):
 72 |                     continue
 73 |                 #if not in currently read chamber, skip
 74 |                 chamber = legislator['terms'][len(legislator['terms'])-1]['type']
 75 |                 if chamber != read_file_chamber:
 76 |                     continue
 77 | 
 78 |                 #only run for selected congress
 79 |                 latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start'])))
 80 |                 if chamber == "sen":
 81 |                     congresses = [latest_congress,latest_congress+1,latest_congress+2]
 82 |                 else:
 83 |                     congresses =[latest_congress]
 84 | 
 85 |                 if int(congress) not in congresses:
 86 |                     continue
 87 | 
 88 |                 # pull data to match from yaml
 89 | 
 90 |                 last_name = legislator['name']['last'].upper()
 91 |                 state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip()
 92 | 
 93 |                 # convert read_file_content str to file object, then parse as csv file
 94 |                 content_as_file = StringIO(read_file_content)
 95 |                 content_parsed = csv.reader(content_as_file, delimiter=',')
 96 | 
 97 |                 # loop through congress members in read file, see if one matches the current legislator
 98 |                 for icpsr_member in content_parsed:
 99 |                     # ensure unique match bassed of bioguide id
100 |                     if bioguide == icpsr_member[10]:
101 |                         num_matches += 1
102 |                         write_id = int(icpsr_member[2])
103 | 
104 |                 # skip if icpsr id is currently in data
105 |                 if "icpsr" in legislator["id"]:
106 |                     if write_id == legislator["id"]["icpsr"] or write_id == "":
107 |                         continue
108 |                     elif write_id != legislator["id"]["icpsr"] and write_id != "":
109 |                         error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id])
110 |                         print("ID updated for %s" % last_name)
111 | 
112 |                 if num_matches == 1:
113 |                     legislator['id']['icpsr'] = int(write_id)
114 |                 else:
115 |                     if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
116 |                         print('error: non 1 match')
117 |                         error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"])
118 |                     else:
119 |                         print(str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress))
120 |                         error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"])
121 | 
122 |             save_data(data_file[0], data_file[1])
123 | 
124 |     ## the following three lines can be run as a separate script to update icpsr id's for all historical congresses
125 |     # import os
126 | 
127 |     # for i in range(1,114):
128 |     #     os.system("python ICPSR_id.py --congress=" + str(i))
129 | 
130 | if __name__ == '__main__':
131 |   run()
132 | 


--------------------------------------------------------------------------------
/scripts/influence_ids.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # gets CRP id for every member with a bioguide ID:
  4 | 
  5 | # options:
  6 | #  --cache: load from cache if present on disk (default: true)
  7 | #  --current: do *only* current legislators (default: true)
  8 | #  --historical: do *only* historical legislators (default: false)
  9 | 
 10 | import utils
 11 | from utils import load_data, save_data
 12 | import json
 13 | 
 14 | def run():
 15 | 
 16 |     options = utils.flags()
 17 |     options['urllib'] = True # disable scrapelib for this
 18 | 
 19 |     debug = options.get('debug', False)
 20 | 
 21 |     # default to NOT caching
 22 |     cache = options.get('cache', False)
 23 |     force = not cache
 24 | 
 25 | 
 26 |     only_bioguide = options.get('bioguide', None)
 27 | 
 28 | 
 29 |     # pick either current or historical
 30 |     # order is important here, since current defaults to true
 31 |     if utils.flags().get('historical', False):
 32 |       filename = "legislators-historical.yaml"
 33 |     elif utils.flags().get('current', True):
 34 |       filename = "legislators-current.yaml"
 35 |     else:
 36 |       print("No legislators selected.")
 37 |       exit(0)
 38 | 
 39 | 
 40 |     print("Loading %s..." % filename)
 41 |     legislators = load_data(filename)
 42 | 
 43 | 
 44 |     api_file = open('cache/sunlight_api_key.txt','r')
 45 |     api_key = api_file.read()
 46 | 
 47 | 
 48 |     for m in legislators:
 49 | 
 50 |         # this can't run unless we've already collected a bioguide for this person
 51 |         bioguide = m["id"].get("bioguide", None)
 52 |         if not bioguide:
 53 |             continue
 54 |         # if we've limited this to just one bioguide, skip over everyone else
 55 |         if only_bioguide and (bioguide != only_bioguide):
 56 |             continue
 57 | 
 58 |         url_BG = "http://transparencydata.com/api/1.0/entities/id_lookup.json?bioguide_id="
 59 |         url_BG += bioguide
 60 |         url_BG += "&apikey="+api_key
 61 | 
 62 | 
 63 |         destination = "legislators/influence_explorer/lookups/%s.json" % bioguide
 64 |         if debug: print("[%s] Looking up ID..." % bioguide)
 65 |         body = utils.download(url_BG, destination, force, options)
 66 | 
 67 |         if not body:
 68 |             print("[%s] Bad request, skipping" % bioguide)
 69 |             continue
 70 | 
 71 |         jsondata = json.loads(body)
 72 |         if (jsondata != []):
 73 |             IE_ID = jsondata[0]['id']
 74 |             url_CRP = "http://transparencydata.com/api/1.0/entities/"
 75 |             url_CRP += IE_ID
 76 |             url_CRP += ".json?apikey=" + api_key
 77 | 
 78 |             destination = "legislators/influence_explorer/entities/%s.json" % IE_ID
 79 |             body = utils.download(url_CRP, destination, force, options)
 80 | 
 81 |             jsondata = json.loads(body)
 82 | 
 83 |             opensecrets_id = None
 84 |             fec_ids = []
 85 |             for external in jsondata['external_ids']:
 86 |                 if external["namespace"].startswith("urn:crp"):
 87 |                     opensecrets_id = external['id']
 88 |                 elif external["namespace"].startswith("urn:fec"):
 89 |                     fec_ids.append(external['id'])
 90 | 
 91 |             if opensecrets_id:
 92 |                 m["id"]["opensecrets"] = opensecrets_id
 93 | 
 94 |             # preserve existing FEC IDs, but don't duplicate them
 95 |             if len(fec_ids) > 0:
 96 |                 if m["id"].get("fec", None) is None: m["id"]["fec"] = []
 97 |                 for fec_id in fec_ids:
 98 |                     if fec_id not in m["id"]["fec"]:
 99 |                         m["id"]["fec"].append(fec_id)
100 | 
101 |             print("[%s] Added opensecrets ID of %s" % (bioguide, opensecrets_id))
102 |         else:
103 |             print("[%s] NO DATA" % bioguide)
104 | 
105 | 
106 | 
107 | 
108 |     print("Saving data to %s..." % filename)
109 |     save_data(legislators, filename)
110 | 
111 | if __name__ == '__main__':
112 |   run()


--------------------------------------------------------------------------------
/scripts/lint.py:
--------------------------------------------------------------------------------
 1 | # Just loads and saves each .yaml file to normalize serialization syntax.
 2 | #
 3 | # python lint.py
 4 | # ... will lint every .yaml file in the data directory.
 5 | #
 6 | # python lint.py file1.yaml file2.yaml ...
 7 | # ... will lint the specified files.
 8 | 
 9 | import glob, sys
10 | from utils import yaml_load, yaml_dump, data_dir
11 | 
12 | def run():
13 |     for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]:
14 |         print(fn + "...")
15 |         data = yaml_load(fn, use_cache=False)
16 |         yaml_dump(data, fn)
17 | 
18 | if __name__ == '__main__':
19 |   run()


--------------------------------------------------------------------------------
/scripts/office_validator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Run validation tests on district office data.
  4 | 
  5 | 
  6 | For each legislator:
  7 |     has offices
  8 | 
  9 | For each office:
 10 |     Required fields: id, city, state
 11 |     Expected fields: address, city, state, zip, phone, latitude, longitude, id
 12 |     Optional fields: building, fax, hours, suite
 13 |     Office id: check consistent
 14 |     offices are in legislator's state
 15 | 
 16 | Globally:
 17 |     Every legislator has offices
 18 |     All offices belong to current legislators
 19 | 
 20 | """
 21 | 
 22 | import datetime
 23 | import os.path
 24 | import re
 25 | from collections import OrderedDict, defaultdict
 26 | from itertools import count
 27 | import sys
 28 | 
 29 | try:
 30 |     import rtyaml as yaml
 31 | except ImportError:
 32 |     import yaml
 33 | 
 34 | try:
 35 |     from termcolor import colored
 36 | except ImportError:
 37 |     colored = None
 38 | 
 39 | 
 40 | NONALPHA = re.compile(r"\W")
 41 | PHONE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
 42 | FIELD_ORDER = """
 43 | 
 44 |     id
 45 |     address suite building
 46 |     city state zip
 47 |     latitude longitude
 48 |     fax hours phone
 49 | 
 50 | """.split()
 51 | 
 52 | 
 53 | def relfile(path):
 54 |     return os.path.abspath(os.path.join(os.path.dirname(__file__), path))
 55 | 
 56 | 
 57 | def id_offices(bioguide_id, offices):
 58 |     """
 59 |     Generate unique office ids using a similar algorithm to
 60 |     https://github.com/controlshift/congress-legislators/blob/add-ids-to-offices-script/add_ids_to_offices.rb
 61 | 
 62 |     Used for validation here, but could be used to generate ids.
 63 |     """
 64 |     id_count = defaultdict(count)
 65 |     for office in offices:
 66 |         locality = office.get('city', 'no_city').lower()
 67 |         locality = NONALPHA.sub('_', locality)
 68 | 
 69 |         office_id = '-'.join([bioguide_id, locality])
 70 | 
 71 |         city_count = next(id_count[office_id])
 72 |         if city_count:
 73 |             office_id = '-'.join([office_id, str(city_count)])
 74 | 
 75 |         yield office_id, office
 76 | 
 77 | 
 78 | def check_legislator_offices(legislator_offices, legislator):
 79 |     bioguide_id = legislator_offices['id']['bioguide']
 80 |     offices = legislator_offices.get('offices', [])
 81 | 
 82 |     state = None
 83 |     if legislator:
 84 |         state = legislator['terms'][-1]['state']
 85 | 
 86 |     required = ['id', 'city', 'state']
 87 |     expected = ['address', 'zip', 'phone', 'latitude', 'longitude']
 88 |     optional = ['building', 'suite', 'hours', 'fax']
 89 |     all_fields = set(required + expected + optional)
 90 | 
 91 |     errors = []
 92 |     warnings = []
 93 | 
 94 |     if not legislator:
 95 |         errors.append("Offices for inactive legislator")
 96 | 
 97 |     if not offices:
 98 |         errors.append("Zero offices")
 99 | 
100 |     for office_id, office in id_offices(bioguide_id, offices):
101 | 
102 |         for field in required:
103 |             if not office.get(field):
104 |                 errors.append("Office %s is missing required field '%s'" % (office_id, field))
105 | 
106 |         for field in expected:
107 |             if not office.get(field):
108 |                 warnings.append("Office %s is missing field '%s'" % (office_id, field))
109 | 
110 |         for field in office:
111 |             if field not in all_fields:
112 |                 errors.append("Office %s has unrecognized field '%s'" % (office_id, field))
113 |             if not office.get(field):
114 |                 warnings.append("Office %s has empty field %s" % (office_id, field))
115 | 
116 |         found_id = office.get('id')
117 |         if found_id and office_id != found_id:
118 |             errors.append("Office %s has unexpected id '%s'" % (office_id, found_id))
119 | 
120 |         office_state = office.get('state')
121 |         if state and office_state and office_state != state:
122 |             errors.append("Office %s is in '%s', legislator is from '%s'" % (office_id, office_state, state))
123 | 
124 |         office_zip = office.get('zip')
125 |         if office_zip is not None and not isinstance(office_zip, str):
126 |             errors.append("Office %s has non-string zip: %s" % (office_id, office_zip))
127 | 
128 |         phone = office.get('phone')
129 |         fax = office.get('fax')
130 | 
131 |         if phone and not PHONE.match(phone):
132 |             errors.append("Office %s phone '%s' does not match format ddd-ddd-dddd" % (office_id, phone))
133 | 
134 |         if fax and not PHONE.match(fax):
135 |             errors.append("Office %s fax '%s' does not match format ddd-ddd-dddd" % (office_id, fax))
136 | 
137 |         if (office.get('address') and
138 |                 not (office.get('latitude') and office.get('longitude'))):
139 |             warnings.append("Office %s missing geocode" % office_id)
140 | 
141 |         if not office.get('address') and not office.get('phone'):
142 |             errors.append("Office %s needs at least address or phone" % office_id)
143 | 
144 |         fields = [f for f in office if f in FIELD_ORDER]  # unknown fields checked above
145 |         sorted_fields = sorted(fields, key=FIELD_ORDER.index)
146 |         if fields != sorted_fields:
147 |             warnings.append("Office %s fields out of order, expected %s" % (office_id, sorted_fields))
148 | 
149 |     return errors, warnings
150 | 
151 | 
152 | def load_to_dict(path):
153 |     # load to an OrderedDict keyed by bioguide id
154 |     d = yaml.load(open(relfile(path)))
155 |     return OrderedDict((l['id']['bioguide'], l) for l in d
156 |         if 'bioguide' in l['id'])
157 | 
158 | 
159 | def print_issues(legislator, errors, warnings):
160 |     if not (errors or warnings):
161 |         return
162 | 
163 |     if isinstance(legislator, str):
164 |         info = legislator
165 |     else:
166 |         term = legislator['terms'][-1]
167 |         info = "{} [{} {}] {} ({})".format(
168 |             legislator['id']['bioguide'], term['state'], term['type'],
169 |             legislator['name'].get('official_full'), term.get('url', 'no url'))
170 | 
171 |     print(info)
172 | 
173 |     for error in errors:
174 |         msg = "    ERROR: {}".format(error)
175 |         if colored:
176 |             msg = colored(msg, "red")
177 |         print(msg)
178 |     for warning in warnings:
179 |         msg = "    WARNING: {}".format(warning)
180 |         if colored:
181 |             msg = colored(msg, "yellow")
182 |         print(msg)
183 |     print("")
184 | 
185 | 
186 | def run(skip_warnings=False):
187 |     legislators = load_to_dict("../legislators-current.yaml")
188 |     legislators_offices = load_to_dict("../legislators-district-offices.yaml")
189 | 
190 |     has_errors = False
191 | 
192 |     for bioguide_id, legislator_offices in legislators_offices.items():
193 |         legislator = legislators.get(bioguide_id)
194 | 
195 |         errors, warnings = check_legislator_offices(legislator_offices, legislator)
196 | 
197 |         if skip_warnings:
198 |             warnings = []
199 | 
200 |         if errors:
201 |             has_errors = True
202 | 
203 |         print_issues(legislator or bioguide_id, errors, warnings)
204 | 
205 |     for bioguide_id in set(legislators) - set(legislators_offices):
206 |         # Only report an error for a missing office if the
207 |         # legislator has been in office for at least 60 days.
208 |         start_date = legislators[bioguide_id]['terms'][-1]['start']
209 |         if datetime.date.today() - datetime.datetime.strptime(start_date, '%Y-%m-%d').date() >= datetime.timedelta(60):
210 |             has_errors = True
211 |             errors, warnings = ["No offices"], []
212 |         else:
213 |             errors, warnings = [], ["No offices"]
214 |         print_issues(legislators[bioguide_id], errors, warnings)
215 | 
216 |     return has_errors
217 | 
218 | if __name__ == '__main__':
219 |     import argparse
220 |     parser = argparse.ArgumentParser()
221 |     parser.add_argument("--skip-warnings", action="store_true")
222 |     args = parser.parse_args()
223 | 
224 |     has_errors = run(skip_warnings=args.skip_warnings)
225 |     sys.exit(1 if has_errors else 0)
226 | 


--------------------------------------------------------------------------------
/scripts/pictorial_ids.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import csv
  4 | import json
  5 | import unicodedata
  6 | import utils
  7 | from utils import load_data, mkdir_p, save_data, parse_date
  8 | 
  9 | # Update legislators current pictorial ids
 10 | # https://pictorialapi.gpo.gov/index.html
 11 | #
 12 | # options:
 13 | #  --cache: load from cache if present on disk (default: false)
 14 | #  --bioguide: load only one legislator, by their bioguide ID
 15 | #  --congress: do *only* updates for legislators serving in specific congress
 16 | #
 17 | # example:
 18 | #  python pictorial_ids.py --congress=118
 19 | 
 20 | 
 21 | def run():
 22 | 
 23 |     # default to not caching
 24 |     cache = utils.flags().get("cache", False)
 25 |     force = not cache
 26 | 
 27 |     only_bioguide = utils.flags().get("bioguide", None)
 28 |     congress = utils.flags().get("congress", None)
 29 | 
 30 |     data_files = []
 31 |     print("Loading %s..." % "legislators-current.yaml")
 32 |     legislators = load_data("legislators-current.yaml")
 33 |     data_files.append((legislators, "legislators-current.yaml"))
 34 |     print("Loading %s..." % "legislators-historical.yaml")
 35 |     legislators = load_data("legislators-historical.yaml")
 36 |     data_files.append((legislators, "legislators-historical.yaml"))
 37 | 
 38 |     if congress == None:
 39 |         raise Exception("the --congress flag is required")
 40 |     elif int(congress) >= 110:
 41 |         # Pictorial seems to go back to 110th Congress
 42 |         url = f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress}"
 43 |         pass
 44 |     else:
 45 |         raise Exception("no data for congress " + congress)
 46 | 
 47 |     pictorial_destination = f"pictorial/source/GetMembers/{congress}.json"
 48 |     pictorial_data = json.loads(utils.download(url, pictorial_destination, force))
 49 | 
 50 |     # Filter out non-legislators and the vacant placeholders
 51 |     pictorial_members = [
 52 |         member
 53 |         for member in pictorial_data["memberCollection"]
 54 |         if member["memberType"] in ("Senator", "Representative", "Delegate")
 55 |         and member["name"] != "Vacant, Vacant"
 56 |     ]
 57 | 
 58 |     error_filename = f"cache/errors/pictorial/mismatch_{congress}.csv"
 59 |     mkdir_p("cache/errors/pictorial")
 60 |     error_log = csv.writer(open(error_filename, "w"))
 61 |     error_log.writerow(
 62 |         [
 63 |             "message",
 64 |             "bioguide_id",
 65 |             "name_first",
 66 |             "name_last",
 67 |         ]
 68 |     )
 69 |     error_count = 0
 70 | 
 71 |     print("Running for congress " + congress)
 72 |     for legislators, filename in data_files:
 73 |         for legislator in legislators:
 74 |             # this can't run unless we've already collected a bioguide for this person
 75 |             bioguide = legislator["id"].get("bioguide", None)
 76 |             # if we've limited this to just one bioguide, skip over everyone else
 77 |             if only_bioguide and (bioguide != only_bioguide):
 78 |                 continue
 79 | 
 80 |             # only run for selected congress
 81 |             latest_term = legislator["terms"][-1]
 82 |             latest_congress = utils.congress_from_legislative_year(
 83 |                 utils.legislative_year(parse_date(latest_term["start"]))
 84 |             )
 85 |             if int(congress) != latest_congress:
 86 |                 continue
 87 | 
 88 |             # skip if we already have it
 89 |             if legislator["id"].get("pictorial"):
 90 |                 continue
 91 |             try:
 92 |                 pictorial_id = match_pictorial_id(legislator, pictorial_members)
 93 |                 legislator["id"]["pictorial"] = pictorial_id
 94 |             except ValueError as e:
 95 |                 error_count += 1
 96 |                 error_log.writerow(
 97 |                     [
 98 |                         e,
 99 |                         bioguide,
100 |                         legislator["name"]["first"],
101 |                         legislator["name"]["last"],
102 |                     ]
103 |                 )
104 | 
105 |         save_data(legislators, filename)
106 | 
107 |     if error_count:
108 |         print(f"{error_count} error details written to {error_filename}")
109 | 
110 | 
111 | def to_ascii(s):
112 |     return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII")
113 | 
114 | 
115 | def reverse_name(name):
116 |     """
117 |     Given a name in "Last, First" format, return "First Last"
118 |     """
119 |     return " ".join(name.split(", ")[::-1])
120 | 
121 | 
122 | def match_pictorial_id(legislator, pictorial_members):
123 |     """
124 |     Attempt to find the corresponding pictorial id for the given member.
125 | 
126 |     There are many odd cases -- see tests/test_gpo_member_photos.py for
127 |     examples.
128 |     """
129 |     name = legislator["name"]["official_full"]
130 | 
131 |     # Map common nicknames (and GPO typos) from legislators to pictorial
132 |     common_nicknames = {
133 |         "Nick": "Nicolas",
134 |         "Daniel": "Dan",
135 |         "Mike": "Michael",
136 |         "Michael": "Mike",
137 |         "Richard": "Rich",
138 |         "Christopher": "Chris",
139 |         "JOhn": "John",
140 |     }
141 | 
142 |     matches = []
143 |     for member_pictorial in pictorial_members:
144 |         # First check whether the name matches
145 |         name_matches = False
146 |         legislator_name_last = to_ascii(legislator["name"]["last"].replace(" ", ""))
147 |         legislator_name_first = to_ascii(legislator["name"]["first"].replace(" ", ""))
148 | 
149 |         if legislator_name_last == member_pictorial["lastName"]:
150 |             if legislator_name_first == member_pictorial["firstName"] or (
151 |                 "nickname" in legislator["name"]
152 |                 and legislator["name"]["nickname"] == member_pictorial["firstName"]
153 |             ):
154 |                 name_matches = True
155 |             # Sometimes the nickname is encoded in the first name
156 |             elif member_pictorial["firstName"] in legislator_name_first:
157 |                 name_matches = True
158 |             # Sometimes the nickname is encoded in the middle name
159 |             elif (
160 |                 "middle" in legislator["name"]
161 |                 and member_pictorial["firstName"] in legislator["name"]["middle"]
162 |             ):
163 |                 name_matches = True
164 |             # Sometimes the nickname is not encoded
165 |             elif (
166 |                 member_pictorial["firstName"] in common_nicknames
167 |                 and common_nicknames[member_pictorial["firstName"]]
168 |                 == legislator_name_first
169 |             ):
170 |                 name_matches = True
171 | 
172 |         # Sometimes matching the official full name is best
173 |         if legislator["name"]["official_full"] == reverse_name(
174 |             member_pictorial["name"]
175 |         ):
176 |             name_matches = True
177 | 
178 |         # The GPO has some first and last names swapped, so check those too
179 |         if not name_matches and legislator_name_first == member_pictorial["lastName"]:
180 |             if legislator_name_last == member_pictorial["firstName"] or (
181 |                 "nickname" in legislator["name"]
182 |                 and legislator["name"]["nickname"] == member_pictorial["firstName"]
183 |             ):
184 |                 name_matches = True
185 | 
186 |         # If the name matches, check the office and state
187 |         # Note: Assumes we're matching against most recent term
188 |         if name_matches:
189 |             most_recent_term = legislator["terms"][-1]
190 |             mType = "sen" if member_pictorial["memberType"] == "Senator" else "rep"
191 |             if (
192 |                 most_recent_term["state"] == member_pictorial["stateId"]
193 |                 and most_recent_term["type"] == mType
194 |             ):
195 |                 matches.append(member_pictorial)
196 | 
197 |     if len(matches) == 1:
198 |         return matches[0]["memberId"]
199 |     else:
200 |         if len(matches):
201 |             raise ValueError(f"Multiple pictorial id matches found for {name}")
202 |         else:
203 |             raise ValueError(f"No pictorial id match found for {name}")
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     run()
208 | 


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
 1 | pyyaml
 2 | rtyaml
 3 | scrapelib==0.10.1
 4 | ipython
 5 | lxml>=2.2
 6 | cssselect
 7 | pyflakes
 8 | pytz
 9 | tweepy
10 | sparqlwrapper
11 | 


--------------------------------------------------------------------------------
/scripts/retire.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Retire a Member of Congress. Updates the end date of the
 4 | # Member's most recent term and moves him/her from the
 5 | # current file to the historical file.
 6 | #
 7 | # python retire.py bioguideID termEndDate
 8 | 
 9 | import sys
10 | import utils
11 | import rtyaml
12 | 
13 | def run():
14 | 	if len(sys.argv) != 3:
15 | 		print("Usage:")
16 | 		print("python retire.py bioguideID termEndDate")
17 | 		sys.exit()
18 | 
19 | 	try:
20 | 		utils.parse_date(sys.argv[2])
21 | 	except:
22 | 		print("Invalid date: ", sys.argv[2])
23 | 		sys.exit()
24 | 
25 | 	print("Loading current YAML...")
26 | 	y = utils.load_data("legislators-current.yaml")
27 | 	print("Loading historical YAML...")
28 | 	y1 = utils.load_data("legislators-historical.yaml")
29 | 
30 | 	for moc in y:
31 | 		if moc["id"].get("bioguide", None) != sys.argv[1]: continue
32 | 
33 | 		print("Updating:")
34 | 		rtyaml.pprint(moc["id"])
35 | 		print()
36 | 		rtyaml.pprint(moc["name"])
37 | 		print()
38 | 		rtyaml.pprint(moc["terms"][-1])
39 | 
40 | 		moc["terms"][-1]["end"] = sys.argv[2]
41 | 
42 | 		y.remove(moc)
43 | 		y1.append(moc)
44 | 
45 | 		break
46 | 
47 | 	print("Saving changes...")
48 | 	utils.save_data(y, "legislators-current.yaml")
49 | 	utils.save_data(y1, "legislators-historical.yaml")
50 | 
51 | if __name__ == '__main__':
52 |   run()


--------------------------------------------------------------------------------
/scripts/run_script_to_branch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run_script_to_branch
 3 | # --------------------
 4 | # Creates a branch, executes a script, and optioanally creates a pull request
 5 | # using github's hub tool (http://hub.github.com/).
 6 | #
 7 | # Usage:
 8 | #
 9 | # ./run_script_to_branch [-push] script_name.py
10 | #
11 | # Example:
12 | #
13 | # ./run_script_to_branch -push senate_contacts.py
14 | 
15 | 
16 | # Command-line options.
17 | PUSH=0
18 | if [[ "$1" = "-push" ]]; then
19 | 	if ! which hub>/dev/null; then
20 | 		echo "Install 'hub' from hub.github.com to automatically create a pull request."
21 | 	fi
22 | 	PUSH=1
23 | 	shift;
24 | fi
25 | 
26 | # Check that we have an argument for which script to run.
27 | if [ -z "$1" ]; then
28 | 	echo "usage: $0 script_name.py";
29 | 	exit;
30 | fi
31 | 
32 | # Check that there are no unstaged changes.
33 | # see http://stackoverflow.com/questions/5139290/how-to-check-if-theres-nothing-to-be-committed-in-the-current-branch
34 | if ! git diff-files --quiet --ignore-submodules; then
35 |     echo "Cannot run this now: You have unstaged changes."
36 |     exit;
37 | fi
38 | 
39 | # Create a branch with the name of the script, the date, and a random string to prevent accidental collisions.
40 | BRANCH_NAME=$1_`date +%Y%m%d`_$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 6 | head -n 1)
41 | git fetch
42 | git checkout -b $BRANCH_NAME main
43 | 
44 | # Run the script.
45 | echo Running $@...
46 | ./$@
47 | 
48 | # If there were no changes. Return to the main branch and then delete our temporary branch.
49 | if git diff-files --quiet --ignore-submodules; then
50 | 	echo No changes.;
51 | 	git checkout main;
52 | 	git branch -d $BRANCH_NAME;
53 | 	exit;
54 | fi
55 | 
56 | # Commit to the branch.
57 | CMD=$@
58 | git commit -am "running $CMD at `date "+%FT%T"`"
59 | 
60 | if [ $PUSH -gt 0 ]; then
61 | 	# Push to github.
62 | 	if git push -u origin $BRANCH_NAME; then
63 | 		if hub pull-request -m "[auto] $CMD run at `date "+%FT%T"`"; then
64 | 			# Success, so we can delete our local copy. Use -D to force delete
65 | 			# even though it's not merged.
66 | 			git checkout main;
67 | 			git branch -D $BRANCH_NAME;
68 | 		fi
69 | 	fi
70 | fi
71 | 


--------------------------------------------------------------------------------
/scripts/senate_contacts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Update current senator's website and address from www.senate.gov.
  4 | 
  5 | import lxml.etree, io
  6 | import string, re
  7 | from datetime import datetime
  8 | import utils
  9 | from utils import download, load_data, save_data, parse_date
 10 | import urllib.request
 11 | 
 12 | def run():
 13 | 
 14 | 	today = datetime.now().date()
 15 | 
 16 | 	# default to not caching
 17 | 	cache = utils.flags().get('cache', False)
 18 | 	force = not cache
 19 | 
 20 | 	y = load_data("legislators-current.yaml")
 21 | 
 22 | 	# Map bioguide IDs to dicts. Reference the same dicts
 23 | 	# in y so we are updating y when we update biogiude.
 24 | 	bioguide = { }
 25 | 	by_name = { }
 26 | 	for m in y:
 27 | 		if "bioguide" in m["id"]:
 28 | 			bioguide[m["id"]["bioguide"]] = m
 29 | 		party = m["terms"][-1]["party"][0]
 30 | 		state = m["terms"][-1]["state"]
 31 | 		last_name = m["name"]["last"]
 32 | 		member_full = "%s (%s-%s)" % (last_name, party, state)
 33 | 		by_name[member_full] = m
 34 | 
 35 | 
 36 | 	print("Fetching general Senate information from senators_cfm.xml...")
 37 | 
 38 | 	url = "https://www.senate.gov/general/contact_information/senators_cfm.xml"
 39 | 	body = download(url, "legislators/senate.xml", force, { "binary": True })
 40 | 	dom = lxml.etree.parse(io.BytesIO(body)) # file has an <?xml declaration and so must be parsed as a bytes array
 41 | 	for node in dom.xpath("member"):
 42 | 		bioguide_id = str(node.xpath("string(bioguide_id)")).strip()
 43 | 		member_full = node.xpath("string(member_full)")
 44 | 
 45 | 		if bioguide_id == "":
 46 | 			print("Someone has an empty bioguide ID!")
 47 | 			print(lxml.etree.tostring(node))
 48 | 			continue
 49 | 
 50 | 		print("[%s] Processing Senator %s..." % (bioguide_id, member_full))
 51 | 
 52 | 		# find member record in our YAML, either by bioguide_id or member_full
 53 | 		if bioguide_id in bioguide:
 54 | 			member = bioguide[bioguide_id]
 55 | 		else:
 56 | 			if member_full in by_name:
 57 | 				member = by_name[member_full]
 58 | 			else:
 59 | 				print("Bioguide ID '%s' and full name '%s' not recognized." % (bioguide_id, member_full))
 60 | 				exit(0)
 61 | 
 62 | 		try:
 63 | 			term = member["terms"][-1]
 64 | 		except IndexError:
 65 | 			print("Member has no terms", bioguide_id, member_full)
 66 | 			continue
 67 | 
 68 | 		if today < parse_date(term["start"]) or today > parse_date(term["end"]):
 69 | 			print("Member's last listed term is not current", bioguide_id, member_full, term["start"])
 70 | 			continue
 71 | 
 72 | 		if term["type"] != "sen":
 73 | 			print("Member's last listed term is not a Senate term", bioguide_id, member_full)
 74 | 			continue
 75 | 
 76 | 
 77 | 		if term["state"] != str(node.xpath("string(state)")):
 78 | 			print("Member's last listed term has the wrong state", bioguide_id, member_full)
 79 | 			continue
 80 | 
 81 | 		if "district" in term: del term["district"]
 82 | 
 83 | 		full_name = str(node.xpath("string(first_name)"))
 84 | 		suffix = None
 85 | 		if ", " in full_name: full_name, suffix = full_name.split(", ")
 86 | 		full_name += " " + str(node.xpath("string(last_name)"))
 87 | 		if suffix: full_name += ", " + suffix
 88 | 		member["name"]["official_full"] = re.sub("'", "’", full_name)
 89 | 
 90 | 		member["id"]["bioguide"] = bioguide_id
 91 | 
 92 | 		term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3}[ node.xpath("string(class)") ]
 93 | 		term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent"}[ node.xpath("string(party)") ]
 94 | 
 95 | 		url = str(node.xpath("string(website)")).strip()
 96 | 		if not url.startswith("/"):
 97 | 			# temporary home pages for new senators are relative links?
 98 | 
 99 | 			# hit the URL to resolve any redirects to get the canonical URL,
100 | 			# since the listing sometimes gives URLs that redirect.
101 | 			try:
102 | 				req = urllib.request.Request(url)
103 | 				req.add_header("User-Agent", "https://github.com/unitedstates/congress-legislators")
104 | 				resp = urllib.request.urlopen(req)
105 | 				url = resp.geturl()
106 | 			except Exception as e:
107 | 				print(url, e)
108 | 
109 | 			# kill trailing slash
110 | 			url = re.sub("/$", "", url)
111 | 
112 | 			term["url"] = url
113 | 
114 | 		#contact forms are sometimes listed as the base url, ignore if such case
115 | 		contact_form = str(node.xpath("string(email)")).strip()
116 | 		if contact_form and contact_form.rstrip("/") != term['url']:
117 | 			term['contact_form'] = contact_form
118 | 
119 | 		term["address"] = str(node.xpath("string(address)")).strip().replace("\n      ", " ")
120 | 		term["office"] = string.capwords(term["address"].upper().split(" WASHINGTON ")[0])
121 | 
122 | 		phone = str(node.xpath("string(phone)")).strip()
123 | 		term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-")
124 | 
125 | 
126 | 
127 | 	print("\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...")
128 | 
129 | 	url = "https://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml"
130 | 	body = download(url, "legislators/senate_cvc.xml", force)
131 | 	dom = lxml.etree.parse(io.StringIO(body))
132 | 	for node in dom.getroot():
133 | 		if node.tag == "lastUpdate":
134 | 			date, time = node.getchildren()
135 | 			print("Last updated: %s, %s" % (date.text, time.text))
136 | 			continue
137 | 
138 | 		bioguide_id = str(node.xpath("string(bioguideId)")).strip()
139 | 		if bioguide_id == "":
140 | 			print("Someone has an empty bioguide ID!")
141 | 			print(lxml.etree.tostring(node))
142 | 			continue
143 | 
144 | 		last_name = node.xpath("string(name/last)")
145 | 		party = node.xpath("string(party)")
146 | 		state = node.xpath("string(state)")
147 | 		member_full = "%s (%s-%s)" % (last_name, party, state)
148 | 
149 | 		print("[%s] Processing Senator %s..." % (bioguide_id, member_full))
150 | 
151 | 		# find member record in our YAML, either by bioguide_id or member_full
152 | 		if bioguide_id in bioguide:
153 | 			member = bioguide[bioguide_id]
154 | 		else:
155 | 			if member_full in by_name:
156 | 				member = by_name[member_full]
157 | 			else:
158 | 				print("Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full))
159 | 				continue # exit(0)
160 | 
161 | 		try:
162 | 			term = member["terms"][-1]
163 | 		except IndexError:
164 | 			print("Member has no terms", bioguide_id, member_full)
165 | 			continue
166 | 
167 | 		if "id" not in member:
168 | 			member["id"] = {}
169 | 
170 | 		member["id"]["lis"] = node.attrib["lis_member_id"]
171 | 		state_rank = node.xpath("string(stateRank)")
172 | 		if state_rank == '1':
173 | 			term["state_rank"] = "senior"
174 | 		elif state_rank == '2':
175 | 			term["state_rank"] = "junior"
176 | 
177 | 
178 | 	print("Saving data...")
179 | 	save_data(y, "legislators-current.yaml")
180 | 
181 | if __name__ == '__main__':
182 |   run()
183 | 


--------------------------------------------------------------------------------
/scripts/social/twitter.py:
--------------------------------------------------------------------------------
 1 | # Helpful functions for accessing Twitter
 2 | import tweepy
 3 | TWITTER_PROFILE_BATCH_SIZE = 100
 4 | from math import ceil
 5 | 
 6 | def get_api(access_token, access_token_secret, consumer_key, consumer_secret):
 7 |     """
 8 |     Takes care of the Twitter OAuth authentication process and
 9 |     creates an API-handler to execute commands on Twitter
10 | 
11 |     Arguments: string values
12 | 
13 |     Returns:
14 |       A tweepy.api.API object
15 |     """
16 |     # Get authentication token
17 |     auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
18 |     auth.set_access_token(access_token, access_token_secret)
19 |     # create an API handler
20 |     return tweepy.API(auth)
21 | 
22 | def fetch_profiles(api, screen_names = [], ids = []):
23 |     """
24 |     A wrapper method around tweepy.API.lookup_users that handles the batch lookup of
25 |       screen_names. Assuming number of screen_names < 10000, this should not typically
26 |       run afoul of API limits (i.e. it's a good enough hack for now)
27 | 
28 |     `api` is a tweepy.API handle
29 |     `screen_names` is a list of twitter screen names
30 | 
31 |     Returns: a list of dicts representing Twitter profiles
32 |     """
33 |     profiles = []
34 |     key, lookups = ['user_ids', ids] if ids else ['screen_names', screen_names]
35 |     for batch_idx in range(ceil(len(lookups) / TWITTER_PROFILE_BATCH_SIZE)):
36 |         offset = batch_idx * TWITTER_PROFILE_BATCH_SIZE
37 |         # break lookups list into batches of TWITTER_PROFILE_BATCH_SIZE
38 |         batch = lookups[offset:(offset + TWITTER_PROFILE_BATCH_SIZE)]
39 |         try:
40 |             for user in api.lookup_users(**{key: batch}):
41 |                 profiles.append(user._json)
42 |         # catch situation in which none of the names in the batch are found
43 |         # or else Tweepy will error out
44 |         except tweepy.error.TweepError as e:
45 |             if e.response.status_code == 404:
46 |                 pass
47 |             else: # some other error, raise the exception
48 |                 raise e
49 |     return profiles
50 | 


--------------------------------------------------------------------------------
/scripts/sweep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from utils import load_data, save_data
 4 | 
 5 | def run():
 6 |     # load in members, orient by bioguide ID
 7 |     print("Loading current legislators...")
 8 |     current = load_data("legislators-current.yaml")
 9 | 
10 |     current_bioguide = { }
11 |     for m in current:
12 |       if "bioguide" in m["id"]:
13 |         current_bioguide[m["id"]["bioguide"]] = m
14 | 
15 |     # remove out-of-office people from current committee membership
16 |     print("Sweeping committee membership...")
17 |     membership_current = load_data("committee-membership-current.yaml")
18 |     for committee_id in list(membership_current.keys()):
19 |       for member in membership_current[committee_id]:
20 |         if member["bioguide"] not in current_bioguide:
21 |           print("\t[%s] Ding ding ding! (%s)" % (member["bioguide"], member["name"]))
22 |           membership_current[committee_id].remove(member)
23 |     save_data(membership_current, "committee-membership-current.yaml")
24 | 
25 |     # remove out-of-office people from social media info
26 |     print("Sweeping social media accounts...")
27 |     socialmedia_current = load_data("legislators-social-media.yaml")
28 |     for member in list(socialmedia_current):
29 |       if member["id"]["bioguide"] not in current_bioguide:
30 |         print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["social"]))
31 |         socialmedia_current.remove(member)
32 |     save_data(socialmedia_current, "legislators-social-media.yaml")
33 | 
34 |     # remove out-of-office people from district offices
35 |     print("Sweeping district offices...")
36 |     district_offices = load_data("legislators-district-offices.yaml")
37 |     for member in list(district_offices):
38 |       if member["id"]["bioguide"] not in current_bioguide:
39 |         print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["offices"]))
40 |         district_offices.remove(member)
41 |     save_data(district_offices, "legislators-district-offices.yaml")
42 | 
43 | if __name__ == '__main__':
44 |   run()
45 | 


--------------------------------------------------------------------------------
/scripts/thomas_ids.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Update current THOMAS IDs using beta.congress.gov. Congressmen's
 4 | # IDs are updated directly. For Senators, we just print out new
 5 | # IDs because name matching is hard.
 6 | 
 7 | import lxml.html, io, urllib.request, urllib.parse, urllib.error
 8 | import re
 9 | import utils
10 | from utils import download, load_data, save_data
11 | 
12 | def run():
13 |   CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter
14 | 
15 |   # constants
16 |   state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}
17 | 
18 |   # default to not caching
19 |   cache = utils.flags().get('cache', False)
20 |   force = not cache
21 | 
22 |   # load in current members
23 |   y = load_data("legislators-current.yaml")
24 |   by_district = { }
25 |   existing_senator_ids = set()
26 |   for m in y:
27 |     last_term = m['terms'][-1]
28 |     if last_term['type'] == 'rep':
29 |       full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
30 |       by_district[full_district] = m
31 |     elif last_term['type'] == 'sen':
32 |       if "thomas" in m["id"]:
33 |         existing_senator_ids.add(m["id"]["thomas"])
34 | 
35 |   seen_ids = set()
36 |   for chamber in ("House of Representatives", "Senate"):
37 |     url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
38 |       urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber))
39 |     cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
40 |     try:
41 |       body = download(url, cache, force)
42 |       dom = lxml.html.parse(io.StringIO(body)).getroot()
43 |     except lxml.etree.XMLSyntaxError:
44 |       print("Error parsing: ", url)
45 |       continue
46 | 
47 |     for node in dom.xpath("//ul[@class='results_list']/li"):
48 |       thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1))
49 | 
50 |       # THOMAS misassigned these 'new' IDs to existing individuals.
51 |       if thomas_id in ('02139', '02132'):
52 |         continue
53 | 
54 |       name = node.xpath('h2/a')[0].text
55 | 
56 |       state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip()
57 |       state = state_names[state]
58 | 
59 |       if chamber == "House of Representatives":
60 |         # There's enough information to easily pick out which Member this refers to, so write it
61 |         # directly to the file.
62 |         district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip()
63 |         if district == "At Large": district = 0
64 |         district = "%02d" % int(district)
65 | 
66 |         if state + district not in by_district:
67 |           print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.")
68 |           continue
69 | 
70 |         if state + district in seen_ids:
71 |           print("Congress.gov lists two people for %s%s!" % (state, district))
72 |         seen_ids.add(state+district)
73 | 
74 |         by_district[state + district]["id"]["thomas"] = thomas_id
75 | 
76 |       elif chamber == "Senate":
77 |         # For senators we'd have to match on name or something else, so that's too difficult.
78 |         # Just look for new IDs.
79 |         if thomas_id not in existing_senator_ids:
80 |           print("Please manually set", thomas_id, "for", name, "from", state)
81 | 
82 |   save_data(y, "legislators-current.yaml")
83 | 
84 | if __name__ == '__main__':
85 |   run()


--------------------------------------------------------------------------------
/scripts/untire.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # "Un-retire" a Member of Congress: Move a Member of Congress
 4 | # from the legislators-historical file to the legislators-current file
 5 | # and give the Member a new term.
 6 | #
 7 | # python unretire.py bioguideID
 8 | 
 9 | import sys
10 | import rtyaml
11 | import utils
12 | from collections import OrderedDict
13 | 
14 | def run():
15 | 
16 | 	if len(sys.argv) != 2:
17 | 		print("Usage:")
18 | 		print("python untire.py bioguideID")
19 | 		sys.exit()
20 | 
21 | 	print("Loading current YAML...")
22 | 	y = utils.load_data("legislators-current.yaml")
23 | 	print("Loading historical YAML...")
24 | 	y1 = utils.load_data("legislators-historical.yaml")
25 | 
26 | 	for moc in y1:
27 | 		if moc["id"].get("bioguide", None) != sys.argv[1]: continue
28 | 
29 | 		print("Updating:")
30 | 		rtyaml.pprint(moc["id"])
31 | 		print()
32 | 		rtyaml.pprint(moc["name"])
33 | 
34 | 		moc["terms"].append(OrderedDict([
35 | 			("type", moc["terms"][-1]["type"]),
36 | 			("start", None),
37 | 			("end", None),
38 | 			("state", moc["terms"][-1]["state"]),
39 | 			("party", moc["terms"][-1]["party"]),
40 | 		]))
41 | 
42 | 		y1.remove(moc)
43 | 		y.append(moc)
44 | 
45 | 		break
46 | 
47 | 	print("Saving changes...")
48 | 	utils.save_data(y, "legislators-current.yaml")
49 | 	utils.save_data(y1, "legislators-historical.yaml")
50 | 
51 | if __name__ == '__main__':
52 |   run()


--------------------------------------------------------------------------------
/scripts/update_gh_pages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Current commit hash on the source branch.
 5 | SRC_BRANCH=main
 6 | 
 7 | # Switch to gh-pages branch.
 8 | git checkout gh-pages
 9 | 
10 | # Get the YAML and the scripts we need to generate CSV and JSON
11 | # from the source branch.
12 | git fetch origin $SRC_BRANCH
13 | HASH=$(git rev-parse FETCH_HEAD)
14 | echo "Getting latest files from $SRC_BRANCH @ $HASH."
15 | git checkout FETCH_HEAD "*.yaml" scripts/alternate_bulk_formats.py scripts/utils.py
16 | 
17 | # Generate CSV and JSON.
18 | (cd scripts/; python3 alternate_bulk_formats.py;)
19 | 
20 | # Commit the YAML, CSV, and JSON.
21 | # (Don't commit the other scripts files we checked out from
22 | # the source branch, which git has unhelpfully put in the
23 | # index.)
24 | export GIT_AUTHOR_NAME="the unitedstates project (CircleCI)"
25 | export GIT_AUTHOR_EMAIL=circleci@theunitedstates.io
26 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
27 | export GIT_COMMITTER_EMAIL="GIT_AUTHOR_EMAIL"
28 | (
29 | 	git add *.yaml *.csv *.json \
30 | 	&& git commit -m "update to $SRC_BRANCH @ $HASH by CircleCI" \
31 | 	       *.yaml *.csv *.json \
32 | 	&& git push
33 | ) || /bin/true # if there's nothing to commit, don't exit with error status
34 | 
35 | # Switch back to the original branch.
36 | git checkout -f $SRC_BRANCH
37 | 


--------------------------------------------------------------------------------
/scripts/wikidata_update.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import re
  4 | import urllib.request
  5 | import json
  6 | from urllib.parse import quote, unquote
  7 | from utils import load_data, save_data
  8 | from SPARQLWrapper import SPARQLWrapper, JSON
  9 | 
 10 | def get_wikidata_ids(legislators):
 11 |     # Look up wikidata IDs for legislators with English Wikipedia IDs.
 12 |     for p in legislators:
 13 |         if not p["id"].get("wikidata") and p["id"].get("wikipedia"):
 14 |             w = quote(p["id"]["wikipedia"].replace(" ", "_"))
 15 |             query_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={w}&format=json"
 16 |             response = json.load(urllib.request.urlopen(query_url))
 17 |             wikidata_id = list(response["query"]["pages"].values())[0]["pageprops"]["wikibase_item"]
 18 |             p["id"]["wikidata"] = wikidata_id
 19 | 
 20 | 
 21 | def get_ids_from_wikidata(legislators):
 22 |     # Query to fetch information for entities that have a bioguide ID.
 23 |     # Selecting on bioguide ID efficiently gets wikidata entries that
 24 |     # we are interested in.
 25 | 
 26 |     table = run_query("""
 27 |       PREFIX wdt: <http://www.wikidata.org/prop/direct/>
 28 |       PREFIX schema: <http://schema.org/>
 29 | 
 30 |       SELECT ?subject ?bioguide ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia
 31 |       WHERE {
 32 |         ?subject wdt:P1157 ?bioguide .
 33 |         OPTIONAL {
 34 |             ?subject wdt:P2671 ?google_entity_id
 35 |         }
 36 |         OPTIONAL {
 37 |             ?subject wdt:P2686 ?opensecrets
 38 |         }
 39 |         OPTIONAL {
 40 |             ?subject wdt:P3344 ?votesmart
 41 |         }
 42 |         OPTIONAL {
 43 |             ?subject wdt:P2390 ?ballotpedia
 44 |         }
 45 |         OPTIONAL {
 46 |             ?wikipedia schema:about ?subject .
 47 |             ?wikipedia schema:inLanguage "en" .
 48 |             ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
 49 |         }
 50 |       }
 51 |     """)
 52 | 
 53 |     # make a mapping from bioguide ID to query result
 54 |     mapping = { r["bioguide"]: r for r in table }
 55 | 
 56 |     # update legislators
 57 |     for p in legislators:
 58 |         if p["id"].get("bioguide") in mapping:
 59 |             p["id"].update(mapping[p["id"]["bioguide"]])
 60 | 
 61 | 
 62 | def get_ids_from_wikidata_without_bioguide(legislators):
 63 |     # The SQPARL server doesn't seem to suppor VALUES or FILTER(?subject IN (...))
 64 |     # so in order to fill in values for legislators without bioguide IDs but with
 65 |     # wikidata IDs, we can just query them one by one. This probably is only useful
 66 |     # at the start of a new Congress when bioguide IDs are not yet available.
 67 |     for p in legislators:
 68 |         if not ("bioguide" not in p["id"] and "wikidata" in p["id"]): continue
 69 | 
 70 |         table = run_query("""
 71 |           PREFIX wd: <http://www.wikidata.org/entity/>
 72 |           PREFIX wdt: <http://www.wikidata.org/prop/direct/>
 73 |           PREFIX schema: <http://schema.org/>
 74 | 
 75 |           SELECT ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia
 76 |           WHERE {
 77 |             OPTIONAL {
 78 |                 ?subject wdt:P2671 ?google_entity_id
 79 |             }
 80 |             OPTIONAL {
 81 |                 ?subject wdt:P2686 ?opensecrets
 82 |             }
 83 |             OPTIONAL {
 84 |                 ?subject wdt:P3344 ?votesmart
 85 |             }
 86 |             OPTIONAL {
 87 |                 ?subject wdt:P2390 ?ballotpedia
 88 |             }
 89 |             OPTIONAL {
 90 |                 ?wikipedia schema:about ?subject .
 91 |                 ?wikipedia schema:inLanguage "en" .
 92 |                 ?wikipedia schema:isPartOf <https://en.wikipedia.org/> .
 93 |             }
 94 |           }
 95 |         """.replace("?subject", "wd:" + p["id"]["wikidata"]))
 96 | 
 97 | 
 98 |         p["id"].update(table[0])
 99 | 
100 | 
101 | def run_query(query):
102 |     sparql_endpoint = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
103 |     s = SPARQLWrapper(sparql_endpoint)
104 | 
105 |     # run the query
106 |     s.setQuery(query)
107 |     s.setReturnFormat(JSON)
108 |     results = s.query().convert()
109 | 
110 |     for row in results['results']['bindings']:
111 |         if "subject" in row:
112 |             # replace the ?subject variable with the wikidata id
113 |             row['wikidata'] = { "value": re.search(r'/(Q\d+)', row['subject']['value']).group(1) }
114 |             del row["subject"]
115 | 
116 |         # clean up the google entity id
117 |         if 'google_entity_id' in row:
118 |             row['google_entity_id']["value"] = 'kg:' + row['google_entity_id']["value"]
119 | 
120 |         # clean up the wikipedia and ballotpedia results
121 |         if "wikipedia" in row:
122 |             row["wikipedia"]["value"] = \
123 |                 unquote(row["wikipedia"]["value"])\
124 |                 .replace("https://en.wikipedia.org/wiki/", "")\
125 |                 .strip().replace('_',' ')
126 |         if "ballotpedia" in row:
127 |             row["ballotpedia"]["value"] = row["ballotpedia"]["value"].strip().replace('_',' ')
128 | 
129 |         # clean up the votesmart id
130 |         if "votesmart" in row:
131 |             try:
132 |                 row["votesmart"]["value"] = int(row["votesmart"]["value"])
133 |             except ValueError:
134 |                 print("invalid value", row["votesmart"]["value"])
135 |                 continue
136 |     # return a simple list of dicts of results
137 |     return [
138 |         {
139 |             k: row[k]['value']
140 |             for k in row
141 |         }
142 |         for row in results['results']['bindings']
143 |     ]
144 | 
145 | 
146 | def run():
147 |   p1 = load_data("legislators-current.yaml")
148 |   p2 = load_data("legislators-historical.yaml")
149 |   get_wikidata_ids(p1+p2)
150 |   get_ids_from_wikidata(p1+p2)
151 |   get_ids_from_wikidata_without_bioguide(p1+p2)
152 |   save_data(p1, "legislators-current.yaml")
153 |   save_data(p2, "legislators-historical.yaml")
154 | 
155 | if __name__ == '__main__':
156 |   run()
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/scripts/wikipedia_ids.py:
--------------------------------------------------------------------------------
  1 | # Scans Wikipedia for pages using the CongBio and CongLinks
  2 | # templates, which have Bioguide IDs. Updates the 'wikipedia'
  3 | # ID field for matching Members of Congress, and for pages
  4 | # using the CongLinks template also updates a variety of
  5 | # other ID as found in the template.
  6 | 
  7 | import lxml.etree, re, urllib.request, urllib.parse, urllib.error
  8 | import utils, os.path
  9 | 
 10 | def run():
 11 | 
 12 | 	# Field mapping. And which fields should be turned into integers.
 13 | 	# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
 14 | 	fieldmap = {
 15 | 		"congbio": "bioguide",
 16 | 		#"fec": "fec", # handled specially...
 17 | 		"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
 18 | 		"opensecrets": "opensecrets",
 19 | 		"votesmart": "votesmart",
 20 | 		"cspan": "cspan",
 21 | 	}
 22 | 	int_fields = ("govtrack", "votesmart", "cspan")
 23 | 
 24 | 	# default to not caching
 25 | 	cache = utils.flags().get('cache', False)
 26 | 
 27 | 	# Load legislator files and map bioguide IDs.
 28 | 	y1 = utils.load_data("legislators-current.yaml")
 29 | 	y2 = utils.load_data("legislators-historical.yaml")
 30 | 	bioguides = { }
 31 | 	for y in y1+y2:
 32 | 	  bioguides[y["id"]["bioguide"]] = y
 33 | 
 34 | 	# Okay now the Wikipedia stuff...
 35 | 
 36 | 	def get_matching_pages():
 37 | 		# Does a Wikipedia API search for pages containing either of the
 38 | 		# two templates. Returns the pages.
 39 | 
 40 | 		page_titles = set()
 41 | 
 42 | 		for template in ("CongLinks", "CongBio"):
 43 | 			eicontinue = ""
 44 | 			while True:
 45 | 				# construct query URL, using the "eicontinue" of the last query to get the next batch
 46 | 				url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
 47 | 				if eicontinue: url += "&eicontinue=" + eicontinue
 48 | 
 49 | 				# load the XML
 50 | 				print("Getting %s pages (%d...)" % (template, len(page_titles)))
 51 | 				dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably
 52 | 
 53 | 				for pgname in dom.xpath("query/embeddedin/ei/@title"):
 54 | 					page_titles.add(pgname)
 55 | 
 56 | 				# get the next eicontinue value and loop
 57 | 				eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
 58 | 				if not eicontinue: break
 59 | 
 60 | 		return page_titles
 61 | 
 62 | 	# Get the list of Wikipedia pages that use any of the templates we care about.
 63 | 	page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
 64 | 	if cache and os.path.exists(page_list_cache_file):
 65 | 		# Load from cache.
 66 | 		matching_pages = open(page_list_cache_file).read().split("\n")
 67 | 	else:
 68 | 		# Query Wikipedia API and save to cache.
 69 | 		matching_pages = get_matching_pages()
 70 | 		utils.write(("\n".join(matching_pages)), page_list_cache_file)
 71 | 
 72 | 	# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
 73 | 	matching_pages = [p for p in matching_pages if ":" not in p]
 74 | 
 75 | 	# Load each page's content and parse the template.
 76 | 	for p in sorted(matching_pages):
 77 | 		if " campaign" in p: continue
 78 | 		if " (surname)" in p: continue
 79 | 		if "career of " in p: continue
 80 | 		if "for Congress" in p: continue
 81 | 		if p.startswith("List of "): continue
 82 | 		if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue
 83 | 
 84 | 		# Query the Wikipedia API to get the raw page content in XML,
 85 | 		# and then use XPath to get the raw page text.
 86 | 		url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
 87 | 		cache_path = "legislators/wikipedia/pages/" + p
 88 | 		dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
 89 | 		page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })
 90 | 
 91 | 		# Build a dict for the IDs that we want to insert into our files.
 92 | 		new_ids = {
 93 | 			"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
 94 | 		}
 95 | 
 96 | 		if "CongLinks" in page_content:
 97 | 			# Parse the key/val pairs in the template.
 98 | 			m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
 99 | 			if not m: continue # no template?
100 | 			for arg in m.group(1).split("|"):
101 | 				if "=" not in arg: continue
102 | 				key, val = arg.split("=", 1)
103 | 				key = key.strip()
104 | 				val = val.strip()
105 | 				if val and key in fieldmap:
106 | 					try:
107 | 						if fieldmap[key] in int_fields: val = int(val)
108 | 					except ValueError:
109 | 						print("invalid value", key, val)
110 | 						continue
111 | 
112 | 					if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
113 | 					new_ids[fieldmap[key]] = val
114 | 
115 | 			if "bioguide" not in new_ids: continue
116 | 			new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
117 | 			bioguide = new_ids["bioguide"]
118 | 
119 | 		else:
120 | 			m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
121 | 			if not m: continue # no template?
122 | 			bioguide = m.group(1).upper()
123 | 
124 | 
125 | 		if not bioguide in bioguides:
126 | 			print("Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)")
127 | 			continue
128 | 
129 | 		# handle FEC ids specially because they are stored in an array...
130 | 		fec_id = new_ids.get("fec")
131 | 		if fec_id: del new_ids["fec"]
132 | 
133 | 		member = bioguides[bioguide]
134 | 		member["id"].update(new_ids)
135 | 
136 | 		# ...finish the FEC id.
137 | 		if fec_id:
138 | 			if fec_id not in bioguides[bioguide]["id"].get("fec", []):
139 | 				bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)
140 | 
141 | 		#print p.encode("utf8"), new_ids
142 | 
143 | 	utils.save_data(y1, "legislators-current.yaml")
144 | 	utils.save_data(y2, "legislators-historical.yaml")
145 | 
146 | if __name__ == '__main__':
147 |   run()
148 | 


--------------------------------------------------------------------------------
/test/are_files_linted.py:
--------------------------------------------------------------------------------
 1 | # Check that each YAML file has been linted.
 2 | 
 3 | import difflib
 4 | import glob
 5 | import io
 6 | import sys
 7 | 
 8 | import rtyaml
 9 | 
10 | ok = True
11 | 
12 | for fn in glob.glob("*.yaml"):
13 |   with open(fn) as f:
14 |     body = f.read()
15 | 
16 |   # Round-trip the file. Because of the comment block at the top
17 |   # of legislators-social-media.yaml, we need to go through file-like
18 |   # streams so that rtyaml preserves it.
19 |   data = rtyaml.load(io.StringIO(body))
20 | 
21 |   # Save it back to a buffer.
22 |   buf = io.StringIO()
23 |   rtyaml.dump(data, buf)
24 |   buf = buf.getvalue()
25 | 
26 |   # Check that the file round-trips to the same bytes,
27 |   # except don't worry about trailing newlines because
28 |   # editors mess with the last line line ending.
29 |   if buf.rstrip() != body.rstrip():
30 |     ok = False
31 |     print(fn, "needs to be linted:")
32 | 
33 |     # Show a diff.
34 |     for line in difflib.unified_diff(body.split("\n"), buf.split("\n"), fromfile='in repository', tofile='after linting', lineterm=''):
35 |       print(line)
36 | 
37 | sys.exit(0 if ok else 1)
38 | 


--------------------------------------------------------------------------------
/test/workout.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import glob
 5 | import os
 6 | import importlib
 7 | 
 8 | sys.path.append("scripts")
 9 | 
10 | scripts = glob.glob("scripts/*.py")
11 | scripts.sort()
12 | 
13 | for script in scripts:
14 |     module = os.path.basename(script).replace(".py", "")
15 |     print("Importing %s..." % module)
16 | 
17 |     try:
18 |         importlib.import_module(module)
19 |     except Exception as exc:
20 |         print("Error when importing %s!" % module)
21 |         print()
22 |         raise exc
23 | 
24 | exit(0)


--------------------------------------------------------------------------------