├── .circleci └── config.yml ├── .gitattributes ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── committee-membership-current.yaml ├── committees-current.yaml ├── committees-historical.yaml ├── congress_lookup.py ├── executive.yaml ├── legislators-current.yaml ├── legislators-district-offices.yaml ├── legislators-historical.yaml ├── legislators-social-media.yaml ├── misc ├── biographical-directory-footnotes.json └── new-member-template.yaml ├── scripts ├── alternate_bulk_formats.py ├── archive │ ├── 114th_congress.py │ ├── committee_membership_house.py │ ├── election_results_2014.csv │ ├── election_results_2018_senate.csv │ ├── election_results_2020.csv │ ├── election_results_2022.csv │ ├── election_results_2024.csv │ ├── election_results_house_2016.py │ ├── election_results_senate_2016.csv │ ├── election_results_senate_2016.py │ ├── everypolitician.py │ ├── house_history_gender.py │ └── print_leadership_roles.py ├── bioguide.py ├── bioguide_guess_new_member_ids.py ├── bioguide_xml.py ├── committee_membership.py ├── contact_forms.py ├── cspan.py ├── data │ ├── social_media_blacklist.csv │ └── social_media_whitelist.csv ├── election_results.py ├── email │ └── config.yml.example ├── export_csv.py ├── geocode_offices.py ├── historical_committees.py ├── house_contacts.py ├── house_history.py ├── house_websites.py ├── icpsr_ids.py ├── influence_ids.py ├── lint.py ├── office_validator.py ├── pictorial_ids.py ├── requirements.txt ├── retire.py ├── run_script_to_branch ├── senate_contacts.py ├── social │ └── twitter.py ├── social_media.py ├── sweep.py ├── thomas_ids.py ├── untire.py ├── update_gh_pages.sh ├── utils.py ├── wikidata_update.py └── wikipedia_ids.py └── test ├── are_files_linted.py ├── test_pictorial_ids.py ├── validate.py └── workout.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: cimg/python:3.11 6 | steps: 7 | # Set up. 8 | - checkout 9 | - run: pip install -r scripts/requirements.txt 10 | 11 | # Run tests. 12 | - run: python test/workout.py 13 | - run: pyflakes . 14 | - run: python test/are_files_linted.py 15 | - run: python test/validate.py 16 | 17 | # Update the gh-pages branch. This requires that 18 | # CircleCI be set up with read-write permission 19 | # on the repo, which is not CircleCI's default. 20 | deploy: 21 | docker: 22 | - image: cimg/python:3.8 23 | steps: 24 | - checkout 25 | - run: pip install -r scripts/requirements.txt 26 | - run: scripts/update_gh_pages.sh 27 | 28 | workflows: 29 | version: 2 30 | build-and-deploy: 31 | jobs: 32 | - build 33 | - deploy: 34 | filters: 35 | branches: 36 | only: main 37 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | alternate_formats/* -diff 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /scripts/cache 3 | *.pickle 4 | .DS_Store 5 | /scripts/email/config.yml 6 | \.~lock* 7 | /scripts/build 8 | /domains.rb 9 | /venv/ 10 | .idea 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Public domain 2 | 3 | The project is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication][CC0]. 4 | 5 | All contributions to this project will be released under the CC0 dedication. By submitting a pull request, you are agreeing to comply with this waiver of copyright interest. 6 | 7 | [CC0]: http://creativecommons.org/publicdomain/zero/1.0/ 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /congress_lookup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding: utf-8 3 | __author__ = 'stsmith' 4 | 5 | # congress_lookup: Look up information about congress from the congress-legislators database 6 | # See: https://github.com/unitedstates/congress-legislators and https://github.com/TheWalkers/congress-legislators 7 | 8 | # The project is in the public domain within the United States, and 9 | # copyright and related rights in the work worldwide are waived 10 | # through the CC0 1.0 Universal public domain dedication. 11 | 12 | # Author 2017 Steven T. Smith 13 | 14 | import argparse as ap, contextlib, fnmatch, os, sys, time, warnings, yaml 15 | 16 | # version dependent libraries 17 | # https://docs.python.org/2/library/urllib.html 18 | # https://docs.python.org/3.0/library/urllib.parse.html 19 | if (sys.version_info > (3, 0)): 20 | from urllib.request import urlopen 21 | import urllib.parse as urlparse 22 | else: 23 | from urllib2 import urlopen 24 | import urlparse 25 | 26 | class CongressLookup: 27 | '''A class used to lookup legislator properties from the github congress-legislators YAML database.''' 28 | 29 | def __init__(self): 30 | self.args = self.parseArgs() 31 | self.data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),self.args.data_dir) 32 | self.properties = dict() 33 | self.database_load() 34 | for prop in self.args.properties: self.lookup_property(prop) 35 | 36 | def parseArgs(self): 37 | parser = ap.ArgumentParser() 38 | parser.add_argument('properties', metavar='PROPS', type=str, nargs='+', 39 | help='Properties to look up') 40 | parser.add_argument('-c', '--committee', help="Committee name (wildcard)", type=str, default=None) 41 | parser.add_argument('-n', '--last-name', help="Last name of legislator (wildcard)", type=str, default=None) 42 | parser.add_argument('-d', '--data-dir', help="Database directory", type=str, default='.') 43 | parser.add_argument('-r', '--repo', help="GitHub repo URL", type=str, default='https://github.com/unitedstates/congress-legislators/') 44 | parser.add_argument('-T', '--current-term', help="Properties from only the current term", action='store_true') 45 | parser.add_argument('-D', '--download', help="Download data", action='store_true', default=False) 46 | parser.add_argument('-g', '--debug', help="Debug flag", action='store_true') 47 | return parser.parse_args() 48 | 49 | def lookup_property(self,property): 50 | if self.args.committee is not None: 51 | self.lookup_by_committee(property) 52 | if self.args.last_name is not None: 53 | self.lookup_by_lastname(property) 54 | 55 | def lookup_by_committee(self,property): 56 | for comm in (comm for comm in self.committees if self.inclusive_wildcard_match(comm['name'],self.args.committee)): 57 | if self.args.debug: print(comm) 58 | print('"{}" member properties:'.format(comm['name'].encode('utf-8'))) 59 | members = self.membership[comm['thomas_id']] if comm['thomas_id'] in self.membership else [] 60 | for member in members: self.lookup_by_member(property,member) 61 | 62 | def inclusive_wildcard_match(self,name,pat): 63 | if any(c in pat for c in '*?[]'): # a wildcard pattern 64 | # prepend or append a * for inclusiveness if not already there 65 | if pat[0] != '*': pat = '*' + pat 66 | if pat[-1] != '*': pat = pat + '*' 67 | else: # not a wildcard 68 | pat = '*' + pat + '*' 69 | return fnmatch.fnmatch(name,pat) 70 | 71 | def lookup_by_member(self,property,member): 72 | for leg in ( leg for leg in self.legislators if \ 73 | (leg['name']['official_full'] == member['name']) \ 74 | or ('bioguide' in leg['id'] and 'bioguide' in member and leg['id']['bioguide'] == member['bioguide']) \ 75 | or ('thomas' in leg['id'] and 'thomas' in member and leg['id']['thomas'] == member['thomas']) ): 76 | self.lookup_legislator_properties(property,leg) 77 | 78 | def lookup_by_lastname(self,property): 79 | for leg in (leg for leg in self.legislators if fnmatch.fnmatch(leg['name']['last'],self.args.last_name)): 80 | if self.args.debug: print(leg) 81 | self.lookup_legislator_properties(property,leg) 82 | 83 | def lookup_legislator_properties(self,property,legislator): 84 | self.properties[property] = set([term[property] for term in legislator['terms'] if self.lookup_filter(property,term)]) 85 | for off in self.offices: 86 | if self.args.debug: print(off) 87 | if any(off['id'][db] == legislator['id'][db] for db in off['id'] if db in off['id'] and db in legislator['id']): 88 | self.properties[property] |= set([ok[property] for ok in off['offices'] if property in ok and len(ok[property]) > 0]) 89 | break 90 | print('Property \'{}\' for {}:'.format(property,legislator['name']['official_full'].encode('utf-8'))) 91 | print('\n'.join(sorted(self.properties[property]))) 92 | 93 | def lookup_filter(self,property,term): 94 | result = property in term and len(term[property]) > 0 95 | if result and self.args.current_term: 96 | result &= 'end' in term and time.strptime(term['end'],'%Y-%m-%d') >= time.localtime() 97 | return result 98 | 99 | def database_load(self): 100 | try: 101 | with self.database_access('legislators-current.yaml') as y: 102 | self.legislators = self.yaml_load(y, Loader=yaml.CLoader) 103 | with self.database_access('legislators-district-offices.yaml') as y: 104 | self.offices = self.yaml_load(y, Loader=yaml.CLoader) 105 | if self.args.committee is not None: 106 | with self.database_access('committees-current.yaml') as y: 107 | self.committees = self.yaml_load(y, Loader=yaml.CLoader) 108 | with self.database_access('committee-membership-current.yaml') as y: 109 | self.membership = self.yaml_load(y, Loader=yaml.CLoader) 110 | else: 111 | self.committees = None 112 | except (BaseException,IOError) as e: 113 | print(e) 114 | raise Exception('Clone data from {} and copy it to {} .'.format(self.args.repo,self.data_path)) 115 | 116 | def yaml_load(self,y,Loader=yaml.loader.Loader): 117 | res = yaml.load(y, Loader=Loader) 118 | if res is None: res = [] # make it an empty iterable 119 | return res 120 | 121 | def database_access(self,filename): 122 | if self.args.download: 123 | if self.args.repo[-1] != '/': self.args.repo += '/' 124 | url_base = urlparse.urljoin(urlparse.urlunparse(urlparse.urlparse(self.args.repo)._replace(netloc='raw.githubusercontent.com')),'main/') 125 | # contextlib required for urlopen in with ... as for v < 3.3 126 | res = contextlib.closing(urlopen( urlparse.urljoin(url_base,filename) )) 127 | else: 128 | fname_fullpath = os.path.join(self.data_path,filename) 129 | if os.path.exists(fname_fullpath): 130 | res = open(fname_fullpath,'r') 131 | else: 132 | warnings.warn('File {} doesn\'t exist; clone data from {} and copy it to {} .'.format(filename,self.args.repo,self.data_path)) 133 | res = self.Emptysource() 134 | return res 135 | 136 | class Emptysource(object): 137 | def read(self, size): 138 | return '' # empty 139 | def write(self, data): 140 | pass # ignore the data 141 | def __enter__(self): return self 142 | def __exit__(*x): pass 143 | 144 | 145 | if __name__ == "__main__": 146 | res = CongressLookup() 147 | -------------------------------------------------------------------------------- /misc/new-member-template.yaml: -------------------------------------------------------------------------------- 1 | # All of the fields we can put on a legislator, 2 | # as we might add when a new legislator takes 3 | # office. 4 | # 5 | # Run the lint.py script after editing the main 6 | # YAML files. It will conveniently remove all of 7 | # the comments. 8 | # 9 | # In separate commits, run: 10 | # * house_contacts.py (which will add url, address, etc. fields to House members) 11 | # * senate_contacts.py (likewise for Senate members) 12 | # * committee_membership.py (updates committee membership) 13 | 14 | - id: 15 | bioguide: Q000000 # http://bioguide.congress.gov/ 16 | lis: S999 # not assigned until there is a Senate roll call vote 17 | fec: # http://fec.gov/finance/disclosure/candcmte_info.shtml 18 | - H1XX99999 # (you're looking for a Candidate ID) 19 | govtrack: 456789 # you may assign the next available integer (try: `(echo -n "1+"; git grep -h govtrack: *.yaml | sort | tail -1 | sed "s/ *govtrack: //") | bc`) 20 | opensecrets: N00099999 # http://www.opensecrets.org/ 21 | votesmart: 159999 # http://votesmart.org/ 22 | icpsr: 99999 # not knowable until voteview.org publishes roll call raw data 23 | cspan: 75516 # people search at http://www.c-span.org/ (personid) 24 | wikipedia: John Doe # https://en.wikipedia.org/wiki/Main_Page (replace _ with space!) 25 | wikidata: Q30129999 # from the "Wikidata item" URL linked from the their Wikipedia page 26 | ballotpedia: John Doe # http://ballotpedia.org/Main_Page (replace _ with space!) 27 | house_history: 10999 # http://history.house.gov/People/Search/ 28 | google_entity_id: kg:/g/11dddd111d # ... 29 | name: 30 | first: John 31 | middle: Person # optional, can also be an initial like 'P.' 32 | nickname: Whoami # if clearly in use 33 | last: Doe 34 | suffix: Jr. # optional 35 | bio: 36 | gender: # M or F, no quotes 37 | birthday: '1960-06-06' # can find on Bioguide 38 | terms: 39 | 40 | # for a representative 41 | - type: rep 42 | start: '2017-01-03' # date of swearing in 43 | end: '2019-01-03' # always the next odd-year Jan 3, until a death/resignation 44 | state: FL # USPS state abbreviation 45 | district: 19 # an integer; 0 for At-Large 46 | party: # Republican, Democrat, Independent 47 | caucus: # for Independents only, Republican or Democrat 48 | url: https://someone.house.gov 49 | contact_form: https://www.house.gov/name/email.htm 50 | 51 | # for a senator 52 | - type: sen 53 | start: '2015-01-03' # date of swearing in 54 | end: '2021-01-03' # always a future Jan 3, until a death/resignation 55 | how: appointment # for senators appointed by the governor only 56 | end-type: special-election # when "how: appointment" is used, prior to the special election, 57 | # set the "end" date to the special election date and set this flag 58 | state: FL # USPS state abbreviation 59 | class: 1 # copy from the senator this person is succeeding 60 | party: # Republican, Democrat, Independent 61 | caucus: # for Independents only, Republican or Democrat 62 | state_rank: junior # or senior 63 | url: https://someone.senate.gov/ 64 | contact_form: https://www.name.senate.gov/contact/ 65 | -------------------------------------------------------------------------------- /scripts/alternate_bulk_formats.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import glob 4 | import os 5 | 6 | import utils 7 | 8 | 9 | def generate_csv(): 10 | 11 | #yaml filenames 12 | yamls = ["legislators-current.yaml","legislators-historical.yaml"] 13 | yaml_social = "legislators-social-media.yaml" 14 | 15 | 16 | 17 | #list of yaml field name, csv column name tuples. Split into categories which do not reflect yaml structure (structured for logical csv column ordering) 18 | bio_fields = [ 19 | ("last", "last_name"), 20 | ("first", "first_name"), 21 | ("middle", "middle_name"), 22 | ("suffix", "suffix"), 23 | ("nickname", "nickname"), 24 | ("official_full", "full_name"), 25 | ("birthday", "birthday"), 26 | ("gender", "gender") 27 | ] 28 | 29 | #ID crosswalks, omit FEC id's, which may contain (arbitrary?) number of values 30 | crosswalk_fields = [ 31 | ("bioguide", "bioguide_id"), 32 | ("thomas", "thomas_id"), 33 | ("opensecrets", "opensecrets_id"), 34 | ("lis","lis_id"), 35 | ("fec","fec_ids"), 36 | ("cspan", "cspan_id"), 37 | ("govtrack", "govtrack_id"), 38 | ("votesmart", "votesmart_id"), 39 | ("ballotpedia", "ballotpedia_id"), 40 | ("washington_post", "washington_post_id"), 41 | ("icpsr", "icpsr_id"), 42 | ("wikipedia", "wikipedia_id") 43 | ] 44 | 45 | #separate list for children of "terms", csv only captures data for most recent term 46 | #currently excluding start/end dates - earliest start to latest end is deceptive (excludes gaps) as is start/end for most recent term 47 | term_fields = [ 48 | ("type", "type"), 49 | ("state", "state"), 50 | ("district", "district"), 51 | ("class", "senate_class"), 52 | ("party", "party"), 53 | ("url", "url"), 54 | ("address", "address"), 55 | ("phone", "phone"), 56 | ("contact_form", "contact_form"), 57 | ("rss_url", "rss_url"), 58 | ] 59 | 60 | #pulled from legislators-social-media.yaml 61 | social_media_fields = [ 62 | ("twitter", "twitter"), 63 | ("twitter_id", "twitter_id"), 64 | ("facebook", "facebook"), 65 | ("youtube", "youtube"), 66 | ("youtube_id", "youtube_id"), 67 | ("mastodon", "mastodon") 68 | ] 69 | 70 | 71 | print("Loading %s..." %yaml_social) 72 | social = utils.load_data(yaml_social) 73 | 74 | for filename in yamls: 75 | print("Converting %s to CSV..." % filename) 76 | 77 | legislators = utils.load_data(filename) 78 | 79 | #convert yaml to csv 80 | csv_output = csv.writer(open("../" + filename.replace(".yaml", ".csv"),"w")) 81 | 82 | head = [] 83 | for pair in bio_fields: 84 | head.append(pair[1]) 85 | for pair in term_fields: 86 | head.append(pair[1]) 87 | for pair in social_media_fields: 88 | head.append(pair[1]) 89 | for pair in crosswalk_fields: 90 | head.append(pair[1]) 91 | csv_output.writerow(head) 92 | 93 | for legislator in legislators: 94 | legislator_row = [] 95 | for pair in bio_fields: 96 | if 'name' in legislator and pair[0] in legislator['name']: 97 | legislator_row.append(legislator['name'][pair[0]]) 98 | elif 'bio' in legislator and pair[0] in legislator['bio']: 99 | legislator_row.append(legislator['bio'][pair[0]]) 100 | else: 101 | legislator_row.append(None) 102 | 103 | for pair in term_fields: 104 | latest_term = legislator['terms'][len(legislator['terms'])-1] 105 | if pair[0] in latest_term: 106 | legislator_row.append(latest_term[pair[0]]) 107 | else: 108 | legislator_row.append(None) 109 | 110 | social_match = None 111 | for social_legislator in social: 112 | if 'bioguide' in legislator['id'] and 'bioguide' in social_legislator['id'] and legislator['id']['bioguide'] == social_legislator['id']['bioguide']: 113 | social_match = social_legislator 114 | break 115 | elif 'thomas' in legislator['id'] and 'thomas' in social_legislator['id'] and legislator['id']['thomas'] == social_legislator['id']['thomas']: 116 | social_match = social_legislator 117 | break 118 | elif 'govtrack' in legislator['id'] and 'govtrack' in social_legislator['id'] and legislator['id']['govtrack'] == social_legislator['id']['govtrack']: 119 | social_match = social_legislator 120 | break 121 | for pair in social_media_fields: 122 | if social_match != None: 123 | if pair[0] in social_match['social']: 124 | legislator_row.append(social_match['social'][pair[0]]) 125 | else: 126 | legislator_row.append(None) 127 | else: 128 | legislator_row.append(None) 129 | 130 | for pair in crosswalk_fields: 131 | if pair[0] in legislator['id']: 132 | value = legislator['id'][pair[0]] 133 | if isinstance(value, list): 134 | # make FEC IDs comma-separated 135 | value = ",".join(value) 136 | legislator_row.append(value) 137 | else: 138 | legislator_row.append(None) 139 | 140 | csv_output.writerow(legislator_row) 141 | 142 | generate_district_office_csv() 143 | 144 | 145 | def generate_district_office_csv(): 146 | filename = "legislators-district-offices.yaml" 147 | print("Converting %s to CSV..." % filename) 148 | legislators_offices = utils.load_data(filename) 149 | fields = [ 150 | "bioguide", "thomas", "govtrack", "id", "address", "building", 151 | "city", "fax", "hours", "phone", "state", "suite", "zip", 152 | "latitude", "longitude"] 153 | 154 | f = open("../" + filename.replace(".yaml", ".csv"), "w") 155 | csv_output = csv.DictWriter(f, fieldnames=fields) 156 | csv_output.writeheader() 157 | 158 | for legislator_offices in legislators_offices: 159 | legislator_ids = legislator_offices['id'] 160 | for office in legislator_offices['offices']: 161 | office.update(legislator_ids) 162 | csv_output.writerow(office) 163 | 164 | 165 | def generate_json(): 166 | 167 | #yaml filenames 168 | yamls = list(map(os.path.basename, glob.glob("../*.yaml"))) 169 | 170 | for filename in yamls: 171 | print("Converting %s to JSON..." % filename) 172 | data = utils.load_data(filename) 173 | '''handle edge case of incorrect coercion for twitter ids in social media data 174 | json/js can only handle maximum of 53-bit integers, so 64-bit integer twitter ids *must* be stringified 175 | to consistently preserve value in json. otherwise they may be rounded and malformed 176 | ''' 177 | if 'legislators-social-media' in filename: 178 | for social_legislator in data: 179 | if 'twitter_id' in social_legislator['social']: 180 | social_legislator['social']['twitter_id'] = str(social_legislator['social']['twitter_id']) 181 | 182 | #convert yaml to json 183 | utils.write( 184 | json.dumps(data, default=utils.format_datetime, indent=2), 185 | "../" + filename.replace(".yaml", ".json")) 186 | 187 | if __name__ == '__main__': 188 | generate_csv() 189 | generate_json() 190 | 191 | -------------------------------------------------------------------------------- /scripts/archive/114th_congress.py: -------------------------------------------------------------------------------- 1 | # Temporary script to help us get the data in shape 2 | # for the 114th Congress. 3 | 4 | # Get: (thanks Derek!) 5 | # https://docs.google.com/spreadsheets/d/1H8z7Ah4jSlXiuIol3oXoWBR8s6h0OtA62dNlU-kiIlU/edit#gid=1419747559 6 | # and download as 'election_results_2014.csv'. 7 | 8 | # TODO: 9 | # * What is the expected first day of the Congress? (Closest guess of swearing-in dates.) 10 | # * Am adding "TODO: TODO" to new terms that weren't copied from older terms. Needs checking, possibly additional details like url, contact form. 11 | 12 | from collections import OrderedDict 13 | import copy 14 | import csv 15 | 16 | import utils 17 | 18 | def run(): 19 | 20 | # Which members were up for relection, won in their office, or were 21 | # a winner in another office? 22 | won_row = { } 23 | incumbents = set() 24 | winners = set() 25 | incumbent_winners = set() 26 | new_members = [] 27 | for row in csv.DictReader(open("election_results_2014.csv")): 28 | if row["new_member"] == "": 29 | print("not decided yet...", row) 30 | continue 31 | 32 | # For NC-12, Alma Adams won the vacant seat and the 114th Congress 33 | # term. It's coded in the spreadsheet as if she's a new member, but 34 | # since we've already added her in the 113th Congress we need to 35 | # pretend here that she's a returning member. 36 | if row["new_id"] == "A000370": 37 | row["member_id"] = "A000370" 38 | 39 | incumbents.add(row["member_id"]) 40 | winners.add(row["new_id"]) 41 | won_row[row["new_id"]] = row 42 | if row["member_id"] == row["new_id"]: 43 | incumbent_winners.add(row["new_id"]) 44 | if row["new_id"] == "": 45 | new_members.append(row) 46 | 47 | # Make a stub term based on a row in Derek's spreadsheet. 48 | def build_term(row, mark): 49 | if row['chamber'] == 'House': 50 | end_date = '2017-01-03' 51 | elif row['district'] == 'Class II': 52 | end_date = '2021-01-03' 53 | elif row['district'] == 'Class III': 54 | end_date = '2017-01-03' 55 | else: 56 | raise ValueError() 57 | 58 | ret = OrderedDict([ 59 | ("type", "rep" if row['chamber'] == 'House' else 'sen'), 60 | ("start", '2015-01-06'), 61 | ("end", end_date), 62 | ("state", row['state_abbrev']), 63 | ]) 64 | 65 | if ret["type"] == "rep": 66 | ret["district"] = int(row['district']) if row['district'] != "AL" else 0 67 | else: 68 | if row["district"] == "Class II": 69 | ret["class"] = 2 70 | elif row["district"] == "Class III": 71 | ret["class"] = 3 72 | else: 73 | raise ValueError() 74 | if mark: 75 | ret["state_rank"] = "junior" 76 | 77 | if row["winner_party"] == "D": 78 | ret["party"] = "Democrat" 79 | elif row["winner_party"] == "R": 80 | ret["party"] = "Republican" 81 | else: 82 | raise ValueError() 83 | 84 | if mark: 85 | ret["TODO"] = "TODO" 86 | 87 | return ret 88 | 89 | # Load legislators. 90 | legislators_current = utils.load_data("legislators-current.yaml") 91 | legislators_historical = utils.load_data("legislators-historical.yaml") 92 | legislators_social_media = utils.load_data("legislators-social-media.yaml") 93 | 94 | # Sweep current members. 95 | to_retire = [] 96 | for p in legislators_current: 97 | id = p['id']['bioguide'] 98 | if id in incumbents: 99 | # This legislator was up for reelection. 100 | if id in incumbent_winners: 101 | # And won. Extend the term. 102 | t = copy.deepcopy(p['terms'][-1]) 103 | p['terms'].append(t) 104 | t.update(build_term(won_row[id], False)) 105 | 106 | elif id in winners: 107 | # Incumbent won something else. Start 108 | # a fresh term. 109 | p['terms'].append(build_term(won_row[id], True)) 110 | 111 | else: 112 | # Incumbent lost. 113 | to_retire.append(p) 114 | 115 | # Any legislators to bring forward? 116 | to_return = [] 117 | for p in legislators_historical: 118 | id = p['id']['bioguide'] 119 | if id in winners: 120 | p['terms'].append(build_term(won_row[id], True)) 121 | to_return.append(p) 122 | 123 | # Now that we're outside of the iterator, modify lists. 124 | for p in to_retire: 125 | legislators_current.remove(p) 126 | legislators_historical.append(p) 127 | for p in to_return: 128 | legislators_current.append(p) 129 | legislators_historical.remove(p) 130 | 131 | # Delete entries in legislators-social-media for those retiring 132 | retiring_leg_bioguideids = [leg['id']['bioguide'] for leg in to_retire] 133 | for p in legislators_social_media: 134 | id = p['id']['bioguide'] 135 | if id in retiring_leg_bioguideids: 136 | legislators_social_media.remove(p) 137 | 138 | # Add stubs for new members. 139 | def fix_date(date): 140 | m, d, y = date.split("/") 141 | return "%04d-%02d-%02d" % (int(y), int(m), int(d)) 142 | for i, row in enumerate(new_members): 143 | p = OrderedDict([ 144 | ("id", OrderedDict([ 145 | ("bioguide", "TODO"), 146 | ("thomas", "TODO"), 147 | ("lis", "TODO"), 148 | ("fec", row['new_fec_cand_id'].split(',')), 149 | ("govtrack", 412608+i), # assigning IDs here 150 | ("opensecrets", "TODO"), 151 | ("votesmart", "TODO"), 152 | ("icpsr", "TODO"), 153 | ("cspan", "TODO"), 154 | ("wikipedia", "TODO"), 155 | ("ballotpedia", "TODO"), 156 | ("house_history", "TODO"), 157 | ])), 158 | ("name", OrderedDict()), 159 | ("bio", OrderedDict([ 160 | ("gender", row["gender"]), 161 | ("birthday", fix_date(row["date_of_birth"]) if row["date_of_birth"] != "" else "TODO"), 162 | ])), 163 | ("terms", [ 164 | build_term(row, True), 165 | ]) 166 | ]) 167 | 168 | if len(row["new_member"].split(" ")) == 2: 169 | p['name']['first'] = row["new_member"].split(" ")[0] 170 | p['name']['last'] = row["new_member"].split(" ")[1] 171 | else: 172 | p['name']['FULL'] = row["new_member"] 173 | p['name']['first'] = "TODO" 174 | p['name']['last'] = "TODO" 175 | 176 | legislators_current.append(p) 177 | 178 | 179 | # Save. 180 | utils.save_data(legislators_current, "legislators-current.yaml") 181 | utils.save_data(legislators_historical, "legislators-historical.yaml") 182 | utils.save_data(legislators_social_media, "legislators-social-media.yaml") 183 | 184 | if __name__ == '__main__': 185 | run() 186 | -------------------------------------------------------------------------------- /scripts/archive/committee_membership_house.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Use the NYTimes API to get House committee information. 4 | # When we wrote this script we believed the House Clerk was 5 | # not yet making this info available. 6 | 7 | import utils 8 | import json 9 | import copy 10 | from utils import download, load_data, save_data 11 | 12 | committee_membership = { } 13 | 14 | committees_current = load_data("committees-current.yaml") 15 | memberships_current = load_data("committee-membership-current.yaml") 16 | 17 | # default to not caching 18 | cache = utils.flags().get('cache', False) 19 | force = not cache 20 | 21 | congress = 113 22 | 23 | # map house/senate committee IDs to their dicts 24 | all_ids = [] 25 | 26 | house_ref = { } 27 | for cx in committees_current: 28 | if cx["type"] == "house": 29 | house_ref[cx["thomas_id"]] = cx 30 | all_ids.append(cx['thomas_id']) 31 | 32 | senate_ref = { } 33 | for cx in committees_current: 34 | if cx["type"] == "senate": 35 | senate_ref[cx["thomas_id"]] = cx 36 | all_ids.append(cx['thomas_id']) 37 | 38 | # map people by their bioguide ID 39 | y = load_data("legislators-current.yaml") 40 | by_bioguide = { } 41 | for m in y: 42 | bioguide = m['id']['bioguide'] 43 | by_bioguide[bioguide] = m 44 | 45 | 46 | # load in committees from the NYT Congress API (API key not kept in source control) 47 | api_key = open("cache/nyt_api_key").read() # file's whole body is the api key 48 | 49 | url = "http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%i/house/committees.json?api-key=%s" % (congress, api_key) 50 | 51 | body = download(url, "committees/membership/nyt-house.json", force) 52 | committees = json.loads(body)['results'][0]['committees'] 53 | 54 | for committee in committees: 55 | committee_id = committee['id'] 56 | 57 | committee_url = "http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%i/house/committees/%s.json?api-key=%s" % (congress, committee_id, api_key) 58 | 59 | # current disagreement between THOMAS and NYT (but use HSIG in URL above) 60 | if committee_id == "HSIG": 61 | committee_id = "HLIG" 62 | 63 | if committee_id not in all_ids: 64 | continue 65 | 66 | committee_party = committee['chair_party'] 67 | 68 | committee_body = download(committee_url, "committees/membership/house/%s.json" % committee_id, force) 69 | members = json.loads(committee_body)['results'][0]['current_members'] 70 | 71 | committee_membership[committee_id] = [] 72 | for member in members: 73 | bioguide_id = member['id'] 74 | 75 | print("[{}] {}".format(committee_id, bioguide_id)) 76 | 77 | if bioguide_id not in by_bioguide: 78 | continue 79 | 80 | legislator = by_bioguide[bioguide_id] 81 | # last_term = legislator['terms'][-1] 82 | 83 | if member['party'] == committee_party: 84 | party = "majority" 85 | else: 86 | party = "minority" 87 | 88 | # this really shouldn't be calculated, but for now it's what we've got 89 | rank = int(member['rank_in_party']) 90 | if rank == 1: 91 | if party == "majority": 92 | title = "Chair" 93 | else: 94 | title = "Ranking Member" 95 | else: 96 | title = None 97 | 98 | details = { 99 | 'name': legislator['name']['official_full'], 100 | 'party': party, 101 | 'rank': rank, 102 | 'bioguide': bioguide_id, 103 | 'thomas': legislator['id']['thomas'] 104 | } 105 | 106 | if title: 107 | details['title'] = title 108 | 109 | committee_membership[committee_id].append(details) 110 | 111 | # sort members to put majority party first, then order by rank 112 | # (fixing the order makes for better diffs) 113 | for c in committee_membership.values(): 114 | c.sort(key = lambda m : (m["party"]=="minority", m["rank"])) 115 | 116 | # preserve senate memberships 117 | senate_membership = {} 118 | for committee_id in memberships_current: 119 | if not committee_id.startswith("H"): 120 | committee_membership[committee_id] = copy.deepcopy(memberships_current[committee_id]) 121 | 122 | print("Saving committee memberships...") 123 | save_data(committee_membership, "committee-membership-current.yaml") 124 | -------------------------------------------------------------------------------- /scripts/archive/election_results_2018_senate.csv: -------------------------------------------------------------------------------- 1 | Incumbent Party,State,Senator,GovTrack ID,Bioguide ID,Party,FEC.gov ID,Wikipedia Page Name,Wikidata ID (see Wikipedia sidebar),Ballotpedia Page Name,First Name,Middle Name,Last Name,Gender (M/F),Birthday (often on Wikipedia) 2 | D,AZ,"Sinema, Kyrsten",412509,,,,,,,,,,, 3 | D,CA,"Feinstein, Dianne",300043,,,,,,,,,,, 4 | D,CT,"Murphy, Christopher",412194,,,,,,,,,,, 5 | D,DE,"Carper, Thomas R.",300019,,,,,,,,,,, 6 | R,FL,"Scott, Rick",,S001217,Republican,S8FL00273,Rick Scott,Q439729,Rick Scott,Rick,,Scott,M,1952-12-01 7 | D,HI,"Hirono, Mazie K.",412200,,,,,,,,,,, 8 | R,IN,"Braun, Mike",,B001310,Republican,S8IN00171,Mike Braun,Q42804470,Mike Braun,Mike,,Braun,M,1954-03-24 9 | D,MA,"Warren, Elizabeth",412542,,,,,,,,,,, 10 | D,MD,"Cardin, Benjamin L.",400064,,,,,,,,,,, 11 | I,ME,"King, Angus S., Jr.",412545,,,,,,,,,,, 12 | D,MI,"Stabenow, Debbie",300093,,,,,,,,,,, 13 | D,MN,"Klobuchar, Amy",412242,,,,,,,,,,, 14 | R,MO,"Hawley, Josh",,H001089,Republican,S8MO00160,Josh Hawley,Q23020745,Josh Hawley,Joshua,,Hawley,M,1979-12-31 15 | R,MS,"Wicker, Roger F.",400432,,,,,,,,,,, 16 | D,MT,"Tester, Jon",412244,,,,,,,,,,, 17 | R,ND,"Cramer, Kevin",412555,,,,,,,,,,, 18 | R,NE,"Fischer, Deb",412556,,,,,,,,,,, 19 | D,NJ,"Menendez, Robert",400272,,,,,,,,,,, 20 | D,NM,"Heinrich, Martin",412281,,,,,,,,,,, 21 | D,NV,"Rosen, Jacky",412715,,,,,,,,,,, 22 | D,NY,"Gillibrand, Kirsten E.",412223,,,,,,,,,,, 23 | D,OH,"Brown, Sherrod",400050,,,,,,,,,,, 24 | D,PA,"Casey, Robert P., Jr.",412246,,,,,,,,,,, 25 | D,RI,"Whitehouse, Sheldon",412247,,,,,,,,,,, 26 | R,TN,"Blackburn, Marsha",400032,,,,,,,,,,, 27 | R,TX,"Cruz, Ted",412573,,,,,,,,,,, 28 | R,UT,"Romney, Mitt",,R000615,Republican,S4MA00143,MItt Romney,Q4496,Mitt Romney,Mitt,,Romney,M,1947-03-12 29 | D,VA,"Kaine, Tim",412582,,,,,,,,,,, 30 | I,VT,"Sanders, Bernard",400357,,,,,,,,,,, 31 | D,WA,"Cantwell, Maria",300018,,,,,,,,,,, 32 | D,WI,"Baldwin, Tammy",400013,,,,,,,,,,, 33 | D,WV,"Manchin, Joe, III",412391,,,,,,,,,,, 34 | R,WY,"Barrasso, John",412251,,,,,,,,,,, 35 | -------------------------------------------------------------------------------- /scripts/archive/election_results_2020.csv: -------------------------------------------------------------------------------- 1 | Race,Incumbent Win? Y/N,GovTrack ID,Party,Last Name,First Name,Middle Name,Suffix,Gender (M/F),Birthday (YYYY-MM-DD),FEC.gov ID,Wikidata ID,Twitter Handle,Found photo? 2 | AK,Y,412665,R,Sullivan,,,,,,,,, 3 | AL,N,,R,Tuberville,Tommy,Hawley,,M,1954-09-18,S0AL00230,Q7819948,@TTuberville, 4 | AR,Y,412508,R,Cotton,,,,,,,,, 5 | CO,N,,D,Hickenlooper,John,Wright,Jr.,M,1952-02-07,S0CO00575,Q430518,@Hickenlooper, 6 | DE,Y,412390,D,Coons,,,,,,,,, 7 | GA,Run-off election occurs after Jan 3.,,,,,,,,,,,, 8 | IA,Y,412667,R,Ernst,,,,,,,,, 9 | ID,Y,412322,R,Risch,,,,,,,,, 10 | IL,Y,300038,D,Durbin,,,,,,,,, 11 | KS,Y,412704,R,Marshall,,,,,,,,, 12 | KY,Y,300072,R,McConnell,,,,,,,,, 13 | LA,Y,412269,R,Cassidy,,,,,,,,, 14 | MA,Y,400253,D,Markey,,,,,,,,, 15 | ME,Y,300025,R,Collins,,,,,,,,, 16 | MI,Y,412305,D,Peters,,,,,,,,, 17 | MN,Y,412742,D,Smith,,,,,,,,, 18 | MS,Y,412743,R,Hyde-Smith,,,,,,,,, 19 | MT,Y,412549,R,Daines,,,,,,,,, 20 | NC,Y,412668,R,Tillis,,,,,,,,, 21 | NE,Y,412671,R,Sasse,,,,,,,,, 22 | NH,Y,412323,D,Shaheen,,,,,,,,, 23 | NJ,Y,412598,D,Booker,,,,,,,,, 24 | NM,N,412293,D,Luján,Ben Ray,,,M,1972-06-07,S0NM00058,Q324256,@BenRayLujan, 25 | OK,Y,300055,R,Inhofe,,,,,,,,, 26 | OR,Y,412325,D,Merkley,,,,,,,,, 27 | RI,Y,300081,D,Reed,,,,,,,,, 28 | SC,Y,300047,R,Graham,,,,,,,,, 29 | SD,Y,412669,R,Rounds,,,,,,,,, 30 | TN,N,,R,Hagerty,Bill,Francis,IV,M,1959-08-14,S0TN00169,Q27734214,@BillHagertyTN, 31 | TX,Y,300027,R,Cornyn,,,,,,,,, 32 | VA,Y,412321,D,Warner,,,,,,,,, 33 | WV,Y,400061,R,Capito,,,,,,,,, 34 | WY,N,412294,R,Lummis,Cynthia,Marie,,F,1954-09-10,S0WY00137,Q456064,@CynthiaMLummis, 35 | AK0,Y,,,,,,,,,,,, 36 | AL1,N,,R,Carl,Jerry,Lee,Jr.,M,1958-06-17,H0AL01055,Q102277702,@CarlForAlabama, 37 | AL2,N,,R,Moore,Felix,Barry,,M,1966-09-26,H8AL02171,Q63198048,, 38 | AL3,Y,,,,,,,,,,,, 39 | AL4,Y,,,,,,,,,,,, 40 | AL5,Y,,,,,,,,,,,, 41 | AL6,Y,,,,,,,,,,,, 42 | AL7,Y,,,,,,,,,,,, 43 | AR1,Y,,,,,,,,,,,, 44 | AR2,Y,,,,,,,,,,,, 45 | AR3,Y,,,,,,,,,,,, 46 | AR4,Y,,,,,,,,,,,, 47 | AS0,Y,,,,,,,,,,,, 48 | AZ1,Y,,,,,,,,,,,, 49 | AZ2,Y,,,,,,,,,,,, 50 | AZ3,Y,,,,,,,,,,,, 51 | AZ4,Y,,,,,,,,,,,, 52 | AZ5,Y,,,,,,,,,,,, 53 | AZ6,Y,,,,,,,,,,,, 54 | AZ7,Y,,,,,,,,,,,, 55 | AZ8,Y,,,,,,,,,,,, 56 | AZ9,Y,,,,,,,,,,,, 57 | CA1,Y,,,,,,,,,,,, 58 | CA2,Y,,,,,,,,,,,, 59 | CA3,Y,,,,,,,,,,,, 60 | CA4,Y,,,,,,,,,,,, 61 | CA5,Y,,,,,,,,,,,, 62 | CA6,Y,,,,,,,,,,,, 63 | CA7,Y,,,,,,,,,,,, 64 | CA8,N,,R,Obernolte,Jay,Phillip,,M,1970-08-18,H0CA08135,Q16849797,@JayObernolte, 65 | CA9,Y,,,,,,,,,,,, 66 | CA10,Y,,,,,,,,,,,, 67 | CA11,Y,,,,,,,,,,,, 68 | CA12,Y,,,,,,,,,,,, 69 | CA13,Y,,,,,,,,,,,, 70 | CA14,Y,,,,,,,,,,,, 71 | CA15,Y,,,,,,,,,,,, 72 | CA16,Y,,,,,,,,,,,, 73 | CA17,Y,,,,,,,,,,,, 74 | CA18,Y,,,,,,,,,,,, 75 | CA19,Y,,,,,,,,,,,, 76 | CA20,Y,,,,,,,,,,,, 77 | CA21,N,412515,R,Valadao,David,Goncalves,,M,1977-04-14,H2CA20094,Q3528567,DGValadao, 78 | CA22,Y,,,,,,,,,,,, 79 | CA23,Y,,,,,,,,,,,, 80 | CA24,Y,,,,,,,,,,,, 81 | CA25,Y,,,,,,,,,,,, 82 | CA26,Y,,,,,,,,,,,, 83 | CA27,Y,,,,,,,,,,,, 84 | CA28,Y,,,,,,,,,,,, 85 | CA29,Y,,,,,,,,,,,, 86 | CA30,Y,,,,,,,,,,,, 87 | CA31,Y,,,,,,,,,,,, 88 | CA32,Y,,,,,,,,,,,, 89 | CA33,Y,,,,,,,,,,,, 90 | CA34,Y,,,,,,,,,,,, 91 | CA35,Y,,,,,,,,,,,, 92 | CA36,Y,,,,,,,,,,,, 93 | CA37,Y,,,,,,,,,,,, 94 | CA38,Y,,,,,,,,,,,, 95 | CA39,N,,R,Kim,Young,Oak,,F,1962-10-18,H8CA39240,Q19662859,@YoungKimCA, 96 | CA40,Y,,,,,,,,,,,, 97 | CA41,Y,,,,,,,,,,,, 98 | CA42,Y,,,,,,,,,,,, 99 | CA43,Y,,,,,,,,,,,, 100 | CA44,Y,,,,,,,,,,,, 101 | CA45,Y,,,,,,,,,,,, 102 | CA46,Y,,,,,,,,,,,, 103 | CA47,Y,,,,,,,,,,,, 104 | CA48,N,,R,Steel,Michelle,Eunjoo,,F,1955-06-21,H0CA48198,Q6837200,@MichelleSteelCA, 105 | CA49,Y,,,,,,,,,,,, 106 | CA50,N,400196,R,Issa,Darrell,Edward,,M,1953-11-01,H0CA50178,Q1166592,@DarrellIssa, 107 | CA51,Y,,,,,,,,,,,, 108 | CA52,Y,,,,,,,,,,,, 109 | CA53,N,,D,Jacobs,Sara,,,F,1989-02-01,H0CA53115,Q50825637,@SaraJacobsCA, 110 | CO1,Y,,,,,,,,,,,, 111 | CO2,Y,,,,,,,,,,,, 112 | CO3,N,,R,Boebert,Lauren,Opal,,F,1986-12-15,H0CO03165,Q96761544,@LaurenBoebert, 113 | CO4,Y,,,,,,,,,,,, 114 | CO5,Y,,,,,,,,,,,, 115 | CO6,Y,,,,,,,,,,,, 116 | CO7,Y,,,,,,,,,,,, 117 | CT1,Y,,,,,,,,,,,, 118 | CT2,Y,,,,,,,,,,,, 119 | CT3,Y,,,,,,,,,,,, 120 | CT4,Y,,,,,,,,,,,, 121 | CT5,Y,,,,,,,,,,,, 122 | DC0,Y,,,,,,,,,,,, 123 | DE0,Y,,,,,,,,,,,, 124 | FL1,Y,,,,,,,,,,,, 125 | FL2,Y,,,,,,,,,,,, 126 | FL3,N,,R,Cammack,Katherine,,,F,1988-02-16,H0FL03175,Q98523243,@Kat_Cammack, 127 | FL4,Y,,,,,,,,,,,, 128 | FL5,Y,,,,,,,,,,,, 129 | FL6,Y,,,,,,,,,,,, 130 | FL7,Y,,,,,,,,,,,, 131 | FL8,Y,,,,,,,,,,,, 132 | FL9,Y,,,,,,,,,,,, 133 | FL10,Y,,,,,,,,,,,, 134 | FL11,Y,,,,,,,,,,,, 135 | FL12,Y,,,,,,,,,,,, 136 | FL13,Y,,,,,,,,,,,, 137 | FL14,Y,,,,,,,,,,,, 138 | FL15,N,,R,Franklin,Scott,,,M,1964-08-23,H0FL15104,Q101198561,@ScottFranklinFL, 139 | FL16,Y,,,,,,,,,,,, 140 | FL17,Y,,,,,,,,,,,, 141 | FL18,Y,,,,,,,,,,,, 142 | FL19,N,,R,Donalds,Byron,Lowell,,M,1972-10-28,H0FL19205,Q59726216,@ByronDonalds, 143 | FL20,Y,,,,,,,,,,,, 144 | FL21,Y,,,,,,,,,,,, 145 | FL22,Y,,,,,,,,,,,, 146 | FL23,Y,,,,,,,,,,,, 147 | FL24,Y,,,,,,,,,,,, 148 | FL25,Y,,,,,,,,,,,, 149 | FL26,N,,R,Giménez,Carlos,A.,,M,1954-01-17,H0FL26036,Q5041653,currently @MayorGimenez, 150 | FL27,N,,R,Salazar,Maria,Elvira,,M,1961-11-01,H8FL27185,Q6003715,@MaElviraSalazar, 151 | GA1,Y,,,,,,,,,,,, 152 | GA2,Y,,,,,,,,,,,, 153 | GA3,Y,,,,,,,,,,,, 154 | GA4,Y,,,,,,,,,,,, 155 | GA5,N,,D,Williams,Nikema,Natassha,,F,1978-07-30,H0GA05301,Q56486570,@NikemaWilliams, 156 | GA6,Y,,,,,,,,,,,, 157 | GA7,N,,D,Bourdeaux,Carolyn,,,F,1970-06-03,H8GA07201,Q58333638,@Carolyn4GA7, 158 | GA8,Y,,,,,,,,,,,, 159 | GA9,N,,R,Clyde,Andrew,,,M,1963-11-22,H0GA09246,Q102277679,, 160 | GA10,Y,,,,,,,,,,,, 161 | GA11,Y,,,,,,,,,,,, 162 | GA12,Y,,,,,,,,,,,, 163 | GA13,Y,,,,,,,,,,,, 164 | GA14,N,,R,Greene,Marjorie,Taylor,,F,1974-05-27,H0GA06192,Q98380406,@MTGreenee, 165 | GU0,Y,,,,,,,,,,,, 166 | HI1,Y,,,,,,,,,,,, 167 | HI2,N,,D,Kahele,Kaialiʻi,,,M,1974-03-28,H0HI02155,Q28861508,@KaiKahele, 168 | IA1,N,,R,Hinson,Ashley,,,F,1983-06-27,H0IA01174,Q60713905,@HinsonAshley, 169 | IA2,N,,R,Miller-Meeks,Mariannette,Jane,,F,1955-09-06,H8IA02043,Q58495662,@MillerMeeks, 170 | IA3,Y,,,,,,,,,,,, 171 | IA4,N,,R,Feenstra,Randy,L.,,M,1969-01-14,H0IA04145,Q7292187,@RandyFeenstra, 172 | ID1,Y,,,,,,,,,,,, 173 | ID2,Y,,,,,,,,,,,, 174 | IL1,Y,,,,,,,,,,,, 175 | IL2,Y,,,,,,,,,,,, 176 | IL3,N,,D,Newman,Marie,,,F,1964-04-13,H8IL03102,Q47960940,currently @Marie4Congress, 177 | IL4,Y,,,,,,,,,,,, 178 | IL5,Y,,,,,,,,,,,, 179 | IL6,Y,,,,,,,,,,,, 180 | IL7,Y,,,,,,,,,,,, 181 | IL8,Y,,,,,,,,,,,, 182 | IL9,Y,,,,,,,,,,,, 183 | IL10,Y,,,,,,,,,,,, 184 | IL11,Y,,,,,,,,,,,, 185 | IL12,Y,,,,,,,,,,,, 186 | IL13,Y,,,,,,,,,,,, 187 | IL14,Y,,,,,,,,,,,, 188 | IL15,N,,R,Miller,Mary,,,F,1959-08-27,H0IL15129,Q101204553,@Miller_Congress, 189 | IL16,Y,,,,,,,,,,,, 190 | IL17,Y,,,,,,,,,,,, 191 | IL18,Y,,,,,,,,,,,, 192 | IN1,N,,D,Mrvan,Frank,John,,M,1969-04-16,H0IN01150,Q96077897,currently @Mrvan4Congress, 193 | IN2,Y,,,,,,,,,,,, 194 | IN3,Y,,,,,,,,,,,, 195 | IN4,Y,,,,,,,,,,,, 196 | IN5,N,,R,Spartz,Victoria,,,F,1978-10-06,H0IN05326,Q44059867,@Victoria_Spartz, 197 | IN6,Y,,,,,,,,,,,, 198 | IN7,Y,,,,,,,,,,,, 199 | IN8,Y,,,,,,,,,,,, 200 | IN9,Y,,,,,,,,,,,, 201 | KS1,N,,R,Mann,Tracey,Robert,,M,1976-12-17,H0KS01123,Q48767554,@TraceyMannKS, 202 | KS2,N,,R,LaTurner,Jacob,,,M,1988-02-17,H0KS02188,Q16731273,@JakeLaTurner, 203 | KS3,Y,,,,,,,,,,,, 204 | KS4,Y,,,,,,,,,,,, 205 | KY1,Y,,,,,,,,,,,, 206 | KY2,Y,,,,,,,,,,,, 207 | KY3,Y,,,,,,,,,,,, 208 | KY4,Y,,,,,,,,,,,, 209 | KY5,Y,,,,,,,,,,,, 210 | KY6,Y,,,,,,,,,,,, 211 | LA1,Y,,,,,,,,,,,, 212 | LA2,Y,,,,,,,,,,,, 213 | LA3,Y,,,,,,,,,,,, 214 | LA4,Y,,,,,,,,,,,, 215 | LA5,Rep.-elect Luke Letlow has died of COVID-19 on December 29,,,,,,,,,,,, 216 | LA6,Y,,,,,,,,,,,, 217 | MA1,Y,,,,,,,,,,,, 218 | MA2,Y,,,,,,,,,,,, 219 | MA3,Y,,,,,,,,,,,, 220 | MA4,N,,D,Auchincloss,Jake,Daniel,,M,1988-01-29,H0MA04192,Q101196632,@JakeAuch, 221 | MA5,Y,,,,,,,,,,,, 222 | MA6,Y,,,,,,,,,,,, 223 | MA7,Y,,,,,,,,,,,, 224 | MA8,Y,,,,,,,,,,,, 225 | MA9,Y,,,,,,,,,,,, 226 | MD1,Y,,,,,,,,,,,, 227 | MD2,Y,,,,,,,,,,,, 228 | MD3,Y,,,,,,,,,,,, 229 | MD4,Y,,,,,,,,,,,, 230 | MD5,Y,,,,,,,,,,,, 231 | MD6,Y,,,,,,,,,,,, 232 | MD7,Y,,,,,,,,,,,, 233 | MD8,Y,,,,,,,,,,,, 234 | ME1,Y,,,,,,,,,,,, 235 | ME2,Y,,,,,,,,,,,, 236 | MI1,Y,,,,,,,,,,,, 237 | MI2,Y,,,,,,,,,,,, 238 | MI3,N,,R,Meijer,Peter,James,,M,1988-01-10,H0MI03308,Q96419165,currently @VoteMeijer, 239 | MI4,Y,,,,,,,,,,,, 240 | MI5,Y,,,,,,,,,,,, 241 | MI6,Y,,,,,,,,,,,, 242 | MI7,Y,,,,,,,,,,,, 243 | MI8,Y,,,,,,,,,,,, 244 | MI9,Y,,,,,,,,,,,, 245 | MI10,N,,R,McClain,Lisa,,,F,1966-04-07,H0MI10287,Q102184540,currently @LisaForCongress, 246 | MI11,Y,,,,,,,,,,,, 247 | MI12,Y,,,,,,,,,,,, 248 | MI13,Y,,,,,,,,,,,, 249 | MI14,Y,,,,,,,,,,,, 250 | MN1,Y,,,,,,,,,,,, 251 | MN2,Y,,,,,,,,,,,, 252 | MN3,Y,,,,,,,,,,,, 253 | MN4,Y,,,,,,,,,,,, 254 | MN5,Y,,,,,,,,,,,, 255 | MN6,Y,,,,,,,,,,,, 256 | MN7,N,,R,Fischbach,Michelle,Louise Helene,,F,1965-11-03,H0MN07091,Q6837025,@FischbachMN7, 257 | MN8,Y,,,,,,,,,,,, 258 | MO1,N,,D,Bush,Cori,,,F,1976-07-21,H8MO01143,Q98084800,@CoriBush, 259 | MO2,Y,,,,,,,,,,,, 260 | MO3,Y,,,,,,,,,,,, 261 | MO4,Y,,,,,,,,,,,, 262 | MO5,Y,,,,,,,,,,,, 263 | MO6,Y,,,,,,,,,,,, 264 | MO7,Y,,,,,,,,,,,, 265 | MO8,Y,,,,,,,,,,,, 266 | MP0,Y,,,,,,,,,,,, 267 | MS1,Y,,,,,,,,,,,, 268 | MS2,Y,,,,,,,,,,,, 269 | MS3,Y,,,,,,,,,,,, 270 | MS4,Y,,,,,,,,,,,, 271 | MT0,N,,R,Rosendale,Matthew,Martin,Sr.,M,1960-07-07,H4MT00050,Q6791163,currently @MattForMontana, 272 | NC1,Y,,,,,,,,,,,, 273 | NC2,N,,D,Ross,Deborah,Koff,,F,1963-06-20,H0NC02125,Q5248285,@DeborahRossNC, 274 | NC3,Y,,,,,,,,,,,, 275 | NC4,Y,,,,,,,,,,,, 276 | NC5,Y,,,,,,,,,,,, 277 | NC6,N,,D,Manning,Kathy,Ellen,,F,1956-12-03,H8NC13067,Q101136890,@KathyManningNC, 278 | NC7,Y,,,,,,,,,,,, 279 | NC8,Y,,,,,,,,,,,, 280 | NC9,Y,,,,,,,,,,,, 281 | NC10,Y,,,,,,,,,,,, 282 | NC11,N,,R,Cawthorn,David,Madison,,M,1995-08-01,H0NC11233,Q96633736,currently @CawthornforNC, 283 | NC12,Y,,,,,,,,,,,, 284 | NC13,Y,,,,,,,,,,,, 285 | ND0,Y,,,,,,,,,,,, 286 | NE1,Y,,,,,,,,,,,, 287 | NE2,Y,,,,,,,,,,,, 288 | NE3,Y,,,,,,,,,,,, 289 | NH1,Y,,,,,,,,,,,, 290 | NH2,Y,,,,,,,,,,,, 291 | NJ1,Y,,,,,,,,,,,, 292 | NJ2,Y,,,,,,,,,,,, 293 | NJ3,Y,,,,,,,,,,,, 294 | NJ4,Y,,,,,,,,,,,, 295 | NJ5,Y,,,,,,,,,,,, 296 | NJ6,Y,,,,,,,,,,,, 297 | NJ7,Y,,,,,,,,,,,, 298 | NJ8,Y,,,,,,,,,,,, 299 | NJ9,Y,,,,,,,,,,,, 300 | NJ10,Y,,,,,,,,,,,, 301 | NJ11,Y,,,,,,,,,,,, 302 | NJ12,Y,,,,,,,,,,,, 303 | NM1,Y,,,,,,,,,,,, 304 | NM2,N,,R,Herrell,Stella,Yvette,,F,1964-03-16,H8NM02156,Q16225780,currently @Yvette4Congress, 305 | NM3,N,,D,Fernandez,Teresa,Leger,,F,1959-07-01,S0NJ00258,Q96054905,currently @TeresaForNM, 306 | NV1,Y,,,,,,,,,,,, 307 | NV2,Y,,,,,,,,,,,, 308 | NV3,Y,,,,,,,,,,,, 309 | NV4,Y,,,,,,,,,,,, 310 | NY1,Y,,,,,,,,,,,, 311 | NY2,N,,R,Garbarino,Andrew,,,M,1984-09-27,H0NY02234,Q21257859,currently @GarbarinoforNY, 312 | NY3,Y,,,,,,,,,,,, 313 | NY4,Y,,,,,,,,,,,, 314 | NY5,Y,,,,,,,,,,,, 315 | NY6,Y,,,,,,,,,,,, 316 | NY7,Y,,,,,,,,,,,, 317 | NY8,Y,,,,,,,,,,,, 318 | NY9,Y,,,,,,,,,,,, 319 | NY10,Y,,,,,,,,,,,, 320 | NY11,N,,R,Malliotakis,Nicole,,,F,1980-11-11,H0NY11078,Q7030112,@NMalliotakis, 321 | NY12,Y,,,,,,,,,,,, 322 | NY13,Y,,,,,,,,,,,, 323 | NY14,Y,,,,,,,,,,,, 324 | NY15,N,,D,Torres,Ritchie,John,,M,1988-03-12,H0NY15160,Q16205227,@RitchieTorres, 325 | NY16,N,,D,Bowman,Jamaal,,,M,1976-04-01,H0NY16143,Q96419280,@JamaalBowmanNY, 326 | NY17,N,,D,Jones,Mondaire,,,M,1987-05-18,H0NY17174,Q96781248,@MondaireJones, 327 | NY18,Y,,,,,,,,,,,, 328 | NY19,Y,,,,,,,,,,,, 329 | NY20,Y,,,,,,,,,,,, 330 | NY21,Y,,,,,,,,,,,, 331 | NY22,Race too close to call?,,,,,,,,,,,, 332 | NY23,Y,,,,,,,,,,,, 333 | NY24,Y,,,,,,,,,,,, 334 | NY25,Y,,,,,,,,,,,, 335 | NY26,Y,,,,,,,,,,,, 336 | NY27,Y,,,,,,,,,,,, 337 | OH1,Y,,,,,,,,,,,, 338 | OH2,Y,,,,,,,,,,,, 339 | OH3,Y,,,,,,,,,,,, 340 | OH4,Y,,,,,,,,,,,, 341 | OH5,Y,,,,,,,,,,,, 342 | OH6,Y,,,,,,,,,,,, 343 | OH7,Y,,,,,,,,,,,, 344 | OH8,Y,,,,,,,,,,,, 345 | OH9,Y,,,,,,,,,,,, 346 | OH10,Y,,,,,,,,,,,, 347 | OH11,Y,,,,,,,,,,,, 348 | OH12,Y,,,,,,,,,,,, 349 | OH13,Y,,,,,,,,,,,, 350 | OH14,Y,,,,,,,,,,,, 351 | OH15,Y,,,,,,,,,,,, 352 | OH16,Y,,,,,,,,,,,, 353 | OK1,Y,,,,,,,,,,,, 354 | OK2,Y,,,,,,,,,,,, 355 | OK3,Y,,,,,,,,,,,, 356 | OK4,Y,,,,,,,,,,,, 357 | OK5,N,,R,Bice,Stephanie,,,F,1973-11-11,H0OK05205,Q60190894,@StephanieBice, 358 | OR1,Y,,,,,,,,,,,, 359 | OR2,N,,R,Bentz,Cliff,Stewart,,M,1952-01-12,H0OR02127,Q5132536,@CliffBentz, 360 | OR3,Y,,,,,,,,,,,, 361 | OR4,Y,,,,,,,,,,,, 362 | OR5,Y,,,,,,,,,,,, 363 | PA1,Y,,,,,,,,,,,, 364 | PA2,Y,,,,,,,,,,,, 365 | PA3,Y,,,,,,,,,,,, 366 | PA4,Y,,,,,,,,,,,, 367 | PA5,Y,,,,,,,,,,,, 368 | PA6,Y,,,,,,,,,,,, 369 | PA7,Y,,,,,,,,,,,, 370 | PA8,Y,,,,,,,,,,,, 371 | PA9,Y,,,,,,,,,,,, 372 | PA10,Y,,,,,,,,,,,, 373 | PA11,Y,,,,,,,,,,,, 374 | PA12,Y,,,,,,,,,,,, 375 | PA13,Y,,,,,,,,,,,, 376 | PA14,Y,,,,,,,,,,,, 377 | PA15,Y,,,,,,,,,,,, 378 | PA16,Y,,,,,,,,,,,, 379 | PA17,Y,,,,,,,,,,,, 380 | PA18,Y,,,,,,,,,,,, 381 | PR0,Y,,,,,,,,,,,, 382 | RI1,Y,,,,,,,,,,,, 383 | RI2,Y,,,,,,,,,,,, 384 | SC1,N,,R,Mace,Nancy,Ruth,,F,1977-12-04,H0SC01394,Q6962831,@NancyMace, 385 | SC2,Y,,,,,,,,,,,, 386 | SC3,Y,,,,,,,,,,,, 387 | SC4,Y,,,,,,,,,,,, 388 | SC5,Y,,,,,,,,,,,, 389 | SC6,Y,,,,,,,,,,,, 390 | SC7,Y,,,,,,,,,,,, 391 | SD0,Y,,,,,,,,,,,, 392 | TN1,N,,R,Harshbarger,Diana,,,F,1960-01-01,H0TN01118,Q101197341,@DHarshbargerTN1, 393 | TN2,Y,,,,,,,,,,,, 394 | TN3,Y,,,,,,,,,,,, 395 | TN4,Y,,,,,,,,,,,, 396 | TN5,Y,,,,,,,,,,,, 397 | TN6,Y,,,,,,,,,,,, 398 | TN7,Y,,,,,,,,,,,, 399 | TN8,Y,,,,,,,,,,,, 400 | TN9,Y,,,,,,,,,,,, 401 | TX1,Y,,,,,,,,,,,, 402 | TX2,Y,,,,,,,,,,,, 403 | TX3,Y,,,,,,,,,,,, 404 | TX4,N,,R,Fallon,Patrick,Edward,,M,1967-12-19,H0TX04219,Q16196923,currently @FallonForTexas, 405 | TX5,Y,,,,,,,,,,,, 406 | TX6,Y,,,,,,,,,,,, 407 | TX7,Y,,,,,,,,,,,, 408 | TX8,Y,,,,,,,,,,,, 409 | TX9,Y,,,,,,,,,,,, 410 | TX10,Y,,,,,,,,,,,, 411 | TX11,N,,R,Pfluger,August,Lee,II,M,1978-12-28,H0TX11230,Q101196462,@AugustPfluger, 412 | TX12,Y,,,,,,,,,,,, 413 | TX13,N,,R,Jackson,Ronny,Lynn,,M,1967-05-04,H0TX13228,Q47270118,currently @RonnyJackson4TX, 414 | TX14,Y,,,,,,,,,,,, 415 | TX15,Y,,,,,,,,,,,, 416 | TX16,Y,,,,,,,,,,,, 417 | TX17,N,400367,R,Sessions,Pete,,,,,,,, 418 | TX18,Y,,,,,,,,,,,, 419 | TX19,Y,,,,,,,,,,,, 420 | TX20,Y,,,,,,,,,,,, 421 | TX21,Y,,,,,,,,,,,, 422 | TX22,N,,R,Nehls,Troy,E.,,M,1968-04-07,H0TX22302,Q96741441,@SheriffTNehls, 423 | TX23,N,,R,Gonzales,Ernest,Anthony,II,M,1980-10-10,H0TX35015,,currently @TonyGonzales4TX, 424 | TX24,N,,R,Van Duyne,Beth,Ann,,F,1970-11-16,H0TX24209,Q66309702,@BethVanDuyne, 425 | TX25,Y,,,,,,,,,,,, 426 | TX26,Y,,,,,,,,,,,, 427 | TX27,Y,,,,,,,,,,,, 428 | TX28,Y,,,,,,,,,,,, 429 | TX29,Y,,,,,,,,,,,, 430 | TX30,Y,,,,,,,,,,,, 431 | TX31,Y,,,,,,,,,,,, 432 | TX32,Y,,,,,,,,,,,, 433 | TX33,Y,,,,,,,,,,,, 434 | TX34,Y,,,,,,,,,,,, 435 | TX35,Y,,,,,,,,,,,, 436 | TX36,Y,,,,,,,,,,,, 437 | UT1,N,,R,Moore,Blake,David,,M,1980-06-22,H0UT01205,Q101196971,currently @ElectBlakeMoore,https://en.wikipedia.org/wiki/Blake_Moore#/media/File:Blake_Moore_117th_U.S_Congress.jpg 438 | UT2,Y,,,,,,,,,,,, 439 | UT3,Y,,,,,,,,,,,, 440 | UT4,N,,R,Owens,Clarence,Burgess,,M,1951-08-02,H0UT04076,Q4998602,@BurgessOwens, 441 | VA1,Y,,,,,,,,,,,, 442 | VA2,Y,,,,,,,,,,,, 443 | VA3,Y,,,,,,,,,,,, 444 | VA4,Y,,,,,,,,,,,, 445 | VA5,N,,R,Good,Robert,G.,,M,1965-09-11,H0VA05160,Q103850475,currently @GoodForCongress, 446 | VA6,Y,,,,,,,,,,,, 447 | VA7,Y,,,,,,,,,,,, 448 | VA8,Y,,,,,,,,,,,, 449 | VA9,Y,,,,,,,,,,,, 450 | VA10,Y,,,,,,,,,,,, 451 | VA11,Y,,,,,,,,,,,, 452 | VI0,Y,,,,,,,,,,,, 453 | VT0,Y,,,,,,,,,,,, 454 | WA1,Y,,,,,,,,,,,, 455 | WA2,Y,,,,,,,,,,,, 456 | WA3,Y,,,,,,,,,,,, 457 | WA4,Y,,,,,,,,,,,, 458 | WA5,Y,,,,,,,,,,,, 459 | WA6,Y,,,,,,,,,,,, 460 | WA7,Y,,,,,,,,,,,, 461 | WA8,Y,,,,,,,,,,,, 462 | WA9,Y,,,,,,,,,,,, 463 | WA10,N,,D,Strickland,Marilyn,,,F,1962-09-25,H0WA10034,Q1898180,currently @StricklandForWA, 464 | WI1,Y,,,,,,,,,,,, 465 | WI2,Y,,,,,,,,,,,, 466 | WI3,Y,,,,,,,,,,,, 467 | WI4,Y,,,,,,,,,,,, 468 | WI5,N,,R,Fitzgerald,Scott,L.,,M,1963-11-16,H0WI05113,Q7436650,currently @SenFitzgerald, 469 | WI6,Y,,,,,,,,,,,, 470 | WI7,Y,,,,,,,,,,,, 471 | WI8,Y,,,,,,,,,,,, 472 | WV1,Y,,,,,,,,,,,, 473 | WV2,Y,,,,,,,,,,,, 474 | WV3,Y,,,,,,,,,,,, 475 | WY0,Y,,,,,,,,,,,, -------------------------------------------------------------------------------- /scripts/archive/election_results_house_2016.py: -------------------------------------------------------------------------------- 1 | import collections, requests, lxml 2 | from utils import load_data, save_data 3 | 4 | try: 5 | from yaml import CLoader 6 | assert CLoader #silence pyflakes 7 | except ImportError: 8 | print("Warning: libyaml not found, loading will be slow...") 9 | 10 | # # Open existing data. 11 | historical = load_data("legislators-historical.yaml") 12 | current = load_data("legislators-current.yaml") 13 | 14 | # # Map bioguide IDs to records. 15 | bioguide = { } 16 | for entry in historical + current: 17 | bioguide[entry['id']['bioguide']] = entry 18 | 19 | # # Get highest existing GovTrack ID. 20 | govtrack_id = max(p['id']['govtrack'] for p in historical+current) 21 | 22 | # load members-elect 23 | xml = requests.get("http://clerk.house.gov/member_info/unofficial-115-member-elect-data.xml") 24 | root=lxml.etree.fromstring(xml.content) 25 | 26 | elected = [] 27 | for xml_member in root.findall('./members/member'): 28 | mi = xml_member.find("member-info") 29 | bioguide_id = mi.find("bioguideID").text 30 | 31 | #print("bioguide_id is {} for {}".format(bioguide_id, xml_member.find("statedistrict").text)) 32 | if bioguide_id is None: 33 | print("WARN: no member found for {}".format(xml_member.find("statedistrict").text)) 34 | continue 35 | 36 | if bioguide_id in bioguide: 37 | # Incumbent won or current representative has become a senator 38 | # or historical member is returning to office. 39 | p = bioguide[bioguide_id] 40 | party = p['terms'][-1]['party'] 41 | 42 | else: 43 | # Make a new entry. 44 | govtrack_id += 1 45 | p = collections.OrderedDict([ 46 | ("id", collections.OrderedDict([ 47 | ("bioguide", bioguide_id), 48 | #("fec", [row['fec']]), 49 | ("govtrack", govtrack_id), 50 | #("opensecrets", None), # don't know yet 51 | #("votesmart", int(row['votesmart'])), 52 | #("wikipedia", row['wikipedia']), 53 | #("ballotpedia", row['ballotpedia']), 54 | ])), 55 | ("name", collections.OrderedDict([ 56 | ("first", mi.find('firstname').text), 57 | ("last", mi.find('lastname').text), 58 | #("official_full", mi.find('official_full').text), #not available yet 59 | ])), 60 | ("bio", collections.OrderedDict([ 61 | ("gender", "M" if mi.find('courtesy').text == "Mr." else "F"), 62 | #("birthday", row['birthday']), 63 | ])), 64 | ("terms", []), 65 | ]) 66 | 67 | party_char = mi.find('party').text 68 | party = 'Republican' if party_char == 'R' else 'Democrat' # valid? 69 | caucus_char = mi.find('caucus').text 70 | caucus = 'Republican' if caucus_char == 'R' else 'Democrat' # valid? 71 | 72 | district = int(xml_member.find("statedistrict").text[2:]) 73 | # Add a new term. 74 | p['terms'].append(collections.OrderedDict([ 75 | ("type", "rep"), 76 | ("start", "2017-01-03"), 77 | ("end", "2019-01-03"), 78 | ("state", mi.find('state').get('postal-code')), 79 | ("district", district), 80 | ("party", party), 81 | ("phone", mi.find("phone").text), 82 | ])) 83 | 84 | if caucus != party: 85 | p['terms'][-1]['caucus'] = caucus 86 | 87 | if len(p['terms']) > 1: 88 | # This is an incumbent. Copy some fields forward. 89 | for k in ('url', 'rss_url'): 90 | if k in p['terms'][-2]: 91 | p['terms'][-1][k] = p['terms'][-2][k] 92 | 93 | # Add to array. 94 | elected.append(p) 95 | 96 | # Move losers to the historical file. 97 | for p in list(current): 98 | if p['terms'][-1]['type'] == 'rep' and p not in elected: 99 | #print("moving {} {} {} to historical".format(p['id']['bioguide'], p['name']['first'], p['name']['last'])) 100 | current.remove(p) 101 | historical.append(p) 102 | 103 | # If they have any current leadership roles, end it. 104 | for r in p.get('leadership_roles', []): 105 | if not r.get('end'): 106 | r['end'] = "2017-01-03" 107 | 108 | # Move returning members to the current file 109 | for p in elected: 110 | if p in historical: 111 | historical.remove(p) 112 | current.append(p) 113 | 114 | # Add new members to the current file, after the returning members. 115 | for p in elected: 116 | if p not in current: 117 | current.append(p) 118 | 119 | # Save. 120 | save_data(current, "legislators-current.yaml") 121 | save_data(historical, "legislators-historical.yaml") 122 | -------------------------------------------------------------------------------- /scripts/archive/election_results_senate_2016.csv: -------------------------------------------------------------------------------- 1 | chamber,class,new,state,full name,party,bioguide,fec,votesmart,wikipedia,ballotpedia,first,middle,last,nickname,gender,birthday 2 | Senate,3,,MO,Roy Blunt,,B000575,,,,,,,,,, 3 | Senate,3,,NC,Richard Burr,,B001135,,,,,,,,,, 4 | Senate,3,,AR,John Boozman,,B001236,,,,,,,,,, 5 | Senate,3,,CO,Michael Bennet,,B001267,,,,,,,,,, 6 | Senate,3,,CT,Richard Blumenthal,,B001277,,,,,,,,,, 7 | Senate,3,,ID,Mike Crapo,,C000880,,,,,,,,,, 8 | Senate,3,Y,IL,Tammy Duckworth,,D000622,,,,,,,,,, 9 | Senate,3,,IA,Chuck Grassley,,G000386,,,,,,,,,, 10 | Senate,3,,ND,John Hoeven,,H001061,,,,,,,,,, 11 | Senate,3,,GA,Johnny Isakson,,I000055,,,,,,,,,, 12 | Senate,3,,WI,Ron Johnson,,J000293,,,,,,,,,, 13 | Senate,3,,VT,Patrick Leahy,,L000174,,,,,,,,,, 14 | Senate,3,,OK,James Lankford,,L000575,,,,,,,,,, 15 | Senate,3,,UT,Mike Lee,,L000577,,,,,,,,,, 16 | Senate,3,,AZ,John McCain,,M000303,,,,,,,,,, 17 | Senate,3,,KS,Jerry Moran,,M000934,,,,,,,,,, 18 | Senate,3,,WA,Patty Murray,,M001111,,,,,,,,,, 19 | Senate,3,,AK,Lisa Murkowski,,M001153,,,,,,,,,, 20 | Senate,3,,OH,Rob Portman,,P000449,,,,,,,,,, 21 | Senate,3,,KY,Rand Paul,,P000603,,,,,,,,,, 22 | Senate,3,,FL,Marco Rubio,,R000595,,,,,,,,,, 23 | Senate,3,,NY,Chuck Schumer,,S000148,,,,,,,,,, 24 | Senate,3,,AL,Richard Shelby,,S000320,,,,,,,,,, 25 | Senate,3,,SC,Tim Scott,,S001184,,,,,,,,,, 26 | Senate,3,,HI,Brian Schatz,,S001194,,,,,,,,,, 27 | Senate,3,,SD,John Thune,,T000250,,,,,,,,,, 28 | Senate,3,,PA,Pat Toomey,,T000461,,,,,,,,,, 29 | Senate,3,Y,MD,Chris Van Hollen,,V000128,,,,,,,,,, 30 | Senate,3,,OR,Ron Wyden,,W000779,,,,,,,,,, 31 | Senate,3,Y,IN,Todd Young,,Y000064,,,,,,,,,, 32 | Senate,3,Y,CA,Kamala Harris,Democrat,H001075,S6CA00584,120012,Kamala Harris,Kamala Harris,Kamala,,Harris,,F,1964-10-20 33 | Senate,3,Y,LA,John Neely Kennedy,Republican,K000393,S4LA00065,35496,John Neely Kennedy,John Neely Kennedy,John,Neely,Kennedy,,M,1951-11-21 34 | Senate,3,Y,NH,Maggie Hassan,Democrat,H001076,S6NH00091,42552,Maggie Hassan,Maggie Hassan,Margaret,Wood,Hassan,Maggie,F,1958-02-27 35 | Senate,3,Y,NV,Catherine Cortez Masto,Democrat,C001113,S6NV00200,69579,Catherine Cortez Masto,Catherine Cortez Masto,Catherine,,Cortez Masto,,F,1964-03-29 36 | -------------------------------------------------------------------------------- /scripts/archive/election_results_senate_2016.py: -------------------------------------------------------------------------------- 1 | import csv, collections 2 | from utils import load_data, save_data 3 | 4 | # Open existing data. 5 | historical = load_data("legislators-historical.yaml") 6 | current = load_data("legislators-current.yaml") 7 | 8 | # Map bioguide IDs to records. 9 | bioguide = { } 10 | for entry in historical + current: 11 | bioguide[entry['id']['bioguide']] = entry 12 | 13 | # Get highest existing GovTrack ID. 14 | govtrack_id = max(p['id']['govtrack'] for p in historical+current) 15 | 16 | # Process election results. 17 | elected = [] 18 | for row in csv.DictReader(open("election_results_senate_2016.csv")): 19 | if row['bioguide'] in bioguide: 20 | # Incumbent won or current representative has become a senator 21 | # or historical member is returning to office. 22 | p = bioguide[row['bioguide']] 23 | party = p['terms'][-1]['party'] 24 | 25 | else: 26 | # Make a new entry. 27 | govtrack_id += 1 28 | p = collections.OrderedDict([ 29 | ("id", collections.OrderedDict([ 30 | ("bioguide", row['bioguide']), 31 | ("fec", [row['fec']]), 32 | ("govtrack", govtrack_id), 33 | #("opensecrets", None), # don't know yet 34 | ("votesmart", int(row['votesmart'])), 35 | ("wikipedia", row['wikipedia']), 36 | ("ballotpedia", row['ballotpedia']), 37 | ])), 38 | ("name", collections.OrderedDict([ 39 | (k, row[k]) for k in ("first", "middle", "nickname", "last") if row[k] 40 | ])), 41 | ("bio", collections.OrderedDict([ 42 | ("gender", row['gender']), 43 | ("birthday", row['birthday']), 44 | ])), 45 | ("terms", []), 46 | ]) 47 | 48 | # Add a new term. 49 | p['terms'].append(collections.OrderedDict([ 50 | ("type", "sen"), 51 | ("start", "2017-01-03"), 52 | ("end", "2023-01-03"), 53 | ("state", row['state']), 54 | ("class", 3), 55 | ])) 56 | 57 | if row['new'] == "Y": 58 | # Not an incumbent. Therefore this person becomes 59 | # the junior senator and the other (non-class-3) 60 | # senator becomes the senior senator. 61 | p['terms'][-1]['state_rank'] = "junior" 62 | p['terms'][-1]['party'] = row['party'] or p['terms'][-2]['party'] # as listed in the CSV, or from their previous term if previously served 63 | for p1 in current: 64 | if p1['terms'][-1]['type'] == 'sen' and p1['terms'][-1]['state'] == row['state'] and p1['terms'][-1]['class'] != 3: 65 | p1['terms'][-1]['state_rank'] = "senior" 66 | break 67 | else: 68 | # This is an incumbent. Copy some fields forward. 69 | for k in ('state_rank', 'party', 'caucus', 'url', 'rss_url'): 70 | if k in p['terms'][-2]: 71 | p['terms'][-1][k] = p['terms'][-2][k] 72 | 73 | # Add to array. 74 | elected.append(p) 75 | 76 | # Move losers to the historical file. 77 | for p in current: 78 | if p['terms'][-1]['type'] == 'sen' and p['terms'][-1]['class'] == 3 \ 79 | and p not in elected: 80 | current.remove(p) 81 | historical.append(p) 82 | 83 | # If they have any current leadership roles, end it. 84 | for r in p.get('leadership_roles', []): 85 | if not r.get('end'): 86 | r['end'] = "2017-01-03" 87 | 88 | # Move returning members to the current file -- actually there are no 89 | # cases of this. All of the existing non-incumbents are current reps 90 | # who became senators. 91 | for p in elected: 92 | if p in historical: 93 | historical.remove(p) 94 | current.append(p) 95 | 96 | # Add new members to the current file, after the returning members. 97 | for p in elected: 98 | if p not in current: 99 | current.append(p) 100 | 101 | # Save. 102 | save_data(historical, "legislators-historical.yaml") 103 | save_data(current, "legislators-current.yaml") 104 | -------------------------------------------------------------------------------- /scripts/archive/everypolitician.py: -------------------------------------------------------------------------------- 1 | # Converts our data into CSV files for everypolitician.org, 2 | # one file for the House and one file for the Senate. 3 | # 4 | # Usage: 5 | # python everypolitician.py outputbasename/ 6 | # 7 | # Which will write: 8 | # outputbasename/house.csv 9 | # outputbasename/senate.csv 10 | 11 | import sys, csv 12 | 13 | from utils import yaml_load, CURRENT_CONGRESS, states 14 | 15 | def run(): 16 | if len(sys.argv) < 2: 17 | print("Usage: python everypolitician.py outputbasename/") 18 | sys.exit(0) 19 | 20 | # Load current legislators. 21 | data = yaml_load("../legislators-current.yaml") 22 | data_social_media = { } 23 | for legislator in yaml_load("../legislators-social-media.yaml"): 24 | data_social_media[legislator['id']['bioguide']] = legislator 25 | 26 | # Create output files. 27 | writers = { 28 | "rep": csv.writer(open(sys.argv[1] + "house.csv", "w")), 29 | "sen": csv.writer(open(sys.argv[1] + "senate.csv", "w")), 30 | } 31 | for w in writers.values(): 32 | w.writerow([ 33 | "id", 34 | "name", 35 | "area", 36 | "group", 37 | "term", 38 | "start_date", 39 | "end_date", 40 | "given_name", 41 | "family_name", 42 | "honorific_suffix", 43 | "sort_name", 44 | "phone", 45 | "gender", 46 | "birth_date", 47 | "image", 48 | "twitter", 49 | "facebook", 50 | "instagram", 51 | "wikipedia", 52 | "website", 53 | ]) 54 | 55 | # Write out one row per legislator for their current term. 56 | for legislator in data: 57 | term = legislator['terms'][-1] 58 | 59 | # TODO: "If someone changed party/faction affilation in the middle of the term, you should include two entries, with the relevant start/end dates set." 60 | 61 | w = writers[term['type']] 62 | w.writerow([ 63 | legislator['id']['bioguide'], 64 | build_name(legislator, term, 'full'), 65 | build_area(term), 66 | term['party'], 67 | CURRENT_CONGRESS, 68 | term['start'], 69 | term['end'], 70 | legislator['name'].get('first'), 71 | legislator['name'].get('last'), 72 | legislator['name'].get('suffix'), 73 | build_name(legislator, term, 'sort'), 74 | term.get('phone'), 75 | legislator['bio'].get('gender'), 76 | legislator['bio'].get('birthday'), 77 | "https://theunitedstates.io/images/congress/original/%s.jpg" % legislator['id']['bioguide'], 78 | data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("twitter"), 79 | data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("facebook"), 80 | data_social_media.get(legislator['id']['bioguide'], {}).get("social", {}).get("instagram"), 81 | legislator['id'].get('wikipedia', '').replace(" ", "_"), 82 | term['url'], 83 | ]) 84 | 85 | ordinal_strings = { 1: "st", 2: "nd", 3: "rd", 11: 'th', 12: 'th', 13: 'th' } 86 | def ordinal(num): 87 | return str(num) + ordinal_strings.get(num % 100, ordinal_strings.get(num % 10, "th")) 88 | 89 | def build_area(term): 90 | # Builds the string for the "area" column, which is a human-readable 91 | # description of the legislator's state or district. 92 | ret = states[term['state']] 93 | if term['type'] == 'rep': 94 | ret += "’s " 95 | if term['district'] == 0: 96 | ret += "At-Large" 97 | else: 98 | ret += ordinal(term['district']) 99 | ret += " Congressional District" 100 | return ret 101 | 102 | def build_name(p, t, mode): 103 | # Based on: 104 | # https://github.com/govtrack/govtrack.us-web/blob/master/person/name.py 105 | 106 | # First name. 107 | firstname = p['name']['first'] 108 | if firstname.endswith('.'): 109 | firstname = p['name']['middle'] 110 | if p['name'].get('nickname') and len(p['name']['nickname']) < len(firstname): 111 | firstname = p['name']['nickname'] 112 | 113 | # Last name. 114 | lastname = p['name']['last'] 115 | if p['name'].get('suffix'): 116 | lastname += ', ' + p['name']['suffix'] 117 | 118 | if mode == "full": 119 | return firstname + ' ' + lastname 120 | elif mode == "sort": 121 | return lastname + ', ' + firstname 122 | else: 123 | raise ValueError(mode) 124 | 125 | if __name__ == '__main__': 126 | run() 127 | -------------------------------------------------------------------------------- /scripts/archive/house_history_gender.py: -------------------------------------------------------------------------------- 1 | import re, urllib.request, urllib.parse 2 | from utils import yaml_load, yaml_dump 3 | 4 | def run(): 5 | 6 | # Use the House History Website's Women in Congress search results to get a list of IDs. 7 | # Because this requires a POST, our utils.download() function won't work. 8 | querystring = b"Command=Next&Term=Search&SearchIn=LastName&ShowNonMember=true&ShowNonMember=false&Office=&Leadership=&State=&Party=&ContinentalCongress=false&BlackAmericansInCongress=false&WomenInCongress=true&WomenInCongress=false&HispanicAmericansInCongress=false&CongressNumber=65&CongressNumber=66&CongressNumber=67&CongressNumber=68&CongressNumber=69&CongressNumber=70&CongressNumber=71&CongressNumber=72&CongressNumber=73&CongressNumber=74&CongressNumber=75&CongressNumber=76&CongressNumber=77&CongressNumber=78&CongressNumber=79&CongressNumber=80&CongressNumber=81&CongressNumber=82&CongressNumber=83&CongressNumber=84&CongressNumber=85&CongressNumber=86&CongressNumber=87&CongressNumber=88&CongressNumber=89&CongressNumber=90&CongressNumber=91&CongressNumber=92&CongressNumber=93&CongressNumber=94&CongressNumber=95&CongressNumber=96&CongressNumber=97&CongressNumber=98&CongressNumber=99&CongressNumber=100&CongressNumber=101&CongressNumber=102&CongressNumber=103&CongressNumber=104&CongressNumber=105&CongressNumber=106&CongressNumber=107&CongressNumber=108&CongressNumber=109&CongressNumber=110&CongressNumber=111&CongressNumber=112&CongressNumber=113&CongressNumber=114&CurrentPage=__PAGE__&SortOrder=LastName&ResultType=Grid&PreviousSearch=Search%2CLastName%2C%2C%2C%2C%2CFalse%2CFalse%2CTrue%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2C114%2CLastName&X-Requested-With=XMLHttpRequest" 9 | women_house_history_ids = set() 10 | for pagenum in range(0, 30+1): 11 | body = urllib.request.urlopen( 12 | "http://history.house.gov/People/Search?Length=6", 13 | querystring.replace(b"__PAGE__", str(pagenum).encode("ascii")) 14 | ).read().decode("utf8") 15 | for match in re.findall(r"/People/Detail/(\d+)\?ret=True", body): 16 | women_house_history_ids.add(int(match)) 17 | 18 | # Now check and update the gender of all legislators. 19 | matched_women_house_history_ids = set() 20 | missing_ids = set() 21 | for fn in ("../legislators-current.yaml", "../legislators-historical.yaml"): 22 | legislators = yaml_load(fn) 23 | for p in legislators: 24 | house_history_id = p.get("id", {}).get("house_history") 25 | 26 | if not house_history_id: 27 | # We have all of the women, so anyone left must be a man. 28 | p.setdefault("bio", {})["gender"] = "M" 29 | missing_ids.add(p.get("id", {}).get("bioguide")) 30 | continue 31 | 32 | p.setdefault("bio", {})["gender"] = "F" if house_history_id in women_house_history_ids else "M" 33 | 34 | if house_history_id in women_house_history_ids: 35 | matched_women_house_history_ids.add(house_history_id) 36 | 37 | yaml_dump(legislators, fn) 38 | 39 | print("%d women in Congress reported by the House History website" % len(women_house_history_ids)) 40 | print("%d women in Congress were not found in our files." % len(women_house_history_ids-matched_women_house_history_ids)) 41 | print(" ", " ".join((str(x) for x in (women_house_history_ids-matched_women_house_history_ids)))) 42 | print("%d legislators are missing house_history IDs, set to male." % len(missing_ids)) 43 | 44 | if __name__ == '__main__': 45 | run() -------------------------------------------------------------------------------- /scripts/archive/print_leadership_roles.py: -------------------------------------------------------------------------------- 1 | #print out leadership roles for manual review 2 | 3 | import rtyaml 4 | import utils 5 | 6 | with open("legislators-current.yaml") as f: 7 | legislators = rtyaml.load(f) 8 | for legislator in legislators: 9 | if 'leadership_roles' in legislator: 10 | print("{}, {}".format(legislator["name"]["last"], legislator["name"]["first"])) 11 | for role in legislator.get("leadership_roles", []): 12 | 13 | start = utils.parse_date(role["start"]) 14 | if not "end" in role: 15 | print("{} {} started {} with no end".format(role["chamber"], role["title"], role["start"])) 16 | else: 17 | print("{} {} started {} and ended {}".format(role["chamber"], role["title"], role["start"], role["end"])) 18 | 19 | -------------------------------------------------------------------------------- /scripts/bioguide.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # gets fundamental information for every member with a bioguide ID: 4 | # first name, nickname, middle name, last name, name suffix 5 | # birthday 6 | 7 | # options: 8 | # --cache: load from cache if present on disk (default: true) 9 | # --current: do *only* current legislators (default: true) 10 | # --historical: do *only* historical legislators (default: false) 11 | # --bioguide: do *only* a single legislator 12 | # --relationships: Get familial relationships to other members of congress past and present, when applicable 13 | 14 | import lxml.html, io 15 | import datetime 16 | import re 17 | import utils 18 | from utils import download, load_data, save_data 19 | 20 | def run(): 21 | 22 | def update_birthday(bioguide, person, main): 23 | 24 | birthday = birthday_for(main) 25 | if not birthday: 26 | print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main)) 27 | warnings.append(bioguide) 28 | return 29 | if birthday == "UNKNOWN": 30 | return 31 | 32 | try: 33 | birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y") 34 | except ValueError: 35 | print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main)) 36 | warnings.append(bioguide) 37 | return 38 | 39 | birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) 40 | person.setdefault("bio", {})["birthday"] = birthday 41 | 42 | 43 | def birthday_for(string): 44 | # exceptions for not-nicely-placed semicolons 45 | string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April") 46 | string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") 47 | string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") 48 | string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") 49 | string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") 50 | string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") 51 | 52 | # look for a date 53 | pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" 54 | match = re.search(pattern, string, re.I) 55 | if not match or not match.group(1): 56 | # specifically detect cases that we can't handle to avoid unnecessary warnings 57 | if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN" 58 | if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN" 59 | return None 60 | return match.group(1).strip() 61 | 62 | def relationships_of(string): 63 | # relationship data is stored in a parenthetical immediately after the end of the tag in the bio 64 | # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" 65 | pattern = "^\((.*?)\)" 66 | match = re.search(pattern, string, re.I) 67 | 68 | relationships = [] 69 | 70 | if match and len(match.groups()) > 0: 71 | relationship_text = match.group(1).encode("ascii", "replace") 72 | 73 | # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar 74 | from nltk import tree, pos_tag, RegexpParser 75 | tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) 76 | pos = pos_tag(tokens) 77 | 78 | grammar = r""" 79 | NAME: {+} 80 | NAMES: { (?:)* } 81 | RELATIONSHIP: { + } 82 | MATCH: { } 83 | """ 84 | cp = RegexpParser(grammar) 85 | chunks = cp.parse(pos) 86 | 87 | # iterate through the Relationship/Names pairs 88 | for n in chunks: 89 | if isinstance(n, tree.Tree) and n.node == "MATCH": 90 | people = [] 91 | relationship = None 92 | for piece in n: 93 | if piece.node == "RELATIONSHIP": 94 | relationship = " ".join([x[0] for x in piece]) 95 | elif piece.node == "NAMES": 96 | for name in [x for x in piece if isinstance(x, tree.Tree)]: 97 | people.append(" ".join([x[0] for x in name])) 98 | for person in people: 99 | relationships.append({ "relation": relationship, "name": person}) 100 | return relationships 101 | 102 | # default to caching 103 | cache = utils.flags().get('cache', True) 104 | force = not cache 105 | 106 | # pick either current or historical 107 | # order is important here, since current defaults to true 108 | if utils.flags().get('historical', False): 109 | filename = "legislators-historical.yaml" 110 | elif utils.flags().get('current', True): 111 | filename = "legislators-current.yaml" 112 | else: 113 | print("No legislators selected.") 114 | exit(0) 115 | 116 | print("Loading %s..." % filename) 117 | legislators = load_data(filename) 118 | 119 | 120 | # reoriented cache to access by bioguide ID 121 | by_bioguide = { } 122 | for m in legislators: 123 | if "bioguide" in m["id"]: 124 | by_bioguide[m["id"]["bioguide"]] = m 125 | 126 | 127 | # optionally focus on one legislator 128 | 129 | bioguide = utils.flags().get('bioguide', None) 130 | if bioguide: 131 | bioguides = [bioguide] 132 | else: 133 | bioguides = list(by_bioguide.keys()) 134 | 135 | warnings = [] 136 | missing = [] 137 | count = 0 138 | families = 0 139 | 140 | for bioguide in bioguides: 141 | # Download & parse the HTML of the bioguide page. 142 | try: 143 | dom = fetch_bioguide_page(bioguide, force) 144 | except Exception as e: 145 | print(e) 146 | missing.append(bioguide) 147 | continue 148 | 149 | # Extract the member's name and the biography paragraph (main). 150 | 151 | try: 152 | name = dom.cssselect("p font")[0] 153 | main = dom.cssselect("p")[0] 154 | except IndexError: 155 | print("[%s] Missing name or content!" % bioguide) 156 | exit(0) 157 | 158 | name = name.text_content().strip() 159 | main = main.text_content().strip().replace("\n", " ").replace("\r", " ") 160 | main = re.sub("\s+", " ", main) 161 | 162 | # Extract the member's birthday. 163 | 164 | update_birthday(bioguide, by_bioguide[bioguide], main) 165 | 166 | # Extract relationships with other Members of Congress. 167 | 168 | if utils.flags().get("relationships", False): 169 | #relationship information, if present, is in a parenthetical immediately after the name. 170 | #should always be present if we passed the IndexError catch above 171 | after_name = dom.cssselect("p font")[0].tail.strip() 172 | relationships = relationships_of(after_name) 173 | if len(relationships): 174 | families = families + 1 175 | by_bioguide[bioguide]["family"] = relationships 176 | 177 | count = count + 1 178 | 179 | 180 | print() 181 | if warnings: 182 | print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings))) 183 | 184 | if missing: 185 | print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing))) 186 | 187 | print("Saving data to %s..." % filename) 188 | save_data(legislators, filename) 189 | 190 | print("Saved %d legislators to %s" % (count, filename)) 191 | 192 | if utils.flags().get("relationships", False): 193 | print("Found family members for %d of those legislators" % families) 194 | 195 | # Some testing code to help isolate and fix issued: 196 | # f 197 | # none = "PEARSON, Joseph, a Representative from North Carolina; born in Rowan County, N.C., in 1776; completed preparatory studies; studied law; was admitted to the bar and commenced practice in Salisbury, N.C.; member of the State house of commons; elected as a Federalist to the Eleventh, Twelfth, and Thirteenth Congresses (March 4, 1809-March 3, 1815); while in Congress fought a duel with John George Jackson, of Virginia, and on the second fire wounded his opponent in the hip; died in Salisbury, N.C., October 27, 1834." 198 | # print "Pearson (none): %s" % birthday_for(none) 199 | 200 | # owens = "OWENS, William, a Representative from New York; born in Brooklyn, Kings County, N.Y., January, 20, 1949; B.S., Manhattan College, Riverdale, N.Y., 1971; J.D., Fordham University, New York, N.Y., 1974; United States Air Force; lawyer, private practice; faculty, State University of New York, Plattsburgh, N.Y., 1978-1986; elected as a Democrat to the One Hundred Eleventh Congress, by special election to fill the vacancy caused by the resignation of United States Representative John McHugh, and reelected to the two succeeding Congresses (November 3, 2009-present)." 201 | # print "Owens (January, 20, 1949): %s" % birthday_for(owens) 202 | 203 | # shea = "SHEA-PORTER, Carol, a Representative from New Hampshire; born in New York City, New York County, N.Y., December, 1952; graduated from Oyster River High School, Durham, N.H., 1971; B.A., University of New Hampshire, Durham, N.H., 1975; M.P.A., University of New Hampshire, Durham, N.H., 1979; social worker; professor; elected as a Democrat to the One Hundred Tenth Congress and to the succeeding Congress (January 3, 2007-January 3, 2011); unsuccessful candidate for reelection to the One Hundred Twelfth Congress in 2010; elected as a Democrat to the One Hundred Thirteenth Congress (January 3, 2013-present)." 204 | # print "Shea (none): %s" % birthday_for(shea) 205 | 206 | # control = "PEARSON, Richmond, a Representative from North Carolina; born at Richmond Hill, Yadkin County, N.C., January 26, 1852; attended Horner's School, Oxford, N.C., and was graduated from Princeton College in 1872; studied law; was admitted to the bar in 1874; in the same year was appointed United States consul to Verviers and Liege, Belgium; resigned in 1877; member of the State house of representatives 1884-1886; elected as a Republican to the Fifty-fourth and Fifty-fifth Congresses (March 4, 1895-March 3, 1899); successfully contested the election of William T. Crawford to the Fifty-sixth Congress and served from May 10, 1900, to March 3, 1901; appointed by President Theodore Roosevelt as United States consul to Genoa, Italy, December 11, 1901, as Envoy Extraordinary and Minister Plenipotentiary to Persia in 1902, and as Minister to Greece and Montenegro in 1907; resigned from the diplomatic service in 1909; died at Richmond Hill, Asheville, N.C., September 12, 1923; interment in Riverside Cemetery." 207 | # print "\nControl (January 26, 1852): %s" % birthday_for(control) 208 | 209 | def fetch_bioguide_page(bioguide, force): 210 | url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide 211 | cache = "legislators/bioguide/%s.html" % bioguide 212 | try: 213 | body = download(url, cache, force) 214 | 215 | # Fix a problem? 216 | body = body.replace("Á\xc2\x81", "Á") 217 | 218 | # Entities like ’ are in Windows-1252 encoding. Normally lxml 219 | # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser 220 | # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't 221 | # provide a cssselect method on element objects. So we'll just decode ourselves. 222 | body = utils.unescape(body, "Windows-1252") 223 | 224 | dom = lxml.html.parse(io.StringIO(body)).getroot() 225 | except lxml.etree.XMLSyntaxError: 226 | raise Exception("Error parsing: " + url) 227 | 228 | # Sanity check. 229 | 230 | if len(dom.cssselect("title")) == 0: 231 | raise Exception("No page for bioguide %s!" % bioguide) 232 | 233 | return dom 234 | 235 | if __name__ == '__main__': 236 | run() 237 | -------------------------------------------------------------------------------- /scripts/bioguide_guess_new_member_ids.py: -------------------------------------------------------------------------------- 1 | import rtyaml 2 | 3 | from bioguide import fetch_bioguide_page 4 | 5 | def run(): 6 | 7 | print("Finding highest bioguide numbers we know of...") 8 | highest_num_by_letter = { } 9 | for fn in ('legislators-current', 'legislators-historical'): 10 | P = rtyaml.load(open('../%s.yaml' % fn)) 11 | for p in P: 12 | if not p['id'].get('bioguide'): continue 13 | if p['id']['bioguide'] == "TODO": continue # 114th Congress staging 14 | letter = p['id']['bioguide'][0] 15 | num = p['id']['bioguide'][1:] 16 | highest_num_by_letter[letter] = max(highest_num_by_letter.get(letter, ''), num) 17 | 18 | print("Checking for new bioguide pages...") 19 | for letter in sorted(highest_num_by_letter): 20 | num = int(highest_num_by_letter[letter]) 21 | while True: 22 | num += 1 23 | bioguide = "%s%06d" % (letter, num) 24 | try: 25 | dom = fetch_bioguide_page(bioguide, True) 26 | except Exception: 27 | break 28 | print(bioguide, dom.cssselect("title")[0].text) 29 | 30 | if __name__ == '__main__': 31 | run() 32 | -------------------------------------------------------------------------------- /scripts/bioguide_xml.py: -------------------------------------------------------------------------------- 1 | # Update metadata fields like birthdays from 2 | # bioguide.congress.gov bulk data downloads. 3 | # 4 | # Usage: 5 | # python3 bioguide_xml.py path/to/BioguideProfiles.zip 6 | 7 | import sys 8 | import zipfile 9 | import re 10 | import json 11 | import rtyaml 12 | import datetime 13 | 14 | def run(): 15 | # Load existing legislators and map bioguide IDs 16 | # to their entries. 17 | legislator_data = { } 18 | legislators = { } 19 | for ft in ("current", "historical"): 20 | with open("../legislators-{}.yaml".format(ft)) as f: 21 | data = rtyaml.load(f) 22 | legislator_data[ft] = data 23 | for p in data: 24 | legislators[p["id"]["bioguide"]] = p 25 | 26 | def parse_birthday_from_text(text): 27 | # exceptions for not-nicely-placed semicolons 28 | text = text.replace("born in Cresskill, Bergen County, N. J.; April", "born April") 29 | text = text.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") 30 | text = text.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") 31 | text = text.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") 32 | text = text.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") 33 | text = text.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") 34 | 35 | # look for a date 36 | pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" 37 | match = re.search(pattern, text, re.I) 38 | if not match or not match.group(1): 39 | # specifically detect cases that we can't handle to avoid unnecessary warnings 40 | if re.search("birth dates? unknown|date of birth is unknown", text, re.I): return None, None 41 | if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", text, re.I): return None, None 42 | return None, None 43 | original_text = match.group(1).strip() 44 | 45 | try: 46 | birthday = datetime.datetime.strptime(original_text.replace(",", ""), "%B %d %Y") 47 | except ValueError: 48 | print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide_id, original_text)) 49 | return None, original_text 50 | 51 | birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) 52 | return birthday, original_text 53 | 54 | # Process all profile data in the bioguide ZIP file. 55 | with zipfile.ZipFile(sys.argv[1]) as zf: 56 | for profile_fn in zf.namelist(): 57 | bioguide_id = re.match(r"^([A-Z]\d+)\.json", profile_fn).group(1) 58 | if bioguide_id not in legislators: 59 | #print("No legislator for", bioguide_id) 60 | continue 61 | with zf.open(profile_fn) as zff: 62 | profile = json.load(zff) 63 | if "profileText" not in profile: 64 | continue 65 | 66 | legislator = legislators[bioguide_id] 67 | 68 | # Get birthday from text. 69 | birthday, original_text = parse_birthday_from_text(profile["profileText"]) 70 | if birthday: 71 | 72 | # Check birthday from metadata --- not as reliable. 73 | # Since the metadata may only have a year, only match 74 | # as much of the date string as it has. 75 | if profile.get("birthDate") and not profile.get("birthCirca"): 76 | if profile["birthDate"] != birthday[0:len(profile["birthDate"])]: 77 | print(bioguide_id, "metadata", repr(profile["birthDate"]), "doesn't match profile text", repr(original_text)) 78 | else: 79 | # They match, so update. 80 | legislators.setdefault("bio", {}) 81 | legislator["bio"]["birthday"] = birthday 82 | 83 | 84 | # Write out updated data files. 85 | for fn in legislator_data: 86 | with open("../legislators-{}.yaml".format(ft), "w") as f: 87 | rtyaml.dump(legislator_data[fn], f) 88 | 89 | if __name__ == "__main__": 90 | run() 91 | -------------------------------------------------------------------------------- /scripts/committee_membership.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Data Sources: 4 | # House: 5 | # http://clerk.house.gov/xml/lists/MemberData.xml 6 | # Senate: 7 | # https://www.senate.gov/general/committee_membership/committee_memberships_{thomas_id}.xml 8 | 9 | # Data Files Updated: 10 | # committee-membership-current.yaml: 11 | # All entries are overwritten except for house members of joint committees 12 | # which have to be manually entered since there is no source of this data 13 | # committees-current.yaml: 14 | # Fro House committees, updates name, address, and phone 15 | # For Senate committees, updates name and url 16 | 17 | 18 | import re, lxml.html, lxml.etree 19 | from collections import OrderedDict 20 | import utils 21 | from utils import download, load_data, save_data 22 | 23 | 24 | def run(): 25 | committee_membership = load_data("committee-membership-current.yaml") 26 | committees_current = load_data("committees-current.yaml") 27 | 28 | # default to not caching 29 | cache = utils.flags().get('cache', False) 30 | force = not cache 31 | 32 | 33 | # map house/senate committee IDs to their dicts 34 | house_ref = { } 35 | for cx in committees_current: 36 | if "house_committee_id" in cx: 37 | house_ref[cx["house_committee_id"]] = cx 38 | senate_ref = { } 39 | for cx in committees_current: 40 | if "senate_committee_id" in cx: 41 | senate_ref[cx["senate_committee_id"]] = cx 42 | 43 | 44 | # map state/district to current senators because the Senate committee 45 | # membership data does not contain IDs for senators, and map to bioguide 46 | # IDs so we can copy forward the official_full name for House members 47 | legislators_current = load_data("legislators-current.yaml") 48 | senators = { } 49 | for moc in legislators_current: 50 | term = moc["terms"][-1] 51 | if term["type"] == "sen": 52 | for n in [moc["name"]] + moc.get("other_names", []): 53 | senators[(term["state"], n["last"])] = moc 54 | legislators_current = { moc["id"]["bioguide"]: moc for moc in legislators_current } 55 | 56 | 57 | # Scrape clerk.house.gov... 58 | def scrape_house(): 59 | # clear out all of the existing House members of committees (i.e. all House committee membership 60 | # and the House part of Joint committee membership) 61 | for committee, members in committee_membership.items(): 62 | for m in list(members): # must clone before editing list 63 | if committee[0] == "H" or m.get("chamber") == "house": 64 | members.remove(m) 65 | 66 | r = download("http://clerk.house.gov/xml/lists/MemberData.xml", "clerk_xml", force) 67 | dom = lxml.etree.fromstring(r.encode("latin-1")) # must be bytes to parse if there is an encoding declaration inside the string 68 | 69 | # Update committee metadata. 70 | def update_house_committee_metadata(xml_cx, cx, parentdict, is_subcommittee): 71 | sub_prefix = "sub" if is_subcommittee else "" 72 | 73 | if cx is None: 74 | # New committee. 75 | if not is_subcommittee: 76 | cx = { 77 | "type": "house", 78 | "thomas_id": "H" + xml_cx.attrib["type"][0].upper() + xml_cx.attrib["comcode"][0:2], 79 | "house_committee_id": xml_cx.attrib["comcode"][0:2] 80 | } 81 | house_ref[cx["house_committee_id"]] = cx 82 | else: 83 | cx = { 84 | "name": None, # placeholder so order is right 85 | "thomas_id": xml_cx.attrib["subcomcode"][2:] 86 | } 87 | parentdict.append(cx) 88 | 89 | cx["name"] = normalize_text(xml_cx.find(sub_prefix + "committee-fullname").text) 90 | if not is_subcommittee and not cx["name"].startswith("Joint "): cx["name"] = "House " + cx["name"] 91 | 92 | building = xml_cx.attrib[sub_prefix + "com-building-code"] 93 | if building == "C": 94 | building = "CAPITOL" 95 | #address format: 1301 LHOB; Washington, DC 20515-6001 96 | cx["address"] = xml_cx.attrib[sub_prefix + "com-room"] + " " + building \ 97 | + "; Washington, DC " + xml_cx.attrib[sub_prefix + "com-zip"] \ 98 | + (("-" + xml_cx.attrib[sub_prefix + "com-zip-suffix"]) if xml_cx.attrib[sub_prefix + "com-zip-suffix"] != "0" else "") 99 | cx["phone"] = "(202) " + xml_cx.attrib[sub_prefix + "com-phone"] 100 | 101 | if not is_subcommittee: 102 | for xml_sx in xml_cx.findall("subcommittee"): 103 | sxx = [s for s in cx["subcommittees"] if s["thomas_id"] == xml_sx.attrib["subcomcode"][2:]] 104 | update_house_committee_metadata(xml_sx, sxx[0] if len(sxx) > 0 else None, cx["subcommittees"], True) 105 | 106 | committees = dom.xpath("/MemberData/committees")[0] 107 | for xml_cx in committees.findall("committee"): 108 | house_committee_id = xml_cx.attrib["comcode"][0:2] 109 | update_house_committee_metadata(xml_cx, house_ref.get(house_committee_id), committees_current, False) 110 | 111 | # Determine which party is in the majority. Only the majority 112 | # party holds chair positions. At least one should have the 113 | # position Chair. 114 | house_majority_caucus = dom.xpath("string(/MemberData/members/member[committee-assignments/committee[@leadership='Chair']]/member-info/caucus)") 115 | 116 | for xml_member in dom.xpath("/MemberData/members/member"): 117 | bioguide_id = xml_member.xpath("member-info/bioguideID")[0].text 118 | if not bioguide_id: #sometimes the xml has vacancies as blanks 119 | continue 120 | 121 | # Although there is a name in the XML data, for consistency use the one we 122 | # have in legislators-current.yaml, if one is set. 123 | try: 124 | official_name = legislators_current[bioguide_id]["name"]["official_full"] 125 | except KeyError: 126 | official_name = xml_member.xpath("member-info/official-name")[0].text 127 | 128 | #is using caucus better than using party? 129 | caucus = xml_member.xpath("member-info/caucus")[0].text 130 | party = "majority" if caucus == house_majority_caucus else "minority" 131 | 132 | #for each committee or subcommittee membership 133 | for cm in xml_member.xpath("committee-assignments/committee|committee-assignments/subcommittee"): 134 | if "comcode" in cm.attrib: 135 | house_committee_id = cm.attrib["comcode"][:2] 136 | if house_committee_id == "HL": continue # this doesn't appear to be a committee and seems like a data error 137 | thomas_committee_id = house_ref[house_committee_id]["thomas_id"] 138 | elif "subcomcode" in cm.attrib: 139 | house_committee_id = cm.attrib["subcomcode"][:2] 140 | thomas_committee_id = house_ref[house_committee_id]["thomas_id"] + cm.attrib["subcomcode"][2:] 141 | else: 142 | continue # some nodes are invalid 143 | 144 | membership = OrderedDict() 145 | membership["name"] = official_name 146 | membership["party"] = party 147 | membership["rank"] = int(cm.attrib["rank"]) 148 | 149 | if "leadership" in cm.attrib: 150 | membership["title"] = cm.attrib["leadership"] # TODO .replace("woman", "").replace("man", "") 151 | elif membership["rank"] == 1: 152 | #xml doesn't contain ranking member titles 153 | if membership["party"] == "majority": 154 | membership["title"] = "Chair" 155 | else: 156 | membership["title"] = "Ranking Member" 157 | membership["bioguide"] = bioguide_id 158 | 159 | if house_ref[house_committee_id]["type"] == "joint": 160 | membership["chamber"] = "house" 161 | 162 | committee_membership.setdefault(thomas_committee_id, []).append(membership) 163 | 164 | # Scrape senate.gov.... 165 | def scrape_senate(): 166 | url = "https://www.senate.gov/pagelayout/committees/b_three_sections_with_teasers/membership.htm" 167 | body = download(url, "committees/membership/senate.html", force) 168 | 169 | for id, name in re.findall(r'value="/general/committee_membership/committee_memberships_(....).htm">(.*?)', body, re.I | re.S): 170 | if id not in senate_ref: 171 | print("Unrecognized committee:", id, name) 172 | continue 173 | 174 | cx = senate_ref[id] 175 | is_joint = (id[0] == "J") 176 | 177 | # Scrape some metadata on the HTML page first. 178 | 179 | committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.htm" % id 180 | print("[%s] Fetching members for %s (%s)" % (id, name, committee_url)) 181 | body2 = download(committee_url, "committees/membership/senate/%s.html" % id, force) 182 | 183 | if not body2: 184 | print("\tcommittee page not good:", committee_url) 185 | continue 186 | 187 | m = re.search(r'', body2, re.I) 188 | if m: 189 | cx["url"] = m.group(1) 190 | 191 | # Use the XML for the rest. 192 | 193 | print("\tDownloading XML...") 194 | committee_url = "https://www.senate.gov/general/committee_membership/committee_memberships_%s.xml" % id 195 | 196 | body3 = download(committee_url, "committees/membership/senate/%s.xml" % id, force) 197 | dom = lxml.etree.fromstring(body3.encode("utf8")) # must be bytes to parse if there is an encoding declaration inside the string 198 | 199 | cx["name"] = normalize_text(dom.xpath("committees/committee_name")[0].text) 200 | if id[0] != "J" and id[0:2] != 'SC': 201 | cx["name"] = "Senate " + cx["name"] 202 | 203 | majority_party = dom.xpath("committees/majority_party")[0].text 204 | 205 | # update full committee members 206 | scrape_senate_members( 207 | dom.xpath("committees/members/member"), 208 | committee_membership.setdefault(id, []), 209 | majority_party, is_joint) 210 | 211 | # update subcommittees 212 | for subcom in dom.xpath("committees/subcommittee"): 213 | scid = subcom.xpath("committee_code")[0].text[4:] 214 | for sx in cx.get('subcommittees', []): 215 | if sx["thomas_id"] == scid: 216 | break 217 | else: 218 | print("Subcommittee not found, creating it", scid, name) 219 | sx = OrderedDict() 220 | sx['thomas_id'] = scid 221 | cx.setdefault('subcommittees', []).append(sx) 222 | 223 | # update metadata 224 | name = subcom.xpath("subcommittee_name")[0].text 225 | sx["name"] = normalize_text(name) 226 | sx["name"] = re.sub(r"^\s*Subcommittee on\s*", "", sx["name"]) 227 | sx["name"] = re.sub(r"\s+", " ", sx["name"]) 228 | 229 | scrape_senate_members( 230 | subcom.xpath("members/member"), 231 | committee_membership.setdefault(id + scid, []), 232 | majority_party, is_joint) 233 | 234 | def scrape_senate_members(members, output_list, majority_party, is_joint): 235 | # Keep a copy of the previous membership, and then clear the Senate members 236 | # of the committee. 237 | existing_members_data = list(output_list) # clone 238 | if not is_joint: 239 | output_list.clear() 240 | else: 241 | for m in list(output_list): # must clone before editing list 242 | if m.get("chamber") == "senate": 243 | output_list.remove(m) 244 | 245 | # Update members. 246 | ids = set() 247 | count_by_party = { "majority": 0, "minority": 0 } 248 | for node in members: 249 | ids.add(scrape_senate_member(output_list, node, majority_party, is_joint, count_by_party, existing_members_data)) 250 | 251 | # Purge non-members. Ignore House members of joint committees. 252 | i = 0 253 | while i < len(output_list): 254 | if output_list[i]['bioguide'] not in ids and output_list[i].get("chamber") in (None, "senate"): 255 | output_list[i:i+1] = [] 256 | else: 257 | i += 1 258 | 259 | # sort by party, then by rank, since we get the nodes in the XML in a rough seniority order that ignores party 260 | output_list.sort(key = lambda e : (e["party"] != "majority", e["rank"])) 261 | 262 | def scrape_senate_member(output_list, membernode, majority_party, is_joint, count_by_party, existing_members_data): 263 | last_name = membernode.xpath("name/last")[0].text 264 | state = membernode.xpath("state")[0].text 265 | party = "majority" if membernode.xpath("party")[0].text == majority_party else "minority" 266 | title = membernode.xpath("position")[0].text 267 | if title == "Member": title = None 268 | if title == "Ranking": title = "Ranking Member" 269 | 270 | # look up senator by state and last name 271 | if (state, last_name) == ("NM", "Lujan"): last_name = "Luján" 272 | if (state, last_name) not in senators: 273 | print("\t[%s] Unknown member: %s" % (state, last_name)) 274 | return None 275 | 276 | moc = senators[(state, last_name)] 277 | 278 | entry = OrderedDict() 279 | if 'official_full' in moc['name']: 280 | entry["name"] = moc['name']['official_full'] 281 | else: 282 | print("missing name->official_full field for", moc['id']['bioguide']) 283 | entry["party"] = party 284 | count_by_party[party] += 1 285 | entry["rank"] = count_by_party[party] 286 | if title: entry["title"] = title 287 | entry.update(ids_from(moc["id"])) 288 | if is_joint: entry["chamber"] = "senate" 289 | 290 | # Look for an existing entry for this member and take 291 | # start_date and source from it, if set. 292 | for item in existing_members_data: 293 | if item["bioguide"] == entry["bioguide"]: 294 | for key in ("start_date", "source"): 295 | if key in item: 296 | entry[key] = item[key] 297 | 298 | output_list.append(entry) 299 | 300 | # Return bioguide ID of member added. 301 | return entry["bioguide"] 302 | 303 | # stick to a specific small set of official IDs to cross-link members 304 | # this limits the IDs from going out of control in this file, while 305 | # preserving us flexibility to be inclusive of IDs in the main leg files 306 | def ids_from(moc): 307 | ids = {} 308 | if "bioguide" in moc: 309 | ids["bioguide"] = moc["bioguide"] 310 | if len(ids) == 0: 311 | raise ValueError("Missing an official ID for this legislator, won't be able to link back") 312 | return ids 313 | 314 | # MAIN 315 | scrape_house() 316 | scrape_senate() 317 | 318 | # ensure each committee has members in a stable, sorted order 319 | for comm, mbrs in committee_membership.items(): 320 | # joint committees also have to sort by chamber 321 | if comm[0] == "J": 322 | mbrs.sort(key=lambda entry: (entry["party"] == "minority", entry["rank"], entry["chamber"] != "senate")) 323 | 324 | # Senate and House committees have different sort orders to match 325 | # earlier data, but there's no particular reason for this 326 | elif comm[0] == "S": 327 | mbrs.sort(key=lambda entry: (entry["party"] == "minority", entry["rank"])) 328 | else: 329 | mbrs.sort(key=lambda entry: (entry["rank"], entry["party"] == "minority")) 330 | 331 | save_data(committee_membership, "committee-membership-current.yaml") 332 | save_data(committees_current, "committees-current.yaml") 333 | 334 | 335 | def normalize_text(text): 336 | # Remove leading and trailing whitespace (coul also use .strip()). 337 | text = re.sub(r"^\s+|\s+$", "", text) 338 | 339 | # Remove double spaces and turn all internal whitespace into spaces. 340 | text = re.sub(r"\s+", " ", text) 341 | 342 | return text 343 | 344 | 345 | if __name__ == '__main__': 346 | run() 347 | -------------------------------------------------------------------------------- /scripts/contact_forms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | '''Gets contact webform URLs for the intersection of members with bioguide ids 4 | and with correlating contact form steps in unitedstates/contact-congress: 5 | 6 | args: 7 | 8 | A list of bioguide ids to import. 9 | 10 | options: 11 | --debug[=True] 12 | Whether or not verbose output should be printed to the command line 13 | ''' 14 | 15 | import yaml 16 | from urllib.request import urlopen 17 | 18 | import utils 19 | from utils import load_data, save_data 20 | 21 | 22 | # These members have forms in iframes, and Contact-Congress has different 23 | # needs than human users might. 24 | SKIP_BIOGUIDES = ['M000312'] 25 | 26 | 27 | def run(): 28 | options = utils.flags() 29 | debug = options.get('debug', False) 30 | 31 | filename = "legislators-current.yaml" 32 | args = utils.args() 33 | legislators = load_data(filename) 34 | 35 | if len(args) != 0: 36 | bioguides = args 37 | print("Fetching contact forms for %s..." % ', '.join(bioguides)) 38 | else: 39 | bioguides = [member['id']['bioguide'] for member in legislators] 40 | print("Fetching contact forms for all current members...") 41 | 42 | for legislator in legislators: 43 | bioguide = legislator['id']['bioguide'] 44 | if bioguide not in bioguides: continue 45 | if bioguide in SKIP_BIOGUIDES: continue 46 | 47 | if debug: print("Downloading form for %s" % bioguide, flush=True) 48 | 49 | try: 50 | steps = contact_steps_for(bioguide) 51 | except LegislatorNotFoundError as e: 52 | if debug: print("skipping, %s..." % e, flush=True) 53 | continue 54 | 55 | legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit'] 56 | 57 | print("Saving data to %s..." % filename) 58 | save_data(legislators, filename) 59 | 60 | 61 | def contact_steps_for(bioguide): 62 | base_url = "https://raw.githubusercontent.com/unitedstates/contact-congress/main/members/{bioguide}.yaml" 63 | response = urlopen(base_url.format(bioguide=bioguide)) 64 | if response.code == 404: 65 | raise LegislatorNotFoundError("%s not found in unitedstates/contact-congress!" % bioguide) 66 | return yaml.load(response.read()) 67 | 68 | 69 | class LegislatorNotFoundError(Exception): 70 | pass 71 | 72 | 73 | if __name__ == '__main__': 74 | run() 75 | -------------------------------------------------------------------------------- /scripts/cspan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Update current cspan IDs using NYT Congress API. 4 | 5 | import json, urllib.request, urllib.parse, urllib.error 6 | from utils import load_data, save_data 7 | 8 | def run(): 9 | # load in current members 10 | y = load_data("legislators-current.yaml") 11 | for m in y: 12 | # retrieve C-SPAN id, if available, from ProPublica API 13 | # TODO: use utils.download here 14 | response = urllib.request.urlopen("https://projects.propublica.org/represent/api/v1/members/%s.json" % m['id']['bioguide']).read() 15 | j = json.loads(response.decode("utf8")) 16 | cspan = j['results'][0]['cspan_id'] 17 | if not cspan == '': 18 | m['id']['cspan'] = int(cspan) 19 | save_data(y, "legislators-current.yaml") 20 | 21 | if __name__ == '__main__': 22 | run() 23 | -------------------------------------------------------------------------------- /scripts/data/social_media_blacklist.csv: -------------------------------------------------------------------------------- 1 | service,pattern,comment 2 | twitter,housedemocrats,house dem caucus 3 | twitter,search\?q=,search links 4 | twitter,SmallBizGOP,some house republican caucus 5 | facebook,likebox\.php,like buttons 6 | twitter,^kellyayotte$,"CHECK LATER – no link on her homepage, twitter button is conspicuously absent" 7 | twitter,elizabethforma,campaign account 8 | twitter,heinrich4nm,campaign account 9 | twitter,gopconference,some house republican caucus 10 | twitter,^petegallego$,campaign account 11 | twitter,^share$,not the share url 12 | twitter,javascripts,probably not real 13 | twitter,user_timeline,probably not real 14 | twitter,statuses,probably not real 15 | twitter,WaysandMeansGOP,some house republican caucus 16 | twitter,congjeffmiller,abandoned 17 | twitter,Daniel_Inouye,mentioned on some people's sites 18 | twitter,^SydneyFreedberg$,appears somewhere 19 | twitter,^kyledeb$,appears somewhere 20 | twitter,nbcnews,appears somewhere 21 | twitter,^MarcoRubio$,accidentally linked to campaign account 22 | twitter,^TedCruz$,accidentally linked to campaign account 23 | twitter,^NRO$,appears somewhere 24 | twitter,20515507227,commented out embed on peter roskam's site 25 | twitter,^tammybaldwin$,campaign account 26 | facebook,media/set,media plugin 27 | facebook,photo.php,media plugin 28 | facebook,plugins/like.php,like plugin 29 | facebook,feeds/page.php,some plugin 30 | facebook,republicanconference,shared page 31 | facebook,HouseDemocrats,shared page 32 | facebook,photos.php,photo plugin 33 | facebook,album.php,album plugin 34 | facebook,^2008$,fbml year xmlns thing 35 | facebook,^plugins$,plugin plugin 36 | facebook,^feeds$,feeds plugin 37 | facebook,HouseChamber,shared page 38 | facebook,VoteMartinHeinrich,campaign account 39 | facebook,^JeffFlake1$,campaign account 40 | facebook,^media$,media plugin 41 | youtube,^embed$,embed tag 42 | youtube,^e$,embed tag 43 | youtube,^v$,embed tag 44 | youtube,^cp$,embed tag 45 | youtube,SmallBizRepublicans,shared page 46 | youtube,^watch$,embed tag 47 | youtube,housedems,embed tag 48 | youtube,republicanconference,embed tag 49 | youtube,^WSB111$,Some weird vendor thing 50 | youtube,^p$,embed tag 51 | youtube,HouseConference,shared page 52 | youtube,^rss$,rss link 53 | youtube,RepublicanLeader,shared page 54 | youtube,ElizabethForMA,campaign account 55 | youtube,^player_api$,embed tag 56 | twitter,^PatrickMurphyFL$,campaign account 57 | facebook,128261203867188,commented out thing on carolyn mccarthy's site 58 | facebook,^BrianSchatz$,campaign account 59 | facebook,^pages$,pages is never the answer 60 | youtube,^upload\)<$,weird 61 | facebook,^blumenauer$,campaign account 62 | youtube,^HCmNo6p7XRqNs$,"auto-generated channel, not videos created by staff" 63 | youtube,^channel$,ignore detections of “/channel” 64 | facebook,^socialjobs$,some job thing on booker's site 65 | twitter,^USRepJoeWilson$,404s 66 | twitter,^FraminghamPatch$,not the right one 67 | twitter,^CandiceMiller$,"we have it already, blacklisted because it shows up elsewhere mistakenly" 68 | facebook,^billnelson$,"campaign account, even though listed on official page" 69 | instagram,republicanconference, not an individual's account 70 | instagram,housedemocrats, not an individual's account 71 | twitter,^housegop$,house gop conference 72 | twitter,^share\?,share URL with query string 73 | twitter,^sethmoulton$,seth moulton's campaign account 74 | facebook,^HouseRepublicans$,house conference account 75 | facebook,^sharer$,share URL 76 | youtube,^c$,junk 77 | twitter,^OlsonPressShop$,"not the right one (linked, but other one is used)" 78 | instagram,^johncornyn$,campaign account 79 | instagram,^housegop$,house gop conference 80 | instagram,^en_US$,junk 81 | instagram,^rep$,junk 82 | instagram,^t51$,junk 83 | twitter,^home$,junk 84 | twitter,^intent$,junk 85 | -------------------------------------------------------------------------------- /scripts/data/social_media_whitelist.csv: -------------------------------------------------------------------------------- 1 | service,account,comment 2 | twitter,CongressmanDan,"not linked, but is official" 3 | twitter,repbarbaralee,"not linked, but is official" 4 | twitter,pedropierluisi,on separate page 5 | twitter,BettyMcCollum04,"not linked, but is official" 6 | twitter,CongCulberson,"not linked, but is official" 7 | twitter,franklautenberg,"not linked, but is official" 8 | twitter,RepShelley,"not linked, but is official" 9 | twitter,DocHastings,"not linked, but is official" 10 | twitter,NydiaVelazquez,"not linked, but is official" 11 | twitter,BillPascrell,"not linked, but is official" 12 | twitter,maziehirono,"no official site yet, but is official" 13 | youtube,SenatorVitter,"official site links to individual video, but not account" 14 | youtube,CongresswomanHirono,"official twitter links to account" 15 | youtube,SenatorWhitehouse,"linked, but obfuscated by javascript" 16 | youtube,SenatorBaucus,"official site links to official videos but not account" 17 | youtube,SenatorIsakson,"official site links to official videos but not account" 18 | youtube,KirstenEGillibrand,"official site links to official videos but not account" -------------------------------------------------------------------------------- /scripts/election_results.py: -------------------------------------------------------------------------------- 1 | # Updates the data files according to the results of 2 | # a general election using a spreadsheet of election 3 | # results and prepares for a new Congress. This script 4 | # does the following: 5 | # 6 | # * Adds end dates to all current leadership roles since 7 | # leadership resets in both chambers each Congress. 8 | # * Brings senators not up for reelection, and Puerto 9 | # Rico's resident commissioner in off-years, forward 10 | # unchanged. 11 | # * Creates new legislator entries for new people in 12 | # the election results spreadsheet. The next available 13 | # GovTrack ID is assigned to each new legislator. 14 | # * Creates new terms for each election winner in the 15 | # election results spreadsheet (incumbents and new 16 | # legislators). 17 | # * Clears the committee-membership-current.yaml file 18 | # since all House and Senate committees reset at the 19 | # start of a new Congress. 20 | # * Clears out the social media entries for legislators 21 | # no longer serving. 22 | # 23 | # Usage: 24 | # * Use the same column headers as in the last spreadsheet (see 25 | # the previous .csv file in the archive directory). 26 | # * Save the spreadsheet to archive/election_results_{year}.csv. 27 | # * Edit the ELECTION_YEAR constant below. 28 | # * Make sure the legislators-{current,historical}.yaml files are 29 | # clean -- i.e. if you've run this script, revert any changes 30 | # before running this script again with e.g.: 31 | # git checkout origin/main ../*.yaml 32 | # * Run this script. 33 | # * Make other changes manually for special elections. 34 | # * Run wikidata_update.py to fill in some other fields. 35 | # * Run `NOW=2023-01-03 test/validate.py` to check for errors. 36 | 37 | import traceback 38 | from types import SimpleNamespace as SN 39 | 40 | import collections, csv, re 41 | from utils import load_data, save_data 42 | 43 | ELECTION_YEAR = 2024 44 | 45 | def run(): 46 | # Compute helper constants. 47 | SENATE_CLASS = ((ELECTION_YEAR-2) % 6) // 2 + 1 48 | 49 | # Open existing data. 50 | print("Opening legislator data...") 51 | legislators_historical = load_data("legislators-historical.yaml") 52 | legislators_current = load_data("legislators-current.yaml") 53 | 54 | # New member data. 55 | party_map = { "R": "Republican", "D": "Democrat", "I": "Independent" } 56 | new_legislators = [] 57 | 58 | # Only one class of senators was up for election. Mark all other 59 | # senators as still serving. Additionally, in off years for the 60 | # four-year-termed resident commissioner of Puerto Rico, mark 61 | # that person as still serving also. 62 | current = [] 63 | for p in legislators_current: 64 | if p["terms"][-1]["type"] == "sen" and p["terms"][-1]["class"] != SENATE_CLASS: 65 | current.append(p["id"]["govtrack"]) 66 | if p["terms"][-1]["state"] == "PR" and (ELECTION_YEAR % 4 != 0): 67 | current.append(p["id"]["govtrack"]) 68 | 69 | # Map bioguide IDs to existing legislators to read the Bioguide ID 70 | # column of the CSV file. 71 | bioguide_id_map = { } 72 | for entry in legislators_historical + legislators_current: 73 | bioguide_id_map[entry['id']['bioguide']] = entry 74 | 75 | # Get highest existing GovTrack ID to know where to start for assigning new IDs. 76 | # Store it in a mutable data structure so that the inner function can increment it. 77 | max_govtrack_id = SN( 78 | value=max(p['id']['govtrack'] for p in (legislators_historical+legislators_current))) 79 | 80 | # Load spreadsheet of Senate election results. 81 | print("Applying election results...") 82 | 83 | def process_row(row): 84 | # Get state and district from race code. An empty 85 | # district means a senate race. 86 | state, district = re.match(r"^([A-Z]{2})(\d*)$", row["Race"]).groups() 87 | 88 | if row['Bioguide ID'] in bioguide_id_map: 89 | # Use the Bioguide ID to get the legislator who won, which might be 90 | # the incumbent or a representative elected to the senate, or 91 | # someone who previously served in Congress, etc. The House provides 92 | # draft IDs for new members, so the ID in the spreadsheet may not 93 | # match an existing person. 94 | p = bioguide_id_map[row['Bioguide ID']] 95 | else: 96 | # Make a new legislator entry. 97 | max_govtrack_id.value += 1 98 | p = collections.OrderedDict([ 99 | ("id", collections.OrderedDict([ 100 | ("bioguide", row['Bioguide ID'] if row['Bioguide ID'] != "(not assigned)" else None), 101 | ("fec", [row['FEC.gov ID']]), 102 | ("govtrack", max_govtrack_id.value), 103 | #("opensecrets", None), # don't know yet 104 | #("votesmart", int(row['votesmart'])), # not doing this anymore 105 | ("wikipedia", row['Wikipedia URL'].replace("https://en.wikipedia.org/wiki/", "").replace("_", " ")), 106 | #("wikidata", row['Wikidata ID']), # will convert from wikipedia 107 | #("ballotpedia", row['Ballotpedia Page Name']), 108 | ])), 109 | ("name", collections.OrderedDict([ 110 | ("first", row['First Name']), 111 | ("middle", row['Middle Name']), 112 | ("last", row['Last Name']), 113 | ("suffix", row['Suffix']), 114 | ("official_full", row['Name']), # best guess 115 | ])), 116 | ("bio", collections.OrderedDict([ 117 | ("gender", row['Gender (M/F)']), 118 | ("birthday", row['Birthday (YYYY-MM-DD)']), 119 | ])), 120 | ("terms", []), 121 | ]) 122 | 123 | # Delete keys that were filled with Nones or empty strings 124 | # because we don't have the data yet, other than Bioguide ID 125 | # because we'll need that to be filled in manually anyway. 126 | for section in ("id", "name", "bio"): 127 | for k in list(p[section]): # clone key list before modifying dict 128 | if not p[section][k] and not (section == "id" and k == "bioguide"): 129 | del p[section][k] 130 | 131 | new_legislators.append(p) 132 | 133 | # Add to array marking this legislator as currently serving. 134 | current.append(p['id']['govtrack']) 135 | 136 | # Add a new term. 137 | if district == "": # Senate race 138 | term = collections.OrderedDict([ 139 | ("type", "sen"), 140 | ("start", "{next_year}-01-03".format(next_year=ELECTION_YEAR+1)), 141 | ("end", "{in_six_years}-01-03".format(in_six_years=ELECTION_YEAR+1+6)), 142 | ("state", state), 143 | ("class", SENATE_CLASS), 144 | ("state_rank", None), # computed later 145 | ]) 146 | else: 147 | term = collections.OrderedDict([ 148 | ("type", "rep"), 149 | ("start", "{next_year}-01-03".format(next_year=ELECTION_YEAR+1)), 150 | ("end", "{in_two_years}-01-03".format(in_two_years=ELECTION_YEAR+1+2)), 151 | ("state", state), 152 | ("district", int(district)), 153 | ]) 154 | 155 | # If party is given in the table (for some incumbents and 156 | # all new winners), use it. Otherwise just make a field so 157 | # it's in the right order. 158 | term.update(collections.OrderedDict([ 159 | ("party", party_map[row['Party (D/R/I)']] if row['Party (D/R/I)'] else None), 160 | ])) 161 | p['terms'].append(term) 162 | if term['party'] == "Independent": 163 | term["caucus"] = row['Caucus'] 164 | 165 | if len(p['terms']) > 1: 166 | # This is an incumbent or at least served previously. 167 | # Copy some fields forward that are likely to remain the same, if we 168 | # haven't already set them. 169 | for k in ('party', 'caucus'): 170 | if k in p['terms'][-2] and not term.get(k): 171 | term[k] = p['terms'][-2][k] 172 | if len(p['terms']) > 1 and p["terms"][-2]["type"] == term["type"]: 173 | # Copy some more fields if the last term was in the same chamber. 174 | for k in ('url', 'rss_url'): 175 | if k in p['terms'][-2] and not term.get(k): 176 | term[k] = p['terms'][-2][k] 177 | 178 | election_results = csv.DictReader(open("archive/election_results_{year}.csv".format(year=ELECTION_YEAR))) 179 | for row in election_results: 180 | if row['Race'] == "": return # end of spreadsheet 181 | try: 182 | process_row(row) 183 | except: 184 | print(row) 185 | traceback.print_exc() 186 | print() 187 | 188 | # End any current leadership roles. 189 | for p in legislators_current: 190 | for r in p.get('leadership_roles', []): 191 | if not r.get('end'): 192 | r['end'] = "{next_year}-01-03".format(next_year=ELECTION_YEAR+1) 193 | 194 | # Split the legislators back into the historical and current lists: 195 | 196 | # Move previously-current legislators into the historical list 197 | # if they are no longer serving, in the order that they appear 198 | # in the current list. 199 | for p in legislators_current: 200 | if p["id"]["govtrack"] not in current: 201 | legislators_historical.append(p) 202 | legislators_current = [p for p in legislators_current if p['id']['govtrack'] in current] 203 | 204 | # Move former legislators forward into the current list if they 205 | # are returning to Congress, in the order they appear in the 206 | # historical list. 207 | for p in legislators_historical: 208 | if p["id"]["govtrack"] in current: 209 | legislators_current.append(p) 210 | legislators_historical = [p for p in legislators_historical if p['id']['govtrack'] not in current] 211 | 212 | # Add new legislators in the order they occur in the election 213 | # results spreadsheet. 214 | for p in new_legislators: 215 | legislators_current.append(p) 216 | 217 | # Re-compute the state_rank junior/senior status of all senators. 218 | # We'll get this authoritatively from the Senate by senate_contacts.py 219 | # once that data is up, but we'll make an educated guess now. 220 | state_rank_assignment = set() 221 | # Senior senators not up for re-election keep their status: 222 | for p in legislators_current: 223 | term = p['terms'][-1] 224 | if term['type'] == 'sen' and term['class'] != SENATE_CLASS and term['state_rank'] == 'senior': 225 | state_rank_assignment.add(p['terms'][-1]['state']) 226 | # Senior senators who won re-election pull their status forward: 227 | for p in legislators_current: 228 | term = p['terms'][-1] 229 | if term['state'] in state_rank_assignment: continue # we already assigned the senior senator 230 | if term['type'] == 'sen' and term['class'] == SENATE_CLASS and len(p['terms']) > 1 \ 231 | and p['terms'][-2]['type'] == 'sen' and p['terms'][-2]['state'] == term['state'] and p['terms'][-2]['state_rank'] == 'senior': 232 | term['state_rank'] = 'senior' 233 | state_rank_assignment.add(p['terms'][-1]['state']) 234 | # Junior senators not up for re-election become senior if we didn't see a senior senator yet: 235 | for p in legislators_current: 236 | term = p['terms'][-1] 237 | if term['state'] in state_rank_assignment: continue # we already assigned the senior senator 238 | if term['type'] == 'sen' and term['class'] != SENATE_CLASS and term['state_rank'] == 'junior': 239 | term['state_rank'] = 'senior' 240 | state_rank_assignment.add(p['terms'][-1]['state']) 241 | # Remaining senators are senior if we haven't seen a senior senator yet, else junior: 242 | for p in legislators_current: 243 | term = p['terms'][-1] 244 | if term['type'] == 'sen' and term['state_rank'] is None: 245 | if term['state'] not in state_rank_assignment: 246 | term['state_rank'] = 'senior' 247 | state_rank_assignment.add(term['state']) 248 | else: 249 | term['state_rank'] = 'junior' 250 | 251 | # Save. 252 | print("Saving legislator data...") 253 | save_data(legislators_current, "legislators-current.yaml") 254 | save_data(legislators_historical, "legislators-historical.yaml") 255 | 256 | # Run the sweep script to clear out data that needs to be cleared out 257 | # for legislators that are gone. 258 | import sweep 259 | sweep.run() 260 | 261 | # Clears committee membership. 262 | save_data({}, "committee-membership-current.yaml") 263 | 264 | if __name__ == "__main__": 265 | run() 266 | -------------------------------------------------------------------------------- /scripts/email/config.yml.example: -------------------------------------------------------------------------------- 1 | # email: 2 | # # smtp details 3 | # hostname: 4 | # port: 5 | # user_name: 6 | # password: 7 | # starttls: 8 | # # email defaults 9 | # subject: "[unitedstates/congress-legislators] Notice" 10 | # from: 11 | # from_name: "unitedstates" 12 | # to: -------------------------------------------------------------------------------- /scripts/export_csv.py: -------------------------------------------------------------------------------- 1 | # Converts the specified YAML file to an equivalent-ish CSV file 2 | # (on standard output). 3 | # 4 | # python export_csv.py ../legislators-current.yaml 5 | 6 | import sys, csv 7 | from collections import OrderedDict 8 | 9 | from utils import yaml_load 10 | 11 | def run(): 12 | 13 | if len(sys.argv) < 2: 14 | print("Usage: python export_csv.py ../legislators-current.yaml > legislators-current.csv") 15 | sys.exit(0) 16 | 17 | data = yaml_load(sys.argv[1]) 18 | 19 | ############################################### 20 | 21 | def flatten_object(obj, path, ret): 22 | """Takes an object obj and flattens it into a dictionary ret. 23 | 24 | For instance { "x": { "y": 123 } } is turned into { "x__y": 123 }. 25 | """ 26 | for k, v in list(obj.items()): 27 | if isinstance(v, dict): 28 | flatten_object(v, (path + "__" if path else "") + k + "__", ret) 29 | elif isinstance(v, list): 30 | # don't peek inside lists 31 | pass 32 | else: 33 | ret[path + k] = v 34 | return ret 35 | 36 | # Scan through the records recursively to get a list of column names. 37 | # Attempt to preserve the field order as found in the YAML file. Since 38 | # any field may be absent, no one record can provide the complete field 39 | # order. Build the best field order by looking at what each field tends 40 | # to be preceded by. 41 | fields = set() 42 | preceding_keys = dict() # maps keys to a dict of *previous* keys and how often they occurred 43 | for record in data: 44 | prev_key = None 45 | for key in flatten_object(record, "", OrderedDict()): 46 | fields.add(key) 47 | 48 | preceding_keys.setdefault(key, {}).setdefault(prev_key, 0) 49 | preceding_keys[key][prev_key] += 1 50 | prev_key = key 51 | 52 | # Convert to relative frequencies. 53 | for k, v in list(preceding_keys.items()): 54 | s = float(sum(v.values())) 55 | for k2 in v: 56 | v[k2] /= s 57 | 58 | # Get a good order for the fields. Greedily add keys from left to right 59 | # maximizing the conditional probability that the preceding key would 60 | # precede the key on the right. 61 | field_order = [None] 62 | prev_key = None 63 | while len(field_order) < len(fields): 64 | # Which key is such that prev_key is its most likely precedessor? 65 | # We do it this way (and not what is prev_key's most likely follower) 66 | # because we should be using a probability (of sorts) that is 67 | # conditional on the key being present. Otherwise we lost infrequent 68 | # keys. 69 | next_key = max([f for f in fields if f not in field_order], key = 70 | lambda k : 71 | max(preceding_keys[k].get(pk, 0) for pk in field_order)) 72 | field_order.append(next_key) 73 | prev_key = next_key 74 | field_order = field_order[1:] # remove the None at the start 75 | 76 | # Write CSV header. 77 | w = csv.writer(sys.stdout) 78 | w.writerow(field_order) 79 | 80 | # Write the objects. 81 | for record in data: 82 | obj = flatten_object(record, "", {}) 83 | w.writerow([ 84 | obj.get(f, "") 85 | for f in field_order 86 | ]) 87 | 88 | if __name__ == '__main__': 89 | run() -------------------------------------------------------------------------------- /scripts/geocode_offices.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Geocodes district office addresses using Google Maps. 4 | # Opens legislators-district-offices.yaml, finds offices 5 | # that haven't previously been geocoded and have a street 6 | # adddress, city, and state, then geocodes them and adds 7 | # latitude and longitude fields to the office object 8 | # and writes back to the same file. 9 | # 10 | # Assumes you have a Google Maps API key in 11 | # scripts/cache/google_maps_api_key.txt, and that 12 | # this key is enabled for the Geocoding API in the 13 | # Google APIs Console. 14 | 15 | import requests 16 | import utils 17 | 18 | class GeocodeException(Exception): 19 | def __init__(self, message): 20 | super(GeocodeException, self).__init__(message) 21 | 22 | def run(legislator_ids=None): 23 | legislators = utils.load_data('legislators-district-offices.yaml') 24 | try: 25 | for l in legislators: 26 | if legislator_ids and l['id']['bioguide'] not in legislator_ids: 27 | continue 28 | geocode_offices(l) 29 | finally: 30 | # Save in-progress geocodes in case of keyboard interrupt 31 | print("Saving data...") 32 | utils.save_data(legislators, 'legislators-district-offices.yaml') 33 | 34 | def geocode_offices(l): 35 | for o in l.get('offices', []): 36 | if o.get('latitude'): 37 | continue 38 | if not o.get('address') or not o.get('city') or not o.get('state'): 39 | continue 40 | address_query = ', '.join([o['address'], o['city'], utils.states[o['state']]]) 41 | result = None 42 | try: 43 | result = geocode(address_query) 44 | _sanity_check_location(o, l['id']['bioguide'], result) 45 | except GeocodeException as e: 46 | print('Geocoding failed for %s office %s (%s): %s. Query: "%s". Result: "%s"' % ( 47 | l['id']['bioguide'], o['city'], o['address'], e, address_query, 48 | result['formatted_address'] if result else None)) 49 | continue 50 | 51 | location = result['geometry']['location'] 52 | o['latitude'] = location['lat'] 53 | o['longitude'] = location['lng'] 54 | print('Success: %s office %s, query "%s" geocoded to "%s" (%s,%s)' % ( 55 | l['id']['bioguide'], o['city'], address_query, result['formatted_address'], 56 | location['lat'], location['lng'])) 57 | 58 | def geocode(address): 59 | params = { 60 | 'address': address, 61 | 'key': _get_api_key(), 62 | } 63 | response = requests.get('https://maps.googleapis.com/maps/api/geocode/json', params=params) 64 | js = response.json() 65 | if js.get('status') != 'OK': 66 | raise GeocodeException('Non-success response from geocoder: %s' % js.get('status')) 67 | return js['results'][0] 68 | 69 | _api_key = None 70 | 71 | def _get_api_key(): 72 | global _api_key 73 | if not _api_key: 74 | _api_key = open('cache/google_maps_api_key.txt').read().strip() 75 | return _api_key 76 | 77 | def _find_address_component(geocode_result, component_type): 78 | for component in geocode_result['address_components']: 79 | if component_type in component['types']: 80 | return component 81 | return None 82 | 83 | SANITY_CHECK_EXEMPTIONS = ( 84 | # (bioguide, office_city) 85 | ('B001295', 'Mt. Vernon'), 86 | ('B001290', 'Spotsylvania'), 87 | ('B001300', 'San Pedro'), 88 | ('C000984', 'Ellicott'), 89 | ('C001038', 'Bronx'), 90 | ('C001038', 'Queens'), 91 | ('C001067', 'Brooklyn'), 92 | ('D000482', 'Penn Hills'), 93 | ('D000625', 'Brooklyn'), 94 | ('D000625', 'Staten Island'), 95 | ('D000626', 'West Chester'), 96 | ('E000179', 'Bronx'), 97 | ('E000179', 'Mt. Vernon'), 98 | ('H000324', 'Mangonia Park'), 99 | ('H001059', 'Campton Hills'), 100 | ('J000294', 'Brooklyn'), 101 | ('K000375', 'Hyannis'), 102 | ('M000087', 'Astoria'), 103 | ('M000087', 'Brooklyn'), 104 | ('M001137', 'Arverne'), 105 | ('M001137', 'Jamaica'), 106 | ('M001151', 'Pittsburgh'), 107 | ('M001179', 'Lake Ariel'), 108 | ('M001188', 'Flushing'), 109 | ('M001188', 'Forest Hills'), 110 | ('M001193', 'Marlton'), 111 | ('M001201', 'Shelby Township'), 112 | ('N000002', 'Brooklyn'), 113 | ('N000032', 'Fort Lauderdale'), 114 | ('P000605', 'York'), 115 | ('Q000023', 'Lakeview'), 116 | ('R000486', 'Commerce'), 117 | ('R000576', 'Timonium'), 118 | ('R000601', 'Rockwall'), 119 | ('S000248', 'Bronx'), 120 | ('S000522', 'Hamilton'), 121 | ('V000081', 'Brooklyn'), 122 | ('W000808', 'Miami Gardens'), 123 | ('W000822', 'Ewing'), 124 | ('S000522', 'Plumsted'), 125 | ) 126 | 127 | def _sanity_check_location(office, bioguide_id, geocode_result): 128 | for exemption in SANITY_CHECK_EXEMPTIONS: 129 | if bioguide_id == exemption[0] and office['city'] == exemption[1]: 130 | return 131 | 132 | state_result_component = _find_address_component(geocode_result, 'administrative_area_level_1') 133 | if not state_result_component: 134 | raise GeocodeException('No state code found in geocode result') 135 | result_state = state_result_component['short_name'] 136 | if result_state != office['state']: 137 | raise GeocodeException('Geocode result is not in the right state') 138 | 139 | city_result_component = _find_address_component(geocode_result, 'locality') 140 | if not city_result_component: 141 | raise GeocodeException('No city found in geocode result') 142 | result_city = city_result_component['long_name'] 143 | result_city_alt = city_result_component['short_name'] 144 | if not (_do_city_names_match(result_city, office['city']) or _do_city_names_match(result_city_alt, office['city'])): 145 | # For big cities, Google Maps seems to consider the "city" to be e.g. Los Angeles 146 | # even though the mailing address and colloquial address may be e.g. Panorama City. 147 | # This common name is in the "neighorhood field, so look at that too 148 | result_subcity_component = _find_address_component(geocode_result, 'neighborhood') 149 | if result_subcity_component: 150 | result_subcity = result_subcity_component['long_name'] 151 | if _do_city_names_match(result_subcity, office['city']): 152 | return 153 | raise GeocodeException('Geocode result is not in the right city') 154 | 155 | def _do_city_names_match(name1, name2): 156 | return name1.lower().replace('.', '') == name2.lower().replace('.', '') 157 | 158 | if __name__ == '__main__': 159 | import sys 160 | run(legislator_ids=sys.argv[1:]) 161 | -------------------------------------------------------------------------------- /scripts/historical_committees.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Parse the THOMAS advanced search page for a list of all committees 4 | # and subcommittees from the 93rd Congress forward and store them in 5 | # the committees-historical.yaml file. It will include current committees 6 | # as well. 7 | 8 | import zipfile 9 | from collections import OrderedDict 10 | import utils 11 | from utils import load_data, save_data, CURRENT_CONGRESS, scraper 12 | import io 13 | import lxml.etree 14 | 15 | def run(): 16 | committees_historical = load_data("committees-historical.yaml") 17 | 18 | # default to not caching 19 | flags = utils.flags() 20 | cache = flags.get('cache', False) 21 | 22 | if cache: 23 | from scrapelib.cache import FileCache 24 | scraper.cache_storage = FileCache('cache') 25 | scraper.cache_write_only = False 26 | else: 27 | raise 28 | 29 | # map thomas_id's to their dicts 30 | committees_historical_ref = { } 31 | for cx in committees_historical: 32 | committees_historical_ref[cx["thomas_id"]] = cx 33 | 34 | 35 | # pick the range of committees to get 36 | single_congress = flags.get('congress', False) 37 | if single_congress: 38 | start_congress = int(single_congress) 39 | end_congress = int(single_congress) + 1 40 | else: 41 | start_congress = 113 42 | end_congress = CURRENT_CONGRESS + 1 43 | 44 | 45 | urls = {'senate': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/s/BILLSTATUS-{congress}-s.zip', 46 | 'house': 'https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/hr/BILLSTATUS-{congress}-hr.zip'} 47 | 48 | all_committees = {'house': {}, 'senate': {}} 49 | 50 | for congress in range(start_congress, end_congress): 51 | for chamber, bill_status_url in urls.items(): 52 | chamber_committees = all_committees[chamber] 53 | 54 | url = bill_status_url.format(congress=congress) 55 | response = scraper.get(url) 56 | 57 | with zipfile.ZipFile(io.BytesIO(response.content)) as z: 58 | for name in z.namelist(): 59 | if name.startswith('BILLSTATUS'): 60 | with z.open(name) as xml_file: 61 | bill_status = lxml.etree.parse(xml_file) 62 | committees = bill_status.xpath('//billCommittees/item') 63 | for committee in committees: 64 | code = str(committee.xpath('./systemCode/text()')[0]) 65 | name = str(committee.xpath('./name/text()')[0]) 66 | if name.endswith(' Committee'): 67 | name = name[:-10] 68 | if code not in chamber_committees: 69 | chamber_committees[code] = {'names': {congress: name}, 70 | 'subcommittees': {}} 71 | else: 72 | if congress not in chamber_committees[code]: 73 | chamber_committees[code]['names'][congress] = name 74 | 75 | subcommittees_d = chamber_committees[code]['subcommittees'] 76 | for subcommittee in committee.xpath('./subcommittees/item'): 77 | code = str(subcommittee.xpath('./systemCode/text()')[0]) 78 | name = str(subcommittee.xpath('./name/text()')[0]) 79 | if name.endswith(' Subcommittee'): 80 | name = name[:-13] 81 | if code not in subcommittees_d: 82 | subcommittees_d[code] = {congress: name} 83 | else: 84 | if congress not in subcommittees_d[code]: 85 | subcommittees_d[code][congress] = name 86 | 87 | import pprint 88 | pprint.pprint(chamber_committees) 89 | print(len(chamber_committees)) 90 | 91 | 92 | for chamber, committees in all_committees.items(): 93 | for code, committee in committees.items(): 94 | id = str(code).upper() 95 | 96 | id = id[:-2] 97 | 98 | if id in committees_historical_ref: 99 | # Update existing record. 100 | cx = committees_historical_ref[id] 101 | 102 | else: 103 | # Create a new record. 104 | cx = OrderedDict() 105 | committees_historical_ref[id] = cx 106 | cx['type'] = chamber.lower() 107 | if id[0] != "J": # Joint committees show their full name, otherwise they show a partial name 108 | cx['name'] = chamber + " Committee on " + name 109 | else: 110 | cx['name'] = committee['names'][min(committee['names'])] 111 | cx['thomas_id'] = id 112 | committees_historical.append(cx) 113 | 114 | for code, subcommittee in committee['subcommittees'].items(): 115 | 116 | for sx in cx.setdefault('subcommittees', []): 117 | if sx['thomas_id'] == code[-2:]: 118 | # found existing record 119 | break 120 | else: 121 | # 'break' not executed, so create a new record 122 | sx = OrderedDict() 123 | sx['name'] = subcommittee[min(subcommittee)] 124 | sx['thomas_id'] = code[-2:] 125 | cx['subcommittees'].append(sx) 126 | 127 | 128 | sx.setdefault('congresses', []) 129 | sx.setdefault('names', {}) 130 | 131 | for congress, name in subcommittee.items(): 132 | if congress not in sx['congresses']: 133 | sx['congresses'].append(congress) 134 | 135 | sx['names'][congress] = name 136 | 137 | cx.setdefault('congresses', []) 138 | cx.setdefault('names', {}) 139 | 140 | for congress, name in committee['names'].items(): 141 | if congress not in cx['congresses']: 142 | cx['congresses'].append(congress) 143 | cx['names'][congress] = name 144 | 145 | 146 | # TODO 147 | # after checking diff on first commit, we should re-sort 148 | #committees_historical.sort(key = lambda c : c["thomas_id"]) 149 | #for c in committees_historical: 150 | # c.get("subcommittees", []).sort(key = lambda s : s["thomas_id"]) 151 | 152 | save_data(committees_historical, "committees-historical.yaml") 153 | 154 | if __name__ == '__main__': 155 | run() 156 | -------------------------------------------------------------------------------- /scripts/house_contacts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Update current congressmember's contact info from clerk XML feed 4 | 5 | import requests 6 | import lxml 7 | import re 8 | from datetime import datetime 9 | 10 | from utils import load_data, save_data, parse_date 11 | 12 | def run(): 13 | today = datetime.now().date() 14 | 15 | y = load_data("legislators-current.yaml") 16 | 17 | # TODO use download util? 18 | xml = requests.get("http://clerk.house.gov/xml/lists/MemberData.xml") 19 | #xml = requests.get("https://clerk.house.gov/xml/lists/unofficial-118-member-elect-data.xml") 20 | root=lxml.etree.fromstring(xml.content) 21 | 22 | for moc in y: 23 | try: 24 | term = moc["terms"][-1] 25 | except IndexError: 26 | print("Member has no terms", moc) 27 | continue 28 | 29 | if term["type"] != "rep": continue 30 | 31 | if today < parse_date(term["start"]) or today > parse_date(term["end"]): 32 | print("Member's last listed term is not current", moc, term["start"]) 33 | continue 34 | 35 | ssdd = "%s%02d" % (term["state"], term["district"]) 36 | 37 | query_str = "./members/member/[statedistrict='%s']" % ssdd 38 | 39 | # Odd state abbreviation. 40 | query_str = query_str.replace("AS00", "AQ00") 41 | 42 | mi = root.findall(query_str)[0].find('member-info') 43 | 44 | # Check that the bioguide ID matches. 45 | bioguideid = mi.find('bioguideID').text 46 | if moc['id'].get('bioguide') is not None and \ 47 | bioguideid != moc['id']['bioguide']: 48 | print("Warning: Bioguide ID did not match for %s%02d (%s != %s)" % ( 49 | term["state"], term["district"], 50 | bioguideid, moc['id']['bioguide'])) 51 | elif moc['id'].get('bioguide') is None: 52 | # At the start of a Congress, we can import the Bioguide ID from 53 | # the official data since we matched on state & district. 54 | 55 | # To keep the field order nice, insert it at the start of the 56 | # IDs list. 57 | moc['id'] = dict([("bioguide", bioguideid)] 58 | + list(moc['id'].items())) 59 | 60 | # for now, no automatic name updates since there is disagremeent on how to handle 61 | # firstname = mi.find('firstname').text 62 | # middlename = mi.find('middlename').text #could be empty 63 | # lastname = mi.find('lastname').text 64 | 65 | if mi.find('official-name') is None or mi.find('official-name').text is None: 66 | print("Warning: No official-name tag for %s" % ssdd) 67 | officialname = None 68 | else: 69 | officialname = re.sub("'", "’", mi.find('official-name').text) 70 | 71 | office_room = mi.find('office-room').text 72 | office_building = mi.find('office-building').text 73 | 74 | office_building_full = office_building.replace("RHOB", "Rayburn House Office Building") 75 | office_building_full = office_building_full.replace("CHOB", "Cannon House Office Building") 76 | office_building_full = office_building_full.replace("LHOB", "Longworth House Office Building") 77 | 78 | office_zip = mi.find('office-zip').text 79 | office_zip_suffix = mi.find('office-zip-suffix').text 80 | 81 | office = "{} {}".format(office_room, office_building_full) 82 | address = "{} {} Washington DC {}-{}".format(office_room, office_building_full, office_zip, office_zip_suffix) 83 | 84 | phone = mi.find('phone').text 85 | phone_parsed = re.sub(r"^\((\d\d\d)\) ", lambda m : m.group(1) + "-", phone) # replace (XXX) area code with XXX- for compatibility w/ existing format 86 | 87 | #for now, no automatic name updates since there is disagremeent on how to handle 88 | # moc["name"]["first"] = firstname 89 | # if (middlename): 90 | # moc["name"]["middle"] = middlename 91 | # else: 92 | # if ("middle" in moc["name"]): 93 | # del moc["name"]["middle"] 94 | # moc["name"]["last"] = lastname 95 | 96 | # TODO: leave if none? 97 | if (officialname): 98 | moc["name"]["official_full"] = officialname 99 | term["address"] = address 100 | term["office"] = office 101 | term["phone"] = phone_parsed 102 | 103 | save_data(y, "legislators-current.yaml") 104 | 105 | if __name__ == '__main__': 106 | run() 107 | -------------------------------------------------------------------------------- /scripts/house_history.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Stores a house_history ID for all legislators that don't yet 4 | # have one, by scraping history.house.gov. 5 | 6 | import lxml.html, io 7 | import requests 8 | from utils import load_data, save_data 9 | import sys 10 | 11 | def run(): 12 | # load legislators YAML files 13 | yamlfiles = { } 14 | for fn in ('historical', 'current'): 15 | fn = 'legislators-%s.yaml' % fn 16 | print("Loading %s..." % fn) 17 | yamlfiles[fn] = load_data(fn) 18 | 19 | # reoriented cache to access by bioguide ID 20 | by_bioguide = { } 21 | known_house_history_ids = set() 22 | for legislators in yamlfiles.values(): 23 | for m in legislators: 24 | if "bioguide" in m["id"]: 25 | by_bioguide[m["id"]["bioguide"]] = m 26 | if "house_history" in m["id"]: 27 | known_house_history_ids.add(m["id"]["house_history"]) 28 | count = 0 29 | 30 | # scrape history.house.gov 31 | if len(sys.argv) == 1: 32 | id_range = range(22000, 25000) 33 | else: 34 | id_range = [int(arg) for arg in sys.argv[1:]] 35 | for id in id_range: 36 | # skip known IDs 37 | if id in known_house_history_ids: 38 | continue 39 | print(id) 40 | bioguide_id = get_bioguide_for_house_history_id(id) 41 | if bioguide_id and bioguide_id in by_bioguide: 42 | print(id, bioguide_id) 43 | by_bioguide[bioguide_id]["id"]["house_history"] = id 44 | count = count + 1 45 | 46 | # write YAML files to disk 47 | for filename, legislators in yamlfiles.items(): 48 | print("Saving data to %s..." % filename) 49 | save_data(legislators, filename) 50 | 51 | # how many updates did we make? 52 | print("Saved %d legislators" % count) 53 | 54 | def get_bioguide_for_house_history_id(id): 55 | url = "http://history.house.gov/People/Detail/%s" % id 56 | r = requests.get(url, allow_redirects=False) 57 | if r.status_code == 200: 58 | dom = lxml.html.parse(io.StringIO(r.text)).getroot() 59 | try: 60 | bioguide_link = dom.cssselect("a.view-in-bioguide")[0].get('href') 61 | return bioguide_link.split('=')[1] 62 | except: 63 | return None 64 | else: 65 | return None 66 | 67 | if __name__ == '__main__': 68 | run() -------------------------------------------------------------------------------- /scripts/house_websites.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Uses https://www.house.gov/representatives/ to scrape official member websites. 4 | # Only known source. 5 | 6 | # Assumptions: 7 | # member's state and district fields are present and accurate. 8 | # member's most recent term in the terms field is their current one. 9 | 10 | import lxml.html, io, urllib.request, urllib.error, urllib.parse 11 | import re 12 | import utils 13 | from utils import load_data, save_data, states as state_names 14 | 15 | 16 | def run(): 17 | 18 | # default to not caching 19 | cache = utils.flags().get('cache', False) 20 | force = not cache 21 | 22 | 23 | states = [] 24 | current = load_data("legislators-current.yaml") 25 | by_district = { } 26 | for m in current: 27 | last_term = m['terms'][-1] 28 | if last_term['type'] != 'sen': 29 | state = last_term['state'] 30 | 31 | full_district = "%s%02d" % (state, int(last_term['district'])) 32 | by_district[full_district] = m 33 | 34 | if not state in states: 35 | states.append(state) 36 | 37 | destination = "legislators/house.html" 38 | url = "https://www.house.gov/representatives/" 39 | body = utils.download(url, destination, force) 40 | if not body: 41 | print("Couldn't download House listing!") 42 | exit(0) 43 | 44 | try: 45 | dom = lxml.html.parse(io.StringIO(body)).getroot() 46 | except lxml.etree.XMLSyntaxError: 47 | print("Error parsing House listing!") 48 | exit(0) 49 | 50 | 51 | # process: 52 | # go through every state in our records, fetching that state's table 53 | # go through every row after the first, pick the district to isolate the member 54 | # pluck out the URL, update that member's last term's URL 55 | count = 0 56 | for state in sorted(states): 57 | state_name = state_names[state].lower().replace(' ', '-') 58 | table = dom.cssselect("table.table caption#state-%s" % state_name)[0].getparent() 59 | rows = table.cssselect("tbody tr") 60 | 61 | for row in rows: 62 | cells = row.cssselect("td") 63 | if not cells: 64 | continue 65 | 66 | district = str(cells[0].text_content()).strip() 67 | if ( 68 | (district == "At Large") 69 | or (district == "Delegate") 70 | or (district == "Resident Commissioner") 71 | ): 72 | district = 0 73 | else: 74 | district = int(re.sub(r'[^\d]', '', district)) 75 | 76 | url = cells[1].cssselect("a")[0].get("href") 77 | original_url = url 78 | 79 | # The House uses subdomains now, and occasionally the directory 80 | # uses URLs with some trailing redirected-to page, like /home. 81 | # We can safely use the subdomain as the root, to be future-proof 82 | # against redirects changing mid-session. 83 | 84 | # We should still follow any redirects, and not just trust the 85 | # directory to have the current active subdomain. As an example, 86 | # the directory lists randyforbes.house.gov, which redirects to 87 | # forbes.house.gov. 88 | resp = urllib.request.urlopen(url) 89 | url = resp.geturl() 90 | 91 | # kill everything after the domain 92 | url = re.sub(".gov/.*$", ".gov", url) 93 | 94 | if state == "AQ": 95 | state = "AS" 96 | full_district = "%s%02d" % (state, int(district)) 97 | if full_district in by_district: 98 | print("[%s] %s %s" % (full_district, url, "" if url == original_url.rstrip("/") else (" <= " + original_url))) 99 | by_district[full_district]['terms'][-1]['url'] = url 100 | else: 101 | print("[%s] No current legislator" % full_district) 102 | 103 | count += 1 104 | 105 | print("Processed %i people rows on House listing." % count) 106 | 107 | print("Saving data...") 108 | save_data(current, "legislators-current.yaml") 109 | 110 | if __name__ == '__main__': 111 | run() 112 | -------------------------------------------------------------------------------- /scripts/icpsr_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # gets ICPSR ID for every member 4 | 5 | # options: 6 | # --cache: load from cache if present on disk (default: true) 7 | # --bioguide: load only one legislator, by his/her bioguide ID 8 | # --congress: do *only* updates for legislators serving in specific congress 9 | 10 | import utils 11 | from utils import load_data, save_data, parse_date 12 | import csv 13 | from io import StringIO 14 | 15 | def run(): 16 | 17 | # default to caching 18 | cache = utils.flags().get('cache', True) 19 | force = not cache 20 | 21 | 22 | only_bioguide = utils.flags().get('bioguide', None) 23 | congress = utils.flags().get('congress',None) 24 | 25 | 26 | data_files = [] 27 | 28 | print("Loading %s..." % "legislators-current.yaml") 29 | legislators = load_data("legislators-current.yaml") 30 | data_files.append((legislators,"legislators-current.yaml")) 31 | print("Loading %s..." % "legislators-historical.yaml") 32 | legislators = load_data("legislators-historical.yaml") 33 | data_files.append((legislators,"legislators-historical.yaml")) 34 | 35 | # load member data from vote view 36 | if congress == None: 37 | raise Exception("the --congress flag is required") 38 | elif int(congress) < 10 and int(congress) > 0: 39 | url_senate = "https://voteview.com/static/data/out/members/S00%s_members.csv" % congress 40 | url_house = "https://voteview.com/static/data/out/members/H00%s_members.csv" % congress 41 | elif int(congress) < 100 and int(congress) >= 10: 42 | url_senate = "https://voteview.com/static/data/out/members/S0%s_members.csv" % congress 43 | url_house = "https://voteview.com/static/data/out/members/H0%s_members.csv" % congress 44 | elif int(congress) >= 100: 45 | url_senate = "https://voteview.com/static/data/out/members/S%s_members.csv" % congress 46 | url_house = "https://voteview.com/static/data/out/members/H%s_members.csv" % congress 47 | else: 48 | raise Exception("no data for congress " + congress) 49 | 50 | senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress 51 | senate_data = utils.download(url_senate, senate_destination, force) 52 | 53 | house_destination = "icpsr/source/house_rollcall%s.txt" % congress 54 | house_data = utils.download(url_house, house_destination, force) 55 | 56 | error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "w")) 57 | error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"]) 58 | 59 | 60 | 61 | read_files = [("sen",senate_data),("rep",house_data)] 62 | print("Running for congress " + congress) 63 | for read_file_chamber,read_file_content in read_files: 64 | for data_file in data_files: 65 | for legislator in data_file[0]: 66 | num_matches = 0 67 | write_id = "" 68 | # this can't run unless we've already collected a bioguide for this person 69 | bioguide = legislator["id"].get("bioguide", None) 70 | # if we've limited this to just one bioguide, skip over everyone else 71 | if only_bioguide and (bioguide != only_bioguide): 72 | continue 73 | #if not in currently read chamber, skip 74 | chamber = legislator['terms'][len(legislator['terms'])-1]['type'] 75 | if chamber != read_file_chamber: 76 | continue 77 | 78 | #only run for selected congress 79 | latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start']))) 80 | if chamber == "sen": 81 | congresses = [latest_congress,latest_congress+1,latest_congress+2] 82 | else: 83 | congresses =[latest_congress] 84 | 85 | if int(congress) not in congresses: 86 | continue 87 | 88 | # pull data to match from yaml 89 | 90 | last_name = legislator['name']['last'].upper() 91 | state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip() 92 | 93 | # convert read_file_content str to file object, then parse as csv file 94 | content_as_file = StringIO(read_file_content) 95 | content_parsed = csv.reader(content_as_file, delimiter=',') 96 | 97 | # loop through congress members in read file, see if one matches the current legislator 98 | for icpsr_member in content_parsed: 99 | # ensure unique match bassed of bioguide id 100 | if bioguide == icpsr_member[10]: 101 | num_matches += 1 102 | write_id = int(icpsr_member[2]) 103 | 104 | # skip if icpsr id is currently in data 105 | if "icpsr" in legislator["id"]: 106 | if write_id == legislator["id"]["icpsr"] or write_id == "": 107 | continue 108 | elif write_id != legislator["id"]["icpsr"] and write_id != "": 109 | error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id]) 110 | print("ID updated for %s" % last_name) 111 | 112 | if num_matches == 1: 113 | legislator['id']['icpsr'] = int(write_id) 114 | else: 115 | if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP": 116 | print('error: non 1 match') 117 | error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"]) 118 | else: 119 | print(str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress)) 120 | error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"]) 121 | 122 | save_data(data_file[0], data_file[1]) 123 | 124 | ## the following three lines can be run as a separate script to update icpsr id's for all historical congresses 125 | # import os 126 | 127 | # for i in range(1,114): 128 | # os.system("python ICPSR_id.py --congress=" + str(i)) 129 | 130 | if __name__ == '__main__': 131 | run() 132 | -------------------------------------------------------------------------------- /scripts/influence_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # gets CRP id for every member with a bioguide ID: 4 | 5 | # options: 6 | # --cache: load from cache if present on disk (default: true) 7 | # --current: do *only* current legislators (default: true) 8 | # --historical: do *only* historical legislators (default: false) 9 | 10 | import utils 11 | from utils import load_data, save_data 12 | import json 13 | 14 | def run(): 15 | 16 | options = utils.flags() 17 | options['urllib'] = True # disable scrapelib for this 18 | 19 | debug = options.get('debug', False) 20 | 21 | # default to NOT caching 22 | cache = options.get('cache', False) 23 | force = not cache 24 | 25 | 26 | only_bioguide = options.get('bioguide', None) 27 | 28 | 29 | # pick either current or historical 30 | # order is important here, since current defaults to true 31 | if utils.flags().get('historical', False): 32 | filename = "legislators-historical.yaml" 33 | elif utils.flags().get('current', True): 34 | filename = "legislators-current.yaml" 35 | else: 36 | print("No legislators selected.") 37 | exit(0) 38 | 39 | 40 | print("Loading %s..." % filename) 41 | legislators = load_data(filename) 42 | 43 | 44 | api_file = open('cache/sunlight_api_key.txt','r') 45 | api_key = api_file.read() 46 | 47 | 48 | for m in legislators: 49 | 50 | # this can't run unless we've already collected a bioguide for this person 51 | bioguide = m["id"].get("bioguide", None) 52 | if not bioguide: 53 | continue 54 | # if we've limited this to just one bioguide, skip over everyone else 55 | if only_bioguide and (bioguide != only_bioguide): 56 | continue 57 | 58 | url_BG = "http://transparencydata.com/api/1.0/entities/id_lookup.json?bioguide_id=" 59 | url_BG += bioguide 60 | url_BG += "&apikey="+api_key 61 | 62 | 63 | destination = "legislators/influence_explorer/lookups/%s.json" % bioguide 64 | if debug: print("[%s] Looking up ID..." % bioguide) 65 | body = utils.download(url_BG, destination, force, options) 66 | 67 | if not body: 68 | print("[%s] Bad request, skipping" % bioguide) 69 | continue 70 | 71 | jsondata = json.loads(body) 72 | if (jsondata != []): 73 | IE_ID = jsondata[0]['id'] 74 | url_CRP = "http://transparencydata.com/api/1.0/entities/" 75 | url_CRP += IE_ID 76 | url_CRP += ".json?apikey=" + api_key 77 | 78 | destination = "legislators/influence_explorer/entities/%s.json" % IE_ID 79 | body = utils.download(url_CRP, destination, force, options) 80 | 81 | jsondata = json.loads(body) 82 | 83 | opensecrets_id = None 84 | fec_ids = [] 85 | for external in jsondata['external_ids']: 86 | if external["namespace"].startswith("urn:crp"): 87 | opensecrets_id = external['id'] 88 | elif external["namespace"].startswith("urn:fec"): 89 | fec_ids.append(external['id']) 90 | 91 | if opensecrets_id: 92 | m["id"]["opensecrets"] = opensecrets_id 93 | 94 | # preserve existing FEC IDs, but don't duplicate them 95 | if len(fec_ids) > 0: 96 | if m["id"].get("fec", None) is None: m["id"]["fec"] = [] 97 | for fec_id in fec_ids: 98 | if fec_id not in m["id"]["fec"]: 99 | m["id"]["fec"].append(fec_id) 100 | 101 | print("[%s] Added opensecrets ID of %s" % (bioguide, opensecrets_id)) 102 | else: 103 | print("[%s] NO DATA" % bioguide) 104 | 105 | 106 | 107 | 108 | print("Saving data to %s..." % filename) 109 | save_data(legislators, filename) 110 | 111 | if __name__ == '__main__': 112 | run() -------------------------------------------------------------------------------- /scripts/lint.py: -------------------------------------------------------------------------------- 1 | # Just loads and saves each .yaml file to normalize serialization syntax. 2 | # 3 | # python lint.py 4 | # ... will lint every .yaml file in the data directory. 5 | # 6 | # python lint.py file1.yaml file2.yaml ... 7 | # ... will lint the specified files. 8 | 9 | import glob, sys 10 | from utils import yaml_load, yaml_dump, data_dir 11 | 12 | def run(): 13 | for fn in glob.glob(data_dir() + "/*.yaml") if len(sys.argv) == 1 else sys.argv[1:]: 14 | print(fn + "...") 15 | data = yaml_load(fn, use_cache=False) 16 | yaml_dump(data, fn) 17 | 18 | if __name__ == '__main__': 19 | run() -------------------------------------------------------------------------------- /scripts/office_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Run validation tests on district office data. 4 | 5 | 6 | For each legislator: 7 | has offices 8 | 9 | For each office: 10 | Required fields: id, city, state 11 | Expected fields: address, city, state, zip, phone, latitude, longitude, id 12 | Optional fields: building, fax, hours, suite 13 | Office id: check consistent 14 | offices are in legislator's state 15 | 16 | Globally: 17 | Every legislator has offices 18 | All offices belong to current legislators 19 | 20 | """ 21 | 22 | import datetime 23 | import os.path 24 | import re 25 | from collections import OrderedDict, defaultdict 26 | from itertools import count 27 | import sys 28 | 29 | try: 30 | import rtyaml as yaml 31 | except ImportError: 32 | import yaml 33 | 34 | try: 35 | from termcolor import colored 36 | except ImportError: 37 | colored = None 38 | 39 | 40 | NONALPHA = re.compile(r"\W") 41 | PHONE = re.compile(r"^\d{3}-\d{3}-\d{4}$") 42 | FIELD_ORDER = """ 43 | 44 | id 45 | address suite building 46 | city state zip 47 | latitude longitude 48 | fax hours phone 49 | 50 | """.split() 51 | 52 | 53 | def relfile(path): 54 | return os.path.abspath(os.path.join(os.path.dirname(__file__), path)) 55 | 56 | 57 | def id_offices(bioguide_id, offices): 58 | """ 59 | Generate unique office ids using a similar algorithm to 60 | https://github.com/controlshift/congress-legislators/blob/add-ids-to-offices-script/add_ids_to_offices.rb 61 | 62 | Used for validation here, but could be used to generate ids. 63 | """ 64 | id_count = defaultdict(count) 65 | for office in offices: 66 | locality = office.get('city', 'no_city').lower() 67 | locality = NONALPHA.sub('_', locality) 68 | 69 | office_id = '-'.join([bioguide_id, locality]) 70 | 71 | city_count = next(id_count[office_id]) 72 | if city_count: 73 | office_id = '-'.join([office_id, str(city_count)]) 74 | 75 | yield office_id, office 76 | 77 | 78 | def check_legislator_offices(legislator_offices, legislator): 79 | bioguide_id = legislator_offices['id']['bioguide'] 80 | offices = legislator_offices.get('offices', []) 81 | 82 | state = None 83 | if legislator: 84 | state = legislator['terms'][-1]['state'] 85 | 86 | required = ['id', 'city', 'state'] 87 | expected = ['address', 'zip', 'phone', 'latitude', 'longitude'] 88 | optional = ['building', 'suite', 'hours', 'fax'] 89 | all_fields = set(required + expected + optional) 90 | 91 | errors = [] 92 | warnings = [] 93 | 94 | if not legislator: 95 | errors.append("Offices for inactive legislator") 96 | 97 | if not offices: 98 | errors.append("Zero offices") 99 | 100 | for office_id, office in id_offices(bioguide_id, offices): 101 | 102 | for field in required: 103 | if not office.get(field): 104 | errors.append("Office %s is missing required field '%s'" % (office_id, field)) 105 | 106 | for field in expected: 107 | if not office.get(field): 108 | warnings.append("Office %s is missing field '%s'" % (office_id, field)) 109 | 110 | for field in office: 111 | if field not in all_fields: 112 | errors.append("Office %s has unrecognized field '%s'" % (office_id, field)) 113 | if not office.get(field): 114 | warnings.append("Office %s has empty field %s" % (office_id, field)) 115 | 116 | found_id = office.get('id') 117 | if found_id and office_id != found_id: 118 | errors.append("Office %s has unexpected id '%s'" % (office_id, found_id)) 119 | 120 | office_state = office.get('state') 121 | if state and office_state and office_state != state: 122 | errors.append("Office %s is in '%s', legislator is from '%s'" % (office_id, office_state, state)) 123 | 124 | office_zip = office.get('zip') 125 | if office_zip is not None and not isinstance(office_zip, str): 126 | errors.append("Office %s has non-string zip: %s" % (office_id, office_zip)) 127 | 128 | phone = office.get('phone') 129 | fax = office.get('fax') 130 | 131 | if phone and not PHONE.match(phone): 132 | errors.append("Office %s phone '%s' does not match format ddd-ddd-dddd" % (office_id, phone)) 133 | 134 | if fax and not PHONE.match(fax): 135 | errors.append("Office %s fax '%s' does not match format ddd-ddd-dddd" % (office_id, fax)) 136 | 137 | if (office.get('address') and 138 | not (office.get('latitude') and office.get('longitude'))): 139 | warnings.append("Office %s missing geocode" % office_id) 140 | 141 | if not office.get('address') and not office.get('phone'): 142 | errors.append("Office %s needs at least address or phone" % office_id) 143 | 144 | fields = [f for f in office if f in FIELD_ORDER] # unknown fields checked above 145 | sorted_fields = sorted(fields, key=FIELD_ORDER.index) 146 | if fields != sorted_fields: 147 | warnings.append("Office %s fields out of order, expected %s" % (office_id, sorted_fields)) 148 | 149 | return errors, warnings 150 | 151 | 152 | def load_to_dict(path): 153 | # load to an OrderedDict keyed by bioguide id 154 | d = yaml.load(open(relfile(path))) 155 | return OrderedDict((l['id']['bioguide'], l) for l in d 156 | if 'bioguide' in l['id']) 157 | 158 | 159 | def print_issues(legislator, errors, warnings): 160 | if not (errors or warnings): 161 | return 162 | 163 | if isinstance(legislator, str): 164 | info = legislator 165 | else: 166 | term = legislator['terms'][-1] 167 | info = "{} [{} {}] {} ({})".format( 168 | legislator['id']['bioguide'], term['state'], term['type'], 169 | legislator['name'].get('official_full'), term.get('url', 'no url')) 170 | 171 | print(info) 172 | 173 | for error in errors: 174 | msg = " ERROR: {}".format(error) 175 | if colored: 176 | msg = colored(msg, "red") 177 | print(msg) 178 | for warning in warnings: 179 | msg = " WARNING: {}".format(warning) 180 | if colored: 181 | msg = colored(msg, "yellow") 182 | print(msg) 183 | print("") 184 | 185 | 186 | def run(skip_warnings=False): 187 | legislators = load_to_dict("../legislators-current.yaml") 188 | legislators_offices = load_to_dict("../legislators-district-offices.yaml") 189 | 190 | has_errors = False 191 | 192 | for bioguide_id, legislator_offices in legislators_offices.items(): 193 | legislator = legislators.get(bioguide_id) 194 | 195 | errors, warnings = check_legislator_offices(legislator_offices, legislator) 196 | 197 | if skip_warnings: 198 | warnings = [] 199 | 200 | if errors: 201 | has_errors = True 202 | 203 | print_issues(legislator or bioguide_id, errors, warnings) 204 | 205 | for bioguide_id in set(legislators) - set(legislators_offices): 206 | # Only report an error for a missing office if the 207 | # legislator has been in office for at least 60 days. 208 | start_date = legislators[bioguide_id]['terms'][-1]['start'] 209 | if datetime.date.today() - datetime.datetime.strptime(start_date, '%Y-%m-%d').date() >= datetime.timedelta(60): 210 | has_errors = True 211 | errors, warnings = ["No offices"], [] 212 | else: 213 | errors, warnings = [], ["No offices"] 214 | print_issues(legislators[bioguide_id], errors, warnings) 215 | 216 | return has_errors 217 | 218 | if __name__ == '__main__': 219 | import argparse 220 | parser = argparse.ArgumentParser() 221 | parser.add_argument("--skip-warnings", action="store_true") 222 | args = parser.parse_args() 223 | 224 | has_errors = run(skip_warnings=args.skip_warnings) 225 | sys.exit(1 if has_errors else 0) 226 | -------------------------------------------------------------------------------- /scripts/pictorial_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import csv 4 | import json 5 | import unicodedata 6 | import utils 7 | from utils import load_data, mkdir_p, save_data, parse_date 8 | 9 | # Update legislators current pictorial ids 10 | # https://pictorialapi.gpo.gov/index.html 11 | # 12 | # options: 13 | # --cache: load from cache if present on disk (default: false) 14 | # --bioguide: load only one legislator, by their bioguide ID 15 | # --congress: do *only* updates for legislators serving in specific congress 16 | # 17 | # example: 18 | # python pictorial_ids.py --congress=118 19 | 20 | 21 | def run(): 22 | 23 | # default to not caching 24 | cache = utils.flags().get("cache", False) 25 | force = not cache 26 | 27 | only_bioguide = utils.flags().get("bioguide", None) 28 | congress = utils.flags().get("congress", None) 29 | 30 | data_files = [] 31 | print("Loading %s..." % "legislators-current.yaml") 32 | legislators = load_data("legislators-current.yaml") 33 | data_files.append((legislators, "legislators-current.yaml")) 34 | print("Loading %s..." % "legislators-historical.yaml") 35 | legislators = load_data("legislators-historical.yaml") 36 | data_files.append((legislators, "legislators-historical.yaml")) 37 | 38 | if congress == None: 39 | raise Exception("the --congress flag is required") 40 | elif int(congress) >= 110: 41 | # Pictorial seems to go back to 110th Congress 42 | url = f"https://pictorialapi.gpo.gov/api/GuideMember/GetMembers/{congress}" 43 | pass 44 | else: 45 | raise Exception("no data for congress " + congress) 46 | 47 | pictorial_destination = f"pictorial/source/GetMembers/{congress}.json" 48 | pictorial_data = json.loads(utils.download(url, pictorial_destination, force)) 49 | 50 | # Filter out non-legislators and the vacant placeholders 51 | pictorial_members = [ 52 | member 53 | for member in pictorial_data["memberCollection"] 54 | if member["memberType"] in ("Senator", "Representative", "Delegate") 55 | and member["name"] != "Vacant, Vacant" 56 | ] 57 | 58 | error_filename = f"cache/errors/pictorial/mismatch_{congress}.csv" 59 | mkdir_p("cache/errors/pictorial") 60 | error_log = csv.writer(open(error_filename, "w")) 61 | error_log.writerow( 62 | [ 63 | "message", 64 | "bioguide_id", 65 | "name_first", 66 | "name_last", 67 | ] 68 | ) 69 | error_count = 0 70 | 71 | print("Running for congress " + congress) 72 | for legislators, filename in data_files: 73 | for legislator in legislators: 74 | # this can't run unless we've already collected a bioguide for this person 75 | bioguide = legislator["id"].get("bioguide", None) 76 | # if we've limited this to just one bioguide, skip over everyone else 77 | if only_bioguide and (bioguide != only_bioguide): 78 | continue 79 | 80 | # only run for selected congress 81 | latest_term = legislator["terms"][-1] 82 | latest_congress = utils.congress_from_legislative_year( 83 | utils.legislative_year(parse_date(latest_term["start"])) 84 | ) 85 | if int(congress) != latest_congress: 86 | continue 87 | 88 | # skip if we already have it 89 | if legislator["id"].get("pictorial"): 90 | continue 91 | try: 92 | pictorial_id = match_pictorial_id(legislator, pictorial_members) 93 | legislator["id"]["pictorial"] = pictorial_id 94 | except ValueError as e: 95 | error_count += 1 96 | error_log.writerow( 97 | [ 98 | e, 99 | bioguide, 100 | legislator["name"]["first"], 101 | legislator["name"]["last"], 102 | ] 103 | ) 104 | 105 | save_data(legislators, filename) 106 | 107 | if error_count: 108 | print(f"{error_count} error details written to {error_filename}") 109 | 110 | 111 | def to_ascii(s): 112 | return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII") 113 | 114 | 115 | def reverse_name(name): 116 | """ 117 | Given a name in "Last, First" format, return "First Last" 118 | """ 119 | return " ".join(name.split(", ")[::-1]) 120 | 121 | 122 | def match_pictorial_id(legislator, pictorial_members): 123 | """ 124 | Attempt to find the corresponding pictorial id for the given member. 125 | 126 | There are many odd cases -- see tests/test_gpo_member_photos.py for 127 | examples. 128 | """ 129 | name = legislator["name"]["official_full"] 130 | 131 | # Map common nicknames (and GPO typos) from legislators to pictorial 132 | common_nicknames = { 133 | "Nick": "Nicolas", 134 | "Daniel": "Dan", 135 | "Mike": "Michael", 136 | "Michael": "Mike", 137 | "Richard": "Rich", 138 | "Christopher": "Chris", 139 | "JOhn": "John", 140 | } 141 | 142 | matches = [] 143 | for member_pictorial in pictorial_members: 144 | # First check whether the name matches 145 | name_matches = False 146 | legislator_name_last = to_ascii(legislator["name"]["last"].replace(" ", "")) 147 | legislator_name_first = to_ascii(legislator["name"]["first"].replace(" ", "")) 148 | 149 | if legislator_name_last == member_pictorial["lastName"]: 150 | if legislator_name_first == member_pictorial["firstName"] or ( 151 | "nickname" in legislator["name"] 152 | and legislator["name"]["nickname"] == member_pictorial["firstName"] 153 | ): 154 | name_matches = True 155 | # Sometimes the nickname is encoded in the first name 156 | elif member_pictorial["firstName"] in legislator_name_first: 157 | name_matches = True 158 | # Sometimes the nickname is encoded in the middle name 159 | elif ( 160 | "middle" in legislator["name"] 161 | and member_pictorial["firstName"] in legislator["name"]["middle"] 162 | ): 163 | name_matches = True 164 | # Sometimes the nickname is not encoded 165 | elif ( 166 | member_pictorial["firstName"] in common_nicknames 167 | and common_nicknames[member_pictorial["firstName"]] 168 | == legislator_name_first 169 | ): 170 | name_matches = True 171 | 172 | # Sometimes matching the official full name is best 173 | if legislator["name"]["official_full"] == reverse_name( 174 | member_pictorial["name"] 175 | ): 176 | name_matches = True 177 | 178 | # The GPO has some first and last names swapped, so check those too 179 | if not name_matches and legislator_name_first == member_pictorial["lastName"]: 180 | if legislator_name_last == member_pictorial["firstName"] or ( 181 | "nickname" in legislator["name"] 182 | and legislator["name"]["nickname"] == member_pictorial["firstName"] 183 | ): 184 | name_matches = True 185 | 186 | # If the name matches, check the office and state 187 | # Note: Assumes we're matching against most recent term 188 | if name_matches: 189 | most_recent_term = legislator["terms"][-1] 190 | mType = "sen" if member_pictorial["memberType"] == "Senator" else "rep" 191 | if ( 192 | most_recent_term["state"] == member_pictorial["stateId"] 193 | and most_recent_term["type"] == mType 194 | ): 195 | matches.append(member_pictorial) 196 | 197 | if len(matches) == 1: 198 | return matches[0]["memberId"] 199 | else: 200 | if len(matches): 201 | raise ValueError(f"Multiple pictorial id matches found for {name}") 202 | else: 203 | raise ValueError(f"No pictorial id match found for {name}") 204 | 205 | 206 | if __name__ == "__main__": 207 | run() 208 | -------------------------------------------------------------------------------- /scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | rtyaml 3 | scrapelib==0.10.1 4 | ipython 5 | lxml>=2.2 6 | cssselect 7 | pyflakes 8 | pytz 9 | tweepy 10 | sparqlwrapper 11 | -------------------------------------------------------------------------------- /scripts/retire.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Retire a Member of Congress. Updates the end date of the 4 | # Member's most recent term and moves him/her from the 5 | # current file to the historical file. 6 | # 7 | # python retire.py bioguideID termEndDate 8 | 9 | import sys 10 | import utils 11 | import rtyaml 12 | 13 | def run(): 14 | if len(sys.argv) != 3: 15 | print("Usage:") 16 | print("python retire.py bioguideID termEndDate") 17 | sys.exit() 18 | 19 | try: 20 | utils.parse_date(sys.argv[2]) 21 | except: 22 | print("Invalid date: ", sys.argv[2]) 23 | sys.exit() 24 | 25 | print("Loading current YAML...") 26 | y = utils.load_data("legislators-current.yaml") 27 | print("Loading historical YAML...") 28 | y1 = utils.load_data("legislators-historical.yaml") 29 | 30 | for moc in y: 31 | if moc["id"].get("bioguide", None) != sys.argv[1]: continue 32 | 33 | print("Updating:") 34 | rtyaml.pprint(moc["id"]) 35 | print() 36 | rtyaml.pprint(moc["name"]) 37 | print() 38 | rtyaml.pprint(moc["terms"][-1]) 39 | 40 | moc["terms"][-1]["end"] = sys.argv[2] 41 | 42 | y.remove(moc) 43 | y1.append(moc) 44 | 45 | break 46 | 47 | print("Saving changes...") 48 | utils.save_data(y, "legislators-current.yaml") 49 | utils.save_data(y1, "legislators-historical.yaml") 50 | 51 | if __name__ == '__main__': 52 | run() -------------------------------------------------------------------------------- /scripts/run_script_to_branch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # run_script_to_branch 3 | # -------------------- 4 | # Creates a branch, executes a script, and optioanally creates a pull request 5 | # using github's hub tool (http://hub.github.com/). 6 | # 7 | # Usage: 8 | # 9 | # ./run_script_to_branch [-push] script_name.py 10 | # 11 | # Example: 12 | # 13 | # ./run_script_to_branch -push senate_contacts.py 14 | 15 | 16 | # Command-line options. 17 | PUSH=0 18 | if [[ "$1" = "-push" ]]; then 19 | if ! which hub>/dev/null; then 20 | echo "Install 'hub' from hub.github.com to automatically create a pull request." 21 | fi 22 | PUSH=1 23 | shift; 24 | fi 25 | 26 | # Check that we have an argument for which script to run. 27 | if [ -z "$1" ]; then 28 | echo "usage: $0 script_name.py"; 29 | exit; 30 | fi 31 | 32 | # Check that there are no unstaged changes. 33 | # see http://stackoverflow.com/questions/5139290/how-to-check-if-theres-nothing-to-be-committed-in-the-current-branch 34 | if ! git diff-files --quiet --ignore-submodules; then 35 | echo "Cannot run this now: You have unstaged changes." 36 | exit; 37 | fi 38 | 39 | # Create a branch with the name of the script, the date, and a random string to prevent accidental collisions. 40 | BRANCH_NAME=$1_`date +%Y%m%d`_$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 6 | head -n 1) 41 | git fetch 42 | git checkout -b $BRANCH_NAME main 43 | 44 | # Run the script. 45 | echo Running $@... 46 | ./$@ 47 | 48 | # If there were no changes. Return to the main branch and then delete our temporary branch. 49 | if git diff-files --quiet --ignore-submodules; then 50 | echo No changes.; 51 | git checkout main; 52 | git branch -d $BRANCH_NAME; 53 | exit; 54 | fi 55 | 56 | # Commit to the branch. 57 | CMD=$@ 58 | git commit -am "running $CMD at `date "+%FT%T"`" 59 | 60 | if [ $PUSH -gt 0 ]; then 61 | # Push to github. 62 | if git push -u origin $BRANCH_NAME; then 63 | if hub pull-request -m "[auto] $CMD run at `date "+%FT%T"`"; then 64 | # Success, so we can delete our local copy. Use -D to force delete 65 | # even though it's not merged. 66 | git checkout main; 67 | git branch -D $BRANCH_NAME; 68 | fi 69 | fi 70 | fi 71 | -------------------------------------------------------------------------------- /scripts/senate_contacts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Update current senator's website and address from www.senate.gov. 4 | 5 | import lxml.etree, io 6 | import string, re 7 | from datetime import datetime 8 | import utils 9 | from utils import download, load_data, save_data, parse_date 10 | import urllib.request 11 | 12 | def run(): 13 | 14 | today = datetime.now().date() 15 | 16 | # default to not caching 17 | cache = utils.flags().get('cache', False) 18 | force = not cache 19 | 20 | y = load_data("legislators-current.yaml") 21 | 22 | # Map bioguide IDs to dicts. Reference the same dicts 23 | # in y so we are updating y when we update biogiude. 24 | bioguide = { } 25 | by_name = { } 26 | for m in y: 27 | if "bioguide" in m["id"]: 28 | bioguide[m["id"]["bioguide"]] = m 29 | party = m["terms"][-1]["party"][0] 30 | state = m["terms"][-1]["state"] 31 | last_name = m["name"]["last"] 32 | member_full = "%s (%s-%s)" % (last_name, party, state) 33 | by_name[member_full] = m 34 | 35 | 36 | print("Fetching general Senate information from senators_cfm.xml...") 37 | 38 | url = "https://www.senate.gov/general/contact_information/senators_cfm.xml" 39 | body = download(url, "legislators/senate.xml", force, { "binary": True }) 40 | dom = lxml.etree.parse(io.BytesIO(body)) # file has an parse_date(term["end"]): 69 | print("Member's last listed term is not current", bioguide_id, member_full, term["start"]) 70 | continue 71 | 72 | if term["type"] != "sen": 73 | print("Member's last listed term is not a Senate term", bioguide_id, member_full) 74 | continue 75 | 76 | 77 | if term["state"] != str(node.xpath("string(state)")): 78 | print("Member's last listed term has the wrong state", bioguide_id, member_full) 79 | continue 80 | 81 | if "district" in term: del term["district"] 82 | 83 | full_name = str(node.xpath("string(first_name)")) 84 | suffix = None 85 | if ", " in full_name: full_name, suffix = full_name.split(", ") 86 | full_name += " " + str(node.xpath("string(last_name)")) 87 | if suffix: full_name += ", " + suffix 88 | member["name"]["official_full"] = re.sub("'", "’", full_name) 89 | 90 | member["id"]["bioguide"] = bioguide_id 91 | 92 | term["class"] = { "Class I": 1, "Class II": 2, "Class III": 3}[ node.xpath("string(class)") ] 93 | term["party"] = { "D": "Democrat", "R": "Republican", "I": "Independent", "ID": "Independent"}[ node.xpath("string(party)") ] 94 | 95 | url = str(node.xpath("string(website)")).strip() 96 | if not url.startswith("/"): 97 | # temporary home pages for new senators are relative links? 98 | 99 | # hit the URL to resolve any redirects to get the canonical URL, 100 | # since the listing sometimes gives URLs that redirect. 101 | try: 102 | req = urllib.request.Request(url) 103 | req.add_header("User-Agent", "https://github.com/unitedstates/congress-legislators") 104 | resp = urllib.request.urlopen(req) 105 | url = resp.geturl() 106 | except Exception as e: 107 | print(url, e) 108 | 109 | # kill trailing slash 110 | url = re.sub("/$", "", url) 111 | 112 | term["url"] = url 113 | 114 | #contact forms are sometimes listed as the base url, ignore if such case 115 | contact_form = str(node.xpath("string(email)")).strip() 116 | if contact_form and contact_form.rstrip("/") != term['url']: 117 | term['contact_form'] = contact_form 118 | 119 | term["address"] = str(node.xpath("string(address)")).strip().replace("\n ", " ") 120 | term["office"] = string.capwords(term["address"].upper().split(" WASHINGTON ")[0]) 121 | 122 | phone = str(node.xpath("string(phone)")).strip() 123 | term["phone"] = phone.replace("(", "").replace(")", "").replace(" ", "-") 124 | 125 | 126 | 127 | print("\n\nUpdating Senate stateRank and LIS ID from cvc_member_data.xml...") 128 | 129 | url = "https://www.senate.gov/legislative/LIS_MEMBER/cvc_member_data.xml" 130 | body = download(url, "legislators/senate_cvc.xml", force) 131 | dom = lxml.etree.parse(io.StringIO(body)) 132 | for node in dom.getroot(): 133 | if node.tag == "lastUpdate": 134 | date, time = node.getchildren() 135 | print("Last updated: %s, %s" % (date.text, time.text)) 136 | continue 137 | 138 | bioguide_id = str(node.xpath("string(bioguideId)")).strip() 139 | if bioguide_id == "": 140 | print("Someone has an empty bioguide ID!") 141 | print(lxml.etree.tostring(node)) 142 | continue 143 | 144 | last_name = node.xpath("string(name/last)") 145 | party = node.xpath("string(party)") 146 | state = node.xpath("string(state)") 147 | member_full = "%s (%s-%s)" % (last_name, party, state) 148 | 149 | print("[%s] Processing Senator %s..." % (bioguide_id, member_full)) 150 | 151 | # find member record in our YAML, either by bioguide_id or member_full 152 | if bioguide_id in bioguide: 153 | member = bioguide[bioguide_id] 154 | else: 155 | if member_full in by_name: 156 | member = by_name[member_full] 157 | else: 158 | print("Bioguide ID '%s' and synthesized official name '%s' not recognized." % (bioguide_id, member_full)) 159 | continue # exit(0) 160 | 161 | try: 162 | term = member["terms"][-1] 163 | except IndexError: 164 | print("Member has no terms", bioguide_id, member_full) 165 | continue 166 | 167 | if "id" not in member: 168 | member["id"] = {} 169 | 170 | member["id"]["lis"] = node.attrib["lis_member_id"] 171 | state_rank = node.xpath("string(stateRank)") 172 | if state_rank == '1': 173 | term["state_rank"] = "senior" 174 | elif state_rank == '2': 175 | term["state_rank"] = "junior" 176 | 177 | 178 | print("Saving data...") 179 | save_data(y, "legislators-current.yaml") 180 | 181 | if __name__ == '__main__': 182 | run() 183 | -------------------------------------------------------------------------------- /scripts/social/twitter.py: -------------------------------------------------------------------------------- 1 | # Helpful functions for accessing Twitter 2 | import tweepy 3 | TWITTER_PROFILE_BATCH_SIZE = 100 4 | from math import ceil 5 | 6 | def get_api(access_token, access_token_secret, consumer_key, consumer_secret): 7 | """ 8 | Takes care of the Twitter OAuth authentication process and 9 | creates an API-handler to execute commands on Twitter 10 | 11 | Arguments: string values 12 | 13 | Returns: 14 | A tweepy.api.API object 15 | """ 16 | # Get authentication token 17 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 18 | auth.set_access_token(access_token, access_token_secret) 19 | # create an API handler 20 | return tweepy.API(auth) 21 | 22 | def fetch_profiles(api, screen_names = [], ids = []): 23 | """ 24 | A wrapper method around tweepy.API.lookup_users that handles the batch lookup of 25 | screen_names. Assuming number of screen_names < 10000, this should not typically 26 | run afoul of API limits (i.e. it's a good enough hack for now) 27 | 28 | `api` is a tweepy.API handle 29 | `screen_names` is a list of twitter screen names 30 | 31 | Returns: a list of dicts representing Twitter profiles 32 | """ 33 | profiles = [] 34 | key, lookups = ['user_ids', ids] if ids else ['screen_names', screen_names] 35 | for batch_idx in range(ceil(len(lookups) / TWITTER_PROFILE_BATCH_SIZE)): 36 | offset = batch_idx * TWITTER_PROFILE_BATCH_SIZE 37 | # break lookups list into batches of TWITTER_PROFILE_BATCH_SIZE 38 | batch = lookups[offset:(offset + TWITTER_PROFILE_BATCH_SIZE)] 39 | try: 40 | for user in api.lookup_users(**{key: batch}): 41 | profiles.append(user._json) 42 | # catch situation in which none of the names in the batch are found 43 | # or else Tweepy will error out 44 | except tweepy.error.TweepError as e: 45 | if e.response.status_code == 404: 46 | pass 47 | else: # some other error, raise the exception 48 | raise e 49 | return profiles 50 | -------------------------------------------------------------------------------- /scripts/sweep.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from utils import load_data, save_data 4 | 5 | def run(): 6 | # load in members, orient by bioguide ID 7 | print("Loading current legislators...") 8 | current = load_data("legislators-current.yaml") 9 | 10 | current_bioguide = { } 11 | for m in current: 12 | if "bioguide" in m["id"]: 13 | current_bioguide[m["id"]["bioguide"]] = m 14 | 15 | # remove out-of-office people from current committee membership 16 | print("Sweeping committee membership...") 17 | membership_current = load_data("committee-membership-current.yaml") 18 | for committee_id in list(membership_current.keys()): 19 | for member in membership_current[committee_id]: 20 | if member["bioguide"] not in current_bioguide: 21 | print("\t[%s] Ding ding ding! (%s)" % (member["bioguide"], member["name"])) 22 | membership_current[committee_id].remove(member) 23 | save_data(membership_current, "committee-membership-current.yaml") 24 | 25 | # remove out-of-office people from social media info 26 | print("Sweeping social media accounts...") 27 | socialmedia_current = load_data("legislators-social-media.yaml") 28 | for member in list(socialmedia_current): 29 | if member["id"]["bioguide"] not in current_bioguide: 30 | print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["social"])) 31 | socialmedia_current.remove(member) 32 | save_data(socialmedia_current, "legislators-social-media.yaml") 33 | 34 | # remove out-of-office people from district offices 35 | print("Sweeping district offices...") 36 | district_offices = load_data("legislators-district-offices.yaml") 37 | for member in list(district_offices): 38 | if member["id"]["bioguide"] not in current_bioguide: 39 | print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["offices"])) 40 | district_offices.remove(member) 41 | save_data(district_offices, "legislators-district-offices.yaml") 42 | 43 | if __name__ == '__main__': 44 | run() 45 | -------------------------------------------------------------------------------- /scripts/thomas_ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Update current THOMAS IDs using beta.congress.gov. Congressmen's 4 | # IDs are updated directly. For Senators, we just print out new 5 | # IDs because name matching is hard. 6 | 7 | import lxml.html, io, urllib.request, urllib.parse, urllib.error 8 | import re 9 | import utils 10 | from utils import download, load_data, save_data 11 | 12 | def run(): 13 | CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter 14 | 15 | # constants 16 | state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"} 17 | 18 | # default to not caching 19 | cache = utils.flags().get('cache', False) 20 | force = not cache 21 | 22 | # load in current members 23 | y = load_data("legislators-current.yaml") 24 | by_district = { } 25 | existing_senator_ids = set() 26 | for m in y: 27 | last_term = m['terms'][-1] 28 | if last_term['type'] == 'rep': 29 | full_district = "%s%02d" % (last_term['state'], int(last_term['district'])) 30 | by_district[full_district] = m 31 | elif last_term['type'] == 'sen': 32 | if "thomas" in m["id"]: 33 | existing_senator_ids.add(m["id"]["thomas"]) 34 | 35 | seen_ids = set() 36 | for chamber in ("House of Representatives", "Senate"): 37 | url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % ( 38 | urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber)) 39 | cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber) 40 | try: 41 | body = download(url, cache, force) 42 | dom = lxml.html.parse(io.StringIO(body)).getroot() 43 | except lxml.etree.XMLSyntaxError: 44 | print("Error parsing: ", url) 45 | continue 46 | 47 | for node in dom.xpath("//ul[@class='results_list']/li"): 48 | thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1)) 49 | 50 | # THOMAS misassigned these 'new' IDs to existing individuals. 51 | if thomas_id in ('02139', '02132'): 52 | continue 53 | 54 | name = node.xpath('h2/a')[0].text 55 | 56 | state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip() 57 | state = state_names[state] 58 | 59 | if chamber == "House of Representatives": 60 | # There's enough information to easily pick out which Member this refers to, so write it 61 | # directly to the file. 62 | district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip() 63 | if district == "At Large": district = 0 64 | district = "%02d" % int(district) 65 | 66 | if state + district not in by_district: 67 | print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.") 68 | continue 69 | 70 | if state + district in seen_ids: 71 | print("Congress.gov lists two people for %s%s!" % (state, district)) 72 | seen_ids.add(state+district) 73 | 74 | by_district[state + district]["id"]["thomas"] = thomas_id 75 | 76 | elif chamber == "Senate": 77 | # For senators we'd have to match on name or something else, so that's too difficult. 78 | # Just look for new IDs. 79 | if thomas_id not in existing_senator_ids: 80 | print("Please manually set", thomas_id, "for", name, "from", state) 81 | 82 | save_data(y, "legislators-current.yaml") 83 | 84 | if __name__ == '__main__': 85 | run() -------------------------------------------------------------------------------- /scripts/untire.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # "Un-retire" a Member of Congress: Move a Member of Congress 4 | # from the legislators-historical file to the legislators-current file 5 | # and give the Member a new term. 6 | # 7 | # python unretire.py bioguideID 8 | 9 | import sys 10 | import rtyaml 11 | import utils 12 | from collections import OrderedDict 13 | 14 | def run(): 15 | 16 | if len(sys.argv) != 2: 17 | print("Usage:") 18 | print("python untire.py bioguideID") 19 | sys.exit() 20 | 21 | print("Loading current YAML...") 22 | y = utils.load_data("legislators-current.yaml") 23 | print("Loading historical YAML...") 24 | y1 = utils.load_data("legislators-historical.yaml") 25 | 26 | for moc in y1: 27 | if moc["id"].get("bioguide", None) != sys.argv[1]: continue 28 | 29 | print("Updating:") 30 | rtyaml.pprint(moc["id"]) 31 | print() 32 | rtyaml.pprint(moc["name"]) 33 | 34 | moc["terms"].append(OrderedDict([ 35 | ("type", moc["terms"][-1]["type"]), 36 | ("start", None), 37 | ("end", None), 38 | ("state", moc["terms"][-1]["state"]), 39 | ("party", moc["terms"][-1]["party"]), 40 | ])) 41 | 42 | y1.remove(moc) 43 | y.append(moc) 44 | 45 | break 46 | 47 | print("Saving changes...") 48 | utils.save_data(y, "legislators-current.yaml") 49 | utils.save_data(y1, "legislators-historical.yaml") 50 | 51 | if __name__ == '__main__': 52 | run() -------------------------------------------------------------------------------- /scripts/update_gh_pages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Current commit hash on the source branch. 5 | SRC_BRANCH=main 6 | 7 | # Switch to gh-pages branch. 8 | git checkout gh-pages 9 | 10 | # Get the YAML and the scripts we need to generate CSV and JSON 11 | # from the source branch. 12 | git fetch origin $SRC_BRANCH 13 | HASH=$(git rev-parse FETCH_HEAD) 14 | echo "Getting latest files from $SRC_BRANCH @ $HASH." 15 | git checkout FETCH_HEAD "*.yaml" scripts/alternate_bulk_formats.py scripts/utils.py 16 | 17 | # Generate CSV and JSON. 18 | (cd scripts/; python3 alternate_bulk_formats.py;) 19 | 20 | # Commit the YAML, CSV, and JSON. 21 | # (Don't commit the other scripts files we checked out from 22 | # the source branch, which git has unhelpfully put in the 23 | # index.) 24 | export GIT_AUTHOR_NAME="the unitedstates project (CircleCI)" 25 | export GIT_AUTHOR_EMAIL=circleci@theunitedstates.io 26 | export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME" 27 | export GIT_COMMITTER_EMAIL="GIT_AUTHOR_EMAIL" 28 | ( 29 | git add *.yaml *.csv *.json \ 30 | && git commit -m "update to $SRC_BRANCH @ $HASH by CircleCI" \ 31 | *.yaml *.csv *.json \ 32 | && git push 33 | ) || /bin/true # if there's nothing to commit, don't exit with error status 34 | 35 | # Switch back to the original branch. 36 | git checkout -f $SRC_BRANCH 37 | -------------------------------------------------------------------------------- /scripts/wikidata_update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import urllib.request 5 | import json 6 | from urllib.parse import quote, unquote 7 | from utils import load_data, save_data 8 | from SPARQLWrapper import SPARQLWrapper, JSON 9 | 10 | def get_wikidata_ids(legislators): 11 | # Look up wikidata IDs for legislators with English Wikipedia IDs. 12 | for p in legislators: 13 | if not p["id"].get("wikidata") and p["id"].get("wikipedia"): 14 | w = quote(p["id"]["wikipedia"].replace(" ", "_")) 15 | query_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={w}&format=json" 16 | response = json.load(urllib.request.urlopen(query_url)) 17 | wikidata_id = list(response["query"]["pages"].values())[0]["pageprops"]["wikibase_item"] 18 | p["id"]["wikidata"] = wikidata_id 19 | 20 | 21 | def get_ids_from_wikidata(legislators): 22 | # Query to fetch information for entities that have a bioguide ID. 23 | # Selecting on bioguide ID efficiently gets wikidata entries that 24 | # we are interested in. 25 | 26 | table = run_query(""" 27 | PREFIX wdt: 28 | PREFIX schema: 29 | 30 | SELECT ?subject ?bioguide ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia 31 | WHERE { 32 | ?subject wdt:P1157 ?bioguide . 33 | OPTIONAL { 34 | ?subject wdt:P2671 ?google_entity_id 35 | } 36 | OPTIONAL { 37 | ?subject wdt:P2686 ?opensecrets 38 | } 39 | OPTIONAL { 40 | ?subject wdt:P3344 ?votesmart 41 | } 42 | OPTIONAL { 43 | ?subject wdt:P2390 ?ballotpedia 44 | } 45 | OPTIONAL { 46 | ?wikipedia schema:about ?subject . 47 | ?wikipedia schema:inLanguage "en" . 48 | ?wikipedia schema:isPartOf . 49 | } 50 | } 51 | """) 52 | 53 | # make a mapping from bioguide ID to query result 54 | mapping = { r["bioguide"]: r for r in table } 55 | 56 | # update legislators 57 | for p in legislators: 58 | if p["id"].get("bioguide") in mapping: 59 | p["id"].update(mapping[p["id"]["bioguide"]]) 60 | 61 | 62 | def get_ids_from_wikidata_without_bioguide(legislators): 63 | # The SQPARL server doesn't seem to suppor VALUES or FILTER(?subject IN (...)) 64 | # so in order to fill in values for legislators without bioguide IDs but with 65 | # wikidata IDs, we can just query them one by one. This probably is only useful 66 | # at the start of a new Congress when bioguide IDs are not yet available. 67 | for p in legislators: 68 | if not ("bioguide" not in p["id"] and "wikidata" in p["id"]): continue 69 | 70 | table = run_query(""" 71 | PREFIX wd: 72 | PREFIX wdt: 73 | PREFIX schema: 74 | 75 | SELECT ?wikipedia ?google_entity_id ?opensecrets ?votesmart ?ballotpedia 76 | WHERE { 77 | OPTIONAL { 78 | ?subject wdt:P2671 ?google_entity_id 79 | } 80 | OPTIONAL { 81 | ?subject wdt:P2686 ?opensecrets 82 | } 83 | OPTIONAL { 84 | ?subject wdt:P3344 ?votesmart 85 | } 86 | OPTIONAL { 87 | ?subject wdt:P2390 ?ballotpedia 88 | } 89 | OPTIONAL { 90 | ?wikipedia schema:about ?subject . 91 | ?wikipedia schema:inLanguage "en" . 92 | ?wikipedia schema:isPartOf . 93 | } 94 | } 95 | """.replace("?subject", "wd:" + p["id"]["wikidata"])) 96 | 97 | 98 | p["id"].update(table[0]) 99 | 100 | 101 | def run_query(query): 102 | sparql_endpoint = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql' 103 | s = SPARQLWrapper(sparql_endpoint) 104 | 105 | # run the query 106 | s.setQuery(query) 107 | s.setReturnFormat(JSON) 108 | results = s.query().convert() 109 | 110 | for row in results['results']['bindings']: 111 | if "subject" in row: 112 | # replace the ?subject variable with the wikidata id 113 | row['wikidata'] = { "value": re.search(r'/(Q\d+)', row['subject']['value']).group(1) } 114 | del row["subject"] 115 | 116 | # clean up the google entity id 117 | if 'google_entity_id' in row: 118 | row['google_entity_id']["value"] = 'kg:' + row['google_entity_id']["value"] 119 | 120 | # clean up the wikipedia and ballotpedia results 121 | if "wikipedia" in row: 122 | row["wikipedia"]["value"] = \ 123 | unquote(row["wikipedia"]["value"])\ 124 | .replace("https://en.wikipedia.org/wiki/", "")\ 125 | .strip().replace('_',' ') 126 | if "ballotpedia" in row: 127 | row["ballotpedia"]["value"] = row["ballotpedia"]["value"].strip().replace('_',' ') 128 | 129 | # clean up the votesmart id 130 | if "votesmart" in row: 131 | try: 132 | row["votesmart"]["value"] = int(row["votesmart"]["value"]) 133 | except ValueError: 134 | print("invalid value", row["votesmart"]["value"]) 135 | continue 136 | # return a simple list of dicts of results 137 | return [ 138 | { 139 | k: row[k]['value'] 140 | for k in row 141 | } 142 | for row in results['results']['bindings'] 143 | ] 144 | 145 | 146 | def run(): 147 | p1 = load_data("legislators-current.yaml") 148 | p2 = load_data("legislators-historical.yaml") 149 | get_wikidata_ids(p1+p2) 150 | get_ids_from_wikidata(p1+p2) 151 | get_ids_from_wikidata_without_bioguide(p1+p2) 152 | save_data(p1, "legislators-current.yaml") 153 | save_data(p2, "legislators-historical.yaml") 154 | 155 | if __name__ == '__main__': 156 | run() 157 | 158 | 159 | -------------------------------------------------------------------------------- /scripts/wikipedia_ids.py: -------------------------------------------------------------------------------- 1 | # Scans Wikipedia for pages using the CongBio and CongLinks 2 | # templates, which have Bioguide IDs. Updates the 'wikipedia' 3 | # ID field for matching Members of Congress, and for pages 4 | # using the CongLinks template also updates a variety of 5 | # other ID as found in the template. 6 | 7 | import lxml.etree, re, urllib.request, urllib.parse, urllib.error 8 | import utils, os.path 9 | 10 | def run(): 11 | 12 | # Field mapping. And which fields should be turned into integers. 13 | # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available. 14 | fieldmap = { 15 | "congbio": "bioguide", 16 | #"fec": "fec", # handled specially... 17 | "govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors) 18 | "opensecrets": "opensecrets", 19 | "votesmart": "votesmart", 20 | "cspan": "cspan", 21 | } 22 | int_fields = ("govtrack", "votesmart", "cspan") 23 | 24 | # default to not caching 25 | cache = utils.flags().get('cache', False) 26 | 27 | # Load legislator files and map bioguide IDs. 28 | y1 = utils.load_data("legislators-current.yaml") 29 | y2 = utils.load_data("legislators-historical.yaml") 30 | bioguides = { } 31 | for y in y1+y2: 32 | bioguides[y["id"]["bioguide"]] = y 33 | 34 | # Okay now the Wikipedia stuff... 35 | 36 | def get_matching_pages(): 37 | # Does a Wikipedia API search for pages containing either of the 38 | # two templates. Returns the pages. 39 | 40 | page_titles = set() 41 | 42 | for template in ("CongLinks", "CongBio"): 43 | eicontinue = "" 44 | while True: 45 | # construct query URL, using the "eicontinue" of the last query to get the next batch 46 | url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template 47 | if eicontinue: url += "&eicontinue=" + eicontinue 48 | 49 | # load the XML 50 | print("Getting %s pages (%d...)" % (template, len(page_titles))) 51 | dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably 52 | 53 | for pgname in dom.xpath("query/embeddedin/ei/@title"): 54 | page_titles.add(pgname) 55 | 56 | # get the next eicontinue value and loop 57 | eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)") 58 | if not eicontinue: break 59 | 60 | return page_titles 61 | 62 | # Get the list of Wikipedia pages that use any of the templates we care about. 63 | page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles") 64 | if cache and os.path.exists(page_list_cache_file): 65 | # Load from cache. 66 | matching_pages = open(page_list_cache_file).read().split("\n") 67 | else: 68 | # Query Wikipedia API and save to cache. 69 | matching_pages = get_matching_pages() 70 | utils.write(("\n".join(matching_pages)), page_list_cache_file) 71 | 72 | # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon). 73 | matching_pages = [p for p in matching_pages if ":" not in p] 74 | 75 | # Load each page's content and parse the template. 76 | for p in sorted(matching_pages): 77 | if " campaign" in p: continue 78 | if " (surname)" in p: continue 79 | if "career of " in p: continue 80 | if "for Congress" in p: continue 81 | if p.startswith("List of "): continue 82 | if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue 83 | 84 | # Query the Wikipedia API to get the raw page content in XML, 85 | # and then use XPath to get the raw page text. 86 | url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap" 87 | cache_path = "legislators/wikipedia/pages/" + p 88 | dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache)) 89 | page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" }) 90 | 91 | # Build a dict for the IDs that we want to insert into our files. 92 | new_ids = { 93 | "wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores) 94 | } 95 | 96 | if "CongLinks" in page_content: 97 | # Parse the key/val pairs in the template. 98 | m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content) 99 | if not m: continue # no template? 100 | for arg in m.group(1).split("|"): 101 | if "=" not in arg: continue 102 | key, val = arg.split("=", 1) 103 | key = key.strip() 104 | val = val.strip() 105 | if val and key in fieldmap: 106 | try: 107 | if fieldmap[key] in int_fields: val = int(val) 108 | except ValueError: 109 | print("invalid value", key, val) 110 | continue 111 | 112 | if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper() 113 | new_ids[fieldmap[key]] = val 114 | 115 | if "bioguide" not in new_ids: continue 116 | new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm 117 | bioguide = new_ids["bioguide"] 118 | 119 | else: 120 | m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content) 121 | if not m: continue # no template? 122 | bioguide = m.group(1).upper() 123 | 124 | 125 | if not bioguide in bioguides: 126 | print("Member not found: " + bioguide, p, "(Might have been a delegate to the Constitutional Convention.)") 127 | continue 128 | 129 | # handle FEC ids specially because they are stored in an array... 130 | fec_id = new_ids.get("fec") 131 | if fec_id: del new_ids["fec"] 132 | 133 | member = bioguides[bioguide] 134 | member["id"].update(new_ids) 135 | 136 | # ...finish the FEC id. 137 | if fec_id: 138 | if fec_id not in bioguides[bioguide]["id"].get("fec", []): 139 | bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id) 140 | 141 | #print p.encode("utf8"), new_ids 142 | 143 | utils.save_data(y1, "legislators-current.yaml") 144 | utils.save_data(y2, "legislators-historical.yaml") 145 | 146 | if __name__ == '__main__': 147 | run() 148 | -------------------------------------------------------------------------------- /test/are_files_linted.py: -------------------------------------------------------------------------------- 1 | # Check that each YAML file has been linted. 2 | 3 | import difflib 4 | import glob 5 | import io 6 | import sys 7 | 8 | import rtyaml 9 | 10 | ok = True 11 | 12 | for fn in glob.glob("*.yaml"): 13 | with open(fn) as f: 14 | body = f.read() 15 | 16 | # Round-trip the file. Because of the comment block at the top 17 | # of legislators-social-media.yaml, we need to go through file-like 18 | # streams so that rtyaml preserves it. 19 | data = rtyaml.load(io.StringIO(body)) 20 | 21 | # Save it back to a buffer. 22 | buf = io.StringIO() 23 | rtyaml.dump(data, buf) 24 | buf = buf.getvalue() 25 | 26 | # Check that the file round-trips to the same bytes, 27 | # except don't worry about trailing newlines because 28 | # editors mess with the last line line ending. 29 | if buf.rstrip() != body.rstrip(): 30 | ok = False 31 | print(fn, "needs to be linted:") 32 | 33 | # Show a diff. 34 | for line in difflib.unified_diff(body.split("\n"), buf.split("\n"), fromfile='in repository', tofile='after linting', lineterm=''): 35 | print(line) 36 | 37 | sys.exit(0 if ok else 1) 38 | -------------------------------------------------------------------------------- /test/workout.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import glob 5 | import os 6 | import importlib 7 | 8 | sys.path.append("scripts") 9 | 10 | scripts = glob.glob("scripts/*.py") 11 | scripts.sort() 12 | 13 | for script in scripts: 14 | module = os.path.basename(script).replace(".py", "") 15 | print("Importing %s..." % module) 16 | 17 | try: 18 | importlib.import_module(module) 19 | except Exception as exc: 20 | print("Error when importing %s!" % module) 21 | print() 22 | raise exc 23 | 24 | exit(0) --------------------------------------------------------------------------------