├── tests ├── __init__.py ├── test_config.py ├── test_build.py └── docker-compose.yml ├── docker-entrypoint.sh ├── archive ├── rialto │ ├── mitmdump.out │ ├── bills.py │ ├── __init__.py │ └── people.py ├── README.md ├── cary │ ├── __init__.py │ └── events.py ├── santa_fe │ ├── __init__.py │ └── events.py ├── maricopa │ ├── __init__.py │ ├── bills.py │ └── people.py ├── philadelphia │ ├── __init__.py │ └── events.py ├── roswell │ ├── __init__.py │ └── events.py ├── longbeach │ ├── bills.py │ ├── __init__.py │ └── people.py ├── albuquerque │ ├── bills.py │ ├── __init__.py │ └── people.py ├── arlington_va │ ├── __init__.py │ ├── people.py │ └── events.py ├── columbus │ ├── __init__.py │ ├── people.py │ └── events.py ├── jonesboro │ └── __init__.py ├── cleveland │ ├── __init__.py │ ├── events.py │ └── people.py ├── monterey │ └── __init__.py ├── statecollegepa │ └── __init__.py ├── wellesley │ ├── __init__.py │ └── people.py ├── denver │ ├── __init__.py │ ├── bills.py │ ├── people.py │ └── utils.py ├── temecula │ ├── __init__.py │ ├── people.py │ ├── events.py │ └── utils.py ├── boise │ ├── __init__.py │ ├── people.py │ ├── events.py │ ├── bills.py │ └── utils.py ├── boston │ ├── __init__.py │ ├── events.py │ ├── vote.py │ └── people.py ├── sanfrancisco.py ├── holyoke │ ├── __init__.py │ └── people.py ├── madison.py └── lametro │ └── __init__.py ├── sacramento ├── bills.py ├── events.py ├── vote_events.py ├── __init__.py └── people.py ├── AUTHORS ├── setup.py ├── requirements.txt ├── Dockerfile ├── .gitignore ├── docker-compose.yml ├── LICENSE ├── .github └── workflows │ └── main.yml ├── cookcounty ├── __init__.py └── people.py ├── ferguson ├── __init__.py └── people.py ├── st_louis ├── utils.py ├── __init__.py ├── people.py └── bills.py ├── nyc ├── __init__.py ├── events.py └── people.py ├── README.md ├── chicago ├── base.py ├── __init__.py ├── legistar.py ├── events.py └── people.py ├── miamidade ├── __init__.py ├── events.py └── people.py └── pittsburgh ├── __init__.py ├── people.py └── events.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_build.py: -------------------------------------------------------------------------------- 1 | def test_truth(): 2 | assert True is True 3 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | pupa dbinit us 5 | 6 | exec "$@" 7 | -------------------------------------------------------------------------------- /archive/rialto/mitmdump.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opencivicdata/scrapers-us-municipal/HEAD/archive/rialto/mitmdump.out -------------------------------------------------------------------------------- /tests/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.4' 2 | 3 | services: 4 | scrapers: 5 | restart: "no" 6 | command: pytest -sxv 7 | -------------------------------------------------------------------------------- /archive/README.md: -------------------------------------------------------------------------------- 1 | # Archived Scrapers 2 | 3 | These are scrapers that were not finished or were not updated to use the current version of our scraper infrastructure. Resurrection welcome. 4 | 5 | -------------------------------------------------------------------------------- /sacramento/bills.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Bill 3 | 4 | 5 | class SacramentoBillScraper(Scraper): 6 | 7 | def scrape(self): 8 | # needs to be implemented 9 | pass 10 | -------------------------------------------------------------------------------- /sacramento/events.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Event 3 | 4 | 5 | class SacramentoEventScraper(Scraper): 6 | 7 | def scrape(self): 8 | # needs to be implemented 9 | pass 10 | -------------------------------------------------------------------------------- /sacramento/vote_events.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import VoteEvent 3 | 4 | 5 | class SacramentoVoteEventScraper(Scraper): 6 | 7 | def scrape(self): 8 | # needs to be implemented 9 | pass 10 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | James Turk 2 | Thom Neale 3 | Paul Tagliamonte 4 | 5 | Specific Scrapers 6 | ----------------- 7 | Arlington, Va - Tom Lee 8 | 9 | Pittsburgh, Pa - James O'Toole 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='muniscrapers', 5 | version='1.0.0', 6 | url='https://github.com/opencivicdata/scrapers-us-municipal', 7 | packages=find_packages(), 8 | install_requires=['pupa', 'scraper-legistar'], 9 | ) 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | https://github.com/opencivicdata/python-opencivicdata-django/zipball/master 2 | pupa==0.10.1 3 | https://github.com/opencivicdata/python-legistar-scraper/zipball/master 4 | lxml 5 | sh 6 | pytest==6.2.5 7 | pytest-mock==3.12.0 8 | requests-mock==1.11.0 9 | https://github.com/jamesturk/scrapelib/archive/refs/heads/main.zip 10 | -------------------------------------------------------------------------------- /archive/cary/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .events import CaryEventsScraper 4 | 5 | 6 | class Cary(Jurisdiction): 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:nc/place:cary/council' 8 | name = 'Cary Town Council' 9 | url = 'http://www.townofcary.org/town_council/cary_town_council.htm' 10 | 11 | scrapers = {'events': CaryEventsScraper} 12 | -------------------------------------------------------------------------------- /archive/santa_fe/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | from .events import SantaFeEventsScraper 3 | 4 | 5 | class SantaFe(Jurisdiction): 6 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:santa_fe/council' 7 | name = 'Santa Fe City Council' 8 | url = 'http://www.santafenm.gov/index.aspx?nid=72' 9 | 10 | scrapers = { 11 | "events": SantaFeEventsScraper 12 | } 13 | -------------------------------------------------------------------------------- /archive/maricopa/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .bills import BillScraper 4 | 5 | 6 | class Example(Jurisdiction): 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:az/place:maricopa' 8 | name = 'Maricopa City Council' 9 | url = 'http://www.maricopa-az.gov/web/' 10 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 11 | scrapers = {'bills': BillScraper} 12 | -------------------------------------------------------------------------------- /archive/philadelphia/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | from .events import PhillyEventsScraper 3 | 4 | 5 | class Philadelphia(Jurisdiction): 6 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:pa/place:philadelphia/council' 7 | 8 | name = 'Philadelphia City Council' 9 | url = 'http://philadelphiacitycouncil.net/' 10 | 11 | scrapers = { 12 | "events": PhillyEventsScraper 13 | } 14 | -------------------------------------------------------------------------------- /archive/roswell/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | from .events import RoswellEventsScraper 3 | 4 | 5 | class Roswell(Jurisdiction): 6 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:roswell/council' 7 | name = 'Roswell City Council' 8 | url = 'http://www.roswell-nm.gov/staticpages/index.php/cc1-citycouncil' 9 | 10 | scrapers = { 11 | "events": RoswellEventsScraper 12 | } 13 | -------------------------------------------------------------------------------- /archive/rialto/bills.py: -------------------------------------------------------------------------------- 1 | from os.path import join, abspath, dirname 2 | 3 | import sh 4 | import lxml.html 5 | from libmproxy import proxy, flow 6 | 7 | from pupa.utils.legistar import LegistarScraper 8 | from pupa.scrape import Bill 9 | 10 | 11 | class BillScraper(LegistarScraper): 12 | url = 'https://rialto.legistar.com/Legislation.aspx' 13 | columns = ( 14 | 'bill_id', 'type', 'status', 15 | 'created', 'action', 'title') 16 | -------------------------------------------------------------------------------- /archive/longbeach/bills.py: -------------------------------------------------------------------------------- 1 | from os.path import join, abspath, dirname 2 | 3 | import sh 4 | import lxml.html 5 | from libmproxy import proxy, flow 6 | 7 | from pupa.utils.legistar import LegistarScraper 8 | from pupa.scrape import Bill 9 | 10 | 11 | class BillScraper(LegistarScraper): 12 | url = 'https://longbeach.legistar.com/Calendar.aspx' 13 | columns = ( 14 | 'bill_id', 'type', 'status', 15 | 'created', 'action', 'title') 16 | -------------------------------------------------------------------------------- /archive/maricopa/bills.py: -------------------------------------------------------------------------------- 1 | from os.path import join, abspath, dirname 2 | 3 | import sh 4 | import lxml.html 5 | from libmproxy import proxy, flow 6 | 7 | from pupa.utils.legistar import LegistarScraper 8 | from pupa.scrape import Bill 9 | 10 | 11 | class BillScraper(LegistarScraper): 12 | url = 'https://maricopa.legistar.com/Legislation.aspx' 13 | columns = ( 14 | 'bill_id', 'type', 'status', 15 | 'created', 'action', 'title') 16 | -------------------------------------------------------------------------------- /archive/longbeach/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .bills import BillScraper 4 | 5 | 6 | class Jurisdiction(Jurisdiction): 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:longbeach' 8 | name = 'Long Beach City Council' 9 | url = 'http://www.longbeach.gov/cityclerk/council_online.asp' 10 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 11 | scrapers = {'bills': BillScraper} 12 | -------------------------------------------------------------------------------- /archive/albuquerque/bills.py: -------------------------------------------------------------------------------- 1 | from os.path import join, abspath, dirname 2 | 3 | import sh 4 | import lxml.html 5 | from libmproxy import proxy, flow 6 | 7 | from pupa.utils.legistar import LegistarScraper 8 | from pupa.scrape import Bill 9 | 10 | 11 | class BillScraper(LegistarScraper): 12 | 13 | url = 'https://cabq.legistar.com/Legislation.aspx' 14 | columns = ( 15 | 'bill_id', 'enactment_id', 'type', 'status', 16 | 'created', 'action', 'title') 17 | -------------------------------------------------------------------------------- /archive/arlington_va/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import PersonScraper 4 | from .events import EventScraper 5 | 6 | 7 | class Arlington(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:va/place:arlington/council' 9 | name = 'Arlington County Board' 10 | url = 'http://www.arlingtonva.us/Departments/CountyBoard/CountyBoardMain.aspx' 11 | 12 | scrapers = {'people': PersonScraper, 'events': EventScraper} 13 | -------------------------------------------------------------------------------- /archive/rialto/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import PersonScraper 4 | from .bills import BillScraper 5 | 6 | 7 | class Rialto(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:az/place:rialto/council' 9 | name = 'Rialto City Council' 10 | url = 'http://http://www.ci.rialto.ca.us/' 11 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 12 | scrapers = {'bills': BillScraper, 'people': PersonScraper} 13 | -------------------------------------------------------------------------------- /archive/albuquerque/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import PersonScraper 4 | from .bills import BillScraper 5 | 6 | 7 | class Albequerque(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:albequerque/council' 9 | name= 'Albequerque City Council' 10 | url = 'http://www.cabq.gov/council/' 11 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 12 | scrapers = {'people': PersonScraper, 'bills': BillScraper} 13 | -------------------------------------------------------------------------------- /archive/columbus/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import ColumbusPersonScraper 4 | from .events import ColumbusEventScraper 5 | 6 | 7 | class Columbus(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:oh/place:columbus/council' 9 | 10 | name = 'Columbus City Council' 11 | url = 'http://council.columbus.gov/' 12 | 13 | scrapers = { 14 | "people": ColumbusPersonScraper, 15 | "events": ColumbusEventScraper, 16 | } 17 | -------------------------------------------------------------------------------- /archive/jonesboro/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | from legistar.ext.pupa import LegistarPeopleScraper 3 | 4 | 5 | class Jonesboro(Jurisdiction): 6 | division_id = 'ocd-division/country:us/state:ar/place:jonesboro' 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:ar/place:jonesboro/government' 8 | 9 | name = 'Jonesboro City Council' 10 | url = 'http://jonesboro.legistar.com/' 11 | 12 | scrapers = { 13 | "people": LegistarPeopleScraper, 14 | } 15 | -------------------------------------------------------------------------------- /archive/cleveland/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import ClevelandPersonScraper 4 | from .events import ClevelandEventScraper 5 | 6 | 7 | class Cleveland(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:oh/place:cleveland/council' 9 | 10 | name = 'Cleveland City Council' 11 | url = 'http://www.clevelandcitycouncil.org/' 12 | 13 | scrapers = { 14 | "people": ClevelandPersonScraper, 15 | "events": ClevelandEventScraper 16 | } 17 | -------------------------------------------------------------------------------- /archive/monterey/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | from legistar.ext.pupa import LegistarPeopleScraper 3 | 4 | 5 | class Jonesboro(Jurisdiction): 6 | division_id = 'ocd-division/country:us/state:ca/place:monterey' 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:monterey/government' 8 | 9 | name = 'City of Monterey Board of Supervisors' 10 | url = 'https://monterey.legistar.com/People.aspx' 11 | 12 | scrapers = { 13 | "people": LegistarPeopleScraper, 14 | } 15 | -------------------------------------------------------------------------------- /archive/statecollegepa/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from granicus.pupa.events import make_event_scraper 4 | 5 | 6 | class StateCollege(Jurisdiction): 7 | division_id = 'ocd-division/country:us/state:pa/place:state_college' 8 | name = 'State College' 9 | url = 'http://www.statecollegepa.us/' 10 | classification = "government" 11 | 12 | scrapers = { 13 | # XXX: The server is giving us 500 errors... 14 | # "events": make_event_scraper("statecollegepa"), 15 | } 16 | -------------------------------------------------------------------------------- /archive/wellesley/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import WellesleyPersonScraper 4 | 5 | 6 | class Wellesley(Jurisdiction): 7 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:ma/place:wellesley/council' 8 | name = 'Wellesley Board of Selectmen' 9 | url = 'http://www.wellesleyma.gov/Pages/WellesleyMA_Selectmen/index' 10 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 11 | 12 | scrapers = { 13 | "people": WellesleyPersonScraper, 14 | } 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | LABEL maintainer "DataMade " 3 | 4 | ENV PYTHONUNBUFFERED=1 5 | 6 | RUN apt-get update && \ 7 | apt-get install -y libxml2-dev gdal-bin && \ 8 | apt-get clean && \ 9 | rm -rf /var/cache/apt/* /var/lib/apt/lists/* 10 | 11 | RUN mkdir /src 12 | WORKDIR /src 13 | 14 | COPY ./requirements.txt /src/requirements.txt 15 | RUN pip install --upgrade pip && \ 16 | pip install --no-cache-dir -r requirements.txt 17 | 18 | COPY . /src 19 | 20 | ENTRYPOINT ["/src/docker-entrypoint.sh"] 21 | -------------------------------------------------------------------------------- /archive/denver/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | # from .events import BoiseEventScraper 4 | from .people import PersonScraper 5 | from .bills import BillScraper 6 | 7 | 8 | class Denver(Jurisdiction): 9 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:co/place:denver/council' 10 | 11 | name = 'Denver City Council' 12 | url = 'https://www.denvergov.org/citycouncil' 13 | parties = [{'name': 'Democratic' }, {'name': 'Republican' }, ] 14 | 15 | scrapers = {'people': PersonScraper, 'bills': BillScraper} 16 | -------------------------------------------------------------------------------- /archive/temecula/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .people import PersonScraper 4 | from .events import TemeculaEventScraper 5 | 6 | 7 | class Temecula(Jurisdiction): 8 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:temecula/council' 9 | 10 | name = 'Temecula City Council' 11 | url = 'http://www.cityoftemecula.org/Temecula/Government/CouncilCommissions/CityCouncil/' 12 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 13 | scrapers = {'people': PersonScraper, 'events': TemeculaEventScraper} 14 | -------------------------------------------------------------------------------- /archive/boise/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction 2 | 3 | from .events import BoiseEventScraper 4 | from .people import PersonScraper 5 | from .bills import BillScraper 6 | 7 | 8 | class Boise(Jurisdiction): 9 | jurisdiction_id = 'ocd-jurisdiction/country:us/state:id/place:boise_city/council' 10 | name = 'Boise City Council' 11 | url = 'http://mayor.cityofboise.org/city-council/' 12 | parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ] 13 | 14 | scrapers = {'people': PersonScraper, 'bills': BillScraper, 'events': BoiseEventScraper} 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .#* 2 | *# 3 | nyc/secrets.py 4 | lametro/secrets.py 5 | pupa_settings.py 6 | 7 | # Pupa bits. 8 | _data 9 | _cache 10 | 11 | # scraper bits 12 | HearingSchedule.pdf 13 | HearingSchedule.txt 14 | 15 | # vim 16 | *swp 17 | 18 | # Python objects 19 | *.py[cod] 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Packages 25 | *.egg 26 | *.egg-info 27 | dist 28 | build 29 | eggs 30 | parts 31 | bin 32 | var 33 | sdist 34 | develop-eggs 35 | .installed.cfg 36 | lib 37 | lib64 38 | 39 | # Installer logs 40 | pip-log.txt 41 | 42 | # Unit test / coverage reports 43 | .coverage 44 | .tox 45 | nosetests.xml 46 | 47 | # Translations 48 | *.mo 49 | 50 | # Mr Developer 51 | .mr.developer.cfg 52 | .project 53 | .pydevproject 54 | 55 | __pycache__ 56 | 57 | venv 58 | -------------------------------------------------------------------------------- /archive/rialto/people.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | 3 | from pupa.scrape import Scraper, Legislator 4 | from pupa.scrape import Person, Organization 5 | 6 | 7 | class PersonScraper(Scraper): 8 | 9 | url = 'http://www.ci.rialto.ca.us/citycouncil_council-members.php' 10 | def get_people(self): 11 | 12 | html = self.urlopen(self.url) 13 | doc = lxml.html.fromstring(html) 14 | 15 | title_xpath = '//div[contains(@class, "biotitle")]' 16 | name_xpath = '//div[contains(@class, "bioname")]' 17 | for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)): 18 | name = name.text_content().strip() 19 | title = title.text_content().strip() 20 | p = Legislator(name=name, district=title) 21 | p.add_source(self.url) 22 | yield p 23 | -------------------------------------------------------------------------------- /archive/longbeach/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape.helpers import Legislator, Organization 3 | 4 | 5 | class PersonScraper(Scraper): 6 | 7 | def get_people(self): 8 | # committee 9 | tech = Organization('Technology', classification='committee') 10 | tech.add_post('Chairman', 'chairman') 11 | tech.add_source('https://example.com') 12 | yield tech 13 | 14 | # subcommittee 15 | ecom = Organization('Subcommittee on E-Commerce', 16 | parent=tech, 17 | classification='committee') 18 | ecom.add_source('https://example.com') 19 | yield ecom 20 | 21 | p = Person('Paul Tagliamonte', district='6', chamber='upper') 22 | p.add_committee_membership(tech, role='chairman') 23 | p.add_source('https://example.com') 24 | yield p 25 | -------------------------------------------------------------------------------- /archive/maricopa/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape.helpers import Legislator, Organization 3 | 4 | 5 | class PersonScraper(Scraper): 6 | 7 | def get_people(self): 8 | # committee 9 | tech = Organization('Technology', classification='committee') 10 | tech.add_post('Chairman', 'chairman') 11 | tech.add_source('https://example.com') 12 | yield tech 13 | 14 | # subcommittee 15 | ecom = Organization('Subcommittee on E-Commerce', 16 | parent=tech, 17 | classification='committee') 18 | ecom.add_source('https://example.com') 19 | yield ecom 20 | 21 | p = Person('Paul Tagliamonte', district='6', chamber='upper') 22 | p.add_committee_membership(tech, role='chairman') 23 | p.add_source('https://example.com') 24 | yield p 25 | -------------------------------------------------------------------------------- /archive/boston/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction, Organization 2 | 3 | from .events import BostonEventsScraper 4 | from .people import BostonPersonScraper 5 | from .vote import BostonVoteScraper 6 | 7 | 8 | class Boston(Jurisdiction): 9 | division_id = 'ocd-jurisdiction/country:us/state:ma/place:boston' 10 | classification = 'council' 11 | 12 | name = 'Boston City Council' 13 | url = 'http://www.cityofboston.gov/citycouncil/' 14 | extras = { 15 | "social_media": { 16 | "twitter": "https://twitter.com/BOSCityCouncil", 17 | "facebook": "https://www.facebook.com/pages/Boston-City-Council/106846899335407", 18 | } 19 | } 20 | 21 | scrapers = { 22 | "people": BostonPersonScraper, 23 | "events": BostonEventsScraper, 24 | "votes": BostonVoteScraper, 25 | } 26 | 27 | def get_organizations(self): 28 | org = Organization(name="Boston City Council", classification="legislature") 29 | yield org 30 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.4' 2 | 3 | services: 4 | scrapers: 5 | image: scrapers-us-municipal 6 | container_name: scrapers-us-municipal 7 | build: . 8 | stdin_open: true 9 | tty: true 10 | depends_on: 11 | postgres: 12 | condition: service_healthy 13 | volumes: 14 | - .:/src 15 | environment: 16 | DATABASE_URL: postgres://postgres:postgres@postgres/opencivicdata 17 | DJANGO_SETTINGS_MODULE: pupa.settings 18 | command: pupa update lametro 19 | 20 | postgres: 21 | container_name: scrapers-us-municipal-postgres 22 | image: postgis/postgis:13-3.4 23 | healthcheck: 24 | test: ["CMD-SHELL", "pg_isready -U postgres"] 25 | interval: 10s 26 | timeout: 5s 27 | retries: 5 28 | environment: 29 | POSTGRES_DB: opencivicdata 30 | POSTGRES_PASSWORD: postgres 31 | volumes: 32 | - scrapers-us-municipal-db-data:/var/lib/postgresql/data 33 | ports: 34 | - 32001:5432 35 | 36 | volumes: 37 | scrapers-us-municipal-db-data: 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 DataMade LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | name: Run tests 14 | runs-on: ubuntu-latest 15 | services: 16 | postgres: 17 | image: postgis/postgis:13-3.4 18 | env: 19 | POSTGRES_DB: lametro 20 | POSTGRES_PASSWORD: postgres 21 | options: >- 22 | --health-cmd pg_isready 23 | --health-interval 10s 24 | --health-timeout 5s 25 | --health-retries 5 26 | ports: 27 | - 5432:5432 28 | steps: 29 | - name: Install system dependencies 30 | run: | 31 | sudo apt-get update 32 | sudo apt-get install gdal-bin libxml2-dev 33 | - uses: actions/checkout@v4 34 | - name: Set up Python 3.9 35 | uses: actions/setup-python@v5 36 | with: 37 | python-version: '3.9' 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install "setuptools-scm<7.0" 42 | pip install -r requirements.txt 43 | - name: Test with pytest 44 | run: | 45 | pytest -sv 46 | -------------------------------------------------------------------------------- /cookcounty/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | from pupa.scrape import Jurisdiction, Organization 3 | from .events import CookcountyEventScraper 4 | from .bills import CookcountyBillScraper 5 | from .people import CookcountyPersonScraper 6 | 7 | 8 | class Cookcounty(Jurisdiction): 9 | division_id = "ocd-division/country:us/state:il/county:cook" 10 | classification = "legislature" 11 | name = "Cook County" 12 | url = "http://www.cookcountyil.gov/board-of-commissioners/" 13 | scrapers = { 14 | #"events": CookcountyEventScraper, 15 | #"bills": CookcountyBillScraper, 16 | "people": CookcountyPersonScraper, 17 | } 18 | 19 | def get_organizations(self): 20 | org = Organization(name="Cook County Board of Commissioners", classification="legislature") 21 | 22 | for x in range(1, 18): 23 | org.add_post( 24 | "District {}".format(x), 25 | "Commissioner", 26 | division_id='ocd-division/country:us/state:il/county:cook/council_district:{}'.format(x)) 27 | 28 | org.add_post( 29 | "Board President", 30 | "Board President", 31 | division_id='ocd-division/country:us/state:il/county:cook') 32 | 33 | yield org 34 | -------------------------------------------------------------------------------- /archive/sanfrancisco.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction, Organization 2 | from legistar.people import LegistarPersonScraper 3 | 4 | class SFPersonScraper(LegistarPersonScraper): 5 | EXTRA_FIELDS = ('notes',) 6 | 7 | #TODO: add district? 8 | 9 | class SanFrancisco(Jurisdiction): 10 | name = 'San Francisco' 11 | classification = 'government' 12 | division_id = 'ocd-division/country:us/state:ca/place:san_francisco' 13 | timezone = 'America/Los_Angeles' 14 | url = 'http://sfgov.org' 15 | 16 | LEGISTAR_ROOT_URL = 'https://sfgov.legistar.com' 17 | scrapers = {'people': SFPersonScraper} 18 | 19 | def get_organizations(self): 20 | council = Organization('San Francisco Board of Supervisors', classification='legislature') 21 | for x in range(1,12): 22 | council.add_post(str(x), role='Supervisor') 23 | yield council 24 | 25 | 26 | #TOPLEVEL_ORG_MEMBERSHIP_TITLE = 'Supervisor' 27 | #TOPLEVEL_ORG_MEMBERSHIP_NAME = 'Board of Supervisors' 28 | #EVT_SEARCH_TABLE_TEXT_AUDIO = 'Audio' # sfgov has this 29 | #EVT_SEARCH_TIME_PERIOD = 'This Year' 30 | #BILL_SEARCH_TABLE_TEXT_INTRO_DATE = 'Introduced' 31 | 32 | #def get_district(self, data): 33 | # return self.DEFAULT_AT_LARGE_STRING 34 | -------------------------------------------------------------------------------- /ferguson/__init__.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction, Organization 2 | 3 | from .people import FergusonPersonScraper 4 | 5 | 6 | class Ferguson(Jurisdiction): 7 | division_id = 'ocd-division/country:us/state:mo/place:ferguson' 8 | classification = 'council' 9 | name = 'Ferguson City Council' 10 | url = 'http://www.fergusoncity.com/56/Government' 11 | parties = [] 12 | 13 | scrapers = { 14 | "people": FergusonPersonScraper, 15 | } 16 | 17 | def get_organizations(self): 18 | org = Organization(name="Ferguson City Council", 19 | classification="legislature") 20 | 21 | org.add_contact_detail( 22 | type='email', 23 | value='citycouncil@fergusoncity.com' 24 | ) 25 | 26 | org.add_post( 27 | label="Mayor", 28 | role="Mayor", 29 | division_id=self.division_id 30 | ) 31 | 32 | WARDS = 3 33 | for ward in range(1, WARDS + 1): 34 | org.add_post( 35 | label="Council Member Ward {}".format(ward), 36 | role="Council Member Ward {}".format(ward), 37 | division_id=self.division_id, 38 | # num_seats=2, 39 | ) 40 | 41 | yield org 42 | -------------------------------------------------------------------------------- /archive/boston/events.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Event 3 | 4 | import datetime as dt 5 | import lxml.html 6 | 7 | 8 | class BostonEventsScraper(Scraper): 9 | 10 | def lxmlize(self, url): 11 | entry = self.urlopen(url) 12 | page = lxml.html.fromstring(entry) 13 | page.make_links_absolute(url) 14 | return page 15 | 16 | def scrape(self): 17 | url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx" 18 | 19 | page = self.lxmlize(url) 20 | for entry in page.xpath( 21 | "//tr[@style='font-family: Verdana; font-size: 12px;']"): 22 | name, when, links = entry.xpath(".//td") 23 | name = name.text.strip().replace(u"\xc2\xa0", "") 24 | when = when.text.strip().replace(u"\xc2\xa0", "") 25 | when = dt.datetime.strptime(when, "%m/%d/%Y") 26 | links = links.xpath(".//a") 27 | links = {x.text: x.attrib['href'] for x in links} 28 | e = Event(name=name, 29 | when=when, 30 | location='unknown') 31 | 32 | e.add_source(url) 33 | for note, url in links.items(): 34 | e.add_link(note=note, url=url) 35 | 36 | yield e 37 | -------------------------------------------------------------------------------- /sacramento/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | from pupa.scrape import Jurisdiction, Organization 3 | from .bills import SacramentoBillScraper 4 | from .vote_events import SacramentoVoteEventScraper 5 | from .events import SacramentoEventScraper 6 | from .people import SacramentoPersonScraper 7 | 8 | 9 | class Sacramento(Jurisdiction): 10 | division_id = "ocd-division/country:us/state:ca/place:sacramento" 11 | classification = "legislature" 12 | name = "Sacramento City Council" 13 | url = "http://www.cityofsacramento.org/" 14 | scrapers = { 15 | # "bills": SacramentoBillScraper, 16 | # "vote_events": SacramentoVoteEventScraper, 17 | # "events": SacramentoEventScraper, 18 | "people": SacramentoPersonScraper, 19 | } 20 | 21 | def get_organizations(self): 22 | 23 | org = Organization(name="Sacramento City Council", classification="legislature") 24 | 25 | org.add_post(label='Mayor of the City of Sacramento', 26 | role='Mayor', 27 | division_id='ocd-division/country:us/state:ca/place:sacramento') 28 | 29 | for district in range(1, 9): 30 | org.add_post(label='Sacramento City Council Member, District {}'.format(district), 31 | role='Member', 32 | division_id='ocd-division/country:us/state:ca/place:sacramento/council_district:{}'.format(district)) 33 | 34 | yield org 35 | -------------------------------------------------------------------------------- /archive/temecula/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Person, Organization 3 | 4 | from .utils import Urls 5 | 6 | legislators_url = ( 7 | 'http://www.cityoftemecula.org/Temecula/Government/' 8 | 'CouncilCommissions/CityCouncil/') 9 | 10 | 11 | class PersonScraper(Scraper): 12 | 13 | def scrape(self): 14 | urls = Urls(dict(list=legislators_url), self) 15 | 16 | council = Organization( 17 | 'Temecula City Council', 18 | classification='legislature') 19 | council.add_source(urls.list.url) 20 | yield council 21 | 22 | for tr in urls.list.xpath('//table[2]//tr')[1:]: 23 | 24 | # Parse some attributes. 25 | name, role = tr.xpath('td/p[1]//font/text()') 26 | image = tr.xpath('td/img/@src').pop() 27 | 28 | # Create legislator. 29 | person = Person(name, image=image) 30 | 31 | # Add membership on council. 32 | memb = person.add_membership(council, role=role) 33 | 34 | # Add email address. 35 | email, detail_url = tr.xpath('td//a/@href') 36 | email = email[7:] 37 | memb.contact_details.append( 38 | dict(type='email', value=email, note='work')) 39 | 40 | # Add sources. 41 | person.add_source(urls.list.url) 42 | person.add_source(detail_url) 43 | 44 | yield person 45 | -------------------------------------------------------------------------------- /archive/columbus/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper, Legislator, Committee 2 | 3 | from collections import defaultdict 4 | import lxml.html 5 | 6 | HOMEPAGE = "http://council.columbus.gov/" 7 | 8 | class ColumbusPersonScraper(Scraper): 9 | 10 | def lxmlize(self, url): 11 | entry = self.urlopen(url) 12 | page = lxml.html.fromstring(entry) 13 | page.make_links_absolute(url) 14 | return page 15 | 16 | def scrape_homepage(self, folk): 17 | url = folk.attrib['href'] 18 | page = self.lxmlize(url) 19 | image = page.xpath( 20 | "//img[contains(@src, 'uploadedImages/City_Council/Members/')]" 21 | )[0].attrib['src'] 22 | 23 | name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3") 24 | name, = name 25 | 26 | bio = "\n\n".join([x.text_content() for x in page.xpath( 27 | "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p" 28 | )]) 29 | 30 | leg = Legislator(name=name.text, 31 | district='member', 32 | biography=bio, 33 | image=image) 34 | leg.add_source(url) 35 | return leg 36 | 37 | def scrape(self): 38 | page = self.lxmlize(HOMEPAGE) 39 | folks = page.xpath("//div[@class='col-left']/div[2]//" 40 | "div[@class='gutter_text'][1]//" 41 | "ul[@class='gutterlist']/li//a") 42 | for folk in folks: 43 | yield self.scrape_homepage(folk) 44 | -------------------------------------------------------------------------------- /archive/denver/bills.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from io import StringIO 4 | 5 | from lxml.html import fromstring 6 | 7 | from pupa.scrape import BaseBillScraper 8 | from pupa.utils import convert_pdf 9 | from pupa.scrape import Bill 10 | 11 | from .utils import Urls 12 | 13 | 14 | search_url = ( 15 | 'http://www.denvergov.org/sirepub/items.aspx?' 16 | 'stype=advanced&meettype=-%20All%20Types%20-&meetdate=This%20Year') 17 | 18 | 19 | class BillScraper(BaseBillScraper): 20 | 21 | def get_bill_ids(self): 22 | self.urls = Urls(dict(search=search_url), scraper=self) 23 | 24 | rows = ('id', 'number', 'type', 'status', 'meeting_type', 25 | 'meeting_date', 'district', 'sponsor', 'title') 26 | xpath = '//tr[contains(@class, "datagrid")]' 27 | for tr in self.urls.search.xpath(xpath)[1:]: 28 | bill_id = re.search(r'\((.+)\)', tr.attrib['onclick']).group(1) 29 | data = [td.text_content() for td in tr.xpath('td')[1:]] 30 | yield bill_id, dict(zip(rows, [bill_id] + data)) 31 | 32 | def get_bill(self, bill_id, **kwargs): 33 | url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id 34 | self.urls.add(detail=url) 35 | 36 | bill_id = kwargs.pop('number') 37 | bill = Bill(bill_id, self.session, kwargs['title'], 'butt', 38 | type=['bills']) 39 | bill.add_source(url, note='detail') 40 | 41 | xpath = '//table[contains(@class, "history")]/tr' 42 | for tr in self.urls.detail.xpath(xpath): 43 | import pdb; pdb.set_trace() 44 | 45 | return bill 46 | -------------------------------------------------------------------------------- /archive/holyoke/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Sunlight Foundation, 2014, under the terms of the BSD-3 2 | # license, a copy of which is in the root level LICENSE file. 3 | # 4 | # This scraper was done at Hack for Western Mass, a huge shoutout 5 | # to all the civic hackers and the Hack for Western Mass folks. 6 | 7 | from pupa.scrape import Jurisdiction, Post, Organization 8 | from .people import HolyokePersonScraper 9 | 10 | NAME = "Holyoke City" 11 | 12 | 13 | class Holyoke(Jurisdiction): 14 | division_id = 'ocd-division/country:us/state:ma/place:holyoke' 15 | classification = 'government' 16 | name = NAME 17 | url = 'http://www.holyoke.org/elected-officials/' 18 | 19 | scrapers = { 20 | "people": HolyokePersonScraper 21 | } 22 | 23 | def get_organizations(self): 24 | # XXX: Add divison IDs 25 | org = Organization(name='Holyoke City Council', 26 | classification='legislature') 27 | 28 | for x in [ 29 | {"label": "Mayor", "role": "mayor",}, 30 | {"label": "City Clerk", "role": "clerk",}, 31 | {"label": "City Treasurer", "role": "treasurer",}, 32 | {"label": "At Large", "role": "councilmember",}, 33 | 34 | {"label": "Ward 1", "role": "councilmember"}, 35 | {"label": "Ward 2", "role": "councilmember"}, 36 | {"label": "Ward 3", "role": "councilmember"}, 37 | {"label": "Ward 4", "role": "councilmember"}, 38 | {"label": "Ward 5", "role": "councilmember"}, 39 | {"label": "Ward 6", "role": "councilmember"}, 40 | {"label": "Ward 7", "role": "councilmember"}, 41 | ]: 42 | org.add_post(**x) 43 | 44 | yield org 45 | -------------------------------------------------------------------------------- /archive/boise/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Person, Organization 3 | 4 | from .utils import Urls 5 | 6 | legislators_url = 'http://mayor.cityofboise.org/city-council/' 7 | 8 | 9 | class PersonScraper(Scraper): 10 | 11 | def scrape(self): 12 | urls = Urls(dict(list=legislators_url), self) 13 | 14 | council = Organization('Boise City Council') 15 | council.add_source(legislators_url) 16 | yield council 17 | 18 | xpath = '//div[@id="content"]/div/a/@href' 19 | people_urls = urls.list.xpath(xpath) 20 | 21 | # SKip the mayor because his page has no name or email. 22 | people_urls = people_urls[1:] 23 | for url in people_urls: 24 | 25 | urls.add(detail=url) 26 | # Parse some attributes. 27 | 28 | image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop() 29 | name = urls.detail.xpath('//h1/text()').pop() 30 | 31 | name = name.replace('Council ', '') 32 | role, _, name = name.partition(' ') 33 | 34 | # Create legislator. 35 | person = Person(name, image=image) 36 | 37 | # Add membership on council. 38 | memb = person.add_membership(council, role=role) 39 | memb.add_source(urls.detail.url) 40 | 41 | # Add email address. 42 | email_xpath = '//a[contains(@href, "mailto")]/@href' 43 | email = urls.detail.xpath(email_xpath).pop()[7:] 44 | memb.contact_details.append( 45 | dict(type='email', value=email, note='work')) 46 | 47 | # Add sources. 48 | person.add_source(urls.list.url) 49 | person.add_source(urls.detail.url) 50 | 51 | yield person 52 | -------------------------------------------------------------------------------- /st_louis/utils.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from lxml import html 3 | import requests 4 | 5 | 6 | class StlScraper(Scraper): 7 | 8 | def lxmlize(self, url, payload=None): 9 | if payload: 10 | entry = self.post(url, payload).text 11 | else: 12 | entry = self.get(url).text 13 | page = html.fromstring(entry) 14 | page.make_links_absolute(url) 15 | return page 16 | 17 | class Urls(object): 18 | 19 | BASE_URL = "https://www.stlouis-mo.gov/government" 20 | ALDERMEN_HOME = BASE_URL + "/departments/aldermen" 21 | BILLS_HOME = BASE_URL + "/city-laws/board-bills/index.cfm" 22 | COMMITTEES_HOME = ALDERMEN_HOME + "/committees/committee.cfm" 23 | 24 | class HumanName(object): 25 | """ 26 | custom hack to avoid dependency on https://pypi.python.org/pypi/nameparser 27 | """ 28 | 29 | @staticmethod 30 | def name_firstandlast(raw_name): 31 | """ 32 | given a string (presumed to be a person's name), try to return 33 | just the person's first and last name without cruft 34 | e.g. 'Megan E. Green' => 'Megan Green' 35 | 'Freeman Bosley Sr.' => 'Freeman Bosley' 36 | """ 37 | # FIXME various corner cases fail 38 | # e.g. 'Bill de la Garza' => 'Bill Garza' 39 | # 'Freeman Bosley III' => 'Freeman III' 40 | 41 | 42 | # first of all, check for any particular known typos 43 | known_typos = { 44 | "Freeman M BosleySr.": "Freeman Bosley", 45 | "Megan E.Green": "Megan Green" 46 | } 47 | if raw_name in known_typos: 48 | return known_typos[raw_name] 49 | 50 | words = raw_name.split(" ") 51 | firstname, *rest = words 52 | # last name is the farthest-back word that does not contain "." 53 | clean_rest = [ w for w in rest if "." not in w ] 54 | try: 55 | lastname = " " + clean_rest[-1] 56 | except IndexError: 57 | lastname = "" 58 | return (firstname + lastname).strip() 59 | 60 | -------------------------------------------------------------------------------- /ferguson/people.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | from pupa.scrape import Person, Scraper 3 | import re 4 | 5 | 6 | class FergusonPersonScraper(Scraper): 7 | COUNCIL_URL = 'http://www.fergusoncity.com/Directory.aspx?DID=3' 8 | 9 | def lxmlize(self, url): 10 | html = self.get(url).text 11 | doc = lxml.html.fromstring(html) 12 | doc.make_links_absolute(url) 13 | return doc 14 | 15 | def get_council(self): 16 | council_doc = self.lxmlize(self.COUNCIL_URL) 17 | 18 | member_urls = council_doc.xpath( 19 | '//table[@summary="City Directory"]/tr//' 20 | 'a[contains(@href, "/directory.aspx?EID=")]/@href') 21 | for member_url in member_urls: 22 | member_doc = self.lxmlize(member_url) 23 | 24 | (name, ) = member_doc.xpath('//h1[@class="BioName"]/text()') 25 | (name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name) 26 | 27 | # Returning everything into a list because the number of values returned varies 28 | # depending on if the person has an email or not 29 | text_list = member_doc.xpath( 30 | '//a[@class="BioLink"]/parent::div/text()') 31 | title = text_list[1].strip() 32 | (title, ) = re.findall( 33 | r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title) 34 | 35 | try: 36 | (image_url, ) = member_doc.xpath( 37 | '//span[@class="BioText"]//img/@src') 38 | except ValueError: 39 | image_url = '' 40 | 41 | member = Person(name=name, 42 | image=image_url, 43 | primary_org='legislature', 44 | role=title) 45 | 46 | member.add_source(member_url) 47 | 48 | yield member 49 | 50 | def scrape(self): 51 | yield from self.get_council() 52 | -------------------------------------------------------------------------------- /nyc/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | from pupa.scrape import Jurisdiction, Organization 3 | from .people import NYCPersonScraper 4 | from .events import NYCEventsScraper 5 | from .bills import NYCBillScraper 6 | 7 | class NYC(Jurisdiction): 8 | classification = 'government' 9 | division_id = 'ocd-division/country:us/state:ny/place:new_york' 10 | name = 'New York City' 11 | timezone = 'America/New_York' 12 | url = 'http://nyc.gov' 13 | 14 | parties = [ 15 | {'name': 'Democratic'}, 16 | {'name': 'Republican'} 17 | ] 18 | scrapers = {'people': NYCPersonScraper, 19 | 'bills' : NYCBillScraper, 20 | 'events': NYCEventsScraper 21 | } 22 | 23 | years = [1994, 1998, 2002, 2004, 2006, 2010, 2014] 24 | 25 | legislative_sessions = [] 26 | 27 | for idx, start_year in enumerate(years): 28 | try: 29 | end_year = years[idx + 1] - 1 30 | session = { 31 | "identifier": str(start_year), 32 | "name": ("%s Regular Session" % str(start_year)), 33 | "start_date": ("%s-01-01" % str(start_year)), 34 | "end_date": ("%s-12-31" % str(end_year)), 35 | } 36 | except IndexError: 37 | continue 38 | else: 39 | legislative_sessions.append(session) 40 | 41 | def get_organizations(self): 42 | council = Organization('New York City Council', classification='legislature') 43 | for x in range(1,52): 44 | council.add_post("District {}".format(x), 45 | role='Council Member', 46 | division_id='ocd-division/country:us/state:ny/place:new_york/council_district:{}'.format(x)) 47 | yield council 48 | 49 | mayor = Organization('Mayor', classification='executive') 50 | 51 | yield mayor 52 | 53 | LEGISTAR_ROOT_URL = 'http://legistar.council.nyc.gov/' 54 | 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapers-us-municipal 2 | ===================== 3 | 4 | Source for municipal scrapers 5 | 6 | To find out more about the ins and outs of these scrapers, as well as how to create your own, head on over to [docs.opencivicdata.org's scraping page](http://docs.opencivicdata.org/en/latest/scrape/index.html). 7 | 8 | Issues? 9 | ------- 10 | 11 | Issues with the data coming from these scrapers should be filed [in this repository](https://github.com/opencivicdata/scrapers-us-municipal/issues). 12 | 13 | ## Development 14 | 15 | ### With Docker 16 | 17 | Requires Docker and Docker Compose 18 | 19 | #### Initialization 20 | 21 | ```bash 22 | docker-compose run --rm scrapers pupa init YOUR_CITY_SCRAPER 23 | ``` 24 | 25 | ### Without Docker 26 | 27 | Requires Python 3, PostGIS 28 | 29 | #### Initialization 30 | Assuming that you want to have your database be called `opencivicdata` on your local machine 31 | 32 | ```bash 33 | pip install -r requirements.txt 34 | createdb opencivicdata 35 | export DATABASE_URL=postgresql:///opencivicdata 36 | pupa dbinit us 37 | pupa init YOUR_CITY_SCRAPER 38 | ``` 39 | 40 | At times, the release of ocd-django on PyPI differs from that of Github. This may cause problems if you need to create and run migrations. Specifically, you might encounter an `ImproperlyConfigured` error that instructs you to do the following: 41 | 42 | ```bash 43 | You must either define the environment variable DJANGO_SETTINGS_MODULE or call settings.configure() before accessing settings. 44 | ``` 45 | 46 | Fix the problem by running: 47 | 48 | ```bash 49 | export DJANGO_SETTINGS_MODULE=pupa.settings 50 | ``` 51 | 52 | Then, you should be able to successfully run: 53 | 54 | ```bash 55 | django-admin makemigrations 56 | django-admin migrate 57 | ``` 58 | 59 | ## Testing 60 | 61 | Before submitting a PR, please run your scraper. 62 | 63 | ### With Docker 64 | 65 | ```bash 66 | docker-compose run --rm scrapers pupa update YOUR_CITY_SCRAPER 67 | ``` 68 | 69 | ### Without Docker 70 | 71 | ```bash 72 | export DATABASE_URL=postgresql:///opencivicdata 73 | pupa update YOUR_CITY_SCRAPER 74 | ``` 75 | -------------------------------------------------------------------------------- /archive/philadelphia/events.py: -------------------------------------------------------------------------------- 1 | # ~*~ encoding: utf-8 ~*~ 2 | from pupa.scrape import Scraper 3 | from pupa.scrape import Event 4 | import datetime as dt 5 | import lxml.html 6 | 7 | 8 | class PhillyEventsScraper(Scraper): 9 | def lxmlize(self, url): 10 | entry = self.urlopen(url) 11 | page = lxml.html.fromstring(entry) 12 | page.make_links_absolute(url) 13 | return page 14 | 15 | def scrape(self): 16 | url = "http://phila.legistar.com/Calendar.aspx/" 17 | page = self.lxmlize(url) 18 | main = page.xpath("//table[@class='rgMasterTable']")[0] 19 | rows = main.xpath(".//tr")[1:] 20 | for row in rows: 21 | if "No records were found." in row.text_content(): 22 | self.warning("Hum. They don't seem to have events?") 23 | continue 24 | 25 | (name, date, _, time, where, agenda, minutes) = row.xpath(".//td") 26 | # _ nom's the image next to the date on the page. 27 | 28 | name = name.text_content().strip() # leaving an href on the table 29 | time = time.text_content().strip() 30 | location = where.text_content().strip() 31 | 32 | if "Deferred" in time: 33 | continue 34 | 35 | all_day = False 36 | if time == "": 37 | all_day = True 38 | when = dt.datetime.strptime(date.text.strip(), 39 | "%m/%d/%Y") 40 | else: 41 | when = dt.datetime.strptime("%s %s" % (date.text.strip(), time), 42 | "%m/%d/%Y %I:%M %p") 43 | 44 | event = Event(name=name, when=when, location=location) 45 | event.add_source(url) 46 | 47 | agendas = agenda.xpath(".//a[@href]") 48 | for a in agendas: 49 | event.add_link(a.text, a.attrib['href']) 50 | 51 | minutes = minutes.xpath(".//a[@href]") 52 | for minute in minutes: 53 | event.add_link(minute.text, minute.attrib['href']) 54 | 55 | yield event 56 | -------------------------------------------------------------------------------- /st_louis/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | from pupa.scrape import Jurisdiction, Organization 3 | from .people import StLouisPersonScraper 4 | from .bills import StLouisBillScraper 5 | 6 | 7 | class StLouis(Jurisdiction): 8 | division_id = "ocd-division/country:us/state:mo/place:st_louis" 9 | classification = "legislature" 10 | name = "St. Louis city Board of Aldermen" 11 | url = "https://www.stlouis-mo.gov/government/departments/aldermen/" 12 | scrapers = { 13 | "people": StLouisPersonScraper, 14 | "bills": StLouisBillScraper 15 | } 16 | 17 | WARD_COUNT = 28 18 | 19 | def get_organizations(self): 20 | yield self.board_of_aldermen() 21 | 22 | def board_of_aldermen(self): 23 | org = Organization(name="St Louis Board of Aldermen", 24 | classification="legislature") 25 | # add a post for each Ward 26 | for ward_num in range(1, self.WARD_COUNT + 1): 27 | org.add_post(label="Ward {} Alderman".format(ward_num), 28 | role="Alderman") 29 | yield org 30 | 31 | 32 | # TODO better way of doing this? 33 | legislative_sessions = [ 34 | { "identifier": "2015-2016", 35 | "name": "2015-2016 Regular Session", 36 | "start_date": "2015-04-20", 37 | "end_date": "2016-04-17" 38 | }, 39 | { "identifier": "2014-2015", 40 | "name": "2014-2015 Regular Session", 41 | "start_date": "2014-04-14", 42 | "end_date": "2015-04-20" 43 | }, 44 | { "identifier": "2013-2014", 45 | "name": "2013-2014 Regular Session", 46 | "start_date": "2013-04-15", 47 | "end_date": "2014-04-07" 48 | }, 49 | { "identifier": "2012-2013", 50 | "name": "2012-2013 Regular Session", 51 | "start_date": "2012-04-16", 52 | "end_date": "2013-04-08" 53 | }, 54 | { "identifier": "2011-2012", 55 | "name": "2011-2012 Regular Session", 56 | "start_date": "2011-04-18", 57 | "end_date": "2012-04-09" 58 | }, 59 | { "identifier": "2010-2011", 60 | "name": "2010-2011 Regular Session", 61 | "start_date": "2010-04-19", 62 | "end_date": "2011-04-11" 63 | } 64 | ] 65 | 66 | 67 | -------------------------------------------------------------------------------- /archive/arlington_va/people.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import lxml, lxml.html 4 | 5 | from pupa.scrape import Scraper 6 | from pupa.scrape.helpers import Legislator, Organization 7 | 8 | class PersonScraper(Scraper): 9 | 10 | COUNTY_BOARD_URL = 'http://www.arlingtonva.us/Departments/CountyBoard/meetings/members/CountyBoardMeetingsMembersMain.aspx' 11 | 12 | def scrape(self): 13 | board_html = self.urlopen(self.COUNTY_BOARD_URL) 14 | board_lxml = lxml.html.fromstring(board_html) 15 | board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL) 16 | 17 | for board_member_lxml in board_lxml.cssselect("div[name=cbo_list] div[name=row]"): 18 | name = board_member_lxml.cssselect("div[name=info] strong")[0].text.strip() 19 | image = board_member_lxml.cssselect("div[name=pictures] img")[0].get('src') 20 | pieces = re.split(r'', lxml.html.tostring(board_member_lxml.cssselect("div[name=info]")[0]).decode(), re.I) 21 | position = re.sub(r'<[^>]*>', '', pieces[1]).strip() 22 | links = board_member_lxml.cssselect("div[name=info] a") 23 | email = bio_link = None 24 | for link in links: 25 | if link.text is None: 26 | continue 27 | if 'arlingtonva.us' in link.text.lower(): 28 | email = re.sub(r'\s*\(at\)\s*','@', link.text).strip() 29 | elif 'bio' in link.text.lower(): 30 | bio_link = link 31 | 32 | legislator = Legislator(name=name, district=position, image=image) 33 | legislator.add_contact(type='email', value=email, note='%(name)s email address' % {'name': name} ) 34 | legislator.add_source(self.COUNTY_BOARD_URL) 35 | 36 | bio = None 37 | if bio_link is not None: 38 | bio_href = bio_link.attrib.get('href') 39 | bio_html = self.urlopen(bio_href) 40 | bio_lxml = lxml.html.fromstring(bio_html) 41 | bio_text = re.sub(r'<[^>]*>', '', lxml.html.tostring(bio_lxml.cssselect('#textSection #text')[0]).decode(), re.I).strip() 42 | bio_text = re.sub(r' ', ' ', bio_text) 43 | legislator.biography = bio_text 44 | legislator.add_link('bio page', bio_href) 45 | 46 | yield legislator 47 | -------------------------------------------------------------------------------- /archive/albuquerque/people.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper, Legislator, Committee 2 | import lxml.html 3 | 4 | 5 | class PersonScraper(Scraper): 6 | 7 | def lxmlize(self, url): 8 | entry = self.urlopen(url) 9 | page = lxml.html.fromstring(entry) 10 | page.make_links_absolute(url) 11 | return page 12 | 13 | def get_people(self): 14 | yield self._scrape_committees() 15 | yield self._scrape_people() 16 | 17 | def _scrape_committees(self): 18 | url = "http://www.cabq.gov/council/committees" 19 | page = self.lxmlize(url) 20 | root = page.xpath("//div[@id='parent-fieldname-text']")[0] 21 | h3s = root.xpath("./h3") 22 | ps = root.xpath("./p")[2:] 23 | uls = root.xpath("./ul") 24 | for h3, p, ul in zip(h3s, ps, uls): 25 | name = h3.text_content() 26 | org = Committee(name=name) 27 | org.add_source(url) 28 | 29 | for person in ul.xpath(".//li"): 30 | who = person.text_content() 31 | title = 'member' 32 | if ", chair" in who.lower(): 33 | title = 'chair' 34 | who = who.replace(", Chair", "") 35 | org.add_member(name=who, 36 | role=title) 37 | yield org 38 | 39 | def _scrape_people(self): 40 | url = 'http://www.cabq.gov/council/councilors' 41 | page = self.lxmlize(url) 42 | names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:] 43 | it = iter(names) 44 | for entry in zip(it, it, it): 45 | name, info, _ = entry 46 | image_small = name.xpath(".//img")[0].attrib['src'] 47 | name = name.text_content() 48 | infopage, email, policy_analyst = info.xpath(".//a") 49 | phone = info.xpath(".//b")[-1].tail.strip() 50 | district = infopage.text_content() 51 | homepage = self.lxmlize(infopage.attrib['href']) 52 | photo = homepage.xpath( 53 | "//div[@class='featureContent']//img" 54 | )[0].attrib['src'] 55 | 56 | bio = "\n".join((x.text_content() for x in homepage.xpath( 57 | "//div[@class='featureContent']//div[@class='stx']/p"))) 58 | 59 | p = Legislator(name=name, 60 | district=district, 61 | image=photo, 62 | biography=bio) 63 | 64 | p.add_source(url) 65 | p.add_source(infopage.attrib['href']) 66 | yield p 67 | -------------------------------------------------------------------------------- /archive/cary/events.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Scraper 2 | from pupa.scrape import Event 3 | 4 | import datetime as dt 5 | from functools import partial 6 | import lxml.html 7 | 8 | CAL_URL = "http://www.townofcary.org/Town_Council/Meetings____Public_Notices_Calendar.htm" 9 | 10 | 11 | class CaryEventsScraper(Scraper): 12 | 13 | def lxmlize(self, url): 14 | entry = self.urlopen(url) 15 | page = lxml.html.fromstring(entry) 16 | page.make_links_absolute(url) 17 | return page 18 | 19 | def scrape(self): 20 | page = self.lxmlize(CAL_URL) 21 | events = page.xpath("//div[@id='ctl14_pnlCalendarAll']//td") 22 | for event in events: 23 | when = event.xpath(".//a[contains(@href, 'javascript')]") 24 | if when == []: 25 | continue 26 | when = when[0] 27 | 28 | dom = when.text # day of month 29 | hrefs = event.xpath(".//a[contains(@href, 'htm')]") 30 | for href in hrefs: 31 | for e in self.scrape_event(href): 32 | yield e 33 | 34 | 35 | def scrape_event(self, href): 36 | page = self.lxmlize(href.attrib['href']) 37 | what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text 38 | info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:] 39 | ret = { 40 | "Location:": "Unknown" 41 | } 42 | for tr in info: 43 | tds = tr.xpath(".//td") 44 | if len(tds) < 2: 45 | continue 46 | what, data = [tds.pop(0).text_content().strip() for x in range(2)] 47 | ret[what] = data 48 | 49 | agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]") 50 | if agendas: 51 | for agenda in agendas: 52 | print("Agenda:", agenda.attrib['href']) 53 | 54 | t = ret['Time:'] 55 | start_time, end_time = t, None 56 | if "-" in t: 57 | start_time, end_time = (x.strip() for x in t.split("-", 1)) 58 | 59 | start_time = "%s %s" % (ret['Date:'], start_time) 60 | dts = "%B %d, %Y %I:%M %p" 61 | start = dt.datetime.strptime(start_time, dts) 62 | 63 | end = None 64 | if end_time: 65 | end = "%s %s" % (ret['Date:'], end_time) 66 | end = dt.datetime.strptime(end, dts) 67 | 68 | kwargs = {} 69 | if end: 70 | kwargs['end'] = end 71 | 72 | e = Event(name=what, location=ret['Location:'], when=start, 73 | **kwargs) 74 | e.add_source(href.attrib['href']) 75 | yield e 76 | -------------------------------------------------------------------------------- /archive/madison.py: -------------------------------------------------------------------------------- 1 | from pupa.scrape import Jurisdiction, Organization 2 | from legistar.people import LegistarPersonScraper 3 | 4 | 5 | class MadisonPersonScraper(LegistarPersonScraper): 6 | 7 | EXTRA_FIELDS = ('notes',) 8 | DATE_FORMATS = ('%m/%d/%Y', '%m/%d/%Y*',) 9 | 10 | def skip_item(self, item): 11 | #return item['name'] in ('VACANCIES', 'Al Matano') 12 | # TODO: this skips all non-city councilors, check to make sure it doesn't skip other 13 | # interesting people? 14 | return 'district' not in item['url'] 15 | 16 | 17 | class Madison(Jurisdiction): 18 | division_id = 'ocd-division/country:us/state:wi/place:madison' 19 | classification = 'government' 20 | timezone = 'America/Chicago' 21 | name = 'Madison' 22 | url = 'http://www.cityofmadison.com/' 23 | 24 | scrapers = {'people': MadisonPersonScraper} 25 | # HTTPS is vital here, without it pagination doesn't work! 26 | LEGISTAR_ROOT_URL = 'https://madison.legistar.com/' 27 | 28 | def get_organizations(self): 29 | council = Organization('City of Madison Common Council', classification='legislature') 30 | for x in range(1,21): 31 | council.add_post(str(x), role='Alder') 32 | yield council 33 | 34 | #ORG_CLASSIFICATIONS = { 35 | # 'ALLIED AREA TASK FORCE': 'commission', 36 | # 'TRANSPORT 2020 IMPLEMENTATION TASK FORCE': 'commission', 37 | # 'COMMON COUNCIL': 'legislature', 38 | # 'COMMON COUNCIL - DISCUSSION': 'commission', 39 | # 'COMMUNITY ACTION COALITION FOR SOUTH CENTRAL WISCONSIN INC': 'commission', 40 | # 'COMMUNITY DEVELOPMENT AUTHORITY': 'commission', 41 | # 'MADISON COMMUNITY FOUNDATION': 'commission', 42 | # 'MADISON FOOD POLICY COUNCIL': 'commission', 43 | # 'MADISON HOUSING AUTHORITY': 'commission', 44 | # 'PARKING COUNCIL FOR PEOPLE WITH DISABILITIES': 'commission', 45 | #} 46 | 47 | #def person_district(self, data): 48 | # '''This corresponds to the label field on organizations posts. 49 | # ''' 50 | # # First try to get it from bio. 51 | # dist = re.findall(r'District\s+\d+', data['notes']) 52 | # if dist: 53 | # return dist.pop() 54 | 55 | # # Then try website. 56 | # dist = re.findall(r'/district(\d+)/', data['website']) 57 | # if dist: 58 | # return dist.pop() 59 | 60 | # # Then email. 61 | # dist = re.findall(r'district(\d+)', data['email']) 62 | # if dist: 63 | # return dist.pop() 64 | -------------------------------------------------------------------------------- /archive/santa_fe/events.py: -------------------------------------------------------------------------------- 1 | # ~*~ encoding: utf-8 ~*~ 2 | from pupa.scrape import Scraper 3 | from pupa.scrape import Event 4 | 5 | import datetime as dt 6 | import lxml.html 7 | import re 8 | 9 | 10 | CAL_PAGE = "http://www.santafenm.gov/index.aspx?NID=1066" 11 | DT = re.compile(r"(?P