├── tests
    ├── __init__.py
    ├── test_config.py
    ├── test_build.py
    └── docker-compose.yml
├── docker-entrypoint.sh
├── archive
    ├── rialto
    │   ├── mitmdump.out
    │   ├── bills.py
    │   ├── __init__.py
    │   └── people.py
    ├── README.md
    ├── cary
    │   ├── __init__.py
    │   └── events.py
    ├── santa_fe
    │   ├── __init__.py
    │   └── events.py
    ├── maricopa
    │   ├── __init__.py
    │   ├── bills.py
    │   └── people.py
    ├── philadelphia
    │   ├── __init__.py
    │   └── events.py
    ├── roswell
    │   ├── __init__.py
    │   └── events.py
    ├── longbeach
    │   ├── bills.py
    │   ├── __init__.py
    │   └── people.py
    ├── albuquerque
    │   ├── bills.py
    │   ├── __init__.py
    │   └── people.py
    ├── arlington_va
    │   ├── __init__.py
    │   ├── people.py
    │   └── events.py
    ├── columbus
    │   ├── __init__.py
    │   ├── people.py
    │   └── events.py
    ├── jonesboro
    │   └── __init__.py
    ├── cleveland
    │   ├── __init__.py
    │   ├── events.py
    │   └── people.py
    ├── monterey
    │   └── __init__.py
    ├── statecollegepa
    │   └── __init__.py
    ├── wellesley
    │   ├── __init__.py
    │   └── people.py
    ├── denver
    │   ├── __init__.py
    │   ├── bills.py
    │   ├── people.py
    │   └── utils.py
    ├── temecula
    │   ├── __init__.py
    │   ├── people.py
    │   ├── events.py
    │   └── utils.py
    ├── boise
    │   ├── __init__.py
    │   ├── people.py
    │   ├── events.py
    │   ├── bills.py
    │   └── utils.py
    ├── boston
    │   ├── __init__.py
    │   ├── events.py
    │   ├── vote.py
    │   └── people.py
    ├── sanfrancisco.py
    ├── holyoke
    │   ├── __init__.py
    │   └── people.py
    ├── madison.py
    └── lametro
    │   └── __init__.py
├── sacramento
    ├── bills.py
    ├── events.py
    ├── vote_events.py
    ├── __init__.py
    └── people.py
├── AUTHORS
├── setup.py
├── requirements.txt
├── Dockerfile
├── .gitignore
├── docker-compose.yml
├── LICENSE
├── .github
    └── workflows
    │   └── main.yml
├── cookcounty
    ├── __init__.py
    └── people.py
├── ferguson
    ├── __init__.py
    └── people.py
├── st_louis
    ├── utils.py
    ├── __init__.py
    ├── people.py
    └── bills.py
├── nyc
    ├── __init__.py
    ├── events.py
    └── people.py
├── README.md
├── chicago
    ├── base.py
    ├── __init__.py
    ├── legistar.py
    ├── events.py
    └── people.py
├── miamidade
    ├── __init__.py
    ├── events.py
    └── people.py
└── pittsburgh
    ├── __init__.py
    ├── people.py
    └── events.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_build.py:
--------------------------------------------------------------------------------
1 | def test_truth():
2 |     assert True is True
3 | 


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 | 
4 | pupa dbinit us
5 | 
6 | exec "$@"
7 | 


--------------------------------------------------------------------------------
/archive/rialto/mitmdump.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opencivicdata/scrapers-us-municipal/HEAD/archive/rialto/mitmdump.out


--------------------------------------------------------------------------------
/tests/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.4'
2 | 
3 | services:
4 |   scrapers:
5 |     restart: "no"
6 |     command: pytest -sxv
7 | 


--------------------------------------------------------------------------------
/archive/README.md:
--------------------------------------------------------------------------------
1 | # Archived Scrapers
2 | 
3 | These are scrapers that were not finished or were not updated to use the current version of our scraper infrastructure. Resurrection welcome.
4 | 
5 | 


--------------------------------------------------------------------------------
/sacramento/bills.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Bill
 3 | 
 4 | 
 5 | class SacramentoBillScraper(Scraper):
 6 | 
 7 |     def scrape(self):
 8 |         # needs to be implemented
 9 |         pass
10 | 


--------------------------------------------------------------------------------
/sacramento/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | 
 5 | class SacramentoEventScraper(Scraper):
 6 | 
 7 |     def scrape(self):
 8 |         # needs to be implemented
 9 |         pass
10 | 


--------------------------------------------------------------------------------
/sacramento/vote_events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import VoteEvent
 3 | 
 4 | 
 5 | class SacramentoVoteEventScraper(Scraper):
 6 | 
 7 |     def scrape(self):
 8 |         # needs to be implemented
 9 |         pass
10 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | James Turk <james.p.turk@gmail.com>
 2 | Thom Neale <twneale@gmail.com>
 3 | Paul Tagliamonte <tag@pault.ag>
 4 | 
 5 | Specific Scrapers
 6 | -----------------
 7 | Arlington, Va - Tom Lee <thomas.j.lee@gmail.com>
 8 | 
 9 | Pittsburgh, Pa - James O'Toole <jtotoole@gmail.com>
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='muniscrapers',
 5 |     version='1.0.0',
 6 |     url='https://github.com/opencivicdata/scrapers-us-municipal',
 7 |     packages=find_packages(),    
 8 |     install_requires=['pupa', 'scraper-legistar'],
 9 | )
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | https://github.com/opencivicdata/python-opencivicdata-django/zipball/master
 2 | pupa==0.10.1
 3 | https://github.com/opencivicdata/python-legistar-scraper/zipball/master
 4 | lxml
 5 | sh
 6 | pytest==6.2.5
 7 | pytest-mock==3.12.0
 8 | requests-mock==1.11.0
 9 | https://github.com/jamesturk/scrapelib/archive/refs/heads/main.zip
10 | 


--------------------------------------------------------------------------------
/archive/cary/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .events import CaryEventsScraper
 4 | 
 5 | 
 6 | class Cary(Jurisdiction):
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:nc/place:cary/council'
 8 |     name = 'Cary Town Council'
 9 |     url = 'http://www.townofcary.org/town_council/cary_town_council.htm'
10 | 
11 |     scrapers = {'events': CaryEventsScraper}
12 | 


--------------------------------------------------------------------------------
/archive/santa_fe/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | from .events import SantaFeEventsScraper
 3 | 
 4 | 
 5 | class SantaFe(Jurisdiction):
 6 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:santa_fe/council'
 7 |     name = 'Santa Fe City Council'
 8 |     url = 'http://www.santafenm.gov/index.aspx?nid=72'
 9 | 
10 |     scrapers = {
11 |         "events": SantaFeEventsScraper
12 |     }
13 | 


--------------------------------------------------------------------------------
/archive/maricopa/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .bills import BillScraper
 4 | 
 5 | 
 6 | class Example(Jurisdiction):
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:az/place:maricopa'
 8 |     name = 'Maricopa City Council'
 9 |     url = 'http://www.maricopa-az.gov/web/'
10 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
11 |     scrapers = {'bills': BillScraper}
12 | 


--------------------------------------------------------------------------------
/archive/philadelphia/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | from .events import PhillyEventsScraper
 3 | 
 4 | 
 5 | class Philadelphia(Jurisdiction):
 6 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:pa/place:philadelphia/council'
 7 | 
 8 |     name = 'Philadelphia City Council'
 9 |     url = 'http://philadelphiacitycouncil.net/'
10 | 
11 |     scrapers = {
12 |         "events": PhillyEventsScraper
13 |     }
14 | 


--------------------------------------------------------------------------------
/archive/roswell/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | from .events import RoswellEventsScraper
 3 | 
 4 | 
 5 | class Roswell(Jurisdiction):
 6 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:roswell/council'
 7 |     name = 'Roswell City Council'
 8 |     url = 'http://www.roswell-nm.gov/staticpages/index.php/cc1-citycouncil'
 9 | 
10 |     scrapers = {
11 |         "events": RoswellEventsScraper
12 |     }
13 | 


--------------------------------------------------------------------------------
/archive/rialto/bills.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, abspath, dirname
 2 | 
 3 | import sh
 4 | import lxml.html
 5 | from libmproxy import proxy, flow
 6 | 
 7 | from pupa.utils.legistar import LegistarScraper
 8 | from pupa.scrape import Bill
 9 | 
10 | 
11 | class BillScraper(LegistarScraper):
12 |     url = 'https://rialto.legistar.com/Legislation.aspx'
13 |     columns = (
14 |         'bill_id', 'type', 'status',
15 |         'created', 'action', 'title')
16 | 


--------------------------------------------------------------------------------
/archive/longbeach/bills.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, abspath, dirname
 2 | 
 3 | import sh
 4 | import lxml.html
 5 | from libmproxy import proxy, flow
 6 | 
 7 | from pupa.utils.legistar import LegistarScraper
 8 | from pupa.scrape import Bill
 9 | 
10 | 
11 | class BillScraper(LegistarScraper):
12 |     url = 'https://longbeach.legistar.com/Calendar.aspx'
13 |     columns = (
14 |         'bill_id', 'type', 'status',
15 |         'created', 'action', 'title')
16 | 


--------------------------------------------------------------------------------
/archive/maricopa/bills.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, abspath, dirname
 2 | 
 3 | import sh
 4 | import lxml.html
 5 | from libmproxy import proxy, flow
 6 | 
 7 | from pupa.utils.legistar import LegistarScraper
 8 | from pupa.scrape import Bill
 9 | 
10 | 
11 | class BillScraper(LegistarScraper):
12 |     url = 'https://maricopa.legistar.com/Legislation.aspx'
13 |     columns = (
14 |         'bill_id', 'type', 'status',
15 |         'created', 'action', 'title')
16 | 


--------------------------------------------------------------------------------
/archive/longbeach/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .bills import BillScraper
 4 | 
 5 | 
 6 | class Jurisdiction(Jurisdiction):
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:longbeach'
 8 |     name = 'Long Beach City Council'
 9 |     url = 'http://www.longbeach.gov/cityclerk/council_online.asp'
10 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
11 |     scrapers = {'bills': BillScraper}
12 | 


--------------------------------------------------------------------------------
/archive/albuquerque/bills.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, abspath, dirname
 2 | 
 3 | import sh
 4 | import lxml.html
 5 | from libmproxy import proxy, flow
 6 | 
 7 | from pupa.utils.legistar import LegistarScraper
 8 | from pupa.scrape import Bill
 9 | 
10 | 
11 | class BillScraper(LegistarScraper):
12 | 
13 |     url = 'https://cabq.legistar.com/Legislation.aspx'
14 |     columns = (
15 |         'bill_id', 'enactment_id', 'type', 'status',
16 |         'created', 'action', 'title')
17 | 


--------------------------------------------------------------------------------
/archive/arlington_va/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import PersonScraper
 4 | from .events import EventScraper
 5 | 
 6 | 
 7 | class Arlington(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:va/place:arlington/council'
 9 |     name = 'Arlington County Board'
10 |     url = 'http://www.arlingtonva.us/Departments/CountyBoard/CountyBoardMain.aspx'
11 | 
12 |     scrapers = {'people': PersonScraper, 'events': EventScraper}
13 | 


--------------------------------------------------------------------------------
/archive/rialto/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import PersonScraper
 4 | from .bills import BillScraper
 5 | 
 6 | 
 7 | class Rialto(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:az/place:rialto/council'
 9 |     name = 'Rialto City Council'
10 |     url = 'http://http://www.ci.rialto.ca.us/'
11 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
12 |     scrapers = {'bills': BillScraper, 'people': PersonScraper}
13 | 


--------------------------------------------------------------------------------
/archive/albuquerque/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import PersonScraper
 4 | from .bills import BillScraper
 5 | 
 6 | 
 7 | class Albequerque(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:nm/place:albequerque/council'
 9 |     name= 'Albequerque City Council'
10 |     url = 'http://www.cabq.gov/council/'
11 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
12 |     scrapers = {'people': PersonScraper, 'bills': BillScraper}
13 | 


--------------------------------------------------------------------------------
/archive/columbus/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import ColumbusPersonScraper
 4 | from .events import ColumbusEventScraper
 5 | 
 6 | 
 7 | class Columbus(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:oh/place:columbus/council'
 9 | 
10 |     name = 'Columbus City Council'
11 |     url = 'http://council.columbus.gov/'
12 | 
13 |     scrapers = {
14 |         "people": ColumbusPersonScraper,
15 |         "events": ColumbusEventScraper,
16 |     }
17 | 


--------------------------------------------------------------------------------
/archive/jonesboro/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | from legistar.ext.pupa import LegistarPeopleScraper
 3 | 
 4 | 
 5 | class Jonesboro(Jurisdiction):
 6 |     division_id = 'ocd-division/country:us/state:ar/place:jonesboro'
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:ar/place:jonesboro/government'
 8 | 
 9 |     name = 'Jonesboro City Council'
10 |     url = 'http://jonesboro.legistar.com/'
11 | 
12 |     scrapers = {
13 |         "people": LegistarPeopleScraper,
14 |     }
15 | 


--------------------------------------------------------------------------------
/archive/cleveland/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import ClevelandPersonScraper
 4 | from .events import ClevelandEventScraper
 5 | 
 6 | 
 7 | class Cleveland(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:oh/place:cleveland/council'
 9 | 
10 |     name = 'Cleveland City Council'
11 |     url = 'http://www.clevelandcitycouncil.org/'
12 | 
13 |     scrapers = {
14 |         "people": ClevelandPersonScraper,
15 |         "events": ClevelandEventScraper
16 |     }
17 | 


--------------------------------------------------------------------------------
/archive/monterey/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | from legistar.ext.pupa import LegistarPeopleScraper
 3 | 
 4 | 
 5 | class Jonesboro(Jurisdiction):
 6 |     division_id = 'ocd-division/country:us/state:ca/place:monterey'
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:monterey/government'
 8 | 
 9 |     name = 'City of Monterey Board of Supervisors'
10 |     url = 'https://monterey.legistar.com/People.aspx'
11 | 
12 |     scrapers = {
13 |         "people": LegistarPeopleScraper,
14 |     }
15 | 


--------------------------------------------------------------------------------
/archive/statecollegepa/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from granicus.pupa.events import make_event_scraper
 4 | 
 5 | 
 6 | class StateCollege(Jurisdiction):
 7 |     division_id = 'ocd-division/country:us/state:pa/place:state_college'
 8 |     name = 'State College'
 9 |     url =  'http://www.statecollegepa.us/'
10 |     classification = "government"
11 | 
12 |     scrapers = {
13 |         # XXX: The server is giving us 500 errors...
14 |         # "events": make_event_scraper("statecollegepa"),
15 |     }
16 | 


--------------------------------------------------------------------------------
/archive/wellesley/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import WellesleyPersonScraper
 4 | 
 5 | 
 6 | class Wellesley(Jurisdiction):
 7 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:ma/place:wellesley/council'
 8 |     name = 'Wellesley Board of Selectmen'
 9 |     url = 'http://www.wellesleyma.gov/Pages/WellesleyMA_Selectmen/index'
10 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
11 | 
12 |     scrapers = {
13 |         "people": WellesleyPersonScraper,
14 |     }
15 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | LABEL maintainer "DataMade <info@datamade.us>"
 3 | 
 4 | ENV PYTHONUNBUFFERED=1
 5 | 
 6 | RUN apt-get update && \
 7 |     apt-get install -y libxml2-dev gdal-bin && \
 8 |     apt-get clean && \
 9 |     rm -rf /var/cache/apt/* /var/lib/apt/lists/*
10 | 
11 | RUN mkdir /src
12 | WORKDIR /src
13 | 
14 | COPY ./requirements.txt /src/requirements.txt
15 | RUN pip install --upgrade pip && \
16 |     pip install --no-cache-dir -r requirements.txt
17 | 
18 | COPY . /src
19 | 
20 | ENTRYPOINT ["/src/docker-entrypoint.sh"]
21 | 


--------------------------------------------------------------------------------
/archive/denver/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | # from .events import BoiseEventScraper
 4 | from .people import PersonScraper
 5 | from .bills import BillScraper
 6 | 
 7 | 
 8 | class Denver(Jurisdiction):
 9 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:co/place:denver/council'
10 | 
11 |     name = 'Denver City Council'
12 |     url = 'https://www.denvergov.org/citycouncil'
13 |     parties = [{'name': 'Democratic' }, {'name': 'Republican' }, ]
14 | 
15 |     scrapers = {'people': PersonScraper, 'bills': BillScraper}
16 | 


--------------------------------------------------------------------------------
/archive/temecula/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .people import PersonScraper
 4 | from .events import TemeculaEventScraper
 5 | 
 6 | 
 7 | class Temecula(Jurisdiction):
 8 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:ca/place:temecula/council'
 9 | 
10 |     name = 'Temecula City Council'
11 |     url = 'http://www.cityoftemecula.org/Temecula/Government/CouncilCommissions/CityCouncil/'
12 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
13 |     scrapers = {'people': PersonScraper, 'events': TemeculaEventScraper}
14 | 


--------------------------------------------------------------------------------
/archive/boise/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction
 2 | 
 3 | from .events import BoiseEventScraper
 4 | from .people import PersonScraper
 5 | from .bills import BillScraper
 6 | 
 7 | 
 8 | class Boise(Jurisdiction):
 9 |     jurisdiction_id = 'ocd-jurisdiction/country:us/state:id/place:boise_city/council'
10 |     name = 'Boise City Council'
11 |     url = 'http://mayor.cityofboise.org/city-council/'
12 |     parties = [ {'name': 'Democratic' }, {'name': 'Republican' } ]
13 | 
14 |     scrapers = {'people': PersonScraper, 'bills': BillScraper, 'events': BoiseEventScraper}
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .#*
 2 | *#
 3 | nyc/secrets.py
 4 | lametro/secrets.py
 5 | pupa_settings.py
 6 | 
 7 | # Pupa bits.
 8 | _data
 9 | _cache
10 | 
11 | # scraper bits
12 | HearingSchedule.pdf
13 | HearingSchedule.txt
14 | 
15 | # vim
16 | *swp
17 | 
18 | # Python objects
19 | *.py[cod]
20 | 
21 | # C extensions
22 | *.so
23 | 
24 | # Packages
25 | *.egg
26 | *.egg-info
27 | dist
28 | build
29 | eggs
30 | parts
31 | bin
32 | var
33 | sdist
34 | develop-eggs
35 | .installed.cfg
36 | lib
37 | lib64
38 | 
39 | # Installer logs
40 | pip-log.txt
41 | 
42 | # Unit test / coverage reports
43 | .coverage
44 | .tox
45 | nosetests.xml
46 | 
47 | # Translations
48 | *.mo
49 | 
50 | # Mr Developer
51 | .mr.developer.cfg
52 | .project
53 | .pydevproject
54 | 
55 | __pycache__
56 | 
57 | venv
58 | 


--------------------------------------------------------------------------------
/archive/rialto/people.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | 
 3 | from pupa.scrape import Scraper, Legislator
 4 | from pupa.scrape import Person, Organization
 5 | 
 6 | 
 7 | class PersonScraper(Scraper):
 8 | 
 9 |     url = 'http://www.ci.rialto.ca.us/citycouncil_council-members.php'
10 |     def get_people(self):
11 | 
12 |         html = self.urlopen(self.url)
13 |         doc = lxml.html.fromstring(html)
14 | 
15 |         title_xpath = '//div[contains(@class, "biotitle")]'
16 |         name_xpath = '//div[contains(@class, "bioname")]'
17 |         for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)):
18 |             name = name.text_content().strip()
19 |             title = title.text_content().strip()
20 |             p = Legislator(name=name, district=title)
21 |             p.add_source(self.url)
22 |             yield p
23 | 


--------------------------------------------------------------------------------
/archive/longbeach/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape.helpers import Legislator, Organization
 3 | 
 4 | 
 5 | class PersonScraper(Scraper):
 6 | 
 7 |     def get_people(self):
 8 |         # committee
 9 |         tech = Organization('Technology', classification='committee')
10 |         tech.add_post('Chairman', 'chairman')
11 |         tech.add_source('https://example.com')
12 |         yield tech
13 | 
14 |         # subcommittee
15 |         ecom = Organization('Subcommittee on E-Commerce',
16 |                             parent=tech,
17 |                             classification='committee')
18 |         ecom.add_source('https://example.com')
19 |         yield ecom
20 | 
21 |         p = Person('Paul Tagliamonte', district='6', chamber='upper')
22 |         p.add_committee_membership(tech, role='chairman')
23 |         p.add_source('https://example.com')
24 |         yield p
25 | 


--------------------------------------------------------------------------------
/archive/maricopa/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape.helpers import Legislator, Organization
 3 | 
 4 | 
 5 | class PersonScraper(Scraper):
 6 | 
 7 |     def get_people(self):
 8 |         # committee
 9 |         tech = Organization('Technology', classification='committee')
10 |         tech.add_post('Chairman', 'chairman')
11 |         tech.add_source('https://example.com')
12 |         yield tech
13 | 
14 |         # subcommittee
15 |         ecom = Organization('Subcommittee on E-Commerce',
16 |                             parent=tech,
17 |                             classification='committee')
18 |         ecom.add_source('https://example.com')
19 |         yield ecom
20 | 
21 |         p = Person('Paul Tagliamonte', district='6', chamber='upper')
22 |         p.add_committee_membership(tech, role='chairman')
23 |         p.add_source('https://example.com')
24 |         yield p
25 | 


--------------------------------------------------------------------------------
/archive/boston/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction, Organization
 2 | 
 3 | from .events import BostonEventsScraper
 4 | from .people import BostonPersonScraper
 5 | from .vote import BostonVoteScraper
 6 | 
 7 | 
 8 | class Boston(Jurisdiction):
 9 |     division_id = 'ocd-jurisdiction/country:us/state:ma/place:boston'
10 |     classification = 'council'
11 | 
12 |     name = 'Boston City Council'
13 |     url = 'http://www.cityofboston.gov/citycouncil/'
14 |     extras = {
15 |         "social_media": {
16 |             "twitter": "https://twitter.com/BOSCityCouncil",
17 |             "facebook": "https://www.facebook.com/pages/Boston-City-Council/106846899335407",
18 |         }
19 |     }
20 | 
21 |     scrapers = {
22 |         "people": BostonPersonScraper,
23 |         "events": BostonEventsScraper,
24 |         "votes": BostonVoteScraper,
25 |     }
26 | 
27 |     def get_organizations(self):
28 |         org = Organization(name="Boston City Council", classification="legislature")
29 |         yield org
30 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2.4'
 2 | 
 3 | services:
 4 |   scrapers:
 5 |     image: scrapers-us-municipal
 6 |     container_name: scrapers-us-municipal
 7 |     build: .
 8 |     stdin_open: true
 9 |     tty: true
10 |     depends_on:
11 |       postgres:
12 |         condition: service_healthy
13 |     volumes:
14 |       - .:/src
15 |     environment:
16 |       DATABASE_URL: postgres://postgres:postgres@postgres/opencivicdata
17 |       DJANGO_SETTINGS_MODULE: pupa.settings
18 |     command: pupa update lametro
19 | 
20 |   postgres:
21 |     container_name: scrapers-us-municipal-postgres
22 |     image: postgis/postgis:13-3.4
23 |     healthcheck:
24 |       test: ["CMD-SHELL", "pg_isready -U postgres"]
25 |       interval: 10s
26 |       timeout: 5s
27 |       retries: 5
28 |     environment:
29 |       POSTGRES_DB: opencivicdata
30 |       POSTGRES_PASSWORD: postgres
31 |     volumes:
32 |       - scrapers-us-municipal-db-data:/var/lib/postgresql/data
33 |     ports:
34 |       - 32001:5432
35 | 
36 | volumes:
37 |   scrapers-us-municipal-db-data:
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 DataMade LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   test:
13 |     name: Run tests
14 |     runs-on: ubuntu-latest
15 |     services:
16 |       postgres:
17 |         image: postgis/postgis:13-3.4
18 |         env:
19 |           POSTGRES_DB: lametro
20 |           POSTGRES_PASSWORD: postgres
21 |         options: >-
22 |           --health-cmd pg_isready
23 |           --health-interval 10s
24 |           --health-timeout 5s
25 |           --health-retries 5
26 |         ports:
27 |           - 5432:5432
28 |     steps:
29 |     - name: Install system dependencies
30 |       run: |
31 |         sudo apt-get update
32 |         sudo apt-get install gdal-bin libxml2-dev
33 |     - uses: actions/checkout@v4
34 |     - name: Set up Python 3.9
35 |       uses: actions/setup-python@v5
36 |       with:
37 |         python-version: '3.9'
38 |     - name: Install dependencies
39 |       run: |
40 |         python -m pip install --upgrade pip
41 |         pip install "setuptools-scm<7.0"
42 |         pip install -r requirements.txt
43 |     - name: Test with pytest
44 |       run: |
45 |         pytest -sv
46 | 


--------------------------------------------------------------------------------
/cookcounty/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization
 3 | from .events import CookcountyEventScraper
 4 | from .bills import CookcountyBillScraper
 5 | from .people import CookcountyPersonScraper
 6 | 
 7 | 
 8 | class Cookcounty(Jurisdiction):
 9 |     division_id = "ocd-division/country:us/state:il/county:cook"
10 |     classification = "legislature"
11 |     name = "Cook County"
12 |     url = "http://www.cookcountyil.gov/board-of-commissioners/"
13 |     scrapers = {
14 |         #"events": CookcountyEventScraper,
15 |         #"bills": CookcountyBillScraper,
16 |         "people": CookcountyPersonScraper,
17 |     }
18 | 
19 |     def get_organizations(self):
20 |         org = Organization(name="Cook County Board of Commissioners", classification="legislature")
21 | 
22 |         for x in range(1, 18):
23 |             org.add_post(
24 |                 "District {}".format(x),
25 |                 "Commissioner",
26 |                 division_id='ocd-division/country:us/state:il/county:cook/council_district:{}'.format(x))
27 | 
28 |         org.add_post(
29 |             "Board President",
30 |             "Board President",
31 |             division_id='ocd-division/country:us/state:il/county:cook')
32 | 
33 |         yield org
34 | 


--------------------------------------------------------------------------------
/archive/sanfrancisco.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction, Organization
 2 | from legistar.people import LegistarPersonScraper
 3 | 
 4 | class SFPersonScraper(LegistarPersonScraper):
 5 |     EXTRA_FIELDS = ('notes',)
 6 | 
 7 | #TODO: add district?
 8 | 
 9 | class SanFrancisco(Jurisdiction):
10 |     name = 'San Francisco'
11 |     classification = 'government'
12 |     division_id = 'ocd-division/country:us/state:ca/place:san_francisco'
13 |     timezone = 'America/Los_Angeles'
14 |     url = 'http://sfgov.org'
15 | 
16 |     LEGISTAR_ROOT_URL = 'https://sfgov.legistar.com'
17 |     scrapers = {'people': SFPersonScraper}
18 | 
19 |     def get_organizations(self):
20 |         council = Organization('San Francisco Board of Supervisors', classification='legislature')
21 |         for x in range(1,12):
22 |             council.add_post(str(x), role='Supervisor')
23 |         yield council
24 | 
25 | 
26 |     #TOPLEVEL_ORG_MEMBERSHIP_TITLE = 'Supervisor'
27 |     #TOPLEVEL_ORG_MEMBERSHIP_NAME = 'Board of Supervisors'
28 |     #EVT_SEARCH_TABLE_TEXT_AUDIO = 'Audio'  # sfgov has this
29 |     #EVT_SEARCH_TIME_PERIOD = 'This Year'
30 |     #BILL_SEARCH_TABLE_TEXT_INTRO_DATE = 'Introduced'
31 | 
32 |     #def get_district(self, data):
33 |     #    return self.DEFAULT_AT_LARGE_STRING
34 | 


--------------------------------------------------------------------------------
/ferguson/__init__.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction, Organization
 2 | 
 3 | from .people import FergusonPersonScraper
 4 | 
 5 | 
 6 | class Ferguson(Jurisdiction):
 7 |     division_id = 'ocd-division/country:us/state:mo/place:ferguson'
 8 |     classification = 'council'
 9 |     name = 'Ferguson City Council'
10 |     url = 'http://www.fergusoncity.com/56/Government'
11 |     parties = []
12 | 
13 |     scrapers = {
14 |         "people": FergusonPersonScraper,
15 |     }
16 | 
17 |     def get_organizations(self):
18 |         org = Organization(name="Ferguson City Council",
19 |                            classification="legislature")
20 | 
21 |         org.add_contact_detail(
22 |             type='email',
23 |             value='citycouncil@fergusoncity.com'
24 |         )
25 | 
26 |         org.add_post(
27 |             label="Mayor",
28 |             role="Mayor",
29 |             division_id=self.division_id
30 |         )
31 | 
32 |         WARDS = 3
33 |         for ward in range(1, WARDS + 1):
34 |             org.add_post(
35 |                 label="Council Member Ward {}".format(ward),
36 |                 role="Council Member Ward {}".format(ward),
37 |                 division_id=self.division_id,
38 |                 # num_seats=2,
39 |             )
40 | 
41 |         yield org
42 | 


--------------------------------------------------------------------------------
/archive/boston/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | import datetime as dt
 5 | import lxml.html
 6 | 
 7 | 
 8 | class BostonEventsScraper(Scraper):
 9 | 
10 |     def lxmlize(self, url):
11 |         entry = self.urlopen(url)
12 |         page = lxml.html.fromstring(entry)
13 |         page.make_links_absolute(url)
14 |         return page
15 | 
16 |     def scrape(self):
17 |         url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"
18 | 
19 |         page = self.lxmlize(url)
20 |         for entry in page.xpath(
21 |                 "//tr[@style='font-family: Verdana; font-size: 12px;']"):
22 |             name, when, links = entry.xpath(".//td")
23 |             name = name.text.strip().replace(u"\xc2\xa0", "")
24 |             when = when.text.strip().replace(u"\xc2\xa0", "")
25 |             when = dt.datetime.strptime(when, "%m/%d/%Y")
26 |             links = links.xpath(".//a")
27 |             links = {x.text: x.attrib['href'] for x in links}
28 |             e = Event(name=name,
29 |                       when=when,
30 |                       location='unknown')
31 | 
32 |             e.add_source(url)
33 |             for note, url in links.items():
34 |                 e.add_link(note=note, url=url)
35 | 
36 |             yield e
37 | 


--------------------------------------------------------------------------------
/sacramento/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization
 3 | from .bills import SacramentoBillScraper
 4 | from .vote_events import SacramentoVoteEventScraper
 5 | from .events import SacramentoEventScraper
 6 | from .people import SacramentoPersonScraper
 7 | 
 8 | 
 9 | class Sacramento(Jurisdiction):
10 |     division_id = "ocd-division/country:us/state:ca/place:sacramento"
11 |     classification = "legislature"
12 |     name = "Sacramento City Council"
13 |     url = "http://www.cityofsacramento.org/"
14 |     scrapers = {
15 |         # "bills": SacramentoBillScraper,
16 |         # "vote_events": SacramentoVoteEventScraper,
17 |         # "events": SacramentoEventScraper,
18 |         "people": SacramentoPersonScraper,
19 |     }
20 | 
21 |     def get_organizations(self):
22 | 
23 |         org = Organization(name="Sacramento City Council", classification="legislature")
24 | 
25 |         org.add_post(label='Mayor of the City of Sacramento',
26 |                      role='Mayor',
27 |                      division_id='ocd-division/country:us/state:ca/place:sacramento')
28 | 
29 |         for district in range(1, 9):
30 |             org.add_post(label='Sacramento City Council Member, District {}'.format(district),
31 |                          role='Member',
32 |                          division_id='ocd-division/country:us/state:ca/place:sacramento/council_district:{}'.format(district))
33 | 
34 |         yield org
35 | 


--------------------------------------------------------------------------------
/archive/temecula/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Person, Organization
 3 | 
 4 | from .utils import Urls
 5 | 
 6 | legislators_url = (
 7 |     'http://www.cityoftemecula.org/Temecula/Government/'
 8 |     'CouncilCommissions/CityCouncil/')
 9 | 
10 | 
11 | class PersonScraper(Scraper):
12 | 
13 |     def scrape(self):
14 |         urls = Urls(dict(list=legislators_url), self)
15 | 
16 |         council = Organization(
17 |             'Temecula City Council',
18 |             classification='legislature')
19 |         council.add_source(urls.list.url)
20 |         yield council
21 | 
22 |         for tr in urls.list.xpath('//table[2]//tr')[1:]:
23 | 
24 |             # Parse some attributes.
25 |             name, role = tr.xpath('td/p[1]//font/text()')
26 |             image = tr.xpath('td/img/@src').pop()
27 | 
28 |             # Create legislator.
29 |             person = Person(name, image=image)
30 | 
31 |             # Add membership on council.
32 |             memb = person.add_membership(council, role=role)
33 | 
34 |             # Add email address.
35 |             email, detail_url = tr.xpath('td//a/@href')
36 |             email = email[7:]
37 |             memb.contact_details.append(
38 |                 dict(type='email', value=email, note='work'))
39 | 
40 |             # Add sources.
41 |             person.add_source(urls.list.url)
42 |             person.add_source(detail_url)
43 | 
44 |             yield person
45 | 


--------------------------------------------------------------------------------
/archive/columbus/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper, Legislator, Committee
 2 | 
 3 | from collections import defaultdict
 4 | import lxml.html
 5 | 
 6 | HOMEPAGE = "http://council.columbus.gov/"
 7 | 
 8 | class ColumbusPersonScraper(Scraper):
 9 | 
10 |     def lxmlize(self, url):
11 |         entry = self.urlopen(url)
12 |         page = lxml.html.fromstring(entry)
13 |         page.make_links_absolute(url)
14 |         return page
15 | 
16 |     def scrape_homepage(self, folk):
17 |         url = folk.attrib['href']
18 |         page = self.lxmlize(url)
19 |         image = page.xpath(
20 |             "//img[contains(@src, 'uploadedImages/City_Council/Members/')]"
21 |         )[0].attrib['src']
22 | 
23 |         name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3")
24 |         name, = name
25 | 
26 |         bio = "\n\n".join([x.text_content() for x in page.xpath(
27 |             "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p"
28 |         )])
29 | 
30 |         leg = Legislator(name=name.text,
31 |                          district='member',
32 |                          biography=bio,
33 |                          image=image)
34 |         leg.add_source(url)
35 |         return leg
36 | 
37 |     def scrape(self):
38 |         page = self.lxmlize(HOMEPAGE)
39 |         folks = page.xpath("//div[@class='col-left']/div[2]//"
40 |                            "div[@class='gutter_text'][1]//"
41 |                            "ul[@class='gutterlist']/li//a")
42 |         for folk in folks:
43 |             yield self.scrape_homepage(folk)
44 | 


--------------------------------------------------------------------------------
/archive/denver/bills.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import defaultdict
 3 | from io import StringIO
 4 | 
 5 | from lxml.html import fromstring
 6 | 
 7 | from pupa.scrape import BaseBillScraper
 8 | from pupa.utils import convert_pdf
 9 | from pupa.scrape import Bill
10 | 
11 | from .utils import Urls
12 | 
13 | 
14 | search_url = (
15 |     'http://www.denvergov.org/sirepub/items.aspx?'
16 |     'stype=advanced&meettype=-%20All%20Types%20-&meetdate=This%20Year')
17 | 
18 | 
19 | class BillScraper(BaseBillScraper):
20 | 
21 |     def get_bill_ids(self):
22 |         self.urls = Urls(dict(search=search_url), scraper=self)
23 | 
24 |         rows = ('id', 'number', 'type', 'status', 'meeting_type',
25 |                 'meeting_date', 'district', 'sponsor', 'title')
26 |         xpath = '//tr[contains(@class, "datagrid")]'
27 |         for tr in self.urls.search.xpath(xpath)[1:]:
28 |             bill_id = re.search(r'\((.+)\)', tr.attrib['onclick']).group(1)
29 |             data = [td.text_content() for td in tr.xpath('td')[1:]]
30 |             yield bill_id, dict(zip(rows, [bill_id] + data))
31 | 
32 |     def get_bill(self, bill_id, **kwargs):
33 |         url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id
34 |         self.urls.add(detail=url)
35 | 
36 |         bill_id = kwargs.pop('number')
37 |         bill = Bill(bill_id, self.session, kwargs['title'], 'butt',
38 |                     type=['bills'])
39 |         bill.add_source(url, note='detail')
40 | 
41 |         xpath = '//table[contains(@class, "history")]/tr'
42 |         for tr in self.urls.detail.xpath(xpath):
43 |             import pdb; pdb.set_trace()
44 | 
45 |         return bill
46 | 


--------------------------------------------------------------------------------
/archive/holyoke/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Sunlight Foundation, 2014, under the terms of the BSD-3
 2 | # license, a copy of which is in the root level LICENSE file.
 3 | #
 4 | # This scraper was done at Hack for Western Mass, a huge shoutout
 5 | # to all the civic hackers and the Hack for Western Mass folks.
 6 | 
 7 | from pupa.scrape import Jurisdiction, Post, Organization
 8 | from .people import HolyokePersonScraper
 9 | 
10 | NAME = "Holyoke City"
11 | 
12 | 
13 | class Holyoke(Jurisdiction):
14 |     division_id = 'ocd-division/country:us/state:ma/place:holyoke'
15 |     classification = 'government'
16 |     name = NAME
17 |     url = 'http://www.holyoke.org/elected-officials/'
18 | 
19 |     scrapers = {
20 |         "people": HolyokePersonScraper
21 |     }
22 | 
23 |     def get_organizations(self):
24 |         # XXX: Add divison IDs
25 |         org = Organization(name='Holyoke City Council',
26 |                           classification='legislature')
27 | 
28 |         for x in [
29 |             {"label": "Mayor", "role": "mayor",},
30 |             {"label": "City Clerk", "role": "clerk",},
31 |             {"label": "City Treasurer", "role": "treasurer",},
32 |             {"label": "At Large", "role": "councilmember",},
33 | 
34 |             {"label": "Ward 1", "role": "councilmember"},
35 |             {"label": "Ward 2", "role": "councilmember"},
36 |             {"label": "Ward 3", "role": "councilmember"},
37 |             {"label": "Ward 4", "role": "councilmember"},
38 |             {"label": "Ward 5", "role": "councilmember"},
39 |             {"label": "Ward 6", "role": "councilmember"},
40 |             {"label": "Ward 7", "role": "councilmember"},
41 |         ]:
42 |             org.add_post(**x)
43 | 
44 |         yield org
45 | 


--------------------------------------------------------------------------------
/archive/boise/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Person, Organization
 3 | 
 4 | from .utils import Urls
 5 | 
 6 | legislators_url = 'http://mayor.cityofboise.org/city-council/'
 7 | 
 8 | 
 9 | class PersonScraper(Scraper):
10 | 
11 |     def scrape(self):
12 |         urls = Urls(dict(list=legislators_url), self)
13 | 
14 |         council = Organization('Boise City Council')
15 |         council.add_source(legislators_url)
16 |         yield council
17 | 
18 |         xpath = '//div[@id="content"]/div/a/@href'
19 |         people_urls = urls.list.xpath(xpath)
20 | 
21 |         # SKip the mayor because his page has no name or email.
22 |         people_urls = people_urls[1:]
23 |         for url in people_urls:
24 | 
25 |             urls.add(detail=url)
26 |             # Parse some attributes.
27 | 
28 |             image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop()
29 |             name = urls.detail.xpath('//h1/text()').pop()
30 | 
31 |             name = name.replace('Council ', '')
32 |             role, _, name = name.partition(' ')
33 | 
34 |             # Create legislator.
35 |             person = Person(name, image=image)
36 | 
37 |             # Add membership on council.
38 |             memb = person.add_membership(council, role=role)
39 |             memb.add_source(urls.detail.url)
40 | 
41 |             # Add email address.
42 |             email_xpath = '//a[contains(@href, "mailto")]/@href'
43 |             email = urls.detail.xpath(email_xpath).pop()[7:]
44 |             memb.contact_details.append(
45 |                 dict(type='email', value=email, note='work'))
46 | 
47 |             # Add sources.
48 |             person.add_source(urls.list.url)
49 |             person.add_source(urls.detail.url)
50 | 
51 |             yield person
52 | 


--------------------------------------------------------------------------------
/st_louis/utils.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from lxml import html
 3 | import requests
 4 | 
 5 | 
 6 | class StlScraper(Scraper):
 7 | 
 8 | 	def lxmlize(self, url, payload=None):
 9 | 		if payload:
10 | 			entry = self.post(url, payload).text
11 | 		else:
12 | 			entry = self.get(url).text
13 | 		page = html.fromstring(entry)
14 | 		page.make_links_absolute(url)
15 | 		return page 
16 | 
17 | class Urls(object):
18 | 
19 | 	BASE_URL = "https://www.stlouis-mo.gov/government"
20 | 	ALDERMEN_HOME = BASE_URL + "/departments/aldermen"
21 | 	BILLS_HOME = BASE_URL + "/city-laws/board-bills/index.cfm"
22 | 	COMMITTEES_HOME = ALDERMEN_HOME + "/committees/committee.cfm" 
23 | 
24 | class HumanName(object):
25 | 	""" 
26 | 	custom hack to avoid dependency on https://pypi.python.org/pypi/nameparser 
27 | 	"""
28 | 	
29 | 	@staticmethod
30 | 	def name_firstandlast(raw_name):
31 | 		""" 
32 | 		given a string (presumed to be a person's name), try to return
33 | 		just the person's first and last name without cruft
34 | 		e.g. 'Megan E. Green'     => 'Megan Green' 
35 | 				 'Freeman Bosley Sr.' => 'Freeman Bosley'
36 | 		"""
37 | 		# FIXME various corner cases fail
38 | 		# e.g. 'Bill de la Garza'   => 'Bill Garza'
39 | 		#      'Freeman Bosley III' => 'Freeman III'
40 | 
41 | 
42 | 		# first of all, check for any particular known typos
43 | 		known_typos = { 
44 | 			"Freeman M BosleySr.": "Freeman Bosley",
45 | 			"Megan E.Green": "Megan Green" 
46 | 		}
47 | 		if raw_name in known_typos:
48 | 			return known_typos[raw_name]
49 | 
50 | 		words = raw_name.split(" ")
51 | 		firstname, *rest = words
52 | 		# last name is the farthest-back word that does not contain "."
53 | 		clean_rest = [ w for w in rest if "." not in w ]
54 | 		try: 
55 | 			lastname = " " + clean_rest[-1]
56 | 		except IndexError:
57 | 			lastname = ""
58 | 		return (firstname + lastname).strip()
59 | 
60 | 


--------------------------------------------------------------------------------
/ferguson/people.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | from pupa.scrape import Person, Scraper
 3 | import re
 4 | 
 5 | 
 6 | class FergusonPersonScraper(Scraper):
 7 |     COUNCIL_URL = 'http://www.fergusoncity.com/Directory.aspx?DID=3'
 8 | 
 9 |     def lxmlize(self, url):
10 |         html = self.get(url).text
11 |         doc = lxml.html.fromstring(html)
12 |         doc.make_links_absolute(url)
13 |         return doc
14 | 
15 |     def get_council(self):
16 |         council_doc = self.lxmlize(self.COUNCIL_URL)
17 | 
18 |         member_urls = council_doc.xpath(
19 |             '//table[@summary="City Directory"]/tr//'
20 |             'a[contains(@href, "/directory.aspx?EID=")]/@href')
21 |         for member_url in member_urls:
22 |             member_doc = self.lxmlize(member_url)
23 | 
24 |             (name, ) = member_doc.xpath('//h1[@class="BioName"]/text()')
25 |             (name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name)
26 | 
27 |             # Returning everything into a list because the number of values returned varies 
28 |             # depending on if the person has an email or not
29 |             text_list = member_doc.xpath(
30 |                 '//a[@class="BioLink"]/parent::div/text()')
31 |             title = text_list[1].strip()
32 |             (title, ) = re.findall(
33 |                 r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title)
34 | 
35 |             try:
36 |                 (image_url, ) = member_doc.xpath(
37 |                     '//span[@class="BioText"]//img/@src')
38 |             except ValueError:
39 |                 image_url = ''
40 | 
41 |             member = Person(name=name,
42 |                             image=image_url,
43 |                             primary_org='legislature',
44 |                             role=title)
45 | 
46 |             member.add_source(member_url)
47 | 
48 |             yield member
49 | 
50 |     def scrape(self):
51 |         yield from self.get_council()
52 | 


--------------------------------------------------------------------------------
/nyc/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization
 3 | from .people import NYCPersonScraper
 4 | from .events import NYCEventsScraper
 5 | from .bills import NYCBillScraper
 6 | 
 7 | class NYC(Jurisdiction):
 8 |     classification = 'government'
 9 |     division_id = 'ocd-division/country:us/state:ny/place:new_york'
10 |     name = 'New York City'
11 |     timezone = 'America/New_York'
12 |     url = 'http://nyc.gov'
13 | 
14 |     parties = [
15 |         {'name': 'Democratic'},
16 |         {'name': 'Republican'}
17 |     ]
18 |     scrapers = {'people': NYCPersonScraper,
19 |                 'bills' : NYCBillScraper,
20 |                 'events': NYCEventsScraper
21 |     }
22 | 
23 |     years = [1994, 1998, 2002, 2004, 2006, 2010, 2014]
24 | 
25 |     legislative_sessions = []
26 | 
27 |     for idx, start_year in enumerate(years):
28 |         try:
29 |             end_year = years[idx + 1] - 1
30 |             session = {
31 |                 "identifier": str(start_year),
32 |                 "name": ("%s Regular Session" % str(start_year)),
33 |                 "start_date": ("%s-01-01" % str(start_year)),
34 |                 "end_date": ("%s-12-31" % str(end_year)),
35 |             }
36 |         except IndexError:
37 |             continue
38 |         else:
39 |             legislative_sessions.append(session)
40 | 
41 |     def get_organizations(self):
42 |         council = Organization('New York City Council', classification='legislature')
43 |         for x in range(1,52):
44 |             council.add_post("District {}".format(x),
45 |                              role='Council Member',
46 |                              division_id='ocd-division/country:us/state:ny/place:new_york/council_district:{}'.format(x))
47 |         yield council
48 | 
49 |         mayor = Organization('Mayor', classification='executive')
50 | 
51 |         yield mayor
52 | 
53 |     LEGISTAR_ROOT_URL = 'http://legistar.council.nyc.gov/'
54 | 
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scrapers-us-municipal
 2 | =====================
 3 | 
 4 | Source for municipal scrapers
 5 | 
 6 | To find out more about the ins and outs of these scrapers, as well as how to create your own, head on over to [docs.opencivicdata.org's scraping page](http://docs.opencivicdata.org/en/latest/scrape/index.html).
 7 | 
 8 | Issues?
 9 | -------
10 | 
11 | Issues with the data coming from these scrapers should be filed [in this repository](https://github.com/opencivicdata/scrapers-us-municipal/issues).
12 | 
13 | ## Development
14 | 
15 | ### With Docker
16 | 
17 | Requires Docker and Docker Compose
18 | 
19 | #### Initialization
20 | 
21 | ```bash
22 | docker-compose run --rm scrapers pupa init YOUR_CITY_SCRAPER
23 | ```
24 | 
25 | ### Without Docker
26 | 
27 | Requires Python 3, PostGIS
28 | 
29 | #### Initialization
30 | Assuming that you want to have your database be called `opencivicdata` on your local machine
31 | 
32 | ```bash
33 | pip install -r requirements.txt
34 | createdb opencivicdata
35 | export DATABASE_URL=postgresql:///opencivicdata
36 | pupa dbinit us
37 | pupa init YOUR_CITY_SCRAPER
38 | ```
39 | 
40 | At times, the release of ocd-django on PyPI differs from that of Github. This may cause problems if you need to create and run migrations. Specifically, you might encounter an `ImproperlyConfigured` error that instructs you to do the following:
41 | 
42 | ```bash
43 | You must either define the environment variable DJANGO_SETTINGS_MODULE or call settings.configure() before accessing settings.
44 | ```
45 | 
46 | Fix the problem by running:
47 | 
48 | ```bash
49 | export DJANGO_SETTINGS_MODULE=pupa.settings
50 | ```
51 | 
52 | Then, you should be able to successfully run:
53 | 
54 | ```bash
55 | django-admin makemigrations
56 | django-admin migrate
57 | ```
58 | 
59 | ## Testing
60 | 
61 | Before submitting a PR, please run your scraper.
62 | 
63 | ### With Docker
64 | 
65 | ```bash
66 | docker-compose run --rm scrapers pupa update YOUR_CITY_SCRAPER
67 | ```
68 | 
69 | ### Without Docker
70 | 
71 | ```bash
72 | export DATABASE_URL=postgresql:///opencivicdata
73 | pupa update YOUR_CITY_SCRAPER
74 | ```
75 | 


--------------------------------------------------------------------------------
/archive/philadelphia/events.py:
--------------------------------------------------------------------------------
 1 | # ~*~ encoding: utf-8 ~*~
 2 | from pupa.scrape import Scraper
 3 | from pupa.scrape import Event
 4 | import datetime as dt
 5 | import lxml.html
 6 | 
 7 | 
 8 | class PhillyEventsScraper(Scraper):
 9 |     def lxmlize(self, url):
10 |         entry = self.urlopen(url)
11 |         page = lxml.html.fromstring(entry)
12 |         page.make_links_absolute(url)
13 |         return page
14 | 
15 |     def scrape(self):
16 |         url = "http://phila.legistar.com/Calendar.aspx/"
17 |         page = self.lxmlize(url)
18 |         main = page.xpath("//table[@class='rgMasterTable']")[0]
19 |         rows = main.xpath(".//tr")[1:]
20 |         for row in rows:
21 |             if "No records were found." in row.text_content():
22 |                 self.warning("Hum. They don't seem to have events?")
23 |                 continue
24 | 
25 |             (name, date, _, time, where, agenda, minutes) = row.xpath(".//td")
26 |             # _ nom's the image next to the date on the page.
27 | 
28 |             name = name.text_content().strip()  # leaving an href on the table
29 |             time = time.text_content().strip()
30 |             location = where.text_content().strip()
31 | 
32 |             if "Deferred" in time:
33 |                 continue
34 | 
35 |             all_day = False
36 |             if time == "":
37 |                 all_day = True
38 |                 when = dt.datetime.strptime(date.text.strip(),
39 |                                             "%m/%d/%Y")
40 |             else:
41 |                 when = dt.datetime.strptime("%s %s" % (date.text.strip(), time),
42 |                                             "%m/%d/%Y %I:%M %p")
43 | 
44 |             event = Event(name=name, when=when, location=location)
45 |             event.add_source(url)
46 | 
47 |             agendas = agenda.xpath(".//a[@href]")
48 |             for a in agendas:
49 |                 event.add_link(a.text, a.attrib['href'])
50 | 
51 |             minutes = minutes.xpath(".//a[@href]")
52 |             for minute in minutes:
53 |                 event.add_link(minute.text, minute.attrib['href'])
54 | 
55 |             yield event
56 | 


--------------------------------------------------------------------------------
/st_louis/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization
 3 | from .people import StLouisPersonScraper
 4 | from .bills import StLouisBillScraper
 5 | 
 6 | 
 7 | class StLouis(Jurisdiction):
 8 |     division_id = "ocd-division/country:us/state:mo/place:st_louis"
 9 |     classification = "legislature"
10 |     name = "St. Louis city Board of Aldermen"
11 |     url = "https://www.stlouis-mo.gov/government/departments/aldermen/"
12 |     scrapers = {
13 |         "people": StLouisPersonScraper,
14 |         "bills":  StLouisBillScraper
15 |     }
16 | 
17 |     WARD_COUNT = 28
18 | 
19 |     def get_organizations(self):
20 |         yield self.board_of_aldermen()
21 | 
22 |     def board_of_aldermen(self):
23 |         org = Organization(name="St Louis Board of Aldermen", 
24 |                            classification="legislature")
25 |         # add a post for each Ward
26 |         for ward_num in range(1, self.WARD_COUNT + 1):
27 |             org.add_post(label="Ward {} Alderman".format(ward_num),
28 |                          role="Alderman")
29 |         yield org
30 | 
31 | 
32 |     # TODO better way of doing this?
33 |     legislative_sessions = [
34 |         { "identifier": "2015-2016",
35 |           "name": "2015-2016 Regular Session",
36 |           "start_date": "2015-04-20",
37 |           "end_date": "2016-04-17"
38 |         },
39 |         { "identifier": "2014-2015",
40 |           "name": "2014-2015 Regular Session",
41 |           "start_date": "2014-04-14",
42 |           "end_date": "2015-04-20"
43 |         },
44 |         { "identifier": "2013-2014",
45 |           "name": "2013-2014 Regular Session",
46 |           "start_date": "2013-04-15",
47 |           "end_date": "2014-04-07"
48 |         },
49 |         { "identifier": "2012-2013",
50 |           "name": "2012-2013 Regular Session",
51 |           "start_date": "2012-04-16",
52 |           "end_date": "2013-04-08"
53 |         },
54 |         { "identifier": "2011-2012",
55 |           "name": "2011-2012 Regular Session",
56 |           "start_date": "2011-04-18",
57 |           "end_date": "2012-04-09"
58 |         },
59 |         { "identifier": "2010-2011",
60 |           "name": "2010-2011 Regular Session",
61 |           "start_date": "2010-04-19",
62 |           "end_date": "2011-04-11"
63 |         }
64 |     ]
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/archive/arlington_va/people.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import lxml, lxml.html
 4 | 
 5 | from pupa.scrape import Scraper
 6 | from pupa.scrape.helpers import Legislator, Organization
 7 | 
 8 | class PersonScraper(Scraper):
 9 | 
10 |     COUNTY_BOARD_URL = 'http://www.arlingtonva.us/Departments/CountyBoard/meetings/members/CountyBoardMeetingsMembersMain.aspx'
11 | 
12 |     def scrape(self):
13 |         board_html = self.urlopen(self.COUNTY_BOARD_URL)
14 |         board_lxml = lxml.html.fromstring(board_html)
15 |         board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL)
16 | 
17 |         for board_member_lxml in board_lxml.cssselect("div[name=cbo_list] div[name=row]"):
18 |             name = board_member_lxml.cssselect("div[name=info] strong")[0].text.strip()
19 |             image = board_member_lxml.cssselect("div[name=pictures] img")[0].get('src')
20 |             pieces = re.split(r'<br\s*\/?>', lxml.html.tostring(board_member_lxml.cssselect("div[name=info]")[0]).decode(), re.I)
21 |             position = re.sub(r'<[^>]*>', '', pieces[1]).strip()
22 |             links = board_member_lxml.cssselect("div[name=info] a")
23 |             email = bio_link = None
24 |             for link in links:
25 |                 if link.text is None:
26 |                     continue
27 |                 if 'arlingtonva.us' in link.text.lower():
28 |                     email = re.sub(r'\s*\(at\)\s*','@', link.text).strip()
29 |                 elif 'bio' in link.text.lower():
30 |                     bio_link = link
31 | 
32 |             legislator = Legislator(name=name, district=position, image=image)
33 |             legislator.add_contact(type='email', value=email, note='%(name)s email address' % {'name': name} )
34 |             legislator.add_source(self.COUNTY_BOARD_URL)
35 | 
36 |             bio = None
37 |             if bio_link is not None:
38 |                 bio_href = bio_link.attrib.get('href')
39 |                 bio_html = self.urlopen(bio_href)
40 |                 bio_lxml = lxml.html.fromstring(bio_html)
41 |                 bio_text = re.sub(r'<[^>]*>', '', lxml.html.tostring(bio_lxml.cssselect('#textSection #text')[0]).decode(), re.I).strip()
42 |                 bio_text = re.sub(r'&#160;', ' ', bio_text)
43 |                 legislator.biography = bio_text
44 |                 legislator.add_link('bio page', bio_href)
45 | 
46 |             yield legislator
47 | 


--------------------------------------------------------------------------------
/archive/albuquerque/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper, Legislator, Committee
 2 | import lxml.html
 3 | 
 4 | 
 5 | class PersonScraper(Scraper):
 6 | 
 7 |     def lxmlize(self, url):
 8 |         entry = self.urlopen(url)
 9 |         page = lxml.html.fromstring(entry)
10 |         page.make_links_absolute(url)
11 |         return page
12 | 
13 |     def get_people(self):
14 |         yield self._scrape_committees()
15 |         yield self._scrape_people()
16 | 
17 |     def _scrape_committees(self):
18 |         url = "http://www.cabq.gov/council/committees"
19 |         page = self.lxmlize(url)
20 |         root = page.xpath("//div[@id='parent-fieldname-text']")[0]
21 |         h3s = root.xpath("./h3")
22 |         ps = root.xpath("./p")[2:]
23 |         uls = root.xpath("./ul")
24 |         for h3, p, ul in zip(h3s, ps, uls):
25 |             name = h3.text_content()
26 |             org = Committee(name=name)
27 |             org.add_source(url)
28 | 
29 |             for person in ul.xpath(".//li"):
30 |                 who = person.text_content()
31 |                 title = 'member'
32 |                 if ", chair" in who.lower():
33 |                     title = 'chair'
34 |                     who = who.replace(", Chair", "")
35 |                 org.add_member(name=who,
36 |                                role=title)
37 |                 yield org
38 | 
39 |     def _scrape_people(self):
40 |         url = 'http://www.cabq.gov/council/councilors'
41 |         page = self.lxmlize(url)
42 |         names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:]
43 |         it = iter(names)
44 |         for entry in zip(it, it, it):
45 |             name, info, _ = entry
46 |             image_small = name.xpath(".//img")[0].attrib['src']
47 |             name = name.text_content()
48 |             infopage, email, policy_analyst = info.xpath(".//a")
49 |             phone = info.xpath(".//b")[-1].tail.strip()
50 |             district = infopage.text_content()
51 |             homepage = self.lxmlize(infopage.attrib['href'])
52 |             photo = homepage.xpath(
53 |                 "//div[@class='featureContent']//img"
54 |             )[0].attrib['src']
55 | 
56 |             bio = "\n".join((x.text_content() for x in homepage.xpath(
57 |                 "//div[@class='featureContent']//div[@class='stx']/p")))
58 | 
59 |             p = Legislator(name=name,
60 |                            district=district,
61 |                            image=photo,
62 |                            biography=bio)
63 | 
64 |             p.add_source(url)
65 |             p.add_source(infopage.attrib['href'])
66 |             yield p
67 | 


--------------------------------------------------------------------------------
/archive/cary/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | import datetime as dt
 5 | from functools import partial
 6 | import lxml.html
 7 | 
 8 | CAL_URL = "http://www.townofcary.org/Town_Council/Meetings____Public_Notices_Calendar.htm"
 9 | 
10 | 
11 | class CaryEventsScraper(Scraper):
12 | 
13 |     def lxmlize(self, url):
14 |         entry = self.urlopen(url)
15 |         page = lxml.html.fromstring(entry)
16 |         page.make_links_absolute(url)
17 |         return page
18 | 
19 |     def scrape(self):
20 |         page = self.lxmlize(CAL_URL)
21 |         events = page.xpath("//div[@id='ctl14_pnlCalendarAll']//td")
22 |         for event in events:
23 |             when = event.xpath(".//a[contains(@href, 'javascript')]")
24 |             if when == []:
25 |                 continue
26 |             when = when[0]
27 | 
28 |             dom = when.text  # day of month
29 |             hrefs = event.xpath(".//a[contains(@href, 'htm')]")
30 |             for href in hrefs:
31 |                 for e in self.scrape_event(href):
32 |                     yield e
33 | 
34 | 
35 |     def scrape_event(self, href):
36 |         page = self.lxmlize(href.attrib['href'])
37 |         what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text
38 |         info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:]
39 |         ret = {
40 |             "Location:": "Unknown"
41 |         }
42 |         for tr in info:
43 |             tds = tr.xpath(".//td")
44 |             if len(tds) < 2:
45 |                 continue
46 |             what, data = [tds.pop(0).text_content().strip() for x in range(2)]
47 |             ret[what] = data
48 | 
49 |         agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]")
50 |         if agendas:
51 |             for agenda in agendas:
52 |                 print("Agenda:", agenda.attrib['href'])
53 | 
54 |         t = ret['Time:']
55 |         start_time, end_time = t, None
56 |         if "-" in t:
57 |             start_time, end_time = (x.strip() for x in t.split("-", 1))
58 | 
59 |         start_time = "%s %s" % (ret['Date:'], start_time)
60 |         dts = "%B %d, %Y %I:%M %p"
61 |         start = dt.datetime.strptime(start_time, dts)
62 | 
63 |         end = None
64 |         if end_time:
65 |             end = "%s %s" % (ret['Date:'], end_time)
66 |             end = dt.datetime.strptime(end, dts)
67 | 
68 |         kwargs = {}
69 |         if end:
70 |             kwargs['end'] = end
71 | 
72 |         e = Event(name=what, location=ret['Location:'], when=start,
73 |                   **kwargs)
74 |         e.add_source(href.attrib['href'])
75 |         yield e
76 | 


--------------------------------------------------------------------------------
/archive/madison.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Jurisdiction, Organization
 2 | from legistar.people import LegistarPersonScraper
 3 | 
 4 | 
 5 | class MadisonPersonScraper(LegistarPersonScraper):
 6 | 
 7 |     EXTRA_FIELDS = ('notes',)
 8 |     DATE_FORMATS = ('%m/%d/%Y', '%m/%d/%Y*',)
 9 | 
10 |     def skip_item(self, item):
11 |         #return item['name'] in ('VACANCIES', 'Al Matano')
12 |         # TODO: this skips all non-city councilors, check to make sure it doesn't skip other
13 |         # interesting people?
14 |         return 'district' not in item['url']
15 | 
16 | 
17 | class Madison(Jurisdiction):
18 |     division_id = 'ocd-division/country:us/state:wi/place:madison'
19 |     classification = 'government'
20 |     timezone = 'America/Chicago'
21 |     name = 'Madison'
22 |     url = 'http://www.cityofmadison.com/'
23 | 
24 |     scrapers = {'people': MadisonPersonScraper}
25 |     # HTTPS is vital here, without it pagination doesn't work!
26 |     LEGISTAR_ROOT_URL = 'https://madison.legistar.com/'
27 | 
28 |     def get_organizations(self):
29 |         council = Organization('City of Madison Common Council', classification='legislature')
30 |         for x in range(1,21):
31 |             council.add_post(str(x), role='Alder')
32 |         yield council
33 | 
34 |         #ORG_CLASSIFICATIONS = {
35 |         #    'ALLIED AREA TASK FORCE': 'commission',
36 |         #    'TRANSPORT 2020 IMPLEMENTATION TASK FORCE': 'commission',
37 |         #    'COMMON COUNCIL': 'legislature',
38 |         #    'COMMON COUNCIL - DISCUSSION': 'commission',
39 |         #    'COMMUNITY ACTION COALITION FOR SOUTH CENTRAL WISCONSIN INC': 'commission',
40 |         #    'COMMUNITY DEVELOPMENT AUTHORITY': 'commission',
41 |         #    'MADISON COMMUNITY FOUNDATION': 'commission',
42 |         #    'MADISON FOOD POLICY COUNCIL': 'commission',
43 |         #    'MADISON HOUSING AUTHORITY': 'commission',
44 |         #    'PARKING COUNCIL FOR PEOPLE WITH DISABILITIES': 'commission',
45 |         #}
46 | 
47 |         #def person_district(self, data):
48 |         #    '''This corresponds to the label field on organizations posts.
49 |         #    '''
50 |         #    # First try to get it from bio.
51 |         #    dist = re.findall(r'District\s+\d+', data['notes'])
52 |         #    if dist:
53 |         #        return dist.pop()
54 | 
55 |         #    # Then try website.
56 |         #    dist = re.findall(r'/district(\d+)/', data['website'])
57 |         #    if dist:
58 |         #        return dist.pop()
59 | 
60 |         #    # Then email.
61 |         #    dist = re.findall(r'district(\d+)', data['email'])
62 |         #    if dist:
63 |         #        return dist.pop()
64 | 


--------------------------------------------------------------------------------
/archive/santa_fe/events.py:
--------------------------------------------------------------------------------
 1 | # ~*~ encoding: utf-8 ~*~
 2 | from pupa.scrape import Scraper
 3 | from pupa.scrape import Event
 4 | 
 5 | import datetime as dt
 6 | import lxml.html
 7 | import re
 8 | 
 9 | 
10 | CAL_PAGE = "http://www.santafenm.gov/index.aspx?NID=1066"
11 | DT = re.compile(r"(?P<time>\d{1,2}:\d{1,2}) (?P<ampm>AM|PM)")
12 | WHEN = re.compile(r"DAY,\s+(?P<month>\w+)\s+(?P<dom>\d{1,2}),\s+(?P<year>\d{4})")
13 | 
14 | 
15 | class SantaFeEventsScraper(Scraper):
16 | 
17 |     def lxmlize(self, url):
18 |         entry = self.urlopen(url)
19 |         page = lxml.html.fromstring(entry)
20 |         page.make_links_absolute(url)
21 |         return page
22 | 
23 |     def cleanup(self, what):
24 |         return re.sub("\s+", " ", what).strip()
25 | 
26 |     def scrape(self):
27 |         curdate = None
28 |         page = self.lxmlize(CAL_PAGE)
29 |         for el in page.xpath("//div[@id='Section1']/*"):
30 |             if el.tag[0] == 'h':
31 |                 when = WHEN.findall(el.text_content())
32 |                 when = when[0] if when else None
33 |                 if when is None:
34 |                     continue
35 |                 curdate = " ".join(when)
36 | 
37 |             if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'):
38 | 
39 |                 els = el.xpath("./*")
40 |                 agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
41 |                 agenda = agenda[0] if agenda else None
42 |                 if agenda is None:
43 |                     continue
44 | 
45 |                 info = self.cleanup(el.text_content())
46 |                 when = DT.findall(info)
47 |                 when = when[0] if when else None
48 |                 if when is None:
49 |                     continue
50 | 
51 |                 people = el.xpath(".//personname")
52 |                 places = el.xpath(".//place")
53 |                 time, ampm = when
54 | 
55 |                 if curdate is None:
56 |                     self.warning("Can't scrape, since I don't know what date it is")
57 |                     continue
58 | 
59 |                 tbuf = " ".join([curdate, time, ampm])
60 |                 obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")
61 | 
62 |                 try:
63 |                     _, where = info.rsplit(u"–", 1)
64 |                 except ValueError:
65 |                     continue
66 | 
67 |                 where = where.replace(u" ", " ")
68 |                 where  = re.sub("\s+", " ", where).strip()
69 |                 where = re.sub("agenda$", "", where).strip()
70 | 
71 |                 event = Event(name=info, when=obj, location=where)
72 |                 event.add_source(CAL_PAGE)
73 |                 yield event
74 | 


--------------------------------------------------------------------------------
/archive/roswell/events.py:
--------------------------------------------------------------------------------
 1 | # ~*~ encoding: utf-8 ~*~
 2 | from pupa.scrape import Scraper
 3 | from pupa.scrape import Event
 4 | import datetime as dt
 5 | import lxml.html
 6 | import re
 7 | 
 8 | CAL_PAGE = ("http://www.roswell-nm.gov/evlist/index.php?view=month&year=2013&"
 9 |             "month=5&day=0&cal=0&cat=0")
10 | 
11 | 
12 | class RoswellEventsScraper(Scraper):
13 | 
14 |     def lxmlize(self, url):
15 |         entry = self.urlopen(url)
16 |         page = lxml.html.fromstring(entry)
17 |         page.make_links_absolute(url)
18 |         return page
19 | 
20 |     def scrape(self):
21 |         page = self.lxmlize(CAL_PAGE)
22 |         days = page.xpath("//table[@class='evlist_month']//td")
23 |         for day in days:
24 |             when = day.xpath(".//span[@class='date_number']//a")
25 |             when = when[0].text if when else None
26 |             if when is None:
27 |                 continue
28 |             events = day.xpath(".//a[contains(@href, 'event.php')]")
29 |             for event in events:
30 |                 yield self.scrape_event_page(event)
31 | 
32 |     def scrape_event_page(self, event):
33 |         url = event.attrib['href']
34 |         page = self.lxmlize(url)
35 |         title = page.xpath("//h2[@class='evlist_header']")
36 |         title = title[0].text.strip() if title else None
37 |         if title is None:
38 |             return
39 |         if "CANCELED" in title:
40 |             return
41 | 
42 |         info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0]
43 |         blocks = info.xpath(".//div")
44 |         ret = {}
45 |         for block in blocks:
46 |             els = block.xpath("./*")
47 |             if not els:
48 |                 continue
49 |             le = els[0]
50 | 
51 |             if le.tag != 'label':
52 |                 continue
53 | 
54 |             label, div = els
55 | 
56 |             ltex = label.text_content().strip()
57 |             dtex = div.text_content().strip()
58 |             ret[ltex] = dtex
59 | 
60 |         when = dt.datetime.utcnow()
61 |         date, start, end = (x.strip() for x in ret['When:'].split("\n"))
62 |         start = re.sub("^@", "", start).strip()
63 |         end = end.replace("-", "").strip()
64 | 
65 |         replace = [
66 |             ('Apr', 'April'),
67 |         ]
68 | 
69 |         skip = ["Occurs every"]
70 | 
71 |         for k, v in replace:
72 |             date = date.replace(k, v).strip()
73 | 
74 |         if True in (x in end for x in skip):
75 |             return
76 | 
77 |         start = "%s %s" % (date, start)
78 |         end = "%s %s" % (date, end)
79 |         start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end))
80 | 
81 |         event = Event( name=title, location=ret['Where:'], when=start, end=end)
82 |         event.add_source(url)
83 |         yield event
84 | 


--------------------------------------------------------------------------------
/archive/cleveland/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | import datetime as dt
 5 | import lxml.html
 6 | import re
 7 | 
 8 | # events
 9 | CLICK_INFO = re.compile(r"CityCouncil\.popOverURL\('(?P<info_id>\d+)'\);")
10 | ORD_INFO = re.compile(r"Ord\. No\. (?P<ord_no>\d+-\d+)")
11 | AJAX_ENDPOINT = ("http://www.clevelandcitycouncil.org/plugins/NewsToolv7/"
12 |                  "public/calendarPopup.ashx")
13 | 
14 | URL = ("http://www.clevelandcitycouncil.org/calendar/"
15 |        "?from_date={from}&to_date={til}")
16 | 
17 | 
18 | class ClevelandEventScraper(Scraper):
19 | 
20 |     def lxmlize(self, url):
21 |         entry = self.urlopen(url)
22 |         page = lxml.html.fromstring(entry)
23 |         page.make_links_absolute(url)
24 |         return page
25 | 
26 |     def scrape(self):
27 |         start = dt.datetime.utcnow()
28 |         start = start - dt.timedelta(days=10)
29 |         end = start + dt.timedelta(days=30)
30 | 
31 |         url = URL.format(**{"from": start.strftime("%Y/%m/%d"),
32 |                             "til": end.strftime("%Y/%m/%d")})
33 | 
34 | 
35 |         page = self.lxmlize(url)
36 |         events = page.xpath("//ul[contains(@class, 'committee-events')]//li")
37 | 
38 |         for event in events:
39 |             string = event.text_content()
40 | 
41 |             po = CLICK_INFO.match(event.xpath(".//span")[0].attrib['onclick'])
42 |             if po is None:
43 |                 continue
44 | 
45 |             poid = po.groupdict()['info_id']  # This is used to get more deetz on
46 | 
47 |             popage = self.popOverUrl(poid)
48 |             when = dt.datetime.strptime(popage.xpath("//strong")[0].text,
49 |                                         "%B %d, %Y @ %I:%M %p")
50 |             who = popage.xpath("//h1")[0].text
51 |             related = []
52 | 
53 |             for item in popage.xpath("//div"):
54 |                 t = item.text
55 |                 if t is None:
56 |                     continue
57 | 
58 |                 t = t.strip()
59 |                 for related_entity in ORD_INFO.findall(t):
60 |                     related.append({
61 |                         "ord_no": related_entity,
62 |                         "what": t
63 |                     })
64 | 
65 |             e = Event(name=who, when=when, location='unknown')
66 |             e.add_source(url)
67 | 
68 |             for o in related:
69 |                 i = e.add_agenda_item(o['what'])
70 |                 i.add_bill(o['ord_no'], note='consideration')
71 | 
72 |             yield e
73 | 
74 | 
75 |     def popOverUrl(self, poid):
76 |         data = {
77 |             "action": "getCalendarPopup",
78 |             "newsid": poid
79 |         }
80 |         page = self.urlopen(AJAX_ENDPOINT, body=data)
81 |         page = lxml.html.fromstring(page)
82 |         page.make_links_absolute(AJAX_ENDPOINT)
83 |         return page
84 | 


--------------------------------------------------------------------------------
/archive/columbus/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | import datetime as dt
 5 | import lxml.html
 6 | import re
 7 | 
 8 | PAGE = "http://council.columbus.gov/events.aspx?id=5370&menu_id=526"
 9 | 
10 | EVENT_RE = re.compile("(?P<event>.*) begins at (?P<time>.*)")
11 | 
12 | 
13 | class ColumbusEventScraper(Scraper):
14 | 
15 |     def lxmlize(self, url):
16 |         entry = self.urlopen(url)
17 |         page = lxml.html.fromstring(entry)
18 |         page.make_links_absolute(url)
19 |         return page
20 | 
21 |     def scrape(self):
22 |         page = self.lxmlize(PAGE)
23 |         events = page.xpath("//div[@class='col-middle']//ul/li")
24 |         when = None
25 |         for event in events:
26 |             h3 = event.xpath("./a/h2")
27 |             h3 = h3[0] if h3 else None
28 |             if h3 is not None:
29 |                 when = h3.text
30 |             else:
31 |                 if when is None:
32 |                     self.warning("Ungrok!")
33 |                     continue
34 | 
35 |                 b, _, i = event.xpath("./p/*")
36 |                 title = b.text_content()
37 |                 event = i.text_content()
38 | 
39 |                 if "NO MEETING" in event:
40 |                     continue
41 | 
42 |                 day, title = (x.strip() for x in title.split("-", 1))
43 | 
44 |                 where = "Council Chambers"
45 | 
46 |                 for subevent in (x.strip() for x in event.split(";")):
47 |                     if " in " in subevent:
48 |                         subevent, where = subevent.rsplit(" in ", 1)
49 |                     subevent = subevent.replace(u'\xa0', ' ')
50 | 
51 |                     if "NO" in subevent and "MEETING" in subevent:
52 |                         continue
53 | 
54 |                     if "to follow" in subevent:
55 |                         continue
56 | 
57 |                     info = EVENT_RE.match(subevent).groupdict()
58 |                     event, time = [info[x] for x in ['event', 'time']]
59 | 
60 |                     ampm = {
61 |                         "a.m.": "AM",
62 |                         "p.m.": "PM",
63 |                     }
64 | 
65 |                     for old, new in ampm.items():
66 |                         time = time.replace(old, new)
67 | 
68 |                     dtstring = ", ".join([day, time])
69 | 
70 |                     try:
71 |                         etime = dt.datetime.strptime(
72 |                             dtstring, "%m/%d/%Y, %I:%M %p")
73 |                     except ValueError:
74 |                         etime = dt.datetime.strptime(
75 |                             dtstring, "%m/%d/%Y, %I%p")
76 | 
77 |                     e = Event(name=event,
78 |                               when=etime,
79 |                               location=where)
80 |                     e.add_source(PAGE)
81 |                     yield e
82 | 


--------------------------------------------------------------------------------
/archive/wellesley/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper, Legislator, Committee
 2 | 
 3 | from collections import defaultdict
 4 | import lxml.html
 5 | import re
 6 | 
 7 | MEMBER_LIST = "http://www.wellesleyma.gov/Pages/WellesleyMA_Clerk/elected"
 8 | 
 9 | 
10 | def clean_address(where):
11 |     baddies = [
12 |         "-",
13 |         "(",
14 |     ]
15 | 
16 |     for thing in baddies:
17 |         where = where.strip().rstrip(thing)
18 | 
19 |     where = where.strip()
20 | 
21 |     return where
22 | 
23 | 
24 | class WellesleyPersonScraper(Scraper):
25 | 
26 |     def lxmlize(self, url):
27 |         entry = self.urlopen(url)
28 |         page = lxml.html.fromstring(entry)
29 |         page.make_links_absolute(url)
30 |         return page
31 | 
32 |     def scrape(self):
33 |         page = self.lxmlize(MEMBER_LIST)
34 |         for row in page.xpath("//table[@frame='void']/tbody/tr")[1:]:
35 |             role, whos, expire = row.xpath("./*")
36 |             people = zip([x.text_content() for x in whos.xpath(".//font")],
37 |                          [x.text_content() for x in expire.xpath(".//font")])
38 |             thing = role.text_content()
39 | 
40 |             comm = Committee(name=thing)
41 |             url = role.xpath(".//a")[0].attrib['href']
42 |             comm.add_link(url=url, note='homepage')
43 | 
44 |             for person, expire in people:
45 |                 if "TBA" in person:
46 |                     continue
47 |                 info = {}
48 | 
49 |                 try:
50 |                    info = re.match("(?P<name>.*), (?P<addr>\d+\w* .*)",
51 |                                    person).groupdict()
52 |                 except AttributeError:
53 |                     info = re.match("(?P<name>.*) (?P<addr>\d+\w* .*)",
54 |                                     person).groupdict()
55 | 
56 |                 addr = info['addr']
57 | 
58 |                 roles = {"Vice Chair": "Vice Chair",
59 |                          "Chair": "Chair",
60 |                          "CHAIR": "Chair",
61 |                          "Appt": "member",}
62 | 
63 |                 position = "member"
64 | 
65 |                 if "Resigned" in addr:
66 |                     continue
67 | 
68 |                 for role in roles:
69 |                     if role in addr:
70 |                         addr, chair = [x.strip() for x in addr.rsplit(role, 1)]
71 |                         position = roles[role]
72 | 
73 |                 addr = clean_address(addr)
74 |                 leg = Legislator(name=info['name'], district=position)
75 |                 leg.add_contact_detail(type="address",
76 |                                        value=addr,
77 |                                        note="Address")
78 |                 leg.add_source(MEMBER_LIST)
79 |                 yield leg
80 | 
81 |                 leg.add_membership(comm)
82 |             comm.add_source(MEMBER_LIST)
83 |             yield comm
84 | 


--------------------------------------------------------------------------------
/chicago/base.py:
--------------------------------------------------------------------------------
 1 | import urllib.parse
 2 | 
 3 | 
 4 | class ElmsAPI:
 5 |     BASE_URL = "https://api.chicityclerkelms.chicago.gov"
 6 | 
 7 |     def _endpoint(self, path):
 8 |         return urllib.parse.urljoin(self.BASE_URL, path)
 9 | 
10 |     def _paginate(self, url, params=None):
11 |         """
12 |         To paginate through APIs we will use both an offset strategy
13 |         and a cursor strategy.
14 | 
15 |         The elms site only allows a max offset of 100,000 records. If
16 |         we want more than that, we can accomplist that by using the
17 |         a sort key as a cursor.
18 | 
19 |         Basically, we sort the records by some feature in ascending
20 |         order. This feature is our "cursor." We step through the data
21 |         as much as we can using offsets. Once we are at the max
22 |         offset, we find the minimum value of cursor in the last page
23 |         of date, and then add a new filter condition that the cursor
24 |         has to be greater than or equal to that minimum value.
25 | 
26 |         We reset the offset back to 0 and restart the offset walking.
27 |         """
28 | 
29 |         max_offset = 100000
30 |         if "sort" in params:
31 |             cursor_feature = params["sort"].split()[0]
32 |         else:
33 |             cursor_feature = None
34 | 
35 |         original_filter = params["filter"]
36 | 
37 |         if params is None:
38 |             params = {}
39 | 
40 |         params["skip"] = 0
41 |         response = self.get(url, params=params)
42 | 
43 |         data = response.json()
44 |         total = data["meta"]["count"]
45 |         yield from data["data"]
46 | 
47 |         while total > max_offset:
48 | 
49 |             for offset in range(100, max_offset, 100):
50 |                 params["skip"] = offset
51 |                 response = self.get(url, params=params)
52 |                 data = response.json()
53 |                 yield from data["data"]
54 | 
55 |             if cursor_feature:
56 |                 cursor_location = min(matter[cursor_feature] for matter in data["data"])
57 |                 params["filter"] = (
58 |                     f"({original_filter}) and "
59 |                     f"({cursor_feature} gt {cursor_location} or "
60 |                     f" {cursor_feature} eq {cursor_location})"
61 |                 )
62 |                 params["skip"] = 0
63 |                 response = self.get(url, params=params)
64 | 
65 |                 data = response.json()
66 |                 total = data["meta"]["count"]
67 |                 yield from data["data"]
68 | 
69 |             else:
70 |                 raise ValueError(
71 |                     f"We don't know how to skip more than {max_offset}. Please provide a sort param."
72 |                 )
73 | 
74 |         for offset in range(100, total, 100):
75 |             params["skip"] = offset
76 |             response = self.get(url, params=params)
77 |             data = response.json()
78 |             yield from data["data"]
79 | 


--------------------------------------------------------------------------------
/cookcounty/people.py:
--------------------------------------------------------------------------------
 1 | from legistar.people import LegistarPersonScraper
 2 | import datetime
 3 | import re
 4 | 
 5 | from pupa.scrape import Person, Organization, Scraper
 6 | 
 7 | 
 8 | class CookcountyPersonScraper(Scraper, LegistarPersonScraper):
 9 |     MEMBERLIST = 'https://cook-county.legistar.com/DepartmentDetail.aspx?ID=20924&GUID=B78A790A-5913-4FBF-8FBF-ECEE445B7796'
10 |     TIMEZONE = 'US/Central'
11 |     ALL_MEMBERS = "3:3"
12 | 
13 |     def scrape(self):
14 |         committee_d = {}
15 | 
16 |         for councilman, committees in self.councilMembers() :
17 | 
18 |             p = Person(' '.join((councilman['First name'], councilman['Last name']))) 
19 |             if p.name == 'Toni Preckwinkle' :
20 |                 continue
21 |             elif p.name == 'Robert Steele' :
22 |                 district = 2
23 |             elif p.name == 'Jerry Butler' :
24 |                 district = 3
25 |             elif p.name == 'Sean Morrison' :
26 |                 district = 17
27 |             else :
28 |                 district = re.findall('\d+', councilman['Person Name']['url'])[0]
29 | 
30 |             start_date = self.toTime(councilman['Start Date']).date()
31 |             end_date = self.toTime(councilman['End Date']).date()
32 | 
33 |             if end_date == datetime.date(2018, 12, 2) :
34 |                 end_date = ''
35 |             else :
36 |                 end_date = end_date.isoformat()
37 | 
38 |             p.add_term('Commissioner', 'legislature', 
39 |                        district='District {}'.format(district), 
40 |                        start_date=start_date.isoformat(),
41 |                        end_date=end_date)
42 | 
43 |             if councilman["E-mail"]:
44 |                 p.add_contact_detail(type="email",
45 |                                      value=councilman['E-mail']['url'],
46 |                                      note='E-mail')
47 | 
48 |             if councilman['Web site']:
49 |                 p.add_link(councilman['Web site']['url'], note='web site')
50 | 
51 | 
52 |             p.add_source(councilman['Person Name']['url'])
53 | 
54 |             for committee, _, _ in committees:
55 |                 committee_name = committee['Department Name']['label']
56 | 
57 |                 if 'committee' in committee_name.lower() :
58 |                     o = committee_d.get(committee_name, 
59 |                                         None)
60 |                     if o is None:
61 |                         o = Organization(committee_name,
62 |                                          classification='committee',
63 |                                          parent_id={'name' : 'Cook County Board of Commissioners'})
64 |                         o.add_source(committee['Department Name']['url'])
65 |                         committee_d[committee_name] = o
66 | 
67 |                     membership = o.add_member(p, role=committee["Title"])
68 |                     membership.start_date = self.mdY2Ymd(committee["Start Date"])
69 |             yield p
70 | 
71 |         for o in committee_d.values() :
72 |             yield o
73 | 


--------------------------------------------------------------------------------
/miamidade/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization, Person
 3 | from .events import MiamidadeEventScraper
 4 | from .bills import MiamidadeBillScraper
 5 | from .people import MiamidadePersonScraper
 6 | 
 7 | import datetime
 8 | 
 9 | 
10 | class Miamidade(Jurisdiction):
11 |     division_id = "ocd-division/country:us/state:fl/county:miami-dade"
12 |     classification = "legislature"
13 |     name = "Miami-Dade County Government"
14 |     url = "http://miamidade.gov/wps/portal/Main/government"
15 |     parties = []
16 | 
17 |     scrapers = {
18 |         "events": MiamidadeEventScraper,
19 |         "bills": MiamidadeBillScraper,
20 |         "people": MiamidadePersonScraper,
21 |     }
22 | 
23 |     legislative_sessions = [{"identifier":"2014",
24 |             "name":"2014 Regular Session",
25 |             "start_date": "2014-11-20",
26 |             "end_date": "2016-11-20"},
27 |             ]
28 | 
29 | 
30 |     def get_organizations(self):
31 |         people_base_url = "http://miamidade.gov/wps/portal/Main/government"
32 |         org = Organization(name="Miami-Dade County Commission",
33 |             classification="legislature")
34 |         
35 |         for x in range(1,14):
36 |             org.add_post(label="District {dist} Commissioner".format(dist=x),
37 |                 role="Commissioner",
38 |                 division_id=self.division_id)
39 |         
40 |         yield org
41 | 
42 |         mayor = Organization('Office of the Mayor', classification='executive')
43 |         yield mayor
44 | 
45 |         mayorPers = Person(name="Carlos A. Giménez",
46 |                        primary_org='executive',
47 |                        role='Mayor',
48 |                        primary_org_name='Office of the Mayor',
49 |                        start_date=datetime.date(2011, 6, 28))
50 |         mayorPers.add_source(people_base_url, note="Miami-Dade government website")
51 |         mayorPers.add_source("http://www.miamidade.gov/mayor/", note="individual's website")
52 |         yield mayorPers
53 | 
54 | 
55 |         clerk = Organization('Clerk of Courts', classification='executive')
56 |         yield clerk
57 | 
58 |         clerkPers = Person(name="Harvey Ruvin",
59 |                        primary_org='executive',
60 |                        role='Clerk',
61 |                        primary_org_name='Clerk of Courts')
62 |         clerkPers.add_source(people_base_url, note="Miami-Dade government website")
63 |         clerkPers.add_source("http://www.miami-dadeclerk.com/", note="individual's website")
64 |         
65 |         yield clerkPers
66 | 
67 | 
68 |         pa = Organization('Office of the Property Appraiser', classification='executive')
69 |         yield pa
70 | 
71 |         paPers = Person(name="Pedro J. Garcia",
72 |                        primary_org='executive',
73 |                        role='Property Appraiser',
74 |                        primary_org_name='Office of the Property Appraiser')
75 |         paPers.add_source(people_base_url, note="Miami-Dade government website")
76 |         paPers.add_source("http://www.miamidade.gov/pa/", note="individual's website")
77 |         yield paPers
78 | 


--------------------------------------------------------------------------------
/archive/boise/events.py:
--------------------------------------------------------------------------------
  1 | # ~*~ encoding: utf-8 ~*~
  2 | from pupa.scrape import Scraper
  3 | from pupa.scrape import Event
  4 | 
  5 | import requests
  6 | 
  7 | from sh import pdftotext
  8 | import datetime as dt
  9 | import os
 10 | import re
 11 | 
 12 | 
 13 | CAL_PDF = "http://www.cityofboise.org/city_clerk/HearingSchedule/HearingSchedule.pdf"
 14 | 
 15 | MONTHS = [
 16 |     "JANUARY",
 17 |     "FEBRUARY",
 18 |     "MARCH",
 19 |     "APRIL",
 20 |     "MAY",
 21 |     "JUNE",
 22 |     "JULY",
 23 |     "AUGUST",
 24 |     "SEPTEMBER",
 25 |     "OCTOBER",
 26 |     "NOVEMBER",
 27 |     "DECEMBER"
 28 | ]
 29 | 
 30 | DATE_FINDER = re.compile(
 31 |     r"(?P<month>%s) (?P<day>\d{1,2}), (?P<year>\d{4})" % ("|".join(MONTHS))
 32 | )
 33 | 
 34 | TIME_FINDER = re.compile(r"\d{1,2}:\d{1,2} \w+")
 35 | 
 36 | 
 37 | 
 38 | class BoiseEventScraper(Scraper):
 39 | 
 40 |     def lxmlize(self, url):
 41 |         entry = self.urlopen(url)
 42 |         page = lxml.html.fromstring(entry)
 43 |         page.make_links_absolute(url)
 44 |         return page
 45 | 
 46 |     def download_file(self, url):
 47 |         fpath = os.path.basename(url)
 48 |         if not os.path.exists(fpath):
 49 |             open(fpath, 'wb').write(requests.get(url).content)
 50 |         return fpath
 51 | 
 52 |     def scrape(self):
 53 |         path = self.download_file(CAL_PDF)
 54 |         target = re.sub("\.pdf$", ".txt", path)
 55 |         if not os.path.exists(target):
 56 |             pdftotext(path)
 57 | 
 58 |         entries = self.parse_file(open(target, 'r'))
 59 |         next(entries)  # two ignorable lines
 60 |         next(entries)
 61 | 
 62 |         for entry in entries:
 63 |             for e in self.handle_buffer(entry):
 64 |                 e.add_source(CAL_PDF)
 65 |                 yield e
 66 | 
 67 | 
 68 |     def handle_buffer(self, buf):
 69 |         dates = DATE_FINDER.findall(buf)
 70 |         if dates == []:
 71 |             return
 72 |         month, day, year = dates[0]
 73 |         _, buf = buf.split(year, 1)
 74 |         time = TIME_FINDER.findall(buf)
 75 |         time = time[0] if time else None
 76 | 
 77 |         all_day = time is None
 78 | 
 79 |         tbuf = "%s %s %s" % (month, day, year)
 80 |         fmt = "%B %d %Y"
 81 | 
 82 |         dt_replace = {"Noon": "PM"}
 83 |         et_replace = [["–", "-"],
 84 |                       [r"^\s+\-\s+", ""]]
 85 | 
 86 |         if not all_day:
 87 |             tbuf += " %s" % (time)
 88 |             fmt += " %I:%M %p"
 89 | 
 90 |         for k, v in dt_replace.items():
 91 |             tbuf = tbuf.replace(k, v)
 92 | 
 93 |         for k, v in et_replace:
 94 |             buf = re.sub(k, v, buf)
 95 | 
 96 |         buf = buf.strip()
 97 | 
 98 |         obj = dt.datetime.strptime(tbuf, fmt)
 99 |         e = Event(name=buf, when=obj, location="City Hall")
100 |         yield e
101 | 
102 | 
103 |     def parse_file(self, fd):
104 |         collect = ""
105 |         for line in fd.readlines():
106 |             line = line.strip()
107 |             if True in (x in line for x in MONTHS):
108 |                 yield collect
109 |                 collect = line
110 |                 continue
111 |             collect += " " + line
112 | 


--------------------------------------------------------------------------------
/archive/denver/people.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from itertools import groupby
 3 | 
 4 | from pupa.scrape import Scraper
 5 | from pupa.scrape import Person
 6 | from pupa.scrape import Organization
 7 | 
 8 | from .utils import Urls
 9 | 
10 | 
11 | legislators_url = (
12 |     "http://www.denvergov.org/citycouncil/DenverCityCouncil/"
13 |     "CouncilMembers/tabid/436358/Default.aspx")
14 | 
15 | 
16 | class PersonScraper(Scraper):
17 | 
18 |     def scrape(self):
19 |         urls = Urls(dict(list=legislators_url), self)
20 | 
21 |         council = Organization('Denver City Council')
22 |         council.add_source(legislators_url)
23 | 
24 |         # Get image urls, names, detail urls, and districts.
25 |         image_xpath = '//a[contains(@href, "councildistrict")]/img/@src'
26 |         image_urls = urls.list.xpath(image_xpath)
27 | 
28 |         name_xpath = '//a[contains(@href, "councildistrict")]'
29 |         names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1]
30 |         names = filter(None, names)
31 | 
32 |         person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href'
33 |         person_urls = urls.list.xpath(person_urls_xpath)
34 | 
35 |         post_ids = []
36 |         xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td'
37 |         for td in urls.list.xpath(xpath):
38 |             text = td.text_content()
39 |             m = re.search('Council District \d+', text)
40 |             if m:
41 |                 post_ids.append(m.group())
42 |                 continue
43 |             m = re.search('Council At-Large', text)
44 |             if m:
45 |                 post_ids.append('Council At-Large')
46 | 
47 |         for post_id in post_ids:
48 |             council.add_post(post_id, post_id)
49 |         yield council
50 | 
51 |         data = zip(image_urls, names, person_urls, post_ids)
52 |         for image_url, name, person_url, post_id in data:
53 | 
54 |             # Create legislator.
55 |             person = Person(name, image=image_url)
56 | 
57 |             # Add sources.
58 |             urls.add(detail=person_url)
59 |             person.add_source(urls.list.url, note='list')
60 |             person.add_source(urls.detail.url, note='detail')
61 | 
62 |             # Add membership on council.
63 |             memb = person.add_membership(council, district=post_id.strip())
64 |             memb.add_source(urls.detail.url)
65 | 
66 |             xpath = '//div[@id="dnn_column3"]'
67 |             contact_text = urls.detail.xpath(xpath)[0].text_content()
68 | 
69 |             if not contact_text.strip():
70 |                 xpath = '//div[contains(@id, "dnn_RightPaneWide")]'
71 |                 contact_text = urls.detail.xpath(xpath)[0].text_content()
72 | 
73 |             phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}'
74 |             phone = re.search(phone_regex, contact_text).group()
75 |             memb.contact_details.append(
76 |                 dict(type='phone', value=phone, note='work'))
77 | 
78 |             # Add email address.
79 |             email_regex = r'\S+@denvergov.org'
80 |             email = re.search(email_regex, contact_text).group()
81 |             memb.contact_details.append(
82 |                 dict(type='email', value=email, note='work'))
83 | 
84 |             yield person
85 | 


--------------------------------------------------------------------------------
/archive/cleveland/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper, Legislator
 2 | 
 3 | import lxml.html
 4 | import re
 5 | 
 6 | # people
 7 | INFOSLUG = re.compile(r"Ward (?P<district>\d+) Council(?P<gender>.*)")
 8 | 
 9 | 
10 | class ClevelandPersonScraper(Scraper):
11 | 
12 |     def lxmlize(self, url):
13 |         entry = self.urlopen(url)
14 |         page = lxml.html.fromstring(entry)
15 |         page.make_links_absolute(url)
16 |         return page
17 | 
18 |     def scrape(self):
19 |         yield self.cleveland_scrape_people()
20 | 
21 |     def scrape_page(self, url):
22 |         ret = {}
23 |         page = self.lxmlize(url)
24 |         bio = page.xpath("//div[@class='biotab bio']")[0].text_content()
25 |         ret['bio'] = bio
26 |         email = page.xpath(
27 |             "//a[contains(@href, 'mailto:')]"
28 |         )[0].attrib['href'].strip()[len("mailto:"):]
29 |         ret['email'] = email
30 |         committees = page.xpath("//ul[@class='list-flat']//li")
31 |         ret['committees'] = [x.text for x in committees]
32 |         contact = page.xpath("//div[@class='sidebar-content']//p")[0]
33 | 
34 |         contact_details = dict([y.strip().split(":", 1) for y in [contact.text]
35 |                                 + [x.tail for x in contact.xpath(".//br")] if y
36 |                                 and ":" in y])
37 |         ret['contact_details'] = contact_details
38 |         return ret
39 | 
40 |     def cleveland_scrape_people(self):
41 |         listing = "http://www.clevelandcitycouncil.org/council-members/"
42 |         page = self.lxmlize(listing)
43 | 
44 |         table = page.xpath("//div[@class='standard-content column']//table")[0]
45 |         for person in table.xpath(".//td[@align='center']"):
46 |             strong = person.xpath(".//strong")[0]
47 |             who = strong.text.strip()
48 |             role = strong.xpath("./br")[0].tail.strip()
49 |             img = person.xpath(".//img")[0].attrib['src']
50 |             info = INFOSLUG.match(role).groupdict()
51 | 
52 |             scraped_info = {}
53 |             page = person.xpath(".//a")
54 |             if page != []:
55 |                 page = page[0].attrib['href']
56 |                 scraped_info = self.scrape_page(page)
57 | 
58 |             kwargs = {}
59 |             biography = scraped_info.get('bio', None)
60 |             if biography:
61 |                 kwargs['biography'] = biography
62 | 
63 |             p = Legislator(name=who,
64 |                            district=info['district'],
65 |                            gender=info['gender'],
66 |                            image=img, **kwargs)
67 |             p.add_source(listing)
68 | 
69 |             valid_titles = [
70 |                 "Chair",
71 |                 "Vice Chair"
72 |             ]
73 | 
74 |             for what in scraped_info.get('committees', []):
75 |                 what = what.strip()
76 |                 if what == "":
77 |                     continue
78 | 
79 |                 role = "member"
80 |                 if "-" in what:
81 |                     c, title = (x.strip() for x in what.rsplit("-", 1))
82 |                     if title in valid_titles:
83 |                         what = c
84 |                         role = title
85 |                 p.add_committee_membership(what, role=role)
86 |             yield p
87 | 


--------------------------------------------------------------------------------
/archive/boise/bills.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import defaultdict
 3 | from io import StringIO
 4 | 
 5 | from lxml.html import fromstring
 6 | 
 7 | from pupa.scrape import BaseBillScraper
 8 | from pupa.utils import convert_pdf
 9 | from pupa.scrape import Bill
10 | 
11 | from .utils import Urls
12 | 
13 | 
14 | agenda_list = ("http://cityclerk.cityofboise.org/city-council-meetings/"
15 |                "council-agendas/2012-agendas/")
16 | 
17 | 
18 | class BillScraper(BaseBillScraper):
19 | 
20 |     def get_agenda_urls(self):
21 |         xpath = '//a/@href'
22 |         urls = self.urls.agenda_list.doc.xpath(xpath)
23 |         for url in filter(re.compile('\d+ca.pdf$', re.I).search, urls):
24 |             yield url
25 |         for url in filter(re.compile('\d+sm.pdf$', re.I).search, urls):
26 |             yield url
27 | 
28 |     def get_bill_ids(self):
29 |         self.urls = Urls(dict(agenda_list=agenda_list), scraper=self)
30 |         for agenda_url in self.get_agenda_urls():
31 |             self.urls.add(agenda=agenda_url)
32 |             doc = self.urls.agenda.pdf_to_lxml
33 | 
34 |             titles = defaultdict(StringIO)
35 |             for url in doc.xpath('//a'):
36 |                 if 'href' not in url.attrib:
37 |                     continue
38 |                 href = url.attrib['href']
39 |                 if re.search(r'[ro]\-\d+\-\d+\.pdf$', href):
40 |                     titles[href].write(url.text_content())
41 | 
42 |             for item in titles.items():
43 |                 try:
44 |                     yield from self.parse_title(item)
45 |                 except Exception:
46 |                     # Because PDF scraping is terrible.
47 |                     continue
48 | 
49 |     def parse_title(self, item):
50 |         url, title = item
51 |         chunks = title.getvalue().split('\xa0')
52 |         chunks = list(filter(None, chunks))
53 | 
54 |         # Fix problem of agenda item number connected to bill_id.
55 |         if len(chunks[0]) > 3:
56 |             bill_id = chunks.pop(0)
57 |             if '.' in bill_id:
58 |                 print(bill_id)
59 |                 agenda_item, bill_id = bill_id.split('.')
60 |             else:
61 |                 agenda_item = None
62 |                 bill_id = chunks.pop(0)
63 |         else:
64 |             agenda_item = chunks.pop(0)
65 |             bill_id = chunks.pop(0)
66 | 
67 |         # Fix issue of different spacing betwixt bill_id and title.
68 |         title = ' '.join(chunks)
69 |         if len(bill_id) > 20:
70 |             bill_id, title_start = bill_id.split(' ', 1)
71 |             title = title_start + ' ' + title
72 | 
73 |         if not bill_id:
74 |             return
75 | 
76 |         yield bill_id, dict(agenda_item=agenda_item, title=title, url=url)
77 | 
78 |     def get_type(self, bill_id):
79 |         first = bill_id[0]
80 |         try:
81 |             return dict(R='resolution', O='ordinance')[first]
82 |         except KeyError:
83 |             raise self.ContinueScraping()
84 | 
85 |     def get_bill(self, bill_id, **kwargs):
86 |         url = kwargs.pop('url')
87 |         agenda_item = kwargs.pop('agenda_item')
88 |         _type = self.get_type(bill_id)
89 |         bill = Bill(bill_id, self.session, type=_type, **kwargs)
90 |         bill.add_source(url, note='detail')
91 |         return bill
92 | 


--------------------------------------------------------------------------------
/miamidade/events.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Event
 3 | 
 4 | import lxml.html
 5 | from datetime import datetime
 6 | import pytz
 7 | 
 8 | DUPLICATE_EVENT_URLS = ('http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=445731', 
 9 |                         'http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=452515',
10 |                         'http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=452513')
11 | 
12 | class MiamidadeEventScraper(Scraper):
13 | 
14 |     def lxmlize(self, url):
15 |         html = self.get(url).text
16 |         doc = lxml.html.fromstring(html)
17 |         doc.make_links_absolute(url)
18 |         return doc
19 | 
20 |     def scrape(self):
21 |         local_timezone =  pytz.timezone("US/Eastern")
22 |         base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp"
23 |         #things get messy more than a few months out
24 |         #so we're just pulling 3 months. If we want three
25 |         #more, they are called "nxx", "nxy" and "nxz"
26 |         months = ["cur","nex","nxw"]
27 |         for m in months:
28 |             doc = self.lxmlize(base_calendar_url + "?next={}".format(m))
29 |             events = doc.xpath("//table[contains(@style,'dotted #ccc')]")
30 |             for event in events:
31 |                 rows = event.xpath(".//tr")
32 |                 for row in rows:
33 |                     heading, data = row.xpath(".//td")
34 |                     h = heading.text_content().lower().replace(":","").strip()
35 |                     if h == "event":
36 |                         title = data.text_content()
37 |                         link = data.xpath(".//a")[0].attrib["href"]
38 |                     elif h == "event date":
39 |                         when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p')
40 |                         when = local_timezone.localize(when)
41 |                     elif h == "location":
42 |                         where = data.text
43 |                     elif h == "description":
44 |                         description = data.text
45 | 
46 |                 if link in DUPLICATE_EVENT_URLS:
47 |                     continue
48 | 
49 |                 if title == "Mayor's FY 2016-17 Proposed Budget Public Meeting":
50 |                     continue
51 | 
52 |                 if not description:
53 |                     description = ""
54 | 
55 |                 status = "confirmed"
56 |                 if "cancelled" in title.lower():
57 |                     status = "cancelled"
58 | 
59 |                 e = Event(name=title,
60 |                             start_time=when,
61 |                             timezone="US/Eastern",
62 |                             location_name=where,
63 |                             description=description,
64 |                             status=status)
65 |                 
66 |                 e.add_source(link)
67 |                 yield e
68 | 
69 |             e = Event(name="Mayor's FY 2016-17 Proposed Budget Public Meeting",
70 |                       start_time=local_timezone.localize(datetime.strptime('08/08/16 06:00PM', '%m/%d/%y %H:%M%p')),
71 |                       timezone="US/Eastern",
72 |                       location_name='111 NW 1st Street',
73 |                       description='Pursuant to Section 2-1800A of the County Code, a Public Meeting has been scheduled by the Honorable Carlos A. Gimenez, Mayor, Miami-Dade County, to discuss the FY 2016-17 budget, tax rates, and fee changes.',
74 |                       status='confirmed')
75 |             e.add_source('http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=447192')
76 |             yield e
77 | 


--------------------------------------------------------------------------------
/archive/temecula/events.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Scraper
  2 | from pupa.scrape import Event
  3 | 
  4 | import datetime as dt
  5 | import lxml.html
  6 | import re
  7 | 
  8 | 
  9 | CAL_PAGE = ("http://www.cityoftemecula.org/Temecula/Visitors/Calendar.htm")
 10 | 
 11 | 
 12 | class TemeculaEventScraper(Scraper):
 13 | 
 14 |     def lxmlize(self, url):
 15 |         entry = self.urlopen(url)
 16 |         page = lxml.html.fromstring(entry)
 17 |         page.make_links_absolute(url)
 18 |         return page
 19 | 
 20 |     def cleanup(self, foo):
 21 |         foo = re.sub("\s+", " ", foo).strip()
 22 |         return foo
 23 | 
 24 |     def scrape(self):
 25 |         page = self.lxmlize(CAL_PAGE)
 26 |         form = page.xpath("//form[@name='Form1']")
 27 |         form = form[0] if form else None
 28 |         if form is None:
 29 |             raise Exception("Erm, crud.")
 30 |         page = self.do_post_back(form, 'Listview1$ddlCategory', '', **{
 31 |             "Listview1:ddlCategory": "1"
 32 |         })
 33 |         for event in self.scrape_event_page(page):
 34 |             event.add_source(CAL_PAGE)
 35 |             yield event
 36 | 
 37 | 
 38 |     def get_start_end(self, obj):
 39 |         date = obj['Date:']
 40 |         times = obj['time']
 41 |         start, end = ("%s %s" % (date, times[time]) for time in times)
 42 |         return (dt.datetime.strptime(x, "%A, %B %d, %Y %I:%M %p")
 43 |                 for x in (start, end))
 44 | 
 45 | 
 46 |     def scrape_event_page(self, page):
 47 |         for entry in page.xpath(
 48 |                 "//table[@id='Listview1_DataGrid1']//tr[@class='mainText']"):
 49 |             title = None
 50 |             ret = {}
 51 |             for block in entry.xpath(".//td[@class='mainText']"):
 52 |                 entries = block.xpath("./*")
 53 |                 if "table" in (x.tag for x in entries):
 54 |                     continue
 55 |                 info = [self.cleanup(x.text_content()) for x in entries]
 56 |                 if title is None:
 57 |                     title = info[1]
 58 |                     continue
 59 |                 key = info.pop(0)
 60 |                 val = None
 61 |                 if "Time: " in key:
 62 |                     _, val = key.split("Time: ", 1)
 63 |                     start, end = val.split(" - ", 1)
 64 |                     val = {"start": start,
 65 |                            "end": end}
 66 |                     key = "time"
 67 |                 else:
 68 |                     val = info.pop(0) if info else None
 69 | 
 70 |                 ret[key] = val
 71 |                 if info != []:
 72 |                     raise Exception("Erm. odd scrape.")
 73 | 
 74 |             if title is None:
 75 |                 continue
 76 | 
 77 |             ret['title'] = title
 78 |             start, end = self.get_start_end(ret)
 79 |             ret['time']['start'], ret['time']['end'] = start, end
 80 | 
 81 |             event = Event(name=ret['Description:'] or "TBA",
 82 |                           location=ret['Location:'],
 83 |                           when=ret['time']['start'],
 84 |                           end=ret['time']['end'])
 85 |             yield event
 86 | 
 87 | 
 88 |     def post_back(self, form, **kwargs):
 89 |         block = {name: value for name, value in [(obj.name, obj.value)
 90 |                     for obj in form.xpath(".//input")]}
 91 |         block.update(kwargs)
 92 | 
 93 |         ret = lxml.html.fromstring(self.urlopen(form.action, block))
 94 | 
 95 |         ret.make_links_absolute(form.action)
 96 |         return ret
 97 | 
 98 | 
 99 |     def do_post_back(self, form, event_target, event_argument, **kwargs):
100 |         block = kwargs
101 |         event_argument = ":".join(event_argument.split("$"))
102 |         block['__EVENTTARGET'] = event_target
103 |         block['__EVENTARGUMENT'] = event_argument
104 |         return self.post_back(form, **block)
105 | 


--------------------------------------------------------------------------------
/miamidade/people.py:
--------------------------------------------------------------------------------
 1 | from pupa.scrape import Scraper
 2 | from pupa.scrape import Person
 3 | 
 4 | import lxml.html
 5 | class MiamidadePersonScraper(Scraper):
 6 | 
 7 |     def lxmlize(self, url):
 8 |         html = self.get(url).text
 9 |         doc = lxml.html.fromstring(html)
10 |         doc.make_links_absolute(url)
11 |         return doc
12 | 
13 |     def scrape(self):
14 |         yield from self.get_people()
15 |         #committees can go in here too
16 | 
17 |     def get_people(self):
18 |         people_base_url = "http://miamidade.gov/wps/portal/Main/government"
19 |         doc = self.lxmlize(people_base_url)
20 |         person_list = doc.xpath("//div[contains(@id,'elected')]//span")
21 |         titles = ["Chairman","Vice Chair"]
22 | 
23 |         for person in person_list:
24 |             info = person.text_content().strip().split("\r")
25 |             position = info[0].strip()
26 |             name = " ".join(info[1:-1])
27 |             name = name.replace("Website | Contact", "")
28 |             for title in titles:
29 |                 name = name.replace(title,"")
30 |             name = name.strip()
31 |             url = person.xpath(".//a[contains(text(),'Website')]/@href")[0]
32 |             image = person.xpath(".//img/@src")[0]
33 |             if position.startswith('District'):
34 |                 pers = Person(name=name,
35 |                               image=image,
36 |                               district=position+" Commissioner",
37 |                               primary_org='legislature',
38 |                               role="Commissioner")
39 |             else:
40 |                 continue
41 |                 
42 |                 pers.add_source(people_base_url, note="Miami-Dade government website")
43 |                 pers.add_source(url, note="individual's website")
44 | 
45 |                 #the commissioners have consistent site format
46 |                 if "district" in position.lower():
47 |                     person_doc = self.lxmlize(url)
48 |                     contact_rows = person_doc.xpath("//div[@class='leftContentContainer']//p")
49 |                     for line in contact_rows:
50 |                         line_text = line.text_content()
51 |                         if "email" in line_text.lower():
52 |                             email_address = line_text.replace("Email:","").strip()
53 |                             pers.add_contact_detail(type="email",
54 |                                                     value=email_address)
55 |                             continue
56 |                         try:
57 |                             office,phone,fax = line_text.strip().split("\n")
58 |                         except ValueError:
59 |                             #ick, it's all on one line.
60 |                             if "downtown office" in line_text.lower():
61 |                                 office = "Downtown Office"
62 |                             elif "district office" in line_text.lower():
63 |                                 office = "District Office"
64 |                             else:
65 |                                 continue
66 |                             phone = line_text[15:27]
67 |                             fax = line_text[33:45]
68 | 
69 |                         if "office" not in office.lower():
70 |                             continue
71 |                             #social is also available in here
72 |                             #but I don't see a place to put it
73 |                         phone = phone.replace("Phone","").strip()
74 |                         fax = fax.replace("Fax","").strip()
75 |                         pers.add_contact_detail(type="voice", #phone is not allowed ????
76 |                                 value=phone,
77 |                                 note=office.strip())
78 | 
79 |                         pers.add_contact_detail(type="fax", #phone is not allowed ????
80 |                                 value=fax,
81 |                                 note=office.strip())
82 | 
83 |                 yield pers
84 | 


--------------------------------------------------------------------------------
/st_louis/people.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Person, Organization
  2 | from .utils import Urls, StlScraper, HumanName
  3 | 
  4 | class StLouisPersonScraper(StlScraper):
  5 | 
  6 | 	def scrape(self):
  7 | 		# FIXME `yield` or `yield from`?
  8 | 		yield from self.scrape_people()
  9 | 		yield from self.scrape_committees()
 10 | 
 11 | 	def scrape_people(self):
 12 | 		for ward_num in range(1, self.jurisdiction.WARD_COUNT + 1):
 13 | 			yield self.scrape_alderman(ward_num)
 14 | 
 15 | 	def scrape_committees(self):
 16 | 		for comm_num in range(1, self.COMMITTEE_COUNT + 1):
 17 | 			yield from self.scrape_committee(comm_num)
 18 | 
 19 | 	def scrape_alderman(self, ward_num):
 20 | 		ward_url = "{}/ward-{}".format(Urls.ALDERMEN_HOME, ward_num)
 21 | 		alderman_url = self.alderman_url(ward_url)
 22 | 		alderman_page = self.lxmlize(alderman_url)
 23 | 
 24 | 		# person's name is the only <h1> tag on the page
 25 | 		raw_name = alderman_page.xpath("//h1/text()")[0]
 26 | 		name = HumanName.name_firstandlast(raw_name)
 27 | 
 28 | 		# initialize person object with appropriate data so that pupa can 
 29 | 		# automatically create a membership object linking this person to
 30 | 		# a post in the jurisdiction's "Board of Aldermen" organization
 31 | 		district = "Ward {} Alderman".format(ward_num)
 32 | 		person = Person(name=name, district=district, role="Alderman", 
 33 | 										primary_org="legislature")
 34 | 
 35 | 		# set additional fields
 36 | 		person.image = alderman_page.xpath("//div/img/@src")[0]
 37 | 		phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip()
 38 | 		person.add_contact_detail(type="voice", value=phone_number)
 39 | 
 40 | 		# add sources
 41 | 		person.add_source(alderman_url, note="profile")
 42 | 		person.add_source(ward_url, note="ward")
 43 | 
 44 | 		return person
 45 | 
 46 | 	def scrape_committee(self, comm_num):
 47 |                 url = self.committee_url(comm_num)
 48 |                 page = self.lxmlize(url)
 49 |                 # get title
 50 |                 comm_name = page.xpath("//h1/text()")[0]
 51 | 
 52 |                 # create object
 53 |                 comm = Organization(name=comm_name,
 54 |                                     classification="committee",
 55 |                                     chamber="legislature")
 56 |                 comm.add_source(url=url)
 57 | 
 58 | 		# add posts
 59 |                 comm.add_post(label="chair", role="chair")
 60 | 		# FIXME do we need a separate post for each member?
 61 | 		# FIXME is member an appropriate name?
 62 |                 comm.add_post(label="member", role="member") 
 63 | 
 64 | 		# helper for finding other nodes
 65 |                 landmark_node = page.xpath("//h2[text()='Committee Members']")[0]
 66 | 
 67 | 		# add memberships
 68 |                 member_names = landmark_node.xpath("following-sibling::div/ul/li/a/text()")
 69 |                 fl_names = [HumanName.name_firstandlast(name) for name in member_names]
 70 |                 print("My attempt to scrub people's names:", 
 71 |                       list(zip(member_names, fl_names)))
 72 |                 chair_name, *other_names = fl_names
 73 |                 if chair_name not in {'Lewis Reed'} :
 74 |                         comm.add_member(chair_name, role="chair")
 75 |                 for name in other_names:
 76 |                         if name not in {'Lewis Reed'} :
 77 |                                 comm.add_member(name, role="member")
 78 | 		# add description 
 79 |                 about_node = page.xpath("//h2[text()='About']")[0]
 80 |                 (description, ) = about_node.xpath("parent::div//div[@class='content-block']/p[2]/text()")
 81 |                 description = description.strip()
 82 |                 comm.extras = {"description": description}
 83 | 
 84 |                 yield comm
 85 | 
 86 | 
 87 | 	def alderman_url(self, ward_url):
 88 | 		ward_page = self.lxmlize(ward_url)
 89 | 		# each ward page contains a link to the current alderman's profile.
 90 | 		# the text of the link says "Email <Jane Doe>" where Jane Doe is the
 91 | 		# name of the alderman.
 92 | 		# find that link by selecting for <a> tags whose text contains 'Email'
 93 | 		return ward_page.xpath("//a[contains(text(), 'Email')]//@href")[0]
 94 | 
 95 | 	def committee_url(self, comm_num):
 96 | 		return Urls.COMMITTEES_HOME + "?committeeDetail=true&comId={}".format(comm_num)
 97 | 
 98 | 	# TODO move this?
 99 | 	COMMITTEE_COUNT = 15
100 | 


--------------------------------------------------------------------------------
/archive/boston/vote.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Scraper
  2 | from pupa.scrape import Vote
  3 | 
  4 | import datetime as dt
  5 | import lxml
  6 | import time
  7 | 
  8 | 
  9 | DURL = "http://www.cityofboston.gov/cityclerk/rollcall/default.aspx"
 10 | 
 11 | 
 12 | class BostonVoteScraper(Scraper):
 13 | 
 14 |     def lxmlize(self, url):
 15 |         entry = self.urlopen(url)
 16 |         page = lxml.html.fromstring(entry)
 17 |         page.make_links_absolute(url)
 18 |         return page
 19 | 
 20 |     def scrape(self):
 21 |         for page in self.iterpages():
 22 |             for subject in page.xpath('//div[@class="ContainerPanel"]'):
 23 |                 dates = subject.xpath(".//font[@color='#276598']/b/text()")
 24 |                 motions = [x.strip() for x in subject.xpath(
 25 |                     ".//div[@style='width:260px; float:left;']/text()")]
 26 |                 votes = subject.xpath(".//div[@style='width:150px; float:right;']")
 27 |                 docket = subject.xpath(".//div[@class='HeaderContent']/b/text()")
 28 |                 docket = list(filter(lambda x: "docket" in x.lower(), docket))
 29 |                 docket = docket[0] if docket else None
 30 | 
 31 |                 for date, motion, vote in zip(dates, motions, votes):
 32 |                     when = dt.datetime.strptime(date, "%m/%d/%Y")
 33 |                     motion = motion.strip()
 34 | 
 35 |                     if motion == "":
 36 |                         self.warning("Skipping vote.")
 37 |                         continue
 38 | 
 39 |                     v = Vote(session=self.session,
 40 |                              organization="Boston City Council",
 41 |                              type='other',
 42 |                              passed=False,
 43 |                              date=when.strftime("%Y-%m-%d"),
 44 |                              motion=motion,
 45 |                              yes_count=0,
 46 |                              no_count=0,)
 47 | 
 48 |                     if docket:
 49 |                         v.set_bill(docket)
 50 | 
 51 |                     yes, no, other = 0, 0, 0
 52 | 
 53 |                     vit = iter(vote.xpath("./div"))
 54 |                     vote = zip(vit, vit, vit)
 55 |                     for who, entry, _ in vote:
 56 |                         how = entry.text
 57 |                         who = who.text
 58 | 
 59 |                         if how == 'Y':
 60 |                             v.yes(who)
 61 |                             yes += 1
 62 |                         elif how == 'N':
 63 |                             v.no(who)
 64 |                             no += 1
 65 |                         else:
 66 |                             v.other(who)
 67 |                             other += 1
 68 | 
 69 |                     for count in v.vote_counts:
 70 |                         count['count'] = {
 71 |                             "yes": yes,
 72 |                             "no": no,
 73 |                             "other": other
 74 |                         }[count['vote_type']]
 75 | 
 76 |                     v.add_source(DURL, note='root')
 77 |                     yield v
 78 | 
 79 | 
 80 |     def do_post_back(self, form, event_target, event_argument):
 81 |         block = {name: value for name, value in [(obj.name, obj.value)
 82 |                     for obj in form.xpath(".//input")]}
 83 |         block['__EVENTTARGET'] = event_target
 84 |         block['__EVENTARGUMENT'] = event_argument
 85 |         block['ctl00$MainContent$lblCurrentText'] = (int(
 86 |             block['ctl00$MainContent$lblCurrentText']) + 1)
 87 |         block.pop("ctl00$MainContent$ctl00")
 88 | 
 89 |         ret = lxml.html.fromstring(self.urlopen(form.action, block))
 90 | 
 91 |         ret.make_links_absolute(form.action)
 92 |         return ret
 93 | 
 94 | 
 95 |     def iterpages(self):
 96 |         page = self.lxmlize(DURL)
 97 |         yield page
 98 |         while page is not None:
 99 |             yield page
100 |             page = self.next_page(page)
101 | 
102 | 
103 |     def next_page(self, page):
104 |         time.sleep(5)
105 |         form = page.xpath("//form[@name='aspnetForm']")[0]
106 |         n = page.xpath("//a[contains(text(), 'Next Page')]")[0]
107 |         nextable = n.attrib['style'] != 'display: none;'
108 |         if nextable:
109 |             return self.do_post_back(form, 'ctl00$MainContent$lnkNext', '')
110 |         return None
111 | 


--------------------------------------------------------------------------------
/archive/arlington_va/events.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import re
 3 | 
 4 | import lxml, lxml.html
 5 | 
 6 | from pupa.scrape import Scraper
 7 | from pupa.scrape import Event
 8 | 
 9 | class EventScraper(Scraper):
10 |     ARLINGTON_MEETING_PAGE = 'http://arlington.granicus.com/ViewPublisher.php?view_id=2'
11 | 
12 |     def _organize_cells(self, table_type, cells):
13 |         if table_type=='upcoming':
14 |             return {
15 |                 'title': cells[0],
16 |                 'date': cells[1],
17 |                 'agenda': cells[2]
18 |             }
19 |         elif table_type=='archive':
20 |             return {
21 |                 'title': cells[0],
22 |                 'date': cells[1],
23 |                 'duration': cells[2],
24 |                 'agenda': cells[3],
25 |                 'minutes': cells[4],
26 |                 'video': cells[5],
27 |                 'audio': cells[6]
28 |             }
29 | 
30 |     def scrape(self):
31 |         meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
32 |         meetings_lxml = lxml.html.fromstring(meetings_html)
33 |         
34 |         for meeting_type in ('archive', 'upcoming'):
35 |             for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
36 |                 
37 |                 # attempt to map the cells across table types. 
38 |                 # if the sizes mismatch, ignore this one (it's an "empty" message)
39 |                 try:
40 |                     cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
41 |                 except:
42 |                     continue
43 | 
44 |                 meeting_title = cell_mapping['title'].text
45 |                 meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))
46 | 
47 |                 e = Event(name=meeting_title, when=meeting_date, location='unknown')
48 |                 e.add_source(self.ARLINGTON_MEETING_PAGE)                
49 | 
50 |                 # detect agenda url, if present
51 |                 meeting_agenda_url = None
52 |                 if len(cell_mapping['agenda'].cssselect('a'))>0:
53 |                     meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')
54 | 
55 |                 # follow the agenda URL and attempt to extract associated documents
56 |                 if meeting_agenda_url is not None:
57 |                     e.add_link(meeting_agenda_url)
58 |                     e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    
59 | 
60 |                     meeting_agenda_html = self.urlopen(meeting_agenda_url)
61 |                     meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
62 |                     for link in meeting_agenda_lxml.cssselect('a'):
63 |                         link_url = link.attrib.get('href','')
64 |                         if not len(link_url):
65 |                             continue
66 |                         if 'metaviewer.php' in link_url.lower():
67 |                             # NOTE: application/pdf is a guess, may not always be correct
68 |                             if link.text is not None:
69 |                                 e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 
70 | 
71 |                 # skip everything below here for the 'upcoming' table
72 |                 if meeting_type=='upcoming':
73 |                     continue
74 | 
75 |                 # detect video
76 |                 # TODO: extract actual mp4 files
77 |                 video_cell = cell_mapping['video'].cssselect('a')
78 |                 if len(video_cell)>0:
79 |                     video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
80 |                     if video_url_match is not None:
81 |                         e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')
82 | 
83 |                 # detect audio
84 |                 audio_cell = cell_mapping['audio'].cssselect('a')
85 |                 if len(audio_cell)>0:
86 |                     e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')
87 | 
88 |                 # detect minutes
89 |                 minutes_cell = cell_mapping['minutes'].cssselect('a')
90 |                 if len(minutes_cell)>0:
91 |                     e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')
92 | 
93 |                 yield e
94 | 


--------------------------------------------------------------------------------
/archive/lametro/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | from pupa.scrape import Jurisdiction, Organization
 3 | from .bills import LametroBillScraper
 4 | from .people import LametroPersonScraper
 5 | from .events import LametroEventScraper
 6 | 
 7 | 
 8 | class Lametro(Jurisdiction):
 9 |     division_id = "ocd-division/country:us/state:ca/county:los_angeles"
10 |     classification = "transit_authority"
11 |     name = "Los Angeles County Metropolitan Transportation Authority"
12 |     url = "https://www.metro.net/"
13 |     scrapers = {
14 |         "bills": LametroBillScraper,
15 |         "people": LametroPersonScraper,
16 |         "events": LametroEventScraper,
17 |     }
18 | 
19 |     legislative_sessions = []
20 |     for year in range(2014, 2023):
21 |         session = {"identifier": "{}".format(year),
22 |                    "start_date": "{}-07-01".format(year),
23 |                    "end_date": "{}-06-30".format(year + 1)}
24 |         legislative_sessions.append(session)
25 | 
26 | 
27 |     def get_organizations(self):
28 |         org = Organization(name="Board of Directors", classification="legislature")
29 | 
30 |         org.add_post('Mayor of the City of Los Angeles',
31 |                      'Board Member',
32 |                      division_id='ocd-division/country:us/state:ca/place:los_angeles')
33 | 
34 |         for district in range(1, 6):
35 |             org.add_post('Los Angeles County Board Supervisor, District {}'.format(district),
36 |                          'Board Member',
37 |                          division_id='ocd-division/country:us/state:ca/county:los_angeles/council_district:{}'.format(district))
38 | 
39 |         org.add_post('Appointee of Mayor of the City of Los Angeles',
40 |                      'Board Member',
41 |                      division_id='ocd-division/country:us/state:ca/place:los_angeles')
42 | 
43 |         org.add_post('Appointee of Governor of California',
44 |                      'Nonvoting Board Member',
45 |                      division_id='ocd-division/country:us/state:ca')
46 | 
47 |         org.add_post('District 7 Director, California Department of Transportation (Caltrans), Appointee of the Governor of California',
48 |                      'Nonvoting Board Member',
49 |                      division_id='ocd-division/country:us/state:ca')
50 | 
51 |         org.add_post('District 7 Director (Interim), California Department of Transportation (Caltrans), Appointee of the Governor of California',
52 |                      'Nonvoting Board Member',
53 |                      division_id='ocd-division/country:us/state:ca')
54 | 
55 |         org.add_post('Appointee of Los Angeles County City Selection Committee, North County/San Fernando Valley sector',
56 |                      'Board Member',
57 |                      division_id='ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:north_county_san_fernando_valley')
58 | 
59 |         org.add_post('Appointee of Los Angeles County City Selection Committee, Southwest Corridor sector',
60 |                      'Board Member',
61 |                      division_id='ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southwest_corridor')
62 | 
63 | 
64 |         org.add_post('Appointee of Los Angeles County City Selection Committee, San Gabriel Valley sector',
65 |                      'Board Member',
66 |                      division_id='ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:san_gabriel_valley')
67 | 
68 |         org.add_post('Appointee of Los Angeles County City Selection Committee, Southeast Long Beach sector',
69 |                      'Board Member',
70 |                      division_id='ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southeast_long_beach')
71 | 
72 |         org.add_post('Chair', 'Chair')
73 | 
74 |         org.add_post('Vice Chair', 'Vice Chair')
75 | 
76 |         org.add_post('1st Vice Chair', '1st Vice Chair')
77 | 
78 |         org.add_post('2nd Vice Chair', '2nd Vice Chair')
79 | 
80 |         org.add_post("Chief Executive Officer", "Chief Executive Officer")
81 | 
82 |         org.add_source('https://metro.legistar.com/DepartmentDetail.aspx?ID=28529&GUID=44319A1A-B2B7-48CC-B857-ADCE9064573B', note='web')
83 | 
84 |         yield org
85 | 
86 |         org = Organization(name="Crenshaw Project Corporation", classification="corporation")
87 |         org.add_source('https://metro.legistar.com/DepartmentDetail.aspx?ID=32216&GUID=D790CC05-ACCB-451C-B576-2952090769F1')
88 |         yield org
89 | 
90 |         org = Organization(name="LA SAFE", classification="corporation")
91 |         org.add_source('https://metro.legistar.com/DepartmentDetail.aspx?ID=30222&GUID=5F27DA83-633F-4FEA-A4B0-0477551061B6&R=aef57793-1826-4cfa-b6e3-d6b42cf77527')
92 |         yield org
93 | 


--------------------------------------------------------------------------------
/pittsburgh/__init__.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | from pupa.scrape import Jurisdiction, Organization, Person
  3 | 
  4 | from .events import PittsburghEventsScraper
  5 | from .people import PittsburghPersonScraper
  6 | from .bills import PittsburghBillScraper
  7 | 
  8 | import datetime
  9 | 
 10 | 
 11 | class Pittsburgh(Jurisdiction):
 12 |     division_id = "ocd-division/country:us/state:pa/place:pittsburgh"
 13 |     classification = "government"
 14 |     name = "Pittsburgh City Government"
 15 |     timezone = "America/New_York"
 16 |     url = "https://pittsburgh.legistar.com"
 17 | 
 18 |     scrapers = {
 19 |         "events": PittsburghEventsScraper,
 20 |         "people": PittsburghPersonScraper,
 21 |         "bills": PittsburghBillScraper,
 22 |     }
 23 | 
 24 |     legislative_sessions = []
 25 | 
 26 |     for year in range(2001, 2030):
 27 |         legislative_sessions.append({"identifier": str(year),
 28 |                                      "name": str(year) + " Session",
 29 |                                      "start_date": str(year) + "-01-01",
 30 |                                      "end_date": str(year) + "-12-31"})
 31 | 
 32 |     def get_organizations(self):
 33 |         org = Organization(name="Pittsburgh City Council", classification="legislature")
 34 |         for x in range(1, 10):
 35 |             org.add_post(
 36 |                 "District {}".format(x),
 37 |                 "Councilmember",
 38 |                 division_id="ocd-division/country:us/state:pa/place:pittsburgh/council_district:{}".format(x))
 39 |         yield org
 40 | 
 41 |         standing_committee = Organization(name="Standing Committee", classification="committee")
 42 |         standing_committee.add_source("http://pittsburghpa.gov/council/standing-committees", note="web")
 43 |         yield standing_committee
 44 | 
 45 |         mayor = Organization(name="Mayor", classification="executive")
 46 |         mayor.add_post("Mayor", "Mayor", division_id="ocd-division/country:us/state:pa/place:pittsburgh")
 47 |         mayor.add_source("http://pittsburghpa.gov/mayor/index.html", note="web")
 48 |         yield mayor
 49 | 
 50 |         # TODO: figure out disambiguation for councilman/mayor positions (using birth_date?)
 51 | 
 52 |         # peduto = Person(name="William Peduto", birth_date=datetime.date(1964, 10, 30))
 53 |         # peduto.add_term("Mayor",
 54 |         #                 "executive",
 55 |         #                 start_date=datetime.date(2014, 1 ,6),
 56 |         #                 appointment=True)
 57 |         # peduto.add_source("http://pittsburghpa.gov/mayor/mayor-profile")
 58 |         # yield peduto
 59 | 
 60 |         # ravenstahl = Person(name="Luke Ravenstahl", birth_date=datetime.date(1980, 2, 6))
 61 |         # ravenstahl.add_term("Mayor",
 62 |         #                 "executive",
 63 |         #                 start_date=datetime.date(2006, 9, 1),
 64 |         #                 end_date=datetime.date(2014, 1 ,6),
 65 |         #                 appointment=True)
 66 |         # ravenstahl.add_source("https://www.post-gazette.com/local/city/2006/09/01/Ravenstahl-sworn-in-as-Pittsburgh-mayor/stories/200609010229")
 67 |         # yield ravenstahl
 68 | 
 69 |         city_clerk = Organization(name="City Clerk", classification="department")
 70 |         city_clerk.add_post("City Clerk", "City Clerk", division_id="ocd-division/country:us/state:pa/place:pittsburgh")
 71 |         city_clerk.add_source("http://pittsburghpa.gov/clerk/", note="web")
 72 |         yield city_clerk
 73 | 
 74 |         pree = Person(name="Brenda Pree")
 75 |         pree.add_term("City Clerk",
 76 |                       "department",
 77 |                       start_date=datetime.date(2017, 8, 29),
 78 |                       appointment=True)
 79 |         pree.add_source("http://pittsburghpa.gov/clerk/clerk-bio")
 80 |         yield pree
 81 | 
 82 |         doheny = Person(name="Mary Beth Doheny")
 83 |         doheny.add_term("City Clerk",
 84 |                         "department",
 85 |                         start_date=datetime.date(2014, 3, 18),
 86 |                         end_date=datetime.date(2017, 8, 28),
 87 |                         appointment=True)
 88 |         doheny.add_source("http://pittsburghpa.gov")
 89 |         yield doheny
 90 | 
 91 |         # "All Members", frustratingly, has a Person entry in Pittsburgh
 92 |         # Legistar, so the import trips without this. Going strong since 1816!
 93 | 
 94 |         all_members = Person(name="All Members")
 95 |         all_members.add_term("City Council",
 96 |                               "legislature",
 97 |                               start_date=datetime.date(1816, 3, 18))
 98 |         all_members.add_source("http://pittsburghpa.gov/council/index.html")
 99 |         yield all_members
100 | 


--------------------------------------------------------------------------------
/sacramento/people.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | from pupa.scrape import Person, Organization
  4 | from legistar.people import LegistarAPIPersonScraper, LegistarPersonScraper
  5 | 
  6 | 
  7 | class SacramentoPersonScraper(LegistarAPIPersonScraper):
  8 |     BASE_URL = 'http://webapi.legistar.com/v1/sacramento'
  9 |     WEB_URL = 'https://sacramento.legistar.com'
 10 |     TIMEZONE = "America/Los_Angeles"
 11 | 
 12 |     def body_offices(self, body):
 13 |         body_id = body['BodyId']
 14 | 
 15 |         offices_url = self.BASE_URL + '/bodies/{}/OfficeRecords'.format(body_id)
 16 | 
 17 |         for office in self.pages(offices_url, item_key="OfficeRecordId"):
 18 |             office['OfficeRecordFullName'] = "{} {}".format(office['OfficeRecordFirstName'],
 19 |                                                             office['OfficeRecordLastName'])
 20 |             yield office
 21 | 
 22 |     def scrape(self):
 23 |         body_types = self.body_types()
 24 | 
 25 |         city_council, = [body for body in self.bodies()
 26 |                          if body['BodyName'] == 'City Council ']
 27 | 
 28 |         terms = collections.defaultdict(list)
 29 | 
 30 |         for office in self.body_offices(city_council):
 31 | 
 32 |             if office['OfficeRecordFullName'] != "Granicus BA":
 33 |                 terms[office['OfficeRecordFullName']].append(office)
 34 | 
 35 |         members = {}
 36 | 
 37 |         for member, offices in terms.items():
 38 | 
 39 |             p = Person(member)
 40 |             for term in offices:
 41 |                 role = term['OfficeRecordTitle']
 42 |                 p.add_term(role,
 43 |                            'legislature',
 44 |                            # district = "District {}".format(int(web['District/Office'])),
 45 |                            start_date=self.toDate(term['OfficeRecordStartDate']),
 46 |                            end_date=self.toDate(term['OfficeRecordEndDate']))
 47 | 
 48 |             source_urls = self.person_sources_from_office(term)
 49 |             person_api_url, person_web_url = source_urls
 50 |             p.add_source(person_api_url, note='api')
 51 |             p.add_source(person_web_url, note='web')
 52 | 
 53 |             members[member] = p
 54 | 
 55 |         for body in self.bodies():
 56 |             if body['BodyTypeId'] == body_types['Standing Committees']:
 57 |                 o = Organization(body['BodyName'],
 58 |                                  classification='committee',
 59 |                                  parent_id={'name': 'Sacramento City Council'})
 60 | 
 61 |                 o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
 62 |                 o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body),
 63 |                              note='web')
 64 | 
 65 |                 for office in self.body_offices(body):
 66 |                     # messed up record for joanna thompson
 67 |                     if office['OfficeRecordId'] == 1055:
 68 |                         continue
 69 | 
 70 |                     role = office['OfficeRecordTitle']
 71 |                     if role not in ("Vice Chair", "Chairperson"):
 72 |                         role = 'Member'
 73 | 
 74 |                     person = office['OfficeRecordFullName'].strip()
 75 |                     if person in members:
 76 |                         p = members[person]
 77 |                     else:
 78 |                         p = Person(person)
 79 | 
 80 |                         source_urls = self.person_sources_from_office(office)
 81 |                         person_api_url, person_web_url = source_urls
 82 |                         p.add_source(person_api_url, note='api')
 83 |                         p.add_source(person_web_url, note='web')
 84 | 
 85 |                         members[person] = p
 86 | 
 87 |                     p.add_membership(body['BodyName'],
 88 |                                      role=role,
 89 |                                      start_date=self.toDate(office['OfficeRecordStartDate']),
 90 |                                      end_date=self.toDate(office['OfficeRecordEndDate']))
 91 | 
 92 |                 yield o
 93 | 
 94 |         for body in self.bodies():
 95 |             if body['BodyTypeId'] == body_types['Boards or Commission']:
 96 |                 o = Organization(body['BodyName'],
 97 |                                  classification='commission',
 98 |                                  parent_id={'name': 'Sacramento City Council'})
 99 | 
100 |                 o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
101 |                 o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body),
102 |                              note='web')
103 | 
104 |                 yield o
105 | 
106 |         for p in members.values():
107 |             yield p
108 | 


--------------------------------------------------------------------------------
/st_louis/bills.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Bill
  2 | from .utils import Urls, StlScraper
  3 | import time
  4 | 
  5 | 
  6 | class StLouisBillScraper(StlScraper):
  7 | 
  8 | 	def scrape(self):
  9 | 		for session in self.jurisdiction.legislative_sessions:
 10 | 			session_id = session["identifier"]
 11 | 			session_url = self.bill_session_url(session_id)
 12 | 			page = self.lxmlize(session_url)
 13 | 			# bills are in a <table class="data stripped"> 
 14 | 			bill_rows = page.xpath("//table[contains(@class, 'data')]/tr")
 15 | 			# first row is headers, so ignore it
 16 | 			bill_rows.pop(0)
 17 | 			
 18 | 			for row in bill_rows:
 19 | 				id_link = row.xpath("td[1]/a")[0]
 20 | 				bill_id = id_link.xpath("text()")[0]
 21 | 				bill_url = id_link.xpath("@href")[0]
 22 | 				yield self.scrape_bill(bill_url, bill_id, session_id)
 23 | 
 24 | 
 25 | 	def scrape_bill(self, bill_url, bill_id, session_id):
 26 | 		page = self.lxmlize(bill_url)
 27 | 		# create bill
 28 | 		title = page.xpath("//h1/text()")[0]
 29 | 		bill = Bill(identifier=bill_id,
 30 | 			        legislative_session=session_id,
 31 | 			        title=title)
 32 | 		bill.add_source(bill_url, note="detail")
 33 | 
 34 | 		# add additional fields
 35 | 
 36 | 		# abstract
 37 | 		try:
 38 | 			# abstract is directly above <h2>Legislative History</h2>
 39 | 			leg_his = page.xpath("//h2[text()='Legislative History']")[0]
 40 | 			abstract = leg_his.xpath("preceding-sibling::p/text()")[0]
 41 | 			bill.add_abstract(abstract=abstract.strip(), note="summary")
 42 | 			# TODO trim whitespace from summary
 43 | 		except IndexError:
 44 | 			print("No abstract for bill {} in session {}".format(bill_id, session_id))
 45 | 
 46 | 		# the rest of the fields are found inside this <table>
 47 | 		data_table = page.xpath("//table[contains(@class, 'data')]")[0]
 48 | 
 49 | 		# sponsor
 50 | 		sponsor_name = data_table.xpath(self.bill_table_query("Sponsor") + "/text()")[0]
 51 | 		bill.add_sponsorship(name=sponsor_name,
 52 | 				classification="Primary",
 53 | 				entity_type="person",
 54 | 				primary=True
 55 | 				)
 56 | 
 57 | 		# actions
 58 | 		action_lines = data_table.xpath(self.bill_table_query("Actions") + "/text()")
 59 | 		for line in action_lines:
 60 | 			line = line.join('')
 61 | 			try:
 62 | 				for date_str, action_type in self.parse_actions(line):
 63 | 					bill.add_action(date=date_str,
 64 | 						description=action_type,	
 65 | 						classification=action_type)
 66 | 			except ValueError:
 67 | 				print("failed to parse these actions: {}".format([line]))
 68 | 
 69 | 
 70 | 		# co-sponsors
 71 | 		co_sponsors = data_table.xpath(self.bill_table_query("Co-Sponsors") + "/text()")
 72 | 		co_sponsors = [name.strip() for name in co_sponsors if name.strip()]
 73 | 		for name in co_sponsors:
 74 | 			bill.add_sponsorship(name=name,
 75 | 						classification="co-sponsor",
 76 | 						entity_type="person",
 77 | 						primary=False)
 78 | 
 79 | 		# committee (stored as another sponsorship in OCD)
 80 | 		committees = data_table.xpath(self.bill_table_query("Committee") + "/a/text()")
 81 | 		for comm in committees:
 82 | 			bill.add_sponsorship(name=comm,
 83 | 							classification="secondary", # classification ?
 84 | 							entity_type="organization",
 85 | 							primary=False)
 86 | 
 87 | 		return bill
 88 | 
 89 | 
 90 | 	def bill_table_query(self, key):
 91 | 		return "//th[text()='{}:']/../td".format(key)
 92 | 
 93 | 	def bill_session_url(self, session_id):
 94 | 		return Urls.BILLS_HOME + "?sessionBB=" + session_id
 95 | 
 96 | 	def parse_actions(self, action_line):
 97 | 		"""
 98 | 		input will look like:
 99 | 		'\n05/15/2015 Second Reading '
100 | 
101 | 		return a tuple that looks like:
102 | 		('2015-05-15', 'reading-2')
103 | 		"""
104 | 
105 | 		# date is the first word, rest of words describe the bill actions
106 | 		date_str, _, action_types_str = action_line.strip().partition(" ")
107 | 
108 | 		# convert date format from eg "05/12/2015" to "2015-05-12"
109 | 		date = time.strptime(date_str, "%m/%d/%Y")
110 | 		date_str = time.strftime("%Y-%m-%d", date)
111 | 
112 | 		# action_types_str might contain multiple action_types, eg
113 | 		# "Third Reading,Perfection"
114 | 		action_types = action_types_str.split(",")
115 | 
116 | 		for act in action_types:
117 | 			# try to convert st louis phrase to OCD phrase, eg
118 | 			# "Second Reading" --> "reading-2"
119 | 			try:
120 | 				classification = self.action_classifications[act]
121 | 			except KeyError:
122 | 				print(act)
123 | 				raise ValueError("invalid bill action classification: {}".format(act))
124 | 			# yield a result for each action_type
125 | 			yield date_str, classification
126 | 			
127 | 	action_classifications = {
128 | 		"First Reading": "reading-1",
129 | 		"Second Reading": "reading-2",
130 | 		"Third Reading": "reading-3",
131 | 		# TODO other cases. 
132 | 		# See http://docs.opencivicdata.org/en/latest/scrape/bills.html
133 | 
134 | 		# what does "Perfection" map to?
135 | 		"Perfection": "referral", # ???
136 | 
137 | 		# what does "Informal Calendar" map to?
138 | 		"Informal Calendar": "filing", # ???
139 | 	}


--------------------------------------------------------------------------------
/archive/denver/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import itertools
  3 | import collections
  4 | from datetime import datetime
  5 | from functools import partial
  6 | 
  7 | import lxml.html
  8 | import lxml.etree
  9 | 
 10 | from six import with_metaclass
 11 | 
 12 | from pupa.utils import convert_pdf
 13 | 
 14 | 
 15 | class Cached(object):
 16 |     '''Computes attribute value and caches it in instance.
 17 | 
 18 |     Example:
 19 |         class MyClass(object):
 20 |             def myMethod(self):
 21 |                 # ...
 22 |             myMethod = Cached(myMethod)
 23 |     Use "del inst.myMethod" to clear cache.
 24 |     http://code.activestate.com/recipes/276643/
 25 |     '''
 26 | 
 27 |     def __init__(self, method, name=None):
 28 |         self.method = method
 29 |         self.name = name or method.__name__
 30 | 
 31 |     def __get__(self, inst, cls):
 32 |         if inst is None:
 33 |             return self
 34 |         result = self.method(inst)
 35 |         setattr(inst, self.name, result)
 36 |         return result
 37 | 
 38 | 
 39 | class UrlData(object):
 40 |     '''Given a url, its nickname, and a scraper instance,
 41 |     provide the parsed lxml doc, the raw html, and the url
 42 |     '''
 43 |     def __init__(self, name, url, scraper, urls_object):
 44 |         '''urls_object is a reference back to the Urls container.
 45 |         '''
 46 |         self.url = url
 47 |         self.name = name
 48 |         self.scraper = scraper
 49 |         self.urls_object = urls_object
 50 | 
 51 |     def __repr__(self):
 52 |         return 'UrlData(url=%r)' % self.url
 53 | 
 54 |     @Cached
 55 |     def text(self):
 56 |         text = self.scraper.urlopen(self.url)
 57 |         self.urls_object.validate(self.name, self.url, text)
 58 |         return text
 59 | 
 60 |     @Cached
 61 |     def resp(self):
 62 |         '''Return the decoded html or xml or whatever. sometimes
 63 |         necessary for a quick "if 'page not found' in html:..."
 64 |         '''
 65 |         return self.text.response
 66 | 
 67 |     @Cached
 68 |     def doc(self):
 69 |         '''Return the page's lxml doc.
 70 |         '''
 71 |         doc = lxml.html.fromstring(self.text)
 72 |         doc.make_links_absolute(self.url)
 73 |         return doc
 74 | 
 75 |     @Cached
 76 |     def xpath(self):
 77 |         return self.doc.xpath
 78 | 
 79 |     @Cached
 80 |     def pdf_to_lxml(self):
 81 |         filename, resp = self.scraper.urlretrieve(self.url)
 82 |         text = convert_pdf(filename, 'html')
 83 |         return lxml.html.fromstring(text)
 84 | 
 85 |     @Cached
 86 |     def etree(self):
 87 |         '''Return the documents element tree.
 88 |         '''
 89 |         return lxml.etree.fromstring(self.text)
 90 | 
 91 | 
 92 | class UrlsMeta(type):
 93 |     '''This metaclass aggregates the validator functions marked
 94 |     using the Urls.validate decorator.
 95 |     '''
 96 |     def __new__(meta, name, bases, attrs):
 97 |         '''Just aggregates the validator methods into a defaultdict
 98 |         and stores them on cls._validators.
 99 |         '''
100 |         validators = collections.defaultdict(set)
101 |         for attr in attrs.values():
102 |             if hasattr(attr, 'validates'):
103 |                 validators[attr.validates].add(attr)
104 |         attrs['_validators'] = validators
105 |         cls = type.__new__(meta, name, bases, attrs)
106 |         return cls
107 | 
108 | 
109 | class Urls(with_metaclass(UrlsMeta)):
110 |     '''Contains urls we need to fetch during this scrape.
111 |     '''
112 | 
113 |     def __init__(self, urls, scraper):
114 |         '''Sets a UrlData object on the instance for each named url given.
115 |         '''
116 |         self.urls = urls
117 |         self.scraper = scraper
118 |         for name, url in urls.items():
119 |             url = UrlData(name, url, scraper, urls_object=self)
120 |             setattr(self, name, url)
121 | 
122 |     def __repr__(self):
123 |         return '%s(%r)' % (self.__class__.__name__, self.urls)
124 | 
125 |     def __iter__(self):
126 |         '''A generator of this object's UrlData members.
127 |         '''
128 |         for name in self.urls:
129 |             yield getattr(self, name)
130 | 
131 |     def add(self, **name_to_url_map):
132 |         for name, url in name_to_url_map.items():
133 |             url_data = UrlData(name, url, self.scraper, urls_object=self)
134 |         setattr(self, name, url_data)
135 | 
136 |     @staticmethod
137 |     def validates(name, retry=False):
138 |         '''A decorator to mark validator functions for use on a particular
139 |         named url. Use like so:
140 | 
141 |         @Urls.validates('history')
142 |         def must_have_actions(self, url, text):
143 |             'Skip bill that hasn't been introduced yet.'
144 |             if 'no actions yet' in text:
145 |                 raise Skip('Bill had no actions yet.')
146 |         '''
147 |         def decorator(method):
148 |             method.validates = name
149 |             method.retry = retry
150 |             return method
151 |         return decorator
152 | 
153 |     def validate(self, name, url, text):
154 |         '''Run each validator function for the named url and its text.
155 |         '''
156 |         for validator in self._validators[name]:
157 |             try:
158 |                 validator(self, url, text)
159 |             except Exception as e:
160 |                 if validator.retry:
161 |                     validator(self, url, text)
162 |                 else:
163 |                     raise e
164 | 


--------------------------------------------------------------------------------
/archive/temecula/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import itertools
  3 | import collections
  4 | from datetime import datetime
  5 | from functools import partial
  6 | 
  7 | import lxml.html
  8 | import lxml.etree
  9 | 
 10 | from six import with_metaclass
 11 | 
 12 | from pupa.utils import convert_pdf
 13 | 
 14 | 
 15 | class Cached(object):
 16 |     '''Computes attribute value and caches it in instance.
 17 | 
 18 |     Example:
 19 |         class MyClass(object):
 20 |             def myMethod(self):
 21 |                 # ...
 22 |             myMethod = Cached(myMethod)
 23 |     Use "del inst.myMethod" to clear cache.
 24 |     http://code.activestate.com/recipes/276643/
 25 |     '''
 26 | 
 27 |     def __init__(self, method, name=None):
 28 |         self.method = method
 29 |         self.name = name or method.__name__
 30 | 
 31 |     def __get__(self, inst, cls):
 32 |         if inst is None:
 33 |             return self
 34 |         result = self.method(inst)
 35 |         setattr(inst, self.name, result)
 36 |         return result
 37 | 
 38 | 
 39 | class UrlData(object):
 40 |     '''Given a url, its nickname, and a scraper instance,
 41 |     provide the parsed lxml doc, the raw html, and the url
 42 |     '''
 43 |     def __init__(self, name, url, scraper, urls_object):
 44 |         '''urls_object is a reference back to the Urls container.
 45 |         '''
 46 |         self.url = url
 47 |         self.name = name
 48 |         self.scraper = scraper
 49 |         self.urls_object = urls_object
 50 | 
 51 |     def __repr__(self):
 52 |         return 'UrlData(url=%r)' % self.url
 53 | 
 54 |     @Cached
 55 |     def text(self):
 56 |         text = self.scraper.urlopen(self.url)
 57 |         self.urls_object.validate(self.name, self.url, text)
 58 |         return text
 59 | 
 60 |     @Cached
 61 |     def resp(self):
 62 |         '''Return the decoded html or xml or whatever. sometimes
 63 |         necessary for a quick "if 'page not found' in html:..."
 64 |         '''
 65 |         return self.text.response
 66 | 
 67 |     @Cached
 68 |     def doc(self):
 69 |         '''Return the page's lxml doc.
 70 |         '''
 71 |         doc = lxml.html.fromstring(self.text)
 72 |         doc.make_links_absolute(self.url)
 73 |         return doc
 74 | 
 75 |     @Cached
 76 |     def xpath(self):
 77 |         return self.doc.xpath
 78 | 
 79 |     @Cached
 80 |     def pdf_to_lxml(self):
 81 |         filename, resp = self.scraper.urlretrieve(self.url)
 82 |         text = convert_pdf(filename, 'html')
 83 |         return lxml.html.fromstring(text)
 84 | 
 85 |     @Cached
 86 |     def etree(self):
 87 |         '''Return the documents element tree.
 88 |         '''
 89 |         return lxml.etree.fromstring(self.text)
 90 | 
 91 | 
 92 | class UrlsMeta(type):
 93 |     '''This metaclass aggregates the validator functions marked
 94 |     using the Urls.validate decorator.
 95 |     '''
 96 |     def __new__(meta, name, bases, attrs):
 97 |         '''Just aggregates the validator methods into a defaultdict
 98 |         and stores them on cls._validators.
 99 |         '''
100 |         validators = collections.defaultdict(set)
101 |         for attr in attrs.values():
102 |             if hasattr(attr, 'validates'):
103 |                 validators[attr.validates].add(attr)
104 |         attrs['_validators'] = validators
105 |         cls = type.__new__(meta, name, bases, attrs)
106 |         return cls
107 | 
108 | 
109 | class Urls(with_metaclass(UrlsMeta)):
110 |     '''Contains urls we need to fetch during this scrape.
111 |     '''
112 | 
113 |     def __init__(self, urls, scraper):
114 |         '''Sets a UrlData object on the instance for each named url given.
115 |         '''
116 |         self.urls = urls
117 |         self.scraper = scraper
118 |         for name, url in urls.items():
119 |             url = UrlData(name, url, scraper, urls_object=self)
120 |             setattr(self, name, url)
121 | 
122 |     def __repr__(self):
123 |         return '%s(%r)' % (self.__class__.__name__, self.urls)
124 | 
125 |     def __iter__(self):
126 |         '''A generator of this object's UrlData members.
127 |         '''
128 |         for name in self.urls:
129 |             yield getattr(self, name)
130 | 
131 |     def add(self, **name_to_url_map):
132 |         for name, url in name_to_url_map.items():
133 |             url_data = UrlData(name, url, self.scraper, urls_object=self)
134 |         setattr(self, name, url_data)
135 | 
136 |     @staticmethod
137 |     def validates(name, retry=False):
138 |         '''A decorator to mark validator functions for use on a particular
139 |         named url. Use like so:
140 | 
141 |         @Urls.validates('history')
142 |         def must_have_actions(self, url, text):
143 |             'Skip bill that hasn't been introduced yet.'
144 |             if 'no actions yet' in text:
145 |                 raise Skip('Bill had no actions yet.')
146 |         '''
147 |         def decorator(method):
148 |             method.validates = name
149 |             method.retry = retry
150 |             return method
151 |         return decorator
152 | 
153 |     def validate(self, name, url, text):
154 |         '''Run each validator function for the named url and its text.
155 |         '''
156 |         for validator in self._validators[name]:
157 |             try:
158 |                 validator(self, url, text)
159 |             except Exception as e:
160 |                 if validator.retry:
161 |                     validator(self, url, text)
162 |                 else:
163 |                     raise e
164 | 


--------------------------------------------------------------------------------
/chicago/__init__.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | from pupa.scrape import Jurisdiction, Organization, Person
  4 | 
  5 | from .bills import ChicagoBillScraper
  6 | from .events import ChicagoEventsScraper
  7 | from .people import ChicagoPersonScraper
  8 | 
  9 | 
 10 | class Chicago(Jurisdiction):
 11 |     division_id = "ocd-division/country:us/state:il/place:chicago"
 12 |     classification = "government"
 13 |     name = "Chicago City Government"
 14 |     url = "https://chicago.legistar.com/"
 15 |     parties = [{"name": "Democrats"}]
 16 | 
 17 |     scrapers = {
 18 |         "people": ChicagoPersonScraper,
 19 |         "events": ChicagoEventsScraper,
 20 |         "bills": ChicagoBillScraper,
 21 |     }
 22 | 
 23 |     legislative_sessions = [
 24 |         {
 25 |             "identifier": "2023",
 26 |             "name": "2023 Regular Session",
 27 |             "start_date": "2023-05-15",
 28 |             "end_date": "2027-05-15",
 29 |         },
 30 |         {
 31 |             "identifier": "2019",
 32 |             "name": "2019 Regular Session",
 33 |             "start_date": "2019-05-20",
 34 |             "end_date": "2023-05-14",
 35 |         },
 36 |         {
 37 |             "identifier": "2015",
 38 |             "name": "2015 Regular Session",
 39 |             "start_date": "2015-05-18",
 40 |             "end_date": "2019-05-19",
 41 |         },
 42 |         {
 43 |             "identifier": "2011",
 44 |             "name": "2011 Regular Session",
 45 |             "start_date": "2011-05-18",
 46 |             "end_date": "2015-05-17",
 47 |         },
 48 |         {
 49 |             "identifier": "2007",
 50 |             "name": "2007 Regular Session",
 51 |             "start_date": "2007-05-18",
 52 |             "end_date": "2011-05-17",
 53 |         },
 54 |     ]
 55 | 
 56 |     def get_organizations(self):
 57 |         org = Organization(name="Chicago City Council", classification="legislature")
 58 |         org.add_name("City Council")
 59 |         for x in range(1, 51):
 60 |             org.add_post(
 61 |                 "Ward {}".format(x),
 62 |                 "Alderman",
 63 |                 division_id="ocd-division/country:us/state:il/place:chicago/ward:{}".format(
 64 |                     x
 65 |                 ),
 66 |             )
 67 | 
 68 |         yield org
 69 | 
 70 |         city = Organization("City of Chicago", classification="executive")
 71 |         city.add_post(
 72 |             "Mayor",
 73 |             "Mayor",
 74 |             division_id="ocd-division/country:us/state:il/place:chicago",
 75 |         )
 76 |         city.add_post(
 77 |             "City Clerk",
 78 |             "City Clerk",
 79 |             division_id="ocd-division/country:us/state:il/place:chicago",
 80 |         )
 81 | 
 82 |         yield city
 83 | 
 84 |         daley = Person(name="Daley, Richard M.")
 85 |         daley.add_term(
 86 |             "Mayor",
 87 |             "executive",
 88 |             start_date=datetime.date(1989, 4, 24),
 89 |             end_date=datetime.date(2011, 5, 16),
 90 |             appointment=True,
 91 |         )
 92 |         daley.add_source("https://chicago.legistar.com/People.aspx")
 93 |         yield daley
 94 | 
 95 |         emanuel = Person(name="Emanuel, Rahm")
 96 |         emanuel.add_term(
 97 |             "Mayor",
 98 |             "executive",
 99 |             start_date=datetime.date(2011, 5, 16),
100 |             end_date=datetime.date(2019, 5, 20),
101 |             appointment=True,
102 |         )
103 |         emanuel.add_source("https://chicago.legistar.com/People.aspx")
104 |         yield emanuel
105 | 
106 |         lightfoot = Person(name="Lightfoot, Lori E.")
107 |         lightfoot.add_term(
108 |             "Mayor",
109 |             "executive",
110 |             start_date=datetime.date(2019, 5, 20),
111 |             end_date=datetime.date(2023, 5, 15),
112 |             appointment=True,
113 |         )
114 |         lightfoot.add_source("https://chicago.legistar.com/People.aspx")
115 |         yield lightfoot
116 | 
117 |         johnson = Person(name="Johnson, Brandon")
118 |         johnson.add_term(
119 |             "Mayor",
120 |             "executive",
121 |             start_date=datetime.date(2023, 5, 15),
122 |             appointment=True,
123 |         )
124 |         johnson.add_source("https://chicago.legistar.com/People.aspx")
125 |         yield johnson
126 | 
127 |         mendoza = Person(name="Mendoza, Susana A.")
128 |         mendoza.add_term(
129 |             "City Clerk",
130 |             "executive",
131 |             start_date=datetime.date(2011, 5, 16),
132 |             end_date=datetime.date(2016, 12, 4),
133 |             appointment=True,
134 |         )
135 | 
136 |         mendoza.add_source("https://chicago.legistar.com/People.aspx")
137 |         yield mendoza
138 | 
139 |         valle = Person(name="Del Valle, Miguel")
140 |         valle.add_term(
141 |             "City Clerk",
142 |             "executive",
143 |             start_date=datetime.date(2006, 12, 1),
144 |             end_date=datetime.date(2011, 5, 16),
145 |             appointment=True,
146 |         )
147 | 
148 |         valle.add_source("https://chicago.legistar.com/People.aspx")
149 |         yield valle
150 | 
151 |         valencia = Person(name="Valencia, Anna M.")
152 |         valencia.add_term(
153 |             role="City Clerk",
154 |             org_classification="executive",
155 |             start_date=datetime.date(2017, 1, 25),
156 |             end_date=datetime.date(2019, 5, 20),
157 |             appointment=True,
158 |         )
159 | 
160 |         valencia.add_source("https://chicago.legistar.com/People.aspx")
161 |         yield valencia
162 | 


--------------------------------------------------------------------------------
/pittsburgh/people.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | from pupa.scrape import Person, Organization, Scraper
  4 | from legistar.people import LegistarAPIPersonScraper, LegistarPersonScraper
  5 | 
  6 | 
  7 | class PittsburghPersonScraper(LegistarAPIPersonScraper, Scraper):
  8 |     BASE_URL = "http://webapi.legistar.com/v1/pittsburgh"
  9 |     WEB_URL = "https://pittsburgh.legistar.com"
 10 |     TIMEZONE = "America/New_York"
 11 | 
 12 |     # Override method in LegistarAPIPersonScraper class to pull in contact
 13 |     # info from call to /persons/PersonId endpoint
 14 | 
 15 |     def person_sources_from_office(self, office):
 16 |         person_api_url = (self.BASE_URL +
 17 |                           "/persons/{OfficeRecordPersonId}".format(**office))
 18 |         response = self.get(person_api_url)
 19 |         person_api_response = self.get(person_api_url).json()
 20 | 
 21 |         return person_api_url, person_api_response
 22 | 
 23 |     def scrape(self):
 24 |         body_types = self.body_types()
 25 |         city_council, = [body for body in self.bodies()
 26 |                          if body["BodyName"] == "City Council"]
 27 |         terms = collections.defaultdict(list)
 28 | 
 29 |         for office in self.body_offices(city_council):
 30 |             if "VACAN" not in office["OfficeRecordFullName"]:
 31 |                 terms[office["OfficeRecordFullName"].strip()].append(office)
 32 | 
 33 |         web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute)
 34 |         web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx"
 35 |         web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx"
 36 | 
 37 |         if self.cache_storage:
 38 |             web_scraper.cache_storage = self.cache_storage
 39 | 
 40 |         if self.requests_per_minute == 0:
 41 |             web_scraper.cache_write_only = False
 42 | 
 43 |         web_info = {}
 44 |         for member in web_scraper.councilMembers():
 45 |             web_info[member["Person Name"]] = member
 46 | 
 47 |         members = {}
 48 |         for member, offices in terms.items():
 49 |             person = Person(member)
 50 |             for term in offices:
 51 |                 role = term["OfficeRecordTitle"]
 52 |                 person.add_term("Councilmember",
 53 |                                 "legislature",
 54 |                                 start_date = self.toDate(term["OfficeRecordStartDate"]),
 55 |                                 end_date = self.toDate(term["OfficeRecordEndDate"]))
 56 | 
 57 |             if member in web_info:
 58 |                 web = web_info[member]
 59 |                 if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A":
 60 |                     person.add_contact_detail(type="email",
 61 |                                         value=web["E-mail"]["label"],
 62 |                                         note="E-mail")
 63 | 
 64 |             person_source_data = self.person_sources_from_office(term)
 65 |             person_api_url, person_api_response = person_source_data
 66 |             person.add_source(person_api_url, note="api")
 67 | 
 68 |             if person_api_response["PersonAddress1"]:
 69 |                 address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"]
 70 |                           + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"])
 71 |                 person.add_contact_detail(type="address",
 72 |                                     value=address,
 73 |                                     note="Office address")
 74 | 
 75 |             if person_api_response["PersonPhone"]:
 76 |                 person.add_contact_detail(type="voice",
 77 |                                     value=person_api_response["PersonPhone"],
 78 |                                     note="Office phone")
 79 | 
 80 |             if person_api_response["PersonWWW"]:
 81 |                 person.add_contact_detail(type="url",
 82 |                                     value=person_api_response["PersonWWW"],
 83 |                                     note="District website")
 84 | 
 85 |             members[member] = person
 86 | 
 87 | 
 88 |         for body in self.bodies():
 89 |             if body["BodyTypeId"] == body_types["Committee"]:
 90 |                 body_name_clean = body["BodyName"].strip()
 91 |                 organization = Organization(body_name_clean,
 92 |                              classification="committee",
 93 |                              parent_id={"name" : "Pittsburgh City Council"})
 94 | 
 95 |                 organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api")
 96 | 
 97 |                 for office in self.body_offices(body):
 98 |                     role = office["OfficeRecordMemberType"]
 99 |                     if role not in ("Vice Chair", "Chair") or role == "Councilmember":
100 |                         role = "Member"
101 | 
102 |                     person = office["OfficeRecordFullName"].strip()
103 |                     if person in members:
104 |                         person = members[person]
105 |                     else:
106 |                         person = Person(person)
107 | 
108 |                     person.add_membership(body_name_clean,
109 |                                      role=role,
110 |                                      start_date = self.toDate(office["OfficeRecordStartDate"]),
111 |                                      end_date = self.toDate(office["OfficeRecordEndDate"]))
112 | 
113 |                 yield organization
114 | 
115 |         for person in members.values():
116 |             yield person
117 | 


--------------------------------------------------------------------------------
/chicago/legistar.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import itertools
  3 | import re
  4 | import traceback
  5 | from collections import defaultdict
  6 | 
  7 | import lxml.etree as etree
  8 | import lxml.html
  9 | import pytz
 10 | from pupa.scrape import Scraper
 11 | 
 12 | 
 13 | class LegistarScraper(Scraper):
 14 |     date_format = "%m/%d/%Y"
 15 | 
 16 |     def __init__(self, *args, **kwargs):
 17 |         super(LegistarScraper, self).__init__(*args, **kwargs)
 18 |         self.timeout = 600
 19 | 
 20 |     def lxmlize(self, url, payload=None):
 21 |         if payload:
 22 |             entry = self.post(url, payload).text
 23 |         else:
 24 |             entry = self.get(url).text
 25 |         page = lxml.html.fromstring(entry)
 26 |         page.make_links_absolute(url)
 27 |         return page
 28 | 
 29 |     def pages(self, url, payload=None):
 30 |         page = self.lxmlize(url, payload)
 31 | 
 32 |         yield page
 33 | 
 34 |         next_page = page.xpath("//a[@class='rgCurrentPage']/following-sibling::a[1]")
 35 |         if payload and "ctl00$ContentPlaceHolder1$btnSearch" in payload:
 36 |             del payload["ctl00$ContentPlaceHolder1$btnSearch"]
 37 | 
 38 |         while len(next_page) > 0:
 39 | 
 40 |             payload.update(self.sessionSecrets(page))
 41 | 
 42 |             event_target = next_page[0].attrib["href"].split("'")[1]
 43 | 
 44 |             payload["__EVENTTARGET"] = event_target
 45 | 
 46 |             page = self.lxmlize(url, payload)
 47 | 
 48 |             yield page
 49 | 
 50 |             next_page = page.xpath(
 51 |                 "//a[@class='rgCurrentPage']/following-sibling::a[1]"
 52 |             )
 53 | 
 54 |     def parseDetails(self, detail_div):
 55 |         """
 56 |         Parse the data in the top section of a detail page.
 57 |         """
 58 |         detail_query = (
 59 |             ".//*[starts-with(@id, 'ctl00_ContentPlaceHolder1_lbl')"
 60 |             "     or starts-with(@id, 'ctl00_ContentPlaceHolder1_hyp')]"
 61 |         )
 62 |         fields = detail_div.xpath(detail_query)
 63 |         details = {}
 64 | 
 65 |         for field_key, field in itertools.groupby(fields, fieldKey):
 66 |             field = list(field)
 67 |             field_1, field_2 = field[0], field[-1]
 68 |             key = field_1.text_content().replace(":", "").strip()
 69 |             if field_2.find(".//a") is not None:
 70 |                 value = []
 71 |                 for link in field_2.xpath(".//a"):
 72 |                     value.append(
 73 |                         {
 74 |                             "label": link.text_content().strip(),
 75 |                             "url": self._get_link_address(link),
 76 |                         }
 77 |                     )
 78 |             else:
 79 |                 value = field_2.text_content().strip()
 80 | 
 81 |             details[key] = value
 82 | 
 83 |         return details
 84 | 
 85 |     def parseDataTable(self, table):
 86 |         """
 87 |         Legistar uses the same kind of data table in a number of
 88 |         places. This will return a list of dictionaries using the
 89 |         table headers as keys.
 90 |         """
 91 |         headers = table.xpath(".//th[starts-with(@class, 'rgHeader')]")
 92 |         rows = table.xpath(".//tr[@class='rgRow' or @class='rgAltRow']")
 93 | 
 94 |         keys = [
 95 |             header.text_content().replace("&nbsp;", " ").strip() for header in headers
 96 |         ]
 97 | 
 98 |         for row in rows:
 99 |             try:
100 |                 data = defaultdict(lambda: None)
101 | 
102 |                 for key, field in zip(keys, row.xpath("./td")):
103 |                     text_content = self._stringify(field)
104 | 
105 |                     if field.find(".//a") is not None:
106 |                         address = self._get_link_address(field.find(".//a"))
107 |                         if address:
108 |                             value = {"label": text_content, "url": address}
109 |                         else:
110 |                             value = text_content
111 |                     else:
112 |                         value = text_content
113 | 
114 |                     data[key] = value
115 | 
116 |                 yield data, keys, row
117 | 
118 |             except Exception as e:
119 |                 print("Problem parsing row:")
120 |                 print(etree.tostring(row))
121 |                 print(traceback.format_exc())
122 |                 raise e
123 | 
124 |     def _get_link_address(self, link):
125 |         url = None
126 |         if "onclick" in link.attrib:
127 |             onclick = link.attrib["onclick"]
128 |             if onclick is not None and (
129 |                 onclick.startswith("radopen('") or onclick.startswith("window.open")
130 |             ):
131 |                 url = self.base_url + onclick.split("'")[1]
132 |         elif "href" in link.attrib:
133 |             url = link.attrib["href"]
134 | 
135 |         return url
136 | 
137 |     def _stringify(self, field):
138 |         return field.text_content().replace("&nbsp;", " ").strip()
139 | 
140 |     def toTime(self, text):
141 |         time = datetime.datetime.strptime(text, self.date_format)
142 |         time = time.replace(tzinfo=pytz.timezone(self.timezone))
143 |         return time
144 | 
145 |     def sessionSecrets(self, page):
146 | 
147 |         payload = {}
148 |         payload["__EVENTARGUMENT"] = None
149 |         payload["__VIEWSTATE"] = page.xpath("//input[@name='__VIEWSTATE']/@value")[0]
150 |         payload["__EVENTVALIDATION"] = page.xpath(
151 |             "//input[@name='__EVENTVALIDATION']/@value"
152 |         )[0]
153 | 
154 |         return payload
155 | 
156 | 
157 | def fieldKey(x):
158 |     field_id = x.attrib["id"]
159 |     field = re.split(r"hyp|lbl", field_id)[-1]
160 |     field = field.split("Prompt")[0]
161 |     field = field.rstrip("X2")
162 |     return field
163 | 


--------------------------------------------------------------------------------
/archive/boston/people.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Organization, Person, Event, Scraper
  2 | 
  3 | from collections import defaultdict
  4 | import lxml.html
  5 | 
  6 | 
  7 | MEMBER_LIST = "http://www.cityofboston.gov/citycouncil/"
  8 | COMMITTEE_LIST = "http://www.cityofboston.gov/citycouncil/committees/"
  9 | 
 10 | 
 11 | PREFIXES = [
 12 |     "Councillors",
 13 |     "Councillor",
 14 | ]
 15 | 
 16 | 
 17 | def clean_name(name):
 18 |     for thing in PREFIXES:
 19 |         name = name.replace(thing, "")
 20 |     name = name.encode('latin1').strip()
 21 |     name = name.replace(b"\xc2\xa0", b"").decode()
 22 |     return name
 23 | 
 24 | 
 25 | class BostonPersonScraper(Scraper):
 26 | 
 27 |     def lxmlize(self, url):
 28 |         entry = self.urlopen(url)
 29 |         page = lxml.html.fromstring(entry)
 30 |         page.make_links_absolute(url)
 31 |         return page
 32 | 
 33 |     def scrape(self):
 34 |         yield self.bos_scrape_committees()
 35 |         yield self.bos_scrape_people()
 36 | 
 37 |     def get_one(self, page, expr):
 38 |         ret = page.xpath(expr)
 39 |         if len(ret) != 1:
 40 |             print(page.text_content())
 41 |             raise Exception("Bad xpath")
 42 |         return ret[0]
 43 | 
 44 |     def scrape_homepage(self, href):
 45 |         page = self.lxmlize(href)
 46 |         ret = {}
 47 |         ret['bio'] = page.xpath(
 48 |             "//div[@class='content_main_sub']")[0].text_content().strip()
 49 |         ret['image'] = page.xpath(
 50 |             "//div[@class='sub_main_hd']//img")[0].attrib['src']
 51 |         return ret
 52 | 
 53 |     def bos_scrape_people(self):
 54 |         page = self.lxmlize(MEMBER_LIST)
 55 |         people = page.xpath(
 56 |             "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")
 57 | 
 58 |         for person in people:
 59 |             image, name = [self.get_one(person, x) for x in [
 60 |                 ".//img",
 61 |                 ".//a[contains(@href, 'councillors') and (text()!='')]"
 62 |             ]]
 63 |             role = person.xpath(".//br")[0].tail.strip()
 64 |             image = image.attrib['src']  # Fallback if we don't get one from the
 65 |             # homepage.
 66 |             homepage = name.attrib['href']
 67 |             name = clean_name(name.text)
 68 |             info = self.scrape_homepage(homepage)
 69 |             if info.get('image', None):
 70 |                 image = info['image']
 71 | 
 72 |             p = Person(name=name, district=role, image=image,
 73 |                        primary_org="legislature", biography=info['bio'])
 74 |             p.add_link(url=homepage, note='homepage')
 75 |             p.add_source(homepage)
 76 |             p.add_source(MEMBER_LIST)
 77 |             yield p
 78 | 
 79 |     def scrape_committee_page(self, href):
 80 |         page = self.lxmlize(href)
 81 |         main = self.get_one(page, "//div[@class='content_main_sub']")
 82 |         things = main.xpath("./*")
 83 |         cur = None
 84 | 
 85 |         split = {
 86 |             "chair": None,
 87 |             "email": None,
 88 |             "liaison": ",",
 89 |             "members": ",",
 90 |             "vice-chair": None,
 91 |             "description": None,
 92 |         }
 93 | 
 94 |         flags = {
 95 |             "Committee Chair:": "chair",
 96 |             "Committee E-mail:": "email",
 97 |             "Committee Members:": "members",
 98 |             "Committee Liaison:": "liaison",
 99 |             "Committee Liaison(s):": "liaison",
100 |             "Committee Vice Chair": "vice-chair",
101 |             "Committee Vice Chair:": "vice-chair",
102 |             "Committee Description:": "description",
103 |         }
104 | 
105 |         def strip(entry):
106 |             entry = entry.replace(u"\xc2", " ").strip()
107 |             return entry
108 | 
109 |         ret = defaultdict(list)
110 |         for entry in things:
111 |             if entry.tag == "h4":
112 |                 cur = flags[strip(entry.text)]
113 |                 continue
114 | 
115 |             e = entry.text_content()
116 |             if e == "":
117 |                 continue
118 | 
119 |             if split[cur]:
120 |                 e = [x.strip() for x in e.split(split[cur])]
121 |             else:
122 |                 e = [e]
123 | 
124 |             ret[cur] += e
125 | 
126 |         return ret
127 | 
128 |     def bos_scrape_committees(self):
129 |         page = self.lxmlize(COMMITTEE_LIST)
130 |         committees = page.xpath(
131 |             "//a[contains(@href, 'committee') and contains(@href, 'asp')]")
132 |         comms = {}
133 |         for c in committees:
134 |             if c.text is None:
135 |                 continue
136 |             name = clean_name(c.text)
137 |             homepage = c.attrib['href']
138 | 
139 |             info = self.scrape_committee_page(homepage)
140 |             if name not in comms:
141 |                 comms[name] = Organization(name, classification='committee')
142 |             committee = comms[name]
143 | 
144 |             committee.add_source(COMMITTEE_LIST)
145 |             committee.add_source(homepage)
146 | 
147 |             for member in info['members']:
148 |                 member = clean_name(member)
149 |                 committee.add_member(member, role='member')
150 | 
151 |             chair = info.get('chair', None)
152 |             if chair:
153 |                 chair = chair[0]
154 |                 chair = clean_name(chair)
155 |                 committee.add_member(chair, role='chair')
156 | 
157 |             vchair = info.get('vice-chair', None)
158 |             if vchair:
159 |                 vchair = vchair[0]
160 |                 vchair = clean_name(vchair)
161 |                 committee.add_member(vchair, role='vice-chair')
162 | 
163 |             email = info.get('email', None)
164 |             if email:
165 |                 email = email[0]
166 |                 committee.add_contact_detail(type='email',
167 |                                              value=email,
168 |                                              note='committee email')
169 |             yield committee
170 | 


--------------------------------------------------------------------------------
/archive/boise/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import itertools
  3 | import collections
  4 | from datetime import datetime
  5 | from functools import partial
  6 | 
  7 | import lxml.html
  8 | import lxml.etree
  9 | 
 10 | from pupa.utils import convert_pdf
 11 | 
 12 | 
 13 | class Cached(object):
 14 |     '''Computes attribute value and caches it in instance.
 15 | 
 16 |     Example:
 17 |         class MyClass(object):
 18 |             def myMethod(self):
 19 |                 # ...
 20 |             myMethod = Cached(myMethod)
 21 |     Use "del inst.myMethod" to clear cache.
 22 |     http://code.activestate.com/recipes/276643/
 23 |     '''
 24 | 
 25 |     def __init__(self, method, name=None):
 26 |         self.method = method
 27 |         self.name = name or method.__name__
 28 | 
 29 |     def __get__(self, inst, cls):
 30 |         if inst is None:
 31 |             return self
 32 |         result = self.method(inst)
 33 |         setattr(inst, self.name, result)
 34 |         return result
 35 | 
 36 | 
 37 | class UrlData(object):
 38 |     '''Given a url, its nickname, and a scraper instance,
 39 |     provide the parsed lxml doc, the raw html, and the url
 40 |     '''
 41 |     def __init__(self, name, url, scraper, urls_object):
 42 |         '''urls_object is a reference back to the Urls container.
 43 |         '''
 44 |         self.url = url
 45 |         self.name = name
 46 |         self.scraper = scraper
 47 |         self.urls_object = urls_object
 48 | 
 49 |     def __repr__(self):
 50 |         return 'UrlData(url=%r)' % self.url
 51 | 
 52 |     @Cached
 53 |     def text(self):
 54 |         text = self.scraper.urlopen(self.url)
 55 |         self.urls_object.validate(self.name, self.url, text)
 56 |         return text
 57 | 
 58 |     @Cached
 59 |     def resp(self):
 60 |         '''Return the decoded html or xml or whatever. sometimes
 61 |         necessary for a quick "if 'page not found' in html:..."
 62 |         '''
 63 |         return self.text.response
 64 | 
 65 |     @Cached
 66 |     def doc(self):
 67 |         '''Return the page's lxml doc.
 68 |         '''
 69 |         doc = lxml.html.fromstring(self.text)
 70 |         doc.make_links_absolute(self.url)
 71 |         return doc
 72 | 
 73 |     @Cached
 74 |     def xpath(self):
 75 |         return self.doc.xpath
 76 | 
 77 |     @Cached
 78 |     def pdf_to_lxml(self):
 79 |         filename, resp = self.scraper.urlretrieve(self.url)
 80 |         text = convert_pdf(filename, 'html')
 81 |         return lxml.html.fromstring(text)
 82 | 
 83 |     @Cached
 84 |     def etree(self):
 85 |         '''Return the documents element tree.
 86 |         '''
 87 |         return lxml.etree.fromstring(self.text)
 88 | 
 89 | 
 90 | class UrlsMeta(type):
 91 |     '''This metaclass aggregates the validator functions marked
 92 |     using the Urls.validate decorator.
 93 |     '''
 94 |     def __new__(meta, name, bases, attrs):
 95 |         '''Just aggregates the validator methods into a defaultdict
 96 |         and stores them on cls._validators.
 97 |         '''
 98 |         validators = collections.defaultdict(set)
 99 |         for attr in attrs.values():
100 |             if hasattr(attr, 'validates'):
101 |                 validators[attr.validates].add(attr)
102 |         attrs['_validators'] = validators
103 |         cls = type.__new__(meta, name, bases, attrs)
104 |         return cls
105 | 
106 | 
107 | class Urls(metaclass=UrlsMeta):
108 |     '''Contains urls we need to fetch during this scrape.
109 |     '''
110 | 
111 |     def __init__(self, urls, scraper):
112 |         '''Sets a UrlData object on the instance for each named url given.
113 |         '''
114 |         self.urls = urls
115 |         self.scraper = scraper
116 |         for name, url in urls.items():
117 |             url = UrlData(name, url, scraper, urls_object=self)
118 |             setattr(self, name, url)
119 | 
120 |     def __repr__(self):
121 |         return '%s(%r)' % (self.__class__.__name__, self.urls)
122 | 
123 |     def __iter__(self):
124 |         '''A generator of this object's UrlData members.
125 |         '''
126 |         for name in self.urls:
127 |             yield getattr(self, name)
128 | 
129 |     def add(self, **name_to_url_map):
130 |         for name, url in name_to_url_map.items():
131 |             url_data = UrlData(name, url, self.scraper, urls_object=self)
132 |         setattr(self, name, url_data)
133 | 
134 |     @staticmethod
135 |     def validates(name, retry=False):
136 |         '''A decorator to mark validator functions for use on a particular
137 |         named url. Use like so:
138 | 
139 |         @Urls.validates('history')
140 |         def must_have_actions(self, url, text):
141 |             'Skip bill that hasn't been introduced yet.'
142 |             if 'no actions yet' in text:
143 |                 raise Skip('Bill had no actions yet.')
144 |         '''
145 |         def decorator(method):
146 |             method.validates = name
147 |             method.retry = retry
148 |             return method
149 |         return decorator
150 | 
151 |     def validate(self, name, url, text):
152 |         '''Run each validator function for the named url and its text.
153 |         '''
154 |         for validator in self._validators[name]:
155 |             try:
156 |                 validator(self, url, text)
157 |             except Exception as e:
158 |                 if validator.retry:
159 |                     validator(self, url, text)
160 |                 else:
161 |                     raise e
162 | 
163 | class PageContext(object):
164 |     '''A class to maintain the state of a single bill scrape. It has
165 |     references to the scraper, the bill object under construction,
166 |     the session context, shortcuts for accessing urls and their lxml
167 |     docs, etc.
168 |     '''
169 |     urls_dict = None
170 |     urls_class = Urls
171 | 
172 |     def __init__(self, scraper, urls_dict=None):
173 |         '''
174 |         context: The Term188 TimespanScraper instance defined above.
175 |         '''
176 |         self.urls_dict = urls_dict or self.urls_dict or {}
177 | 
178 |         # More aliases for convience later:
179 |         self.scraper = scraper
180 | 
181 |     @Cached
182 |     def urls(self):
183 |         return self.urls_class(self.urls_dict, scraper=self.scraper)
184 | 


--------------------------------------------------------------------------------
/nyc/events.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | 
  4 | import pytz
  5 | import requests
  6 | from legistar.events import LegistarAPIEventScraperZip
  7 | from pupa.scrape import Event, Scraper
  8 | 
  9 | from .secrets import TOKEN
 10 | 
 11 | class NYCEventsScraper(LegistarAPIEventScraperZip, Scraper):
 12 |     BASE_URL = 'https://webapi.legistar.com/v1/nyc'
 13 |     WEB_URL = "https://legistar.council.nyc.gov/"
 14 |     EVENTSPAGE = "https://legistar.council.nyc.gov/Calendar.aspx/"
 15 |     TIMEZONE = "America/New_York"
 16 | 
 17 |     def __init__(self, *args, **kwargs):
 18 |         super().__init__(*args, **kwargs)
 19 | 
 20 |         # This adds default param values to all requests made by
 21 |         # this session
 22 |         self.params = {'Token': TOKEN}
 23 | 
 24 |     def scrape(self, window=3):
 25 |         n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
 26 | 
 27 |         for api_event, event in self.events(n_days_ago):
 28 | 
 29 |             when = api_event['start']
 30 |             location = self._clean_location(api_event['EventLocation'])
 31 | 
 32 |             description = event['Meeting\xa0Topic']
 33 | 
 34 |             if any(each in description
 35 |                    for each
 36 |                    in ('Multiple meeting items',
 37 |                        'AGENDA TO BE ANNOUNCED')) :
 38 |                 description = None
 39 | 
 40 |             if description:
 41 |                 e = Event(name=api_event["EventBodyName"],
 42 |                           start_date=when,
 43 |                           description=description,
 44 |                           location_name=location,
 45 |                           status=api_event['status'])
 46 |             else:
 47 |                 e = Event(name=api_event["EventBodyName"],
 48 |                           start_date=when,
 49 |                           location_name=location,
 50 |                           status=api_event['status'])
 51 | 
 52 |             e.pupa_id = str(api_event['EventId'])
 53 | 
 54 |             if event['Multimedia'] != 'Not\xa0available' :
 55 |                 e.add_media_link(note='Recording',
 56 |                                  url = event['Multimedia']['url'],
 57 |                                  type="recording",
 58 |                                  media_type = 'text/html')
 59 | 
 60 |             self.addDocs(e, event, 'Agenda')
 61 |             self.addDocs(e, event, 'Minutes')
 62 | 
 63 |             location_string = event[u'Meeting Location']
 64 |             location_notes, other_orgs = self._parse_location(location_string)
 65 | 
 66 |             if location_notes:
 67 |                 e.extras = {'location note': ' '.join(location_notes)}
 68 | 
 69 |             if e.name == 'City Council Stated Meeting' :
 70 |                 participating_orgs = ['New York City Council']
 71 |             elif 'committee' in e.name.lower() :
 72 |                 participating_orgs = [e.name]
 73 |             else :
 74 |                 participating_orgs = []
 75 | 
 76 |             if other_orgs :
 77 |                 other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
 78 |                 participating_orgs += re.split(' and the |, the ', other_orgs)
 79 | 
 80 |             for org in participating_orgs :
 81 |                 e.add_committee(name=org)
 82 | 
 83 |             for item in self.agenda(api_event):
 84 |                 agenda_item = e.add_agenda_item(item["EventItemTitle"])
 85 |                 if item["EventItemMatterFile"]:
 86 |                     identifier = item["EventItemMatterFile"]
 87 |                     agenda_item.add_bill(identifier)
 88 | 
 89 |             participants = set()
 90 | 
 91 |             for call in self.rollcalls(api_event):
 92 |                 if call['RollCallValueName'] == 'Present':
 93 |                     participants.add(call['RollCallPersonName'].strip())
 94 | 
 95 |             for person in participants:
 96 |                 e.add_participant(name=person,
 97 |                                   type="person")
 98 | 
 99 |             e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event),
100 |                          note='api')
101 | 
102 |             try:
103 |                 detail_url = event['Meeting Details']['url']
104 |             except TypeError:
105 |                 e.add_source(self.EVENTSPAGE, note='web')
106 |             else:
107 |                 if requests.head(detail_url).status_code == 200:
108 |                     e.add_source(detail_url, note='web')
109 | 
110 |             yield e
111 | 
112 |     def _clean_location(self, location_string):
113 |         return re.sub(r'\s{2,}', ' ', location_string)
114 | 
115 |     def _parse_location(self, location_string):
116 |         other_orgs = None
117 |         location_notes = []
118 | 
119 |         if '--em--' in location_string:
120 |             location_string, note = location_string.split('--em--')[:2]
121 |             for each in note.split(' - ') :
122 |                 if each.startswith('Join') :
123 |                     other_orgs = each
124 |                 else :
125 |                     location_notes.append(each)
126 | 
127 |         return location_notes, other_orgs
128 | 
129 |     def _event_key(self, event, web_scraper):
130 |         response = web_scraper.get(event['iCalendar']['url'], verify=False)
131 |         event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt
132 |         event_time = pytz.timezone(self.TIMEZONE).localize(event_time)
133 | 
134 |         if event['Name'] == 'City Council Stated Meeting':
135 |             name = 'City Council'
136 |         else:
137 |             name = event['Name']
138 | 
139 |         key = (name, event_time)
140 | 
141 |         return key
142 | 
143 |     def _event_status(self, event):
144 |         if all(event[k] == 'Deferred' for k in ('EventMinutesStatusName',
145 |                                                 'EventAgendaStatusName')):
146 |             status = 'cancelled'
147 | 
148 |         elif datetime.datetime.utcnow().replace(tzinfo = pytz.utc) > event['start']:
149 |             status = 'passed'
150 | 
151 |         else:
152 |             status = 'confirmed'
153 | 
154 |         return status
155 | 
156 |     def _not_in_web_interface(self, event):
157 |         return event['EventAgendaStatusId'] == 1  # agenda not yet final
158 | 


--------------------------------------------------------------------------------
/archive/holyoke/people.py:
--------------------------------------------------------------------------------
  1 | from pupa.scrape import Scraper, Person, Membership
  2 | from pupa.utils import make_pseudo_id
  3 | import lxml.html
  4 | 
  5 | CITY_CLERK = "http://www.holyoke.org/departments/city-clerk/"
  6 | CITY_TREASURER = "http://www.holyoke.org/departments/treasurer/"
  7 | CITY_COUNCIL = "http://www.holyoke.org/departments/city-council/"
  8 | CITY_MAYOR = "http://www.holyoke.org/departments/mayors-office"
  9 | 
 10 | 
 11 | class HolyokePersonScraper(Scraper):
 12 | 
 13 |     def lxmlize(self, url):
 14 |         entry = self.urlopen(url)
 15 |         page = lxml.html.fromstring(entry)
 16 |         page.make_links_absolute(url)
 17 |         return page
 18 | 
 19 |     def scrape_council(self):
 20 |         page = self.lxmlize(CITY_COUNCIL)
 21 |         seen = set()
 22 |         for member in page.xpath(
 23 |             "//a[contains(@href, 'holyoke.org/city-council/')]"
 24 |         ):
 25 |             url = member.attrib['href']
 26 |             if url in seen:
 27 |                 continue
 28 |             seen.add(url)
 29 |             yield from self.scrape_counciler(member.attrib['href'])
 30 | 
 31 |     def scrape_counciler(self, url):
 32 |         page = self.lxmlize(url)
 33 |         who, = page.xpath("//h3[@class='subtitle']/text()")
 34 |         district, = page.xpath("//div[@class='right-bar']//h2/text()")
 35 |         image, = page.xpath(
 36 |             "//div[@class='left-bar']//a[@class='image lightbox']//img"
 37 |         )
 38 | 
 39 |         member = Person(
 40 |             primary_org='legislature',
 41 |             name=who, district=district,
 42 |             image=image.attrib['src']
 43 |         )
 44 |         member.add_source(url)
 45 | 
 46 |         details = page.xpath("//table[@align='center']//td")
 47 |         for detail in details:
 48 |             detail = detail.text_content().strip()
 49 |             if detail is None or detail == "":
 50 |                 continue
 51 | 
 52 |             type_, value = detail.split(":", 1)
 53 |             cdtype = {
 54 |                 "Home Phone": "voice",
 55 |                 "Address": "address",
 56 |                 "Email": "email",
 57 |                 "Cell Phone": "voice",
 58 |             }[type_]
 59 |             member.add_contact_detail(type=cdtype,
 60 |                                       note=type_,
 61 |                                       value=value)
 62 | 
 63 |         yield member
 64 | 
 65 |     def scrape_clerk(self):
 66 |         yield from self.scrape_staff(CITY_CLERK, 'clerk')
 67 | 
 68 |     def scrape_treasurer(self):
 69 |         yield from self.scrape_staff(CITY_CLERK, 'treasurer')
 70 | 
 71 |     def scrape_mayor(self):
 72 |         yield from self.scrape_staff(CITY_MAYOR, 'mayor')
 73 | 
 74 |     def scrape_staff(self, url, role):
 75 |         page = self.lxmlize(url)
 76 |         bar, = page.xpath("//div[@class='right-bar']")
 77 |         head, office, contact, _ = bar.xpath(".//div[@class='module']")
 78 |         name, = head.xpath(".//h4")
 79 |         title, social = head.xpath(".//p")
 80 | 
 81 |         head = Person(name=name.text_content())
 82 |         head.add_source(url)
 83 | 
 84 |         membership = Membership(
 85 |             post_id=make_pseudo_id(role=role,),
 86 |             role=role,
 87 |             label=title.text_content(),
 88 |             person_id=head._id,
 89 |             organization_id=make_pseudo_id(
 90 |                 classification="legislature"))
 91 |         yield membership
 92 | 
 93 |         emails = social.xpath(".//a[contains(@href, 'mailto:')]")
 94 |         for email in emails:
 95 |             head.add_contact_detail(type='email',
 96 |                                      value=email.attrib['href'],
 97 |                                      note='Office Email')
 98 | 
 99 |         offices = office.xpath(".//p")
100 |         for office in offices:
101 |             head.add_contact_detail(type='address',
102 |                                      value=office.text_content(),
103 |                                      note='Office Address')
104 | 
105 |         contacts = contact.xpath(".//span")
106 |         for contact in contacts:
107 |             class_ = contact.attrib['class']
108 |             type_ = {"icon-phone": "voice",
109 |                      "icon-fax": "fax",
110 |                      "icon-email": "email"}[class_]
111 | 
112 |             value = contact.tail
113 |             if value is None:
114 |                 value = contact.getnext()
115 |                 value = value.text_content() if value is not None else None
116 | 
117 |             if value is None:
118 |                 continue
119 | 
120 |             head.add_contact_detail(type=type_,
121 |                                     value=value,
122 |                                     note="Office Contact Detail")
123 |         yield head
124 | 
125 |         staff, = page.xpath("//div[@id='staff']")
126 |         for member in staff.xpath(
127 |             "//div[@class='table-item clearfix remove-clickable']"
128 |         ):
129 |             name, = member.xpath(".//span[@class='title1']")
130 |             name = name.text
131 |             name, staff_role = name.rsplit("-", 1)
132 |             name = name.strip()
133 |             staff_role = staff_role.strip()
134 | 
135 |             staffer = Person(name=name)
136 |             staffer.add_source(url)
137 |             details = member.xpath(".//p/span")
138 | 
139 |             membership = Membership(
140 |                 role=staff_role,
141 |                 label="%s-staff" % (role),
142 |                 person_id=staffer._id,
143 |                 organization_id=make_pseudo_id(
144 |                     classification="legislature",))
145 |             yield membership
146 | 
147 |             for detail in details:
148 |                 type_ = {
149 |                     "icon-phone marker": "voice",
150 |                     "icon-email marker": "email",
151 |                 }[detail.attrib['class']]
152 |                 value = detail.tail
153 |                 if value is None:
154 |                     value = detail.getnext()
155 |                     value = value.text_content() if value is not None else None
156 | 
157 |                 if value is None:
158 |                     continue
159 | 
160 |                 staffer.add_contact_detail(type=type_,
161 |                                            value=value,
162 |                                            note="Office")
163 | 
164 |             yield staffer
165 | 
166 |     def scrape(self):
167 |         yield from self.scrape_clerk()
168 |         yield from self.scrape_treasurer()
169 |         yield from self.scrape_mayor()
170 |         yield from self.scrape_council()
171 | 


--------------------------------------------------------------------------------
/chicago/events.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | from pupa.scrape import Event, Scraper
  4 | from pupa.utils import _make_pseudo_id
  5 | 
  6 | from .base import ElmsAPI
  7 | from .rule_forty_five import RULE_45
  8 | 
  9 | 
 10 | class ChicagoEventsScraper(ElmsAPI, Scraper):
 11 |     def _events(self, n_days_ago):
 12 | 
 13 |         for event in self._paginate(
 14 |             self._endpoint("/meeting"),
 15 |             {"filter": f"date gt {n_days_ago.isoformat()}", "sort": "date asc"},
 16 |         ):
 17 |             detailed_event = self.get(self._endpoint(f'/meeting/{event["meetingId"]}'))
 18 |             yield detailed_event.json()
 19 | 
 20 |     def scrape(self, window=7):
 21 |         n_days_ago = datetime.datetime.now().astimezone() - datetime.timedelta(
 22 |             float(window)
 23 |         )
 24 |         for event in self._events(n_days_ago):
 25 | 
 26 |             when = datetime.datetime.fromisoformat(event["date"])
 27 |             location = event["location"]
 28 |             if not location:
 29 |                 location = None
 30 | 
 31 |             if (event["comment"] or "").lower() == "wrong meeting date":
 32 |                 continue
 33 | 
 34 |             status = self.infer_status(event, when)
 35 | 
 36 |             e = Event(
 37 |                 name=event["body"],
 38 |                 start_date=when,
 39 |                 location_name=location,
 40 |                 status=status,
 41 |             )
 42 | 
 43 |             e.pupa_id = str(event["meetingId"])
 44 | 
 45 |             if video_links := event["videoLink"]:
 46 |                 for link in video_links.split():
 47 |                     e.add_media_link(
 48 |                         note="Recording",
 49 |                         url=link,
 50 |                         type="recording",
 51 |                         media_type="text/html",
 52 |                     )
 53 | 
 54 |             for document in event["files"]:
 55 |                 e.add_document(
 56 |                     note=document["attachmentType"],
 57 |                     url=document["path"],
 58 |                     media_type="application/pdf",
 59 |                 )
 60 | 
 61 |             participant = event["body"]
 62 |             if participant == "City Council":
 63 |                 participant = "Chicago City Council"
 64 |             e.add_participant(name=participant, type="organization")
 65 | 
 66 |             participants = set()
 67 |             for attendance in event["attendance"]:
 68 |                 for call in attendance["votes"]:
 69 |                     if call["vote"] == "Present":
 70 |                         participants.add(call["voterName"].strip())
 71 | 
 72 |             rule_45 = RULE_45.get(participant, {}).get(str(when.date()))
 73 | 
 74 |             if rule_45:
 75 |                 if rule_45["source"]:
 76 |                     e.add_document(
 77 |                         note="Rule 45 Report",
 78 |                         url=rule_45["source"],
 79 |                         media_type="application/pdf",
 80 |                     )
 81 |                 if not participants and rule_45["attendance"]:
 82 |                     participants.update(rule_45["attendance"])
 83 | 
 84 |             for person in participants:
 85 |                 e.add_participant(name=person, type="person")
 86 | 
 87 |             for item in event["agenda"]:
 88 |                 if not (matterTitle := item["matterTitle"]):
 89 |                     continue
 90 | 
 91 |                 agenda_item = e.add_agenda_item(matterTitle)
 92 |                 if bill_identifier := item["recordNumber"]:
 93 |                     bill_identifier = bill_identifier.strip()
 94 | 
 95 |                     agenda_item.add_bill(bill_identifier)
 96 | 
 97 |                     response = self.get(
 98 |                         self._endpoint(
 99 |                             f'/meeting/{event["meetingId"]}/matter/{item["matterId"]}/votes'
100 |                         )
101 |                     )
102 |                     votes = response.json()
103 |                     if votes:
104 |                         if item["actionText"]:
105 |                             agenda_item["related_entities"].append(
106 |                                 {
107 |                                     "vote_event_id": _make_pseudo_id(
108 |                                         motion_text=item["actionText"],
109 |                                         start_date=str(when.date()),
110 |                                         organization__name=participant,
111 |                                         bill__other_identifiers__identifier=bill_identifier,
112 |                                     ),
113 |                                     "entity_type": "vote_event",
114 |                                     "note": "consideration",
115 |                                 }
116 |                             )
117 |                         else:
118 |                             agenda_item["related_entities"].append(
119 |                                 {
120 |                                     "vote_event_id": _make_pseudo_id(
121 |                                         start_date=str(when.date()),
122 |                                         organization__name=participant,
123 |                                         bill__other_identifiers__identifier=bill_identifier,
124 |                                     ),
125 |                                     "entity_type": "vote_event",
126 |                                     "note": "consideration",
127 |                                 }
128 |                             )
129 | 
130 |             e.add_source(
131 |                 self._endpoint(f'/matter/{event["meetingId"]}'), note="elms_api"
132 |             )
133 |             e.add_source(
134 |                 f"https://chicityclerkelms.chicago.gov/Meeting/?meetingId={event['meetingId']}",
135 |                 note="web",
136 |             )
137 | 
138 |             yield e
139 | 
140 |     def infer_status(self, event, when):
141 |         raw_status = event["status"]
142 | 
143 |         if raw_status == "Scheduled":
144 |             if when > datetime.datetime.now().astimezone():
145 |                 return "confirmed"
146 |             else:
147 |                 return "passed"
148 |         elif raw_status in {"Recessed", "Reconvened", "Rescheduled"}:
149 |             return "passed"
150 |         elif raw_status == "Cancelled":
151 |             return "cancelled"
152 |         elif not raw_status and not event["comment"]:
153 |             return "passed"
154 | 
155 |         comment = event["comment"]
156 |         if comment is None:
157 |             comment = ""
158 |         else:
159 |             comment = comment.lower()
160 | 
161 |         if any(
162 |             phrase in comment
163 |             for phrase in (
164 |                 "rescheduled to",
165 |                 "postponed to",
166 |                 "rescheduled to",
167 |                 "postponed to",
168 |                 "deferred",
169 |                 "time change",
170 |                 "date change",
171 |                 "cancelled",
172 |                 "new date and time",
173 |                 "rescheduled indefinitely",
174 |                 "rescheduled for",
175 |             )
176 |         ):
177 |             return "cancelled"
178 |         elif comment in {"rescheduled", "recessed"}:
179 |             return "cancelled"
180 |         else:
181 |             return "passed"
182 | 


--------------------------------------------------------------------------------
/pittsburgh/events.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import datetime
  3 | 
  4 | import lxml
  5 | import lxml.etree
  6 | import pytz
  7 | import requests
  8 | from legistar.events import LegistarAPIEventScraper
  9 | from pupa.scrape import Event, Scraper
 10 | 
 11 | 
 12 | class PittsburghEventsScraper(LegistarAPIEventScraper, Scraper) :
 13 |     BASE_URL = "http://webapi.legistar.com/v1/pittsburgh"
 14 |     WEB_URL = "https://pittsburgh.legistar.com/"
 15 |     EVENTSPAGE = "https://pittsburgh.legistar.com/Calendar.aspx"
 16 |     TIMEZONE = "America/New_York"
 17 | 
 18 |     def _event_key(self, event, web_scraper):
 19 | 
 20 |         # Overrides method from LegistarAPIEventScraper.
 21 |         # The package looks for event["Name"]["label"],
 22 |         # however in Pittsburgh"s case there"s no "label".
 23 | 
 24 |         response = web_scraper.get(event["iCalendar"]["url"], verify=False)
 25 |         event_time = web_scraper.ical(response.text).subcomponents[0]["DTSTART"].dt
 26 |         event_time = pytz.timezone(self.TIMEZONE).localize(event_time)
 27 | 
 28 |         key = (event["Name"],
 29 |                event_time)
 30 | 
 31 |         return key
 32 | 
 33 |     def clean_agenda_item_title(self, item_title):
 34 |       if "PUBLIC COMMENTS" in item_title:
 35 |         item_title = "PUBLIC COMMENTS"
 36 | 
 37 |       if item_title.endswith(':'):
 38 |         item_title = item_title[:-1]
 39 | 
 40 |       return item_title
 41 | 
 42 |     def scrape(self, window=3):
 43 |         n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
 44 |         for api_event, event in self.events(n_days_ago):
 45 | 
 46 |             description = api_event["EventComment"]
 47 |             when = api_event["start"]
 48 |             location = api_event["EventLocation"]
 49 | 
 50 |             if location == "Council Chambers":
 51 |                 location = "Council Chambers, 5th Floor, City-County Building, " \
 52 |                             "414 Grant Street, Pittsburgh, PA 15219"
 53 | 
 54 |             if not location :
 55 |                 continue
 56 | 
 57 |             status_string = api_event["status"]
 58 | 
 59 |             if len(status_string) > 1 and status_string[1] :
 60 |                 status_text = status_string[1].lower()
 61 |                 if any(phrase in status_text
 62 |                        for phrase in ("rescheduled to",
 63 |                                       "postponed to",
 64 |                                       "reconvened to",
 65 |                                       "rescheduled to",
 66 |                                       "meeting recessed",
 67 |                                       "recessed meeting",
 68 |                                       "postponed to",
 69 |                                       "recessed until",
 70 |                                       "deferred",
 71 |                                       "time change",
 72 |                                       "date change",
 73 |                                       "recessed meeting - reconvene",
 74 |                                       "cancelled",
 75 |                                       "new date and time",
 76 |                                       "rescheduled indefinitely",
 77 |                                       "rescheduled for",)) :
 78 |                     status = "cancelled"
 79 |                 elif status_text in ("rescheduled", "recessed") :
 80 |                     status = "cancelled"
 81 |                 elif status_text in ("meeting reconvened",
 82 |                                      "reconvened meeting",
 83 |                                      "recessed meeting",
 84 |                                      "reconvene meeting",
 85 |                                      "rescheduled hearing",
 86 |                                      "rescheduled meeting",) :
 87 |                     status = api_event["status"]
 88 |                 elif status_text in ("amended notice of meeting",
 89 |                                      "room change",
 90 |                                      "amended notice",
 91 |                                      "change of location",
 92 |                                      "revised - meeting date and time") :
 93 |                     status = api_event["status"]
 94 |                 elif "room" in status_text :
 95 |                     location = status_string[1] + ", " + location
 96 |                 elif status_text in ("wrong meeting date",) :
 97 |                     continue
 98 |                 else :
 99 |                     print(status_text)
100 |                     status = api_event["status"]
101 |             else :
102 |                 status = api_event["status"]
103 | 
104 |             if event["Name"] == "Post Agenda":
105 |                 event_name = "Agenda Announcement"
106 |             else:
107 |                 event_name = event["Name"]
108 | 
109 |             if description :
110 |                 e = Event(name=event_name,
111 |                           start_date=when,
112 |                           description=description,
113 |                           location_name=location,
114 |                           status=status)
115 |             else :
116 |                 e = Event(name=event_name,
117 |                           start_date=when,
118 |                           location_name=location,
119 |                           status=status)
120 | 
121 |             e.pupa_id = str(api_event["EventId"])
122 | 
123 |             if event["Video"] != "Not\xa0available":
124 |                 e.add_media_link(note="Recording",
125 |                                  url = event["Video"]["url"],
126 |                                  type="recording",
127 |                                  media_type = "text/html")
128 | 
129 |             self.addDocs(e, event, "Agenda")
130 |             self.addDocs(e, event, "Minutes")
131 | 
132 |             participant = event["Name"]
133 | 
134 |             if participant == "City Council" or participant == "Post Agenda":
135 |                 participant = "Pittsburgh City Council"
136 | 
137 |             e.add_participant(name=participant,
138 |                               type="organization")
139 | 
140 |             for item in self.agenda(api_event):
141 |                 clean_title = self.clean_agenda_item_title(item["EventItemTitle"])
142 |                 agenda_item = e.add_agenda_item(clean_title)
143 |                 if item["EventItemMatterFile"]:
144 |                     identifier = item["EventItemMatterFile"]
145 |                     agenda_item.add_bill(identifier)
146 | 
147 |             participants = set()
148 | 
149 |             for call in self.rollcalls(api_event):
150 |                 if call["RollCallValueName"] == "Present":
151 |                     participants.add(call["RollCallPersonName"])
152 | 
153 |             for person in participants:
154 |                 e.add_participant(name=person,
155 |                                   type="person")
156 | 
157 |             e.add_source(self.BASE_URL + "/events/{EventId}".format(**api_event),
158 |                          note="api")
159 | 
160 |             try:
161 |                 detail_url = event["Meeting Details"]["url"]
162 |             except TypeError:
163 |                 e.add_source(self.EVENTSPAGE, note="web")
164 |             else:
165 |                 if requests.head(detail_url).status_code == 200:
166 |                     e.add_source(detail_url, note="web")
167 | 
168 |             yield e
169 | 


--------------------------------------------------------------------------------
/chicago/people.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import scrapelib
  4 | from pupa.scrape import Organization, Person, Scraper
  5 | 
  6 | from .base import ElmsAPI
  7 | 
  8 | 
  9 | class ChicagoPersonScraper(ElmsAPI, Scraper):
 10 |     def _bodies(self, filters):
 11 |         for body in self._paginate(
 12 |             self._endpoint("/body"),
 13 |             {"filter": filters},
 14 |         ):
 15 |             yield body
 16 | 
 17 |     def scrape(self, window=None):
 18 | 
 19 |         (city_council,) = self._bodies("bodyType eq 'Full City Council'")
 20 | 
 21 |         alders = {}
 22 | 
 23 |         for term in city_council["members"]:
 24 |             person_name = term["displayName"].strip()
 25 |             if person_name == "Rodriguez-Sanchez, Rossana":
 26 |                 person_name = "Rodriguez Sanchez, Rossana"
 27 |             if "vacant" in person_name.lower():
 28 |                 continue
 29 | 
 30 |             if person_name in alders:
 31 |                 person = alders[person_name]
 32 |             else:
 33 |                 if person_name == "Willie B. Cochran":
 34 |                     person_name = "Cochran, Willie B."
 35 | 
 36 |                 alders[person_name] = person = Person(person_name)
 37 |                 person.family_name = person_name.split(",")[0]
 38 |                 person.extras["personId"] = term["personId"]
 39 | 
 40 |                 if person_name == "Fuentes, Jessica L.":
 41 |                     person.add_name("Fuentes, Jessica")
 42 |                 elif person_name == "Robinson, Lamont J.":
 43 |                     person.add_name("Robinson, Lamont")
 44 |                 elif person_name == "Rodriguez Sanchez, Rossana":
 45 |                     person.add_name("Rodriguez-Sanchez, Rossana")
 46 |                 elif person_name == "Cochran, Willie B.":
 47 |                     person.add_name("Willie B. Cochran")
 48 |                     person.add_name("Willie B., Cochran")
 49 | 
 50 |             person.add_term(
 51 |                 "Alderman",
 52 |                 "legislature",
 53 |                 district=f"Ward {int(term['ward'])}",
 54 |                 start_date=datetime.datetime.fromisoformat(term["startDate"]).date(),
 55 |                 end_date=datetime.datetime.fromisoformat(term["endDate"]).date(),
 56 |             )
 57 | 
 58 |         for person in alders.values():
 59 |             person_url = self._endpoint(f"/person/{person.extras['personId']}")
 60 |             person.add_source(person_url, note="elms_api")
 61 |             person.add_source(
 62 |                 f"https://chicityclerkelms.chicago.gov/Legislative-Member-Details/?personId={person.extras['personId']}",
 63 |                 note="web",
 64 |             )
 65 | 
 66 |             try:
 67 |                 response = self.get(person_url)
 68 |             except scrapelib.HTTPError as error:
 69 |                 if error.response.status_code == 404:
 70 |                     continue
 71 |             person_details = response.json()
 72 | 
 73 |             if image := person_details["photo"]:
 74 |                 person.image = image
 75 | 
 76 |             if web_site := person_details["site"]:
 77 |                 person.add_link(web_site.strip())
 78 | 
 79 |             if email := person_details["email"]:
 80 |                 person.add_contact_detail(type="email", value=email, note="E-mail")
 81 | 
 82 |             if ward_phone := person_details["phone"]:
 83 |                 person.add_contact_detail(
 84 |                     type="voice", value=ward_phone, note="Ward Office Phone"
 85 |                 )
 86 | 
 87 |             if ward_fax := person_details["fax"]:
 88 |                 person.add_contact_detail(
 89 |                     type="fax", value=ward_fax, note="Ward Office Fax"
 90 |                 )
 91 | 
 92 |             if ward_street_number := person_details["address"]:
 93 |                 ward_address = f'{ward_street_number}\n{person_details["city"]}, {person_details["state"]} {person_details["zip"]}'
 94 |                 person.add_contact_detail(
 95 |                     type="address", value=ward_address, note="Ward Office Address"
 96 |                 )
 97 | 
 98 |             if city_hall_phone := person_details["phone2"]:
 99 |                 person.add_contact_detail(
100 |                     type="voice", value=city_hall_phone, note="City Hall Office Phone"
101 |                 )
102 | 
103 |             if city_hall_fax := person_details["fax"]:
104 |                 person.add_contact_detail(
105 |                     type="fax", value=city_hall_fax, note="City Hall Office Fax"
106 |                 )
107 | 
108 |             if city_hall_street_number := person_details["address2"]:
109 |                 city_hall_address = f'{city_hall_street_number}\n{person_details["city2"]}, {person_details["state2"]} {person_details["zip2"]}'
110 |                 person.add_contact_detail(
111 |                     type="address",
112 |                     value=city_hall_address,
113 |                     note="City Hall Office Address",
114 |                 )
115 | 
116 |         for body in self._bodies("bodyType eq 'Committee'"):
117 | 
118 |             org = Organization(
119 |                 body["body"],
120 |                 classification="committee",
121 |                 parent_id={"name": "Chicago City Council"},
122 |             )
123 | 
124 |             org.add_source(self._endpoint(f'/body/{body["bodyId"]}'), note="elms_api")
125 |             org.add_source(
126 |                 f"https://chicityclerkelms.chicago.gov/Legislative-Body-Details/?bodyId={body['bodyId']}",
127 |                 note="web",
128 |             )
129 | 
130 |             terms = longest_memberships(body["members"])
131 |             for term in terms:
132 |                 person_name = term["displayName"].strip()
133 |                 if person_name in {"Allen, Thomas"}:
134 |                     continue
135 |                 elif person_name == "Rodriguez-Sanchez, Rossana":
136 |                     person_name = "Rodriguez Sanchez, Rossana"
137 |                 elif person_name in {"Willie B., Cochran", "Willie B. Cochran"}:
138 |                     person_name = "Cochran, Willie B."
139 |                 
140 |                 try:
141 |                     person = alders[person_name]
142 |                 except KeyError as error:
143 |                     if "vacant" in person_name.lower():
144 |                         continue
145 |                     self.warning("Problem adding to Committee:", person_name)
146 |                 else:
147 |                     person.add_membership(
148 |                         org,
149 |                         role=term["memberType"],
150 |                         start_date=datetime.datetime.fromisoformat(
151 |                             term["startDate"]
152 |                         ).date(),
153 |                         end_date=datetime.datetime.fromisoformat(term["endDate"]).date(),
154 |                     )
155 |                 
156 | 
157 |             yield org
158 | 
159 |         for body in self._bodies("bodyType eq 'Joint Committee'"):
160 | 
161 |             org = Organization(
162 |                 body["body"],
163 |                 classification="committee",
164 |                 parent_id={"name": "Chicago City Council"},
165 |             )
166 | 
167 |             org.add_source(self._endpoint(f'/body/{body["bodyId"]}'), note="elms_api")
168 |             org.add_source(
169 |                 f"https://chicityclerkelms.chicago.gov/Legislative-Body-Details/?bodyId={body['bodyId']}",
170 |                 note="web",
171 |             )
172 | 
173 |             yield org
174 | 
175 |         for person in alders.values():
176 |             yield person
177 | 
178 | 
179 | def longest_memberships(memberships):
180 | 
181 |     collapsed = {}
182 | 
183 |     for membership in memberships:
184 |         key = (membership["displayName"], membership["startDate"])
185 |         if key in collapsed:
186 |             if membership["endDate"] > collapsed[key]["endDate"]:
187 |                 collapsed[key] = membership
188 |         else:
189 |             collapsed[key] = membership
190 | 
191 |     return list(collapsed.values())
192 | 


--------------------------------------------------------------------------------
/nyc/people.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import re
  4 | 
  5 | from legistar.people import LegistarAPIPersonScraper, LegistarPersonScraper
  6 | from pupa.scrape import Person, Organization, Scraper
  7 | 
  8 | from .secrets import TOKEN
  9 | 
 10 | 
 11 | class NYCPersonScraper(LegistarAPIPersonScraper, Scraper):
 12 |     BASE_URL = 'https://webapi.legistar.com/v1/nyc'
 13 |     WEB_URL = 'https://legistar.council.nyc.gov'
 14 |     TIMEZONE = 'US/Eastern'
 15 | 
 16 |     def __init__(self, *args, **kwargs):
 17 |         super().__init__(*args, **kwargs)
 18 | 
 19 |         self.params = {'Token': TOKEN}
 20 | 
 21 |     def scrape(self):
 22 |         web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute)
 23 |         web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody'
 24 | 
 25 |         if self.cache_storage:
 26 |             web_scraper.cache_storage = self.cache_storage
 27 | 
 28 |         if self.requests_per_minute == 0:
 29 |             web_scraper.cache_write_only = False
 30 | 
 31 |         web_info = {}
 32 | 
 33 |         for member, _ in web_scraper.councilMembers():
 34 |             name = member['Person Name']['label'].strip()
 35 |             web_info[name] = member
 36 | 
 37 |         city_council, = [body for body in self.bodies()
 38 |                          if body['BodyName'] == 'City Council']
 39 | 
 40 |         terms = collections.defaultdict(list)
 41 | 
 42 |         public_advocates = {  # Match casing to Bill De Blasio as council member
 43 |             'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio',
 44 |             'The Public Advocate (Ms. James)': 'Letitia James',
 45 |         }
 46 | 
 47 |         for office in self.body_offices(city_council):
 48 |             name = office['OfficeRecordFullName']
 49 |             name = public_advocates.get(name, name).strip()
 50 | 
 51 |             terms[name].append(office)
 52 | 
 53 |             # Add past members (and advocates public)
 54 |             if name not in web_info:
 55 |                 web_info[name] = collections.defaultdict(lambda: None)
 56 | 
 57 |         # Check that we have everyone we expect, formatted consistently, in
 58 |         # both information arrays. For instance, this will fail if we forget to
 59 |         # strip trailing spaces from names on one side or the other (which has
 60 |         # the effect of omitting information, such as post, from the scrape).
 61 | 
 62 |         assert set(web_info.keys()) == set(terms.keys())
 63 | 
 64 |         members = {}
 65 | 
 66 |         for member, offices in terms.items():
 67 | 
 68 |             p = Person(member)
 69 | 
 70 |             web = web_info[member]
 71 | 
 72 |             for term in offices:
 73 |                 role = term['OfficeRecordTitle']
 74 | 
 75 |                 if role == 'Public Advocate':
 76 |                     role = 'Non-Voting Council Member'
 77 |                 else:
 78 |                     role = 'Council Member'
 79 | 
 80 |                 district = web.get('District', '').replace(' 0', ' ')
 81 | 
 82 |                 p.add_term(role,
 83 |                            'legislature',
 84 |                            district=district,
 85 |                            start_date=self.toDate(term['OfficeRecordStartDate']),
 86 |                            end_date=self.toDate(term['OfficeRecordEndDate']))
 87 | 
 88 |                 party = web.get('Political Party')
 89 | 
 90 |                 if party == 'Democrat':
 91 |                     party = 'Democratic'
 92 | 
 93 |                 if party:
 94 |                     p.add_party(party)
 95 | 
 96 |                 if web.get('Photo'):
 97 |                     p.image = web['Photo']
 98 | 
 99 |                 contact_types = {
100 |                     "City Hall Office": ("address", "City Hall Office"),
101 |                     "City Hall Phone": ("voice", "City Hall Phone"),
102 |                     "Ward Office Phone": ("voice", "Ward Office Phone"),
103 |                     "Ward Office Address": ("address", "Ward Office Address"),
104 |                     "Fax": ("fax", "Fax")
105 |                 }
106 | 
107 |                 for contact_type, (type_, _note) in contact_types.items():
108 |                     if web.get(contact_type) and web(contact_type) != 'N/A':
109 |                         p.add_contact_detail(type=type_,
110 |                                              value= web[contact_type],
111 |                                              note=_note)
112 | 
113 |                 if web.get('E-mail'):
114 |                     p.add_contact_detail(type="email",
115 |                                          value=web['E-mail']['url'],
116 |                                          note='E-mail')
117 | 
118 |                 if web.get('Web site'):
119 |                     p.add_link(web['Web site']['url'], note='web site')
120 | 
121 |                 if web.get('Notes'):
122 |                     p.extras = {'Notes': web['Notes']}
123 | 
124 |                 if not p.sources:  # Only add sources once
125 |                     source_urls = self.person_sources_from_office(term)
126 |                     person_api_url, person_web_url = source_urls
127 |                     p.add_source(person_api_url, note='api')
128 |                     p.add_source(person_web_url, note='web')
129 | 
130 |             members[member] = p
131 | 
132 |         committee_types = ['Committee',
133 |                            'Inactive Committee',
134 |                            'Select Committee',
135 |                            'Subcommittee',
136 |                            'Task Force',
137 |                            'Land Use', # Committee on Land Use
138 |                           ]
139 | 
140 |         body_types = {k: v for k, v in self.body_types().items()
141 |                       if k in committee_types}
142 | 
143 |         for body in self.bodies():
144 |             if body['BodyTypeName'] in body_types \
145 |                 or body['BodyName'] in ('Legislative Documents Unit',
146 |                                         'Legal and Government Affairs Division'):
147 | 
148 |                 # Skip typo in API data
149 |                 if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services':
150 |                     continue
151 | 
152 |                 parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council')
153 | 
154 |                 body_name = body['BodyName']
155 | 
156 |                 o = Organization(body_name,
157 |                                  classification='committee',
158 |                                  parent_id={'name': parent_org})
159 | 
160 |                 o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api')
161 |                 o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web')
162 | 
163 |                 for office in self.body_offices(body):
164 |                     # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio',
165 |                     # 'Committee Member', None, 'CHAIRPERSON'
166 | 
167 |                     role = office['OfficeRecordTitle']
168 | 
169 |                     if role and role.lower() == 'chairperson':
170 |                         role = 'Chairperson'
171 |                     else:
172 |                         role = 'Member'
173 | 
174 |                     person = office['OfficeRecordFullName']
175 |                     person = public_advocates.get(person, person).strip()
176 | 
177 |                     if person in members:
178 |                         p = members[person]
179 |                     else:
180 |                         p = Person(person)
181 | 
182 |                         source_urls = self.person_sources_from_office(office)
183 |                         person_api_url, person_web_url = source_urls
184 |                         p.add_source(person_api_url, note='api')
185 |                         p.add_source(person_web_url, note='web')
186 | 
187 |                         members[person] = p
188 | 
189 |                     p.add_membership(o,
190 |                                      role=role,
191 |                                      start_date=self.toDate(office['OfficeRecordStartDate']),
192 |                                      end_date=self.toDate(office['OfficeRecordEndDate']))
193 | 
194 |                 yield o
195 | 
196 |         for p in members.values():
197 |             yield p
198 | 
199 | 
200 | PARENT_ORGS = {
201 |     'Subcommittee on Landmarks, Public Siting and Maritime Uses': 'Committee on Land Use',
202 |     'Subcommittee on Libraries': 'Committee on Cultural Affairs, Libraries and International Intergroup Relations',
203 |     'Subcommittee on Non-Public Schools': 'Committee on Education',
204 |     'Subcommittee on Planning, Dispositions and Concessions': 'Committee on Land Use',
205 |     'Subcommittee on Senior Centers': 'Committee on Aging',
206 |     'Subcommittee on Zoning and Franchises': 'Committee on Land Use',
207 |     'Subcommittee on Drug Abuse': 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse and Disability Services',
208 |     'Legislative Documents Unit': 'Mayor',
209 |     'Legal and Government Affairs Division': 'Mayor',
210 | }
211 | 


--------------------------------------------------------------------------------