├── pupa ├── cli │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── base.py │ │ ├── party.py │ │ ├── dbinit.py │ │ ├── clean.py │ │ └── init.py │ └── __main__.py ├── ext │ ├── __init__.py │ └── ansistrm.py ├── tests │ ├── __init__.py │ ├── importers │ │ ├── __init__.py │ │ ├── test_jurisdiction_importer.py │ │ ├── test_post_importer.py │ │ ├── test_topsort.py │ │ └── test_base_importer.py │ ├── reports │ │ └── __init__.py │ ├── scrape │ │ ├── __init__.py │ │ ├── test_utils.py │ │ ├── test_jurisdiction_scrape.py │ │ ├── test_scraper.py │ │ ├── test_model_basics.py │ │ ├── test_event_scrape.py │ │ ├── test_vote_event_scrape.py │ │ ├── test_people_org_scrape.py │ │ └── test_bill_scrape.py │ ├── django_settings.py │ ├── update │ │ └── test_importer_resolution.py │ └── clean │ │ └── test_clean.py ├── migrations │ ├── __init__.py │ ├── 0005_auto_20170522_1935.py │ ├── 0006_identifier_jurisdiction.py │ ├── 0004_identifier.py │ ├── 0003_auto_20151118_0408.py │ ├── 0002_auto_20150906_1458.py │ ├── 0007_sessiondataqualityreport.py │ └── 0001_initial.py ├── scrape │ ├── schemas │ │ ├── __init__.py │ │ ├── post.py │ │ ├── membership.py │ │ ├── jurisdiction.py │ │ ├── person.py │ │ ├── organization.py │ │ ├── vote_event.py │ │ ├── common.py │ │ ├── bill.py │ │ └── event.py │ ├── __init__.py │ ├── jurisdiction.py │ ├── bill.py │ ├── event.py │ └── vote_event.py ├── __init__.py ├── reports │ ├── __init__.py │ └── session.py ├── utils │ ├── __init__.py │ ├── generic.py │ └── topsort.py ├── importers │ ├── __init__.py │ ├── jurisdiction.py │ ├── posts.py │ ├── memberships.py │ ├── people.py │ ├── bills.py │ ├── events.py │ ├── organizations.py │ └── vote_events.py ├── settings.py ├── exceptions.py ├── admin.py └── models.py ├── setup.cfg ├── run-tests.sh ├── .gitignore ├── .coveragerc ├── tox.ini ├── README.md ├── LICENSE ├── setup.py ├── .github └── workflows │ └── package.yml ├── ARCHITECTURE.md └── CHANGELOG.md /pupa/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/ext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/cli/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/tests/importers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/tests/reports/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/tests/scrape/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pupa/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.11.0" # pragma: no cover 2 | -------------------------------------------------------------------------------- /pupa/reports/__init__.py: -------------------------------------------------------------------------------- 1 | from .session import generate_session_report # noqa 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | max-line-length = 99 6 | exclude = pupa/migrations 7 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH=. 3 | pytest --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *swp 3 | *egg-info* 4 | .tox 5 | dist 6 | .coverage 7 | htmlcov/ 8 | _data/ 9 | _cache/ 10 | build/ 11 | .cache/ 12 | .idea/ 13 | .env/ -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = pupa/tests/* 3 | pupa/ext/* 4 | pupa/cli/* 5 | [report] 6 | exclude_lines = 7 | if __name__ == .__main__.: 8 | pragma: no cover 9 | -------------------------------------------------------------------------------- /pupa/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .generic import ( 3 | _make_pseudo_id, 4 | get_pseudo_id, 5 | makedirs, 6 | JSONEncoderPlus, 7 | convert_pdf, 8 | utcnow, 9 | format_datetime, 10 | ) 11 | -------------------------------------------------------------------------------- /pupa/scrape/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .jurisdiction import Jurisdiction, JurisdictionScraper 3 | from .popolo import Membership, Organization, Person, Post 4 | from .vote_event import VoteEvent, OrderVoteEvent 5 | from .bill import Bill 6 | from .event import Event 7 | from .base import Scraper, BaseBillScraper 8 | -------------------------------------------------------------------------------- /pupa/importers/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .jurisdiction import JurisdictionImporter 3 | from .organizations import OrganizationImporter 4 | from .people import PersonImporter 5 | from .posts import PostImporter 6 | from .memberships import MembershipImporter 7 | from .bills import BillImporter 8 | from .vote_events import VoteEventImporter 9 | from .events import EventImporter 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37-django{22,30},flake8 3 | [testenv] 4 | deps = 5 | django22: Django==2.2 6 | django30: Django==3.0 7 | commands = 8 | pip install -e .[dev] git+https://github.com/opencivicdata/python-opencivicdata.git#egg=opencivicdata 9 | pytest pupa --ds=pupa.tests.django_settings 10 | 11 | [testenv:flake8] 12 | deps = flake8 13 | commands = flake8 pupa 14 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pupa: A legislative data scraping framework 2 | 3 | ![example workflow](https://github.com/opencivicdata/pupa/actions/workflows/package.yml/badge.svg?branch=master) 4 | [![Coverage Status](https://coveralls.io/repos/opencivicdata/pupa/badge.png?branch=master)](https://coveralls.io/r/opencivicdata/pupa?branch=master) 5 | [![PyPI](https://img.shields.io/pypi/v/pupa.svg)](https://pypi.python.org/pypi/pupa) 6 | -------------------------------------------------------------------------------- /pupa/cli/commands/base.py: -------------------------------------------------------------------------------- 1 | class BaseCommand(object): 2 | def __init__(self, subparsers): 3 | self.subparser = subparsers.add_parser(self.name, description=self.help) 4 | self.add_args() 5 | 6 | def add_args(self): 7 | pass 8 | 9 | def add_argument(self, *args, **kwargs): 10 | self.subparser.add_argument(*args, **kwargs) 11 | 12 | def handle(self, args): 13 | raise NotImplementedError("commands must implement handle(args)") 14 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/post.py: -------------------------------------------------------------------------------- 1 | from .common import links, contact_details, extras, fuzzy_date_blank 2 | 3 | schema = { 4 | "properties": { 5 | "label": {"type": "string", "minLength": 1}, 6 | "role": {"type": "string"}, 7 | "maximum_memberships": {"type": "number"}, 8 | "organization_id": {"type": "string", "minLength": 1}, 9 | "division_id": {"type": ["null", "string"], "minLength": 1}, 10 | "start_date": fuzzy_date_blank, 11 | "end_date": fuzzy_date_blank, 12 | "contact_details": contact_details, 13 | "links": links, 14 | "extras": extras, 15 | }, 16 | "type": "object", 17 | } 18 | -------------------------------------------------------------------------------- /pupa/tests/django_settings.py: -------------------------------------------------------------------------------- 1 | # django settings for tests 2 | import os 3 | 4 | SECRET_KEY = 'test' 5 | INSTALLED_APPS = ('django.contrib.contenttypes', 6 | 'opencivicdata.core.apps.BaseConfig', 7 | 'opencivicdata.legislative.apps.BaseConfig', 8 | 'pupa') 9 | DATABASES = { 10 | 'default': { 11 | 'ENGINE': 'django.contrib.gis.db.backends.postgis', 12 | 'NAME': os.getenv('POSTGRES_DB', 'test'), 13 | 'USER': os.getenv('POSTGRES_USER', 'test'), 14 | 'PASSWORD': os.getenv('POSTGRES_PASSWORD', 'test'), 15 | 'HOST': os.getenv('POSTGRES_HOST', 'localhost'), 16 | 'PORT': os.getenv('POSTGRES_PORT', 5432), 17 | } 18 | } 19 | MIDDLEWARE_CLASSES = () 20 | -------------------------------------------------------------------------------- /pupa/migrations/0005_auto_20170522_1935.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.10.5 on 2017-05-22 19:35 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ("pupa", "0004_identifier"), 12 | ] 13 | 14 | operations = [ 15 | migrations.AlterField( 16 | model_name="identifier", 17 | name="identifier", 18 | field=models.CharField(max_length=300), 19 | ), 20 | migrations.AlterField( 21 | model_name="identifier", 22 | name="object_id", 23 | field=models.CharField(max_length=300), 24 | ), 25 | ] 26 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pupa.cli.commands.update import override_settings 4 | 5 | 6 | class _Settings: 7 | pass 8 | 9 | 10 | @pytest.fixture 11 | def settings(): 12 | ret = _Settings() 13 | ret.foo = "bar" 14 | ret.baz = "bob" 15 | return ret 16 | 17 | 18 | def test_override_settings(settings): 19 | with override_settings(settings, {"baz": "fez"}): 20 | assert settings.foo == "bar" 21 | assert settings.baz == "fez" 22 | assert settings.foo == "bar" 23 | assert settings.baz == "bob" 24 | 25 | 26 | def test_override_settings_unset(settings): 27 | with override_settings(settings, {"qux": "fez"}): 28 | assert settings.qux == "fez" 29 | assert not hasattr(settings, "qux") 30 | -------------------------------------------------------------------------------- /pupa/importers/jurisdiction.py: -------------------------------------------------------------------------------- 1 | from opencivicdata.core.models import Jurisdiction 2 | from opencivicdata.legislative.models import LegislativeSession 3 | from .base import BaseImporter 4 | 5 | 6 | class JurisdictionImporter(BaseImporter): 7 | _type = "jurisdiction" 8 | model_class = Jurisdiction 9 | related_models = { 10 | "legislative_sessions": (LegislativeSession, "jurisdiction_id", {}) 11 | } 12 | merge_related = {"legislative_sessions": ["identifier"]} 13 | 14 | def get_object(self, data): 15 | return self.model_class.objects.get( 16 | division_id=data["division_id"], classification=data["classification"] 17 | ) 18 | 19 | def prepare_for_db(self, data): 20 | for s in data["legislative_sessions"]: 21 | s.pop("_scraped_name", None) 22 | return data 23 | -------------------------------------------------------------------------------- /pupa/migrations/0006_identifier_jurisdiction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.10.5 on 2017-06-15 14:07 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ("core", "0001_initial"), 13 | ("pupa", "0005_auto_20170522_1935"), 14 | ] 15 | 16 | operations = [ 17 | migrations.AddField( 18 | model_name="identifier", 19 | name="jurisdiction", 20 | field=models.ForeignKey( 21 | default="", 22 | on_delete=django.db.models.deletion.CASCADE, 23 | related_name="pupa_ids", 24 | to="core.Jurisdiction", 25 | ), 26 | preserve_default=False, 27 | ), 28 | ] 29 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/membership.py: -------------------------------------------------------------------------------- 1 | from .common import links, contact_details, extras, fuzzy_date_blank 2 | 3 | schema = { 4 | "properties": { 5 | "label": {"type": "string"}, 6 | "role": {"type": "string"}, 7 | "person_id": {"type": ["string", "null"]}, 8 | "person_name": {"type": ["string"], "minLength": 1}, 9 | "organization_id": {"type": "string", "minLength": 1}, 10 | "post_id": {"type": ["string", "null"]}, 11 | "on_behalf_of_id": {"type": ["string", "null"]}, 12 | "start_date": fuzzy_date_blank, 13 | "end_date": fuzzy_date_blank, 14 | "contact_details": contact_details, 15 | "links": links, 16 | "extras": extras, 17 | # division & jurisdiction are additions to popolo 18 | "division_id": {"type": ["string", "null"]}, 19 | "jurisdiction_id": {"type": "string", "minLength": 1}, 20 | }, 21 | "type": "object", 22 | } 23 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/jurisdiction.py: -------------------------------------------------------------------------------- 1 | from .common import extras, fuzzy_date_blank 2 | 3 | schema = { 4 | "type": "object", 5 | "properties": { 6 | "name": {"type": "string", "minLength": 1}, 7 | "url": {"type": "string", "minLength": 1}, 8 | "classification": {"type": "string", "minLength": 1}, # TODO: enum 9 | "division_id": {"type": "string", "minLength": 1}, 10 | "legislative_sessions": { 11 | "type": "array", 12 | "items": { 13 | "type": "object", 14 | "properties": { 15 | "name": {"type": "string", "minLength": 1}, 16 | "type": {"type": "string", "enum": ["primary", "special"]}, 17 | "start_date": fuzzy_date_blank, 18 | "end_date": fuzzy_date_blank, 19 | }, 20 | }, 21 | }, 22 | "feature_flags": {"type": "array", "items": {"type": "string", "minLength": 1}}, 23 | "extras": extras, 24 | }, 25 | } 26 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/person.py: -------------------------------------------------------------------------------- 1 | from .common import ( 2 | links, 3 | contact_details, 4 | identifiers, 5 | other_names, 6 | sources, 7 | extras, 8 | fuzzy_date_blank, 9 | ) 10 | 11 | schema = { 12 | "properties": { 13 | "name": {"type": "string", "minLength": 1}, 14 | "other_names": other_names, 15 | "identifiers": identifiers, 16 | "sort_name": {"type": "string"}, 17 | "family_name": {"type": "string"}, 18 | "given_name": {"type": "string"}, 19 | "gender": {"type": "string"}, 20 | "birth_date": fuzzy_date_blank, 21 | "death_date": fuzzy_date_blank, 22 | "image": {"format": "uri-blank", "type": "string"}, 23 | "summary": {"type": "string"}, 24 | "biography": {"type": "string"}, 25 | "national_identity": {"type": "string"}, 26 | "contact_details": contact_details, 27 | "links": links, 28 | "sources": sources, 29 | "extras": extras, 30 | }, 31 | "type": "object", 32 | } 33 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/organization.py: -------------------------------------------------------------------------------- 1 | from .common import ( 2 | links, 3 | contact_details, 4 | identifiers, 5 | other_names, 6 | sources, 7 | extras, 8 | fuzzy_date_blank, 9 | ) 10 | from opencivicdata import common 11 | 12 | schema = { 13 | "properties": { 14 | "name": {"type": "string", "minLength": 1}, 15 | "other_names": other_names, 16 | "identifiers": identifiers, 17 | "classification": { 18 | "type": ["string", "null"], 19 | "enum": common.ORGANIZATION_CLASSIFICATIONS, 20 | }, 21 | "parent_id": { 22 | "type": ["string", "null"], 23 | }, 24 | "founding_date": fuzzy_date_blank, 25 | "dissolution_date": fuzzy_date_blank, 26 | "image": {"type": "string", "format": "uri-blank"}, 27 | "contact_details": contact_details, 28 | "links": links, 29 | "sources": sources, 30 | # added to popolo 31 | "jurisdiction_id": {"type": "string", "minLength": 1}, 32 | "division_id": {"type": ["string", "null"], "minLength": 1}, 33 | "extras": extras, 34 | }, 35 | "type": "object", 36 | } 37 | -------------------------------------------------------------------------------- /pupa/cli/commands/party.py: -------------------------------------------------------------------------------- 1 | import django 2 | from .base import BaseCommand 3 | from pupa.exceptions import CommandError 4 | 5 | 6 | class Command(BaseCommand): 7 | name = "party" 8 | help = "command line tool to manage parties" 9 | 10 | def add_args(self): 11 | self.add_argument("action", type=str, help="add|list") 12 | self.add_argument("party_name", type=str, nargs="?") 13 | 14 | def handle(self, args, other): 15 | django.setup() 16 | from opencivicdata.core.models import Organization 17 | 18 | if args.action == "add": 19 | o, created = Organization.objects.get_or_create( 20 | name=args.party_name, classification="party" 21 | ) 22 | if created: 23 | print("added {}".format(o)) 24 | else: 25 | print("{} already exists".format(o)) 26 | elif args.action == "list": 27 | for party in Organization.objects.filter(classification="party").order_by( 28 | "name" 29 | ): 30 | print(party.name) 31 | else: 32 | raise CommandError('party action must be "add" or "list"') 33 | -------------------------------------------------------------------------------- /pupa/importers/posts.py: -------------------------------------------------------------------------------- 1 | from opencivicdata.core.models import Post, PostContactDetail, PostLink 2 | from .base import BaseImporter 3 | 4 | 5 | class PostImporter(BaseImporter): 6 | _type = "post" 7 | model_class = Post 8 | related_models = { 9 | "contact_details": (PostContactDetail, "post_id", {}), 10 | "links": (PostLink, "post_id", {}), 11 | } 12 | 13 | def __init__(self, jurisdiction_id, org_importer): 14 | super(PostImporter, self).__init__(jurisdiction_id) 15 | self.org_importer = org_importer 16 | 17 | def prepare_for_db(self, data): 18 | data["organization_id"] = self.org_importer.resolve_json_id( 19 | data["organization_id"] 20 | ) 21 | return data 22 | 23 | def get_object(self, post): 24 | spec = { 25 | "organization_id": post["organization_id"], 26 | "label": post["label"], 27 | } 28 | 29 | if post["role"]: 30 | spec["role"] = post["role"] 31 | 32 | return self.model_class.objects.get(**spec) 33 | 34 | def limit_spec(self, spec): 35 | spec["organization__jurisdiction_id"] = self.jurisdiction_id 36 | return spec 37 | -------------------------------------------------------------------------------- /pupa/migrations/0004_identifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.10.5 on 2017-05-22 15:51 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ("contenttypes", "0002_remove_content_type_name"), 13 | ("pupa", "0003_auto_20151118_0408"), 14 | ] 15 | 16 | operations = [ 17 | migrations.CreateModel( 18 | name="Identifier", 19 | fields=[ 20 | ( 21 | "id", 22 | models.AutoField( 23 | auto_created=True, 24 | primary_key=True, 25 | serialize=False, 26 | verbose_name="ID", 27 | ), 28 | ), 29 | ("identifier", models.CharField(max_length=500)), 30 | ("object_id", models.PositiveIntegerField()), 31 | ( 32 | "content_type", 33 | models.ForeignKey( 34 | on_delete=django.db.models.deletion.CASCADE, 35 | to="contenttypes.ContentType", 36 | ), 37 | ), 38 | ], 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /pupa/migrations/0003_auto_20151118_0408.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9b1 on 2015-11-18 04:08 3 | from __future__ import unicode_literals 4 | 5 | import datetime 6 | from django.db import migrations, models 7 | from django.utils.timezone import utc 8 | 9 | 10 | class Migration(migrations.Migration): 11 | 12 | dependencies = [ 13 | ("pupa", "0002_auto_20150906_1458"), 14 | ] 15 | 16 | operations = [ 17 | migrations.AddField( 18 | model_name="runplan", 19 | name="end_time", 20 | field=models.DateTimeField( 21 | default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc) 22 | ), 23 | preserve_default=False, 24 | ), 25 | migrations.AddField( 26 | model_name="runplan", 27 | name="exception", 28 | field=models.TextField(blank=True, default=""), 29 | ), 30 | migrations.AddField( 31 | model_name="runplan", 32 | name="start_time", 33 | field=models.DateTimeField( 34 | default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc) 35 | ), 36 | preserve_default=False, 37 | ), 38 | migrations.AddField( 39 | model_name="runplan", 40 | name="traceback", 41 | field=models.TextField(blank=True, default=""), 42 | ), 43 | ] 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-, Open Civic Data Contributors 2 | Copyright (c) 2014, Sunlight Foundation 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, 10 | this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | * Neither the name of Open Civic Data nor the names of its contributors may be 15 | used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 22 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /pupa/migrations/0002_auto_20150906_1458.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9.dev20150906080247 on 2015-09-06 14:58 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ("pupa", "0001_initial"), 13 | ] 14 | 15 | operations = [ 16 | migrations.AlterField( 17 | model_name="importobjects", 18 | name="report", 19 | field=models.ForeignKey( 20 | on_delete=django.db.models.deletion.CASCADE, 21 | related_name="imported_objects", 22 | to="pupa.RunPlan", 23 | ), 24 | ), 25 | migrations.AlterField( 26 | model_name="runplan", 27 | name="jurisdiction", 28 | field=models.ForeignKey( 29 | on_delete=django.db.models.deletion.CASCADE, 30 | related_name="runs", 31 | to="core.Jurisdiction", 32 | ), 33 | ), 34 | migrations.AlterField( 35 | model_name="scrapeobjects", 36 | name="report", 37 | field=models.ForeignKey( 38 | on_delete=django.db.models.deletion.CASCADE, 39 | related_name="scraped_objects", 40 | to="pupa.ScrapeReport", 41 | ), 42 | ), 43 | migrations.AlterField( 44 | model_name="scrapereport", 45 | name="plan", 46 | field=models.ForeignKey( 47 | on_delete=django.db.models.deletion.CASCADE, 48 | related_name="scrapers", 49 | to="pupa.RunPlan", 50 | ), 51 | ), 52 | ] 53 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/vote_event.py: -------------------------------------------------------------------------------- 1 | from .common import sources, extras, fuzzy_datetime_blank 2 | from opencivicdata import common 3 | 4 | 5 | schema = { 6 | "type": "object", 7 | "properties": { 8 | "identifier": {"type": "string"}, 9 | "motion_text": {"type": "string", "minLength": 1}, 10 | "motion_classification": { 11 | "items": {"type": "string", "minLength": 1}, 12 | "type": "array", 13 | }, 14 | "start_date": fuzzy_datetime_blank, 15 | "end_date": fuzzy_datetime_blank, 16 | "result": {"type": "string", "enum": common.VOTE_RESULTS}, 17 | "organization": {"type": ["string", "null"], "minLength": 1}, 18 | "legislative_session": {"type": "string", "minLength": 1}, 19 | "bill": {"type": ["string", "null"], "minLength": 1}, 20 | "bill_action": {"type": ["string", "null"], "minLength": 1}, 21 | "votes": { 22 | "items": { 23 | "type": "object", 24 | "properties": { 25 | "option": {"type": "string", "enum": common.VOTE_OPTIONS}, 26 | "voter_name": {"type": "string", "minLength": 1}, 27 | "voter_id": {"type": "string", "minLength": 1}, 28 | "note": {"type": "string"}, 29 | }, 30 | }, 31 | }, 32 | "counts": { 33 | "items": { 34 | "properties": { 35 | "option": {"type": "string", "enum": common.VOTE_OPTIONS}, 36 | "value": {"type": "integer", "minimum": 0}, 37 | }, 38 | "type": "object", 39 | }, 40 | }, 41 | "sources": sources, 42 | "extras": extras, 43 | "pupa_id": {"type": ["string", "null"], "minLength": 1}, 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup, find_packages 3 | from pupa import __version__ 4 | 5 | long_description = '' 6 | 7 | setup(name='pupa', 8 | version=__version__, 9 | packages=find_packages(), 10 | author='James Turk', 11 | author_email='james@openstates.org', 12 | license='BSD', 13 | url='https://github.com/opencivicdata/pupa/', 14 | description='scraping framework for muncipal data', 15 | long_description=long_description, 16 | platforms=['any'], 17 | zip_safe=False, 18 | entry_points='''[console_scripts] 19 | pupa = pupa.cli.__main__:main''', 20 | install_requires=[ 21 | 'Django>=2.2,<5', 22 | 'opencivicdata>=3.3.0', 23 | 'dj_database_url>=0.3.0', 24 | 'scrapelib>=1.0', 25 | 'jsonschema>=3.0.0', # TODO: Drop alpha release once stable release available 26 | 'psycopg2-binary', 27 | 'pytz', 28 | ], 29 | extras_require={ 30 | 'dev': [ 31 | 'mock', 32 | 'pytest>=3.6', 33 | 'pytest-cov', 34 | 'pytest-django', 35 | 'freezegun', 36 | 'coveralls', 37 | 'coverage<=6.5.0', 38 | 'flake8', 39 | ], 40 | }, 41 | classifiers=["Development Status :: 4 - Beta", 42 | "Intended Audience :: Developers", 43 | "License :: OSI Approved :: BSD License", 44 | "Natural Language :: English", 45 | "Operating System :: OS Independent", 46 | "Programming Language :: Python :: 3.8", 47 | "Programming Language :: Python :: 3.9", 48 | "Programming Language :: Python :: 3.10", 49 | "Topic :: Software Development :: Libraries :: Python Modules", 50 | ], 51 | ) 52 | -------------------------------------------------------------------------------- /pupa/tests/update/test_importer_resolution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import types 3 | import pytest 4 | 5 | from django.utils.module_loading import import_string 6 | 7 | from pupa.cli.commands import update 8 | from pupa.exceptions import CommandError 9 | 10 | 11 | @pytest.fixture(params=[ 12 | "JurisdictionImporter", 13 | "OrganizationImporter", 14 | "PersonImporter", 15 | "PostImporter", 16 | "MembershipImporter", 17 | "BillImporter", 18 | "VoteEventImporter", 19 | "EventImporter", 20 | ]) 21 | def importer_test_case(request): 22 | return request.param 23 | 24 | 25 | @pytest.fixture 26 | def custom_importer(): 27 | """ 28 | Create a module object at runtime with a single class inside, and insert it 29 | into sys.modules so import_string() can load it by dotted path. 30 | """ 31 | module_name, class_name = ["tests.fixtures.custom_importers", "MyCustomImporter"] 32 | 33 | module = types.ModuleType("tests.fixtures.custom_importers") 34 | cls = type(class_name, (), {}) 35 | setattr(module, class_name, cls) 36 | sys.modules[module_name] = module 37 | 38 | return cls, module_name 39 | 40 | 41 | def test_resolve_custom_importer(custom_importer, settings, importer_test_case): 42 | cls, module_name = custom_importer 43 | settings.IMPORTER_CLASSES = {importer_test_case: f"{module_name}.{cls.__name__}"} 44 | resolved = update.resolve_importer(importer_test_case) 45 | assert resolved is cls 46 | 47 | 48 | def test_resolve_default_importer(importer_test_case): 49 | expected_importer = import_string(f"pupa.importers.{importer_test_case}") 50 | resolved_importer = update.resolve_importer(importer_test_case) 51 | assert resolved_importer is expected_importer 52 | 53 | 54 | def test_resolve_bad_path_raises_error(settings): 55 | settings.IMPORTER_CLASSES = {"PersonImporter": "non.existent.Path"} 56 | with pytest.raises(CommandError): 57 | update.resolve_importer("PersonImporter") 58 | -------------------------------------------------------------------------------- /.github/workflows/package.yml: -------------------------------------------------------------------------------- 1 | name: Test and build Python package 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | tags: 7 | - v* 8 | pull_request: 9 | branches: [ master ] 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | services: 15 | postgres: 16 | image: postgis/postgis:10-2.5 17 | env: 18 | POSTGRES_USER: test 19 | POSTGRES_DB: test 20 | POSTGRES_PASSWORD: test 21 | options: >- 22 | --health-cmd pg_isready 23 | --health-interval 10s 24 | --health-timeout 5s 25 | --health-retries 5 26 | ports: 27 | - 5432:5432 28 | strategy: 29 | matrix: 30 | python-version: ['3.8', '3.9', '3.10'] 31 | django-series: ['2.2', '3.0'] 32 | steps: 33 | - uses: actions/checkout@v2 34 | - name: Set up Python ${{ matrix.python-version }} 35 | uses: actions/setup-python@v2 36 | with: 37 | python-version: ${{ matrix.python-version }} 38 | - name: Install dependencies 39 | run: | 40 | sudo apt update 41 | sudo apt install -y gdal-bin 42 | pip install .[dev] --pre Django==${{ matrix.django-series }} 43 | - name: Lint with flake8 44 | run: | 45 | flake8 pupa 46 | - name: Test with pytest 47 | run: | 48 | ./run-tests.sh 49 | - name: Calculate test coverage 50 | env: 51 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 52 | run: | 53 | coveralls --service=github 54 | 55 | build: 56 | needs: test 57 | name: Build package and upload to PyPI 58 | runs-on: ubuntu-latest 59 | steps: 60 | - uses: actions/checkout@v2 61 | - name: Build and publish 62 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 63 | env: 64 | TWINE_USERNAME: __token__ 65 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 66 | run: | 67 | pip install twine wheel 68 | pip wheel -w dist --no-deps . 69 | python setup.py sdist 70 | twine upload dist/* 71 | continue-on-error: true 72 | -------------------------------------------------------------------------------- /pupa/utils/generic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pytz 4 | import datetime 5 | import subprocess 6 | 7 | 8 | def utcnow(): 9 | return datetime.datetime.now(datetime.timezone.utc) 10 | 11 | 12 | def _make_pseudo_id(**kwargs): 13 | """pseudo ids are just JSON""" 14 | # ensure keys are sorted so that these are deterministic 15 | return "~" + json.dumps(kwargs, sort_keys=True) 16 | 17 | 18 | def get_pseudo_id(pid): 19 | if pid[0] != "~": 20 | raise ValueError("pseudo id doesn't start with ~") 21 | return json.loads(pid[1:]) 22 | 23 | 24 | def makedirs(dname): 25 | if not os.path.isdir(dname): 26 | os.makedirs(dname) 27 | 28 | 29 | class JSONEncoderPlus(json.JSONEncoder): 30 | """ 31 | JSONEncoder that encodes datetime objects as Unix timestamps. 32 | """ 33 | 34 | def default(self, obj, **kwargs): 35 | if isinstance(obj, datetime.datetime): 36 | if obj.tzinfo is None: 37 | raise TypeError("date '%s' is not fully timezone qualified." % (obj)) 38 | obj = obj.astimezone(pytz.UTC) 39 | return "{}".format(obj.isoformat()) 40 | elif isinstance(obj, datetime.date): 41 | return "{}".format(obj.isoformat()) 42 | return super(JSONEncoderPlus, self).default(obj, **kwargs) 43 | 44 | 45 | def convert_pdf(filename, type="xml"): 46 | commands = { 47 | "text": ["pdftotext", "-layout", filename, "-"], 48 | "text-nolayout": ["pdftotext", filename, "-"], 49 | "xml": ["pdftohtml", "-xml", "-stdout", filename], 50 | "html": ["pdftohtml", "-stdout", filename], 51 | } 52 | try: 53 | pipe = subprocess.Popen( 54 | commands[type], stdout=subprocess.PIPE, close_fds=True 55 | ).stdout 56 | except OSError as e: 57 | raise EnvironmentError( 58 | "error running %s, missing executable? [%s]" % " ".join(commands[type]), e 59 | ) 60 | data = pipe.read() 61 | pipe.close() 62 | return data 63 | 64 | 65 | def format_datetime(dt, timezone): 66 | return pytz.timezone(timezone).localize(dt).replace(microsecond=0).isoformat() 67 | -------------------------------------------------------------------------------- /pupa/migrations/0007_sessiondataqualityreport.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.1.2 on 2018-10-23 15:25 2 | 3 | import django.contrib.postgres.fields.jsonb 4 | from django.db import migrations, models 5 | import django.db.models.deletion 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ("legislative", "0005_auto_20171005_2028"), 12 | ("pupa", "0006_identifier_jurisdiction"), 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name="SessionDataQualityReport", 18 | fields=[ 19 | ( 20 | "id", 21 | models.AutoField( 22 | auto_created=True, 23 | primary_key=True, 24 | serialize=False, 25 | verbose_name="ID", 26 | ), 27 | ), 28 | ("bills_missing_actions", models.PositiveIntegerField()), 29 | ("bills_missing_sponsors", models.PositiveIntegerField()), 30 | ("bills_missing_versions", models.PositiveIntegerField()), 31 | ("votes_missing_voters", models.PositiveIntegerField()), 32 | ("votes_missing_bill", models.PositiveIntegerField()), 33 | ("votes_missing_yes_count", models.PositiveIntegerField()), 34 | ("votes_missing_no_count", models.PositiveIntegerField()), 35 | ("votes_with_bad_counts", models.PositiveIntegerField()), 36 | ( 37 | "unmatched_sponsor_people", 38 | django.contrib.postgres.fields.jsonb.JSONField(), 39 | ), 40 | ( 41 | "unmatched_sponsor_organizations", 42 | django.contrib.postgres.fields.jsonb.JSONField(), 43 | ), 44 | ("unmatched_voters", django.contrib.postgres.fields.jsonb.JSONField()), 45 | ( 46 | "legislative_session", 47 | models.ForeignKey( 48 | on_delete=django.db.models.deletion.CASCADE, 49 | to="legislative.LegislativeSession", 50 | ), 51 | ), 52 | ], 53 | ), 54 | ] 55 | -------------------------------------------------------------------------------- /pupa/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import importlib 4 | 5 | import dj_database_url 6 | 7 | DATABASE_URL = os.environ.get( 8 | "DATABASE_URL", "postgis://pupa:pupa@localhost/opencivicdata" 9 | ) 10 | SECRET_KEY = "non-secret" 11 | INSTALLED_APPS = ( 12 | "django.contrib.contenttypes", 13 | "opencivicdata.core.apps.BaseConfig", 14 | "opencivicdata.legislative.apps.BaseConfig", 15 | "pupa", 16 | ) 17 | 18 | ALLOWED_HOSTS = ["localhost"] 19 | SILENCED_SYSTEM_CHECKS = ["fields.E904"] 20 | 21 | # scrape settings 22 | 23 | SCRAPELIB_RPM = 60 24 | SCRAPELIB_TIMEOUT = 60 25 | SCRAPELIB_RETRY_ATTEMPTS = 3 26 | SCRAPELIB_RETRY_WAIT_SECONDS = 10 27 | SCRAPELIB_VERIFY = True 28 | 29 | CACHE_DIR = os.path.join(os.getcwd(), "_cache") 30 | SCRAPED_DATA_DIR = os.path.join(os.getcwd(), "_data") 31 | 32 | # import settings 33 | 34 | ENABLE_PEOPLE_AND_ORGS = True 35 | ENABLE_BILLS = True 36 | ENABLE_VOTES = True 37 | ENABLE_EVENTS = True 38 | 39 | IMPORT_TRANSFORMERS = {"bill": []} 40 | 41 | # Django settings 42 | DEBUG = False 43 | TEMPLATE_DEBUG = False 44 | 45 | MIDDLEWARE_CLASSES = () 46 | LOGGING = { 47 | "version": 1, 48 | "disable_existing_loggers": False, 49 | "formatters": { 50 | "standard": { 51 | "format": "%(asctime)s %(levelname)s %(name)s: %(message)s", 52 | "datefmt": "%H:%M:%S", 53 | } 54 | }, 55 | "handlers": { 56 | "default": { 57 | "level": "DEBUG", 58 | "class": "pupa.ext.ansistrm.ColorizingStreamHandler", 59 | "formatter": "standard", 60 | }, 61 | }, 62 | "loggers": { 63 | "": {"handlers": ["default"], "level": "DEBUG", "propagate": True}, 64 | "scrapelib": {"handlers": ["default"], "level": "INFO", "propagate": False}, 65 | "requests": {"handlers": ["default"], "level": "WARN", "propagate": False}, 66 | "boto": {"handlers": ["default"], "level": "WARN", "propagate": False}, 67 | }, 68 | } 69 | 70 | 71 | sys.path.insert(1, os.getcwd()) 72 | loader = importlib.util.find_spec("pupa_settings") 73 | if loader is None: 74 | print("no pupa_settings on path, using defaults") 75 | else: 76 | from pupa_settings import * # NOQA 77 | 78 | 79 | DATABASES = {"default": dj_database_url.parse(DATABASE_URL)} 80 | DATABASES["default"]["ENGINE"] = "django.contrib.gis.db.backends.postgis" 81 | -------------------------------------------------------------------------------- /ARCHITECTURE.md: -------------------------------------------------------------------------------- 1 | ================= 2 | pupa architecture 3 | ================= 4 | 5 | pupa.cli 6 | ======== 7 | 8 | * dbinit - initializes a postgres database for use with pupa scrapers 9 | 10 | * init - initializes a local project directory ready for people to write scrapers 11 | 12 | * update - updates data, can be run with --scrape if desire is to examine data locally 13 | 14 | pupa.ext 15 | ======== 16 | 17 | Nothing here is particularly interesting architecturally, this is where a few vendorized files 18 | live. 19 | 20 | pupa.scrape 21 | =========== 22 | 23 | scrape.Scraper - base class for all scrapers 24 | 25 | self.info 26 | self.debug 27 | self.warning 28 | self.error 29 | self.critical 30 | 31 | self.save_object(obj) - given a scrape object saves it to disk 32 | calls obj.pre_save(jid), obj.as_dict(), and obj.validate() 33 | 34 | self.do_scrape(**kwargs) - the workhorse of the scraper, runs a scrape by calling self.scrape() 35 | passed on all arbitrary args to scrape, which can use them for discrimination 36 | 37 | self.scrape(**kwargs) - the user-implemented method where the scraper should be implemented 38 | 39 | 40 | scrape.BaseBillScraper - special helper for bill scrapers 41 | 42 | ContinueScraping - exception that can be raised to skip a bill 43 | 44 | scrape() defined to call two functions 45 | get_bill_ids(**kwargs) - returns a list of (bill_id, extras) tuples 46 | get_bill(bill_id, **extras) - either gets a bill or raises a ContinueScraping 47 | 48 | 49 | scrape.BaseModel - base class for all scrape models 50 | _type - overriden to the type (???used where???) 51 | _schema - the schema dictionary to use in validate() 52 | 53 | self._id - defaults to a UUID 54 | self._related - list of related models 55 | self._meta - ???used??? 56 | self.extras = {} - dict of all irregular fields 57 | 58 | validate() - validates against _schema 59 | as_dict() - converts to a dict, only includes properties in the schema 60 | 61 | notes: 62 | setattr is overriden to avoid setting properties that will fail on save 63 | __eq__ is overriden (???used???) 64 | 65 | 66 | scrape.SourceMixin, ContactDetailMixin, LinkMixin, AssociatedLinkMixin 67 | various mixins that add common fields and helper methods for each of these common attributes 68 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/common.py: -------------------------------------------------------------------------------- 1 | from opencivicdata import common 2 | 3 | contact_details = { 4 | "type": "array", 5 | "items": { 6 | "type": "object", 7 | "properties": { 8 | "type": {"type": "string", "enum": common.CONTACT_TYPES}, 9 | "value": {"type": "string", "minLength": 1}, 10 | "note": {"type": "string"}, 11 | "label": {"type": "string"}, 12 | }, 13 | }, 14 | } 15 | 16 | identifiers = { 17 | "items": { 18 | "properties": { 19 | "identifier": {"type": "string", "minLength": 1}, 20 | "scheme": {"type": "string"}, 21 | } 22 | }, 23 | "type": "array", 24 | } 25 | 26 | fuzzy_date_string = {"type": "string", "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"} 27 | fuzzy_date_string_blank = { 28 | "type": "string", 29 | "pattern": "^([0-9]{4})?(-[0-9]{2}){0,2}$", 30 | } 31 | fuzzy_datetime_string_blank = { 32 | "type": "string", 33 | "pattern": ( 34 | "^([0-9]{4}((-[0-9]{2}){0,2}|(-[0-9]{2}){2}T" 35 | "[0-9]{2}(:[0-9]{2}){0,2}" 36 | "(Z|[+-][0-9]{2}(:[0-9]{2})?))?)?$" 37 | ), 38 | } 39 | fuzzy_date = {"type": [fuzzy_date_string, "date"]} 40 | fuzzy_date_blank = {"type": [fuzzy_date_string_blank, "date"]} 41 | fuzzy_datetime = {"type": [fuzzy_datetime_string_blank, "datetime"]} 42 | fuzzy_datetime_blank = {"type": [fuzzy_datetime_string_blank, "datetime"]} 43 | 44 | other_names = { 45 | "items": { 46 | "properties": { 47 | "name": {"type": "string", "minLength": 1}, 48 | "start_date": fuzzy_date_blank, 49 | "end_date": fuzzy_date_blank, 50 | "note": {"type": "string"}, 51 | }, 52 | "type": "object", 53 | }, 54 | "type": "array", 55 | } 56 | 57 | 58 | links = { 59 | "items": { 60 | "properties": { 61 | "note": {"type": "string"}, 62 | "url": {"format": "uri", "type": "string"}, 63 | }, 64 | "type": "object", 65 | }, 66 | "type": "array", 67 | } 68 | 69 | 70 | sources = { 71 | "items": { 72 | "properties": { 73 | "url": {"type": "string", "format": "uri"}, 74 | "note": {"type": "string"}, 75 | }, 76 | "type": "object", 77 | }, 78 | "minItems": 1, 79 | "type": "array", 80 | } 81 | 82 | extras = { 83 | "type": "object", 84 | } 85 | -------------------------------------------------------------------------------- /pupa/scrape/jurisdiction.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel, Scraper 3 | from .schemas.jurisdiction import schema 4 | from .popolo import Organization 5 | 6 | 7 | class Jurisdiction(BaseModel): 8 | """Base class for a jurisdiction""" 9 | 10 | _type = "jurisdiction" 11 | _schema = schema 12 | 13 | # schema objects 14 | classification = None 15 | name = None 16 | url = None 17 | legislative_sessions = [] 18 | feature_flags = [] 19 | extras = {} 20 | 21 | # non-db properties 22 | scrapers = {} 23 | default_scrapers = None 24 | parties = [] 25 | ignored_scraped_sessions = [] 26 | 27 | def __init__(self): 28 | super(BaseModel, self).__init__() 29 | self._related = [] 30 | self.extras = {} 31 | 32 | @property 33 | def jurisdiction_id(self): 34 | return "{}/{}".format( 35 | self.division_id.replace("ocd-division", "ocd-jurisdiction"), 36 | self.classification, 37 | ) 38 | 39 | _id = jurisdiction_id 40 | 41 | def as_dict(self): 42 | return { 43 | "_id": self.jurisdiction_id, 44 | "id": self.jurisdiction_id, 45 | "name": self.name, 46 | "url": self.url, 47 | "division_id": self.division_id, 48 | "classification": self.classification, 49 | "legislative_sessions": self.legislative_sessions, 50 | "feature_flags": self.feature_flags, 51 | "extras": self.extras, 52 | } 53 | 54 | def __str__(self): 55 | return self.name 56 | 57 | def get_organizations(self): 58 | raise NotImplementedError( 59 | "get_organizations is not implemented" 60 | ) # pragma: no cover 61 | 62 | 63 | class JurisdictionScraper(Scraper): 64 | def scrape(self): 65 | # yield a single Jurisdiction object 66 | yield self.jurisdiction 67 | 68 | # yield all organizations 69 | for org in self.jurisdiction.get_organizations(): 70 | yield org 71 | 72 | if self.jurisdiction.parties: 73 | warnings.warn( 74 | "including parties on Jurisdiction is deprecated, " 75 | 'use "pupa party" command instead' 76 | ) 77 | for party in self.jurisdiction.parties: 78 | org = Organization(classification="party", name=party["name"]) 79 | yield org 80 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_jurisdiction_scrape.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pupa.scrape import Jurisdiction, Organization, JurisdictionScraper 3 | 4 | 5 | class FakeJurisdiction(Jurisdiction): 6 | division_id = "ocd-division/test" 7 | classification = "government" 8 | name = "Test" 9 | url = "http://example.com" 10 | 11 | def get_organizations(self): 12 | parent = Organization("Congress", classification="legislature") 13 | yield parent 14 | yield Organization("House", classification="lower", parent_id=parent) 15 | yield Organization("Senate", classification="upper", parent_id=parent) 16 | 17 | 18 | def test_basics(): 19 | # id property and string 20 | j = FakeJurisdiction() 21 | assert j.jurisdiction_id == "ocd-jurisdiction/test/government" 22 | assert j.name in str(j) 23 | 24 | 25 | def test_as_dict(): 26 | j = FakeJurisdiction() 27 | d = j.as_dict() 28 | 29 | assert d["_id"] == j.jurisdiction_id 30 | assert d["name"] == j.name 31 | assert d["url"] == j.url 32 | assert d["legislative_sessions"] == [] 33 | assert d["feature_flags"] == [] 34 | 35 | 36 | def test_jurisdiction_unicam_scrape(): 37 | class UnicameralJurisdiction(Jurisdiction): 38 | jurisdiction_id = "unicam" 39 | name = "Unicameral" 40 | url = "http://example.com" 41 | 42 | def get_organizations(self): 43 | yield Organization("Unicameral Legislature", classification="legislature") 44 | 45 | j = UnicameralJurisdiction() 46 | js = JurisdictionScraper(j, "/tmp/") 47 | objects = list(js.scrape()) 48 | 49 | # two objects, first is the Jurisdiction 50 | assert len(objects) == 2 51 | assert objects[0] == j 52 | 53 | # ensure we made a single legislature org 54 | assert isinstance(objects[1], Organization) 55 | assert objects[1].classification == "legislature" 56 | 57 | 58 | def test_jurisdiction_bicameral_scrape(): 59 | j = FakeJurisdiction() 60 | js = JurisdictionScraper(j, "/tmp/") 61 | objects = list(js.scrape()) 62 | obj_names = set() 63 | obj_types = defaultdict(int) 64 | 65 | for o in objects: 66 | obj_names.add(o.name) 67 | obj_types[type(o)] += 1 68 | 69 | # ensure Jurisdiction and 5 organizations were found 70 | assert obj_names == {"Test", "Congress", "House", "Senate"} 71 | assert obj_types[FakeJurisdiction] == 1 72 | assert obj_types[Organization] == 3 73 | -------------------------------------------------------------------------------- /pupa/cli/commands/dbinit.py: -------------------------------------------------------------------------------- 1 | import django 2 | from django.db import connection 3 | from django.core.management import call_command 4 | 5 | from .base import BaseCommand 6 | 7 | 8 | def copy_tmp(tablename): 9 | cursor = connection.cursor() 10 | print("copying data from table " + tablename) 11 | cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename)) 12 | cursor.execute("CREATE TABLE tmp_{t} (LIKE {t});".format(t=tablename)) 13 | cursor.execute("INSERT INTO tmp_{t} SELECT * FROM {t};".format(t=tablename)) 14 | 15 | 16 | def restore_from_tmp(tablename): 17 | print("restoring data to table " + tablename) 18 | cursor = connection.cursor() 19 | cursor.execute("INSERT INTO {t} SELECT * FROM tmp_{t};".format(t=tablename)) 20 | cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename)) 21 | 22 | 23 | def drop_tables(skip_divisions=False): 24 | tables = connection.introspection.table_names() 25 | cursor = connection.cursor() 26 | for table in tables: 27 | if table.startswith(("opencivicdata_", "pupa_")): 28 | print("dropping table " + table) 29 | cursor.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table)) 30 | cursor.execute("DELETE FROM django_migrations WHERE app='core';") 31 | cursor.execute("DELETE FROM django_migrations WHERE app='legislative';") 32 | cursor.execute("DELETE FROM django_migrations WHERE app='pupa';") 33 | 34 | 35 | class Command(BaseCommand): 36 | name = "dbinit" 37 | help = "initialize a pupa database" 38 | 39 | def add_args(self): 40 | self.add_argument( 41 | "--reset", 42 | action="store_true", 43 | default=False, 44 | help="reset entire database - USE WITH CAUTION", 45 | ) 46 | self.add_argument( 47 | "--partial-reset", 48 | action="store_true", 49 | default=False, 50 | help="reset entire database, except for divisions - USE WITH CAUTION", 51 | ) 52 | self.add_argument( 53 | type=str, dest="country", nargs="+", help="country to load divisions for" 54 | ) 55 | 56 | def handle(self, args, other): 57 | django.setup() 58 | 59 | if args.partial_reset: 60 | copy_tmp("opencivicdata_division") 61 | drop_tables() 62 | elif args.reset: 63 | drop_tables() 64 | else: 65 | pass 66 | 67 | call_command("migrate", interactive=False) 68 | 69 | if args.partial_reset: 70 | restore_from_tmp("opencivicdata_division") 71 | else: 72 | for country in args.country: 73 | call_command("loaddivisions", country) 74 | -------------------------------------------------------------------------------- /pupa/cli/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging.config 4 | import argparse 5 | import importlib 6 | import traceback 7 | from django.conf import settings 8 | from pupa.exceptions import CommandError 9 | 10 | logger = logging.getLogger("pupa") 11 | 12 | COMMAND_MODULES = ( 13 | "pupa.cli.commands.init", 14 | "pupa.cli.commands.dbinit", 15 | "pupa.cli.commands.update", 16 | "pupa.cli.commands.party", 17 | "pupa.cli.commands.clean", 18 | ) 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser("pupa", description="pupa CLI") 23 | parser.add_argument("--debug", action="store_true", help="open debugger on error") 24 | parser.add_argument( 25 | "--loglevel", 26 | default="INFO", 27 | help=( 28 | "set log level. options are: " 29 | "DEBUG|INFO|WARNING|ERROR|CRITICAL " 30 | "(default is INFO)" 31 | ), 32 | ) 33 | subparsers = parser.add_subparsers(dest="subcommand") 34 | 35 | # configure Django before model imports 36 | if os.environ.get("DJANGO_SETTINGS_MODULE") is None: 37 | os.environ["DJANGO_SETTINGS_MODULE"] = "pupa.settings" 38 | 39 | subcommands = {} 40 | for mod in COMMAND_MODULES: 41 | try: 42 | cmd = importlib.import_module(mod).Command(subparsers) 43 | subcommands[cmd.name] = cmd 44 | except ImportError as e: 45 | logger.error('exception "%s" prevented loading of %s module', e, mod) 46 | 47 | # process args 48 | args, other = parser.parse_known_args() 49 | 50 | # set log level from command line 51 | handler_level = getattr(logging, args.loglevel.upper(), "INFO") 52 | settings.LOGGING["handlers"]["default"]["level"] = handler_level 53 | logging.config.dictConfig(settings.LOGGING) 54 | 55 | # turn debug on 56 | if args.debug: 57 | try: 58 | debug_module = importlib.import_module("ipdb") 59 | except ImportError: 60 | debug_module = importlib.import_module("pdb") 61 | 62 | # turn on PDB-on-error mode 63 | # stolen from http://stackoverflow.com/questions/1237379/ 64 | # if this causes problems in interactive mode check that page 65 | def _tb_info(type, value, tb): 66 | traceback.print_exception(type, value, tb) 67 | debug_module.pm() 68 | 69 | sys.excepthook = _tb_info 70 | 71 | if not args.subcommand: 72 | parser.print_help() 73 | else: 74 | try: 75 | subcommands[args.subcommand].handle(args, other) 76 | except CommandError as e: 77 | logger.critical(str(e)) 78 | sys.exit(1) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /pupa/exceptions.py: -------------------------------------------------------------------------------- 1 | class PupaError(Exception): 2 | """Base class for exceptions from within Pupa""" 3 | 4 | 5 | class PupaInternalError(PupaError): 6 | """Indication something went wrong inside of Pupa that never should happen""" 7 | 8 | 9 | class CommandError(PupaError): 10 | """Errors from within pupa CLI""" 11 | 12 | 13 | # import-related errors 14 | 15 | 16 | class DataImportError(PupaError): 17 | """A generic error related to the import process.""" 18 | 19 | 20 | class InvalidVoteEventError(DataImportError): 21 | """Attempt to create a vote event without an identifier or bill_id""" 22 | 23 | 24 | class NoMembershipsError(DataImportError): 25 | """An attempt was made to import a person without any memberships.""" 26 | 27 | def __init__(self, ids): 28 | super(NoMembershipsError, self).__init__( 29 | "no memberships for {} people: \n{}".format(len(ids), ", ".join(ids)) 30 | ) 31 | 32 | 33 | class SameNameError(DataImportError): 34 | """Attempt was made to import two people with the same name.""" 35 | 36 | def __init__(self, name): 37 | super(SameNameError, self).__init__( 38 | 'multiple people with same name "{}" in Jurisdiction ' 39 | "- must provide birth_date to disambiguate".format(name) 40 | ) 41 | 42 | 43 | class SameOrgNameError(DataImportError): 44 | """Attempt was made to import two orgs with the same name.""" 45 | 46 | def __init__(self, name): 47 | super(SameOrgNameError, self).__init__( 48 | 'multiple orgs with same name "{}" in Jurisdiction '.format(name) 49 | ) 50 | 51 | 52 | class DuplicateItemError(DataImportError): 53 | """Attempt was made to import items that resolve to the same database item.""" 54 | 55 | def __init__(self, data, obj, data_sources=None): 56 | super(DuplicateItemError, self).__init__( 57 | "attempt to import data that would conflict with " 58 | "data already in the import: {} " 59 | "(already imported as {})\n" 60 | "obj1 sources: {}\nobj2 sources: {}".format( 61 | data, 62 | obj, 63 | list( 64 | obj.sources.values_list("url", flat=True) 65 | if hasattr(obj, "sources") 66 | else [] 67 | ), 68 | [s["url"] for s in data_sources or []], 69 | ) 70 | ) 71 | 72 | 73 | class UnresolvedIdError(DataImportError): 74 | """Attempt was made to resolve an id that has no result.""" 75 | 76 | 77 | # scrape-related errors 78 | 79 | 80 | class ScrapeError(PupaError): 81 | """A generic error related to the scrape process.""" 82 | 83 | 84 | class ScrapeValueError(PupaError, ValueError): 85 | """An invalid value was passed to a pupa scrape object.""" 86 | -------------------------------------------------------------------------------- /pupa/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from . import models 3 | 4 | 5 | class ScrapeReportInline(admin.TabularInline): 6 | model = models.ScrapeReport 7 | readonly_fields = ("scraper", "args", "start_time", "end_time", "get_object_list") 8 | 9 | def has_add_permission(self, request): 10 | return False 11 | 12 | can_delete = False 13 | 14 | def get_object_list(self, obj): 15 | return "\n".join( 16 | "{} ({})".format(o.object_type, o.count) for o in obj.scraped_objects.all() 17 | ) 18 | 19 | 20 | class ImportObjectsInline(admin.TabularInline): 21 | model = models.ImportObjects 22 | readonly_fields = ( 23 | "object_type", 24 | "insert_count", 25 | "update_count", 26 | "noop_count", 27 | "start_time", 28 | "end_time", 29 | ) 30 | 31 | def has_add_permission(self, request): 32 | return False 33 | 34 | can_delete = False 35 | 36 | 37 | @admin.register(models.RunPlan) 38 | class RunPlanAdmin(admin.ModelAdmin): 39 | actions = None 40 | 41 | readonly_fields = ( 42 | "jurisdiction", 43 | "success", 44 | "start_time", 45 | "end_time", 46 | "exception", 47 | "traceback", 48 | ) 49 | list_filter = ("jurisdiction__name", "success") 50 | list_display = ("jurisdiction", "success", "start_time") 51 | inlines = [ 52 | ScrapeReportInline, 53 | ImportObjectsInline, 54 | ] 55 | 56 | def has_delete_permission(self, request, obj=None): 57 | return False 58 | 59 | def has_add_permission(self, request): 60 | return False 61 | 62 | 63 | @admin.register(models.SessionDataQualityReport) 64 | class SessionDataQualityAdmin(admin.ModelAdmin): 65 | actions = None 66 | 67 | readonly_fields = ( 68 | "legislative_session", 69 | "bills_missing_actions", 70 | "bills_missing_sponsors", 71 | "bills_missing_versions", 72 | "votes_missing_voters", 73 | "votes_missing_bill", 74 | "votes_missing_yes_count", 75 | "votes_missing_no_count", 76 | "votes_with_bad_counts", 77 | "unmatched_sponsor_people", 78 | "unmatched_sponsor_organizations", 79 | "unmatched_voters", 80 | ) 81 | list_display = ( 82 | "jurisdiction_name", 83 | "legislative_session", 84 | "bills_missing_actions", 85 | "bills_missing_sponsors", 86 | "bills_missing_versions", 87 | "votes_missing_voters", 88 | "votes_missing_bill", 89 | "votes_missing_yes_count", 90 | "votes_missing_no_count", 91 | "votes_with_bad_counts", 92 | ) 93 | list_filter = ("legislative_session__jurisdiction__name",) 94 | 95 | def jurisdiction_name(self, obj): 96 | return obj.legislative_session.jurisdiction.name 97 | -------------------------------------------------------------------------------- /pupa/tests/importers/test_jurisdiction_importer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.scrape import Jurisdiction as JurisdictionBase 3 | from pupa.importers import JurisdictionImporter 4 | from opencivicdata.core.models import Jurisdiction, Division 5 | from opencivicdata.legislative.models import LegislativeSession 6 | 7 | 8 | class FakeJurisdiction(JurisdictionBase): 9 | division_id = "ocd-division/country:us" 10 | name = "test" 11 | url = "http://example.com" 12 | classification = "government" 13 | 14 | legislative_sessions = [ 15 | {"identifier": "2015", "name": "2015 Regular Session"}, 16 | {"identifier": "2016", "name": "2016 Regular Session"}, 17 | ] 18 | 19 | 20 | @pytest.mark.django_db 21 | def test_jurisdiction_import(): 22 | Division.objects.create(id="ocd-division/country:us", name="USA") 23 | tj = FakeJurisdiction() 24 | juris_dict = tj.as_dict() 25 | JurisdictionImporter("jurisdiction-id").import_data([juris_dict]) 26 | 27 | dbj = Jurisdiction.objects.get() 28 | assert dbj.id == tj.jurisdiction_id 29 | assert dbj.division_id == tj.division_id 30 | assert dbj.name == tj.name 31 | assert dbj.url == tj.url 32 | 33 | 34 | @pytest.mark.django_db 35 | def test_jurisdiction_update(): 36 | Division.objects.create(id="ocd-division/country:us", name="USA") 37 | tj = FakeJurisdiction() 38 | ji = JurisdictionImporter("jurisdiction-id") 39 | _, what = ji.import_item(tj.as_dict()) 40 | assert what == "insert" 41 | 42 | _, what = ji.import_item(tj.as_dict()) 43 | assert what == "noop" 44 | assert Jurisdiction.objects.count() == 1 45 | 46 | tj.name = "different name" 47 | obj, what = ji.import_item(tj.as_dict()) 48 | assert what == "update" 49 | assert Jurisdiction.objects.count() == 1 50 | assert Jurisdiction.objects.get().name == "different name" 51 | 52 | 53 | @pytest.mark.django_db 54 | def test_jurisdiction_merge_related(): 55 | Division.objects.create(id="ocd-division/country:us", name="USA") 56 | # need to ensure legislative_sessions don't get deleted 57 | ji = JurisdictionImporter("jurisdiction-id") 58 | tj = FakeJurisdiction() 59 | ji.import_item(tj.as_dict()) 60 | 61 | assert LegislativeSession.objects.count() == 2 62 | 63 | # disallow deletion of legislative sessions as it can remove bills 64 | tj.legislative_sessions.pop() 65 | ji.import_item(tj.as_dict()) 66 | 67 | # should still have two 68 | assert LegislativeSession.objects.count() == 2 69 | 70 | # now will have three 71 | tj.legislative_sessions.append({"identifier": "2017", "name": "2017 Session"}) 72 | ji.import_item(tj.as_dict()) 73 | assert LegislativeSession.objects.count() == 3 74 | 75 | # and test that the non-identifier fields actually update 76 | tj.legislative_sessions.append({"identifier": "2016", "name": "updated"}) 77 | ji.import_item(tj.as_dict()) 78 | assert LegislativeSession.objects.count() == 3 79 | assert LegislativeSession.objects.get(identifier="2016").name == "updated" 80 | -------------------------------------------------------------------------------- /pupa/importers/memberships.py: -------------------------------------------------------------------------------- 1 | from opencivicdata.core.models import ( 2 | Membership, 3 | MembershipContactDetail, 4 | MembershipLink, 5 | ) 6 | from .base import BaseImporter 7 | from ..utils import get_pseudo_id 8 | from ..exceptions import NoMembershipsError 9 | 10 | 11 | class MembershipImporter(BaseImporter): 12 | _type = "membership" 13 | model_class = Membership 14 | related_models = { 15 | "contact_details": (MembershipContactDetail, "membership_id", {}), 16 | "links": (MembershipLink, "membership_id", {}), 17 | } 18 | 19 | def __init__(self, jurisdiction_id, person_importer, org_importer, post_importer): 20 | super(MembershipImporter, self).__init__(jurisdiction_id) 21 | self.person_importer = person_importer 22 | self.org_importer = org_importer 23 | self.post_importer = post_importer 24 | self.seen_person_ids = set() 25 | 26 | def get_object(self, membership): 27 | spec = { 28 | "organization_id": membership["organization_id"], 29 | "person_id": membership["person_id"], 30 | "label": membership["label"], 31 | "role": membership["role"], 32 | } 33 | 34 | # post_id is optional - might exist in DB but not scraped here? 35 | if membership["post_id"]: 36 | spec["post_id"] = membership["post_id"] 37 | 38 | if membership["person_name"]: 39 | spec["person_name"] = membership["person_name"] 40 | 41 | if membership["start_date"]: 42 | spec["start_date"] = membership["start_date"] 43 | else: 44 | # if this is a historical role, only update historical roles 45 | spec["end_date"] = membership["end_date"] 46 | 47 | return self.model_class.objects.get(**spec) 48 | 49 | def prepare_for_db(self, data): 50 | # check if the organization is not tied to a jurisdiction 51 | if data["organization_id"].startswith("~"): 52 | pseudo_id = get_pseudo_id(data["organization_id"]) 53 | is_party = pseudo_id.get("classification") == "party" 54 | else: 55 | # we have to assume it is not a party if we want to avoid 56 | # doing a lookup here 57 | is_party = False 58 | 59 | data["organization_id"] = self.org_importer.resolve_json_id( 60 | data["organization_id"] 61 | ) 62 | data["person_id"] = self.person_importer.resolve_json_id( 63 | data["person_id"], allow_no_match=True 64 | ) 65 | data["post_id"] = self.post_importer.resolve_json_id(data["post_id"]) 66 | if not is_party: 67 | # track that we had a membership for this person 68 | self.seen_person_ids.add(data["person_id"]) 69 | return data 70 | 71 | def postimport(self): 72 | person_ids = ( 73 | set(self.person_importer.json_to_db_id.values()) - self.seen_person_ids 74 | ) 75 | if person_ids: 76 | reverse_id_dict = { 77 | v: k for k, v in self.person_importer.json_to_db_id.items() 78 | } 79 | person_ids = [reverse_id_dict[pid] for pid in person_ids] 80 | raise NoMembershipsError(person_ids) 81 | -------------------------------------------------------------------------------- /pupa/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django.contrib.postgres.fields import JSONField 3 | from django.contrib.contenttypes.fields import GenericForeignKey 4 | from django.contrib.contenttypes.models import ContentType 5 | from opencivicdata.core.models import Jurisdiction 6 | from opencivicdata.legislative.models import LegislativeSession 7 | 8 | 9 | OBJECT_TYPES = ( 10 | ("jurisdiction", "Jurisdiction"), 11 | ("person", "Person"), 12 | ("organization", "Organization"), 13 | ("post", "Post"), 14 | ("membership", "Membership"), 15 | ("bill", "Bill"), 16 | ("vote_event", "VoteEvent"), 17 | ("event", "Event"), 18 | ) 19 | 20 | 21 | class RunPlan(models.Model): 22 | jurisdiction = models.ForeignKey( 23 | Jurisdiction, related_name="runs", on_delete=models.CASCADE 24 | ) 25 | success = models.BooleanField(default=True) 26 | start_time = models.DateTimeField() 27 | end_time = models.DateTimeField() 28 | exception = models.TextField(blank=True, default="") 29 | traceback = models.TextField(blank=True, default="") 30 | 31 | 32 | class ScrapeReport(models.Model): 33 | plan = models.ForeignKey(RunPlan, related_name="scrapers", on_delete=models.CASCADE) 34 | scraper = models.CharField(max_length=300) 35 | args = models.CharField(max_length=300) 36 | start_time = models.DateTimeField() 37 | end_time = models.DateTimeField() 38 | 39 | 40 | class ScrapeObjects(models.Model): 41 | report = models.ForeignKey( 42 | ScrapeReport, related_name="scraped_objects", on_delete=models.CASCADE 43 | ) 44 | object_type = models.CharField(max_length=20, choices=OBJECT_TYPES) 45 | count = models.PositiveIntegerField() 46 | 47 | 48 | class ImportObjects(models.Model): 49 | report = models.ForeignKey( 50 | RunPlan, related_name="imported_objects", on_delete=models.CASCADE 51 | ) 52 | object_type = models.CharField(max_length=20, choices=OBJECT_TYPES) 53 | insert_count = models.PositiveIntegerField() 54 | update_count = models.PositiveIntegerField() 55 | noop_count = models.PositiveIntegerField() 56 | start_time = models.DateTimeField() 57 | end_time = models.DateTimeField() 58 | 59 | 60 | class Identifier(models.Model): 61 | identifier = models.CharField(max_length=300) 62 | jurisdiction = models.ForeignKey( 63 | Jurisdiction, 64 | related_name="pupa_ids", 65 | on_delete=models.CASCADE, 66 | ) 67 | content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE) 68 | object_id = models.CharField(max_length=300) 69 | content_object = GenericForeignKey("content_type", "object_id") 70 | 71 | def __str__(self): # __unicode__ on Python 2 72 | return self.identifier 73 | 74 | 75 | class SessionDataQualityReport(models.Model): 76 | legislative_session = models.ForeignKey( 77 | LegislativeSession, on_delete=models.CASCADE 78 | ) 79 | 80 | bills_missing_actions = models.PositiveIntegerField() 81 | bills_missing_sponsors = models.PositiveIntegerField() 82 | bills_missing_versions = models.PositiveIntegerField() 83 | 84 | votes_missing_voters = models.PositiveIntegerField() 85 | votes_missing_bill = models.PositiveIntegerField() 86 | votes_missing_yes_count = models.PositiveIntegerField() 87 | votes_missing_no_count = models.PositiveIntegerField() 88 | votes_with_bad_counts = models.PositiveIntegerField() 89 | 90 | # these fields store lists of names mapped to numbers of occurances 91 | unmatched_sponsor_people = JSONField() 92 | unmatched_sponsor_organizations = JSONField() 93 | unmatched_voters = JSONField() 94 | -------------------------------------------------------------------------------- /pupa/importers/people.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from django.db.models import Q 3 | from opencivicdata.core.models import ( 4 | Person, 5 | PersonIdentifier, 6 | PersonName, 7 | PersonContactDetail, 8 | PersonLink, 9 | PersonSource, 10 | ) 11 | from .base import BaseImporter 12 | from ..exceptions import SameNameError 13 | 14 | 15 | class PersonImporter(BaseImporter): 16 | _type = "person" 17 | model_class = Person 18 | related_models = { 19 | "identifiers": (PersonIdentifier, "person_id", {}), 20 | "other_names": (PersonName, "person_id", {}), 21 | "contact_details": (PersonContactDetail, "person_id", {}), 22 | "links": (PersonLink, "person_id", {}), 23 | "sources": (PersonSource, "person_id", {}), 24 | } 25 | 26 | def _prepare_imports(self, dicts): 27 | dicts = list(super(PersonImporter, self)._prepare_imports(dicts)) 28 | 29 | by_name = defaultdict(list) 30 | for _, person in dicts: 31 | by_name[person["name"]].append(person) 32 | for other in person["other_names"]: 33 | by_name[other["name"]].append(person) 34 | 35 | # check for duplicates 36 | for name, people in by_name.items(): 37 | if len(people) > 1: 38 | for person in people: 39 | if person["birth_date"] == "": 40 | raise SameNameError(name) 41 | 42 | return dicts 43 | 44 | def limit_spec(self, spec): 45 | """ 46 | Whenever we do a Pseudo ID lookup from the database, we need to limit 47 | based on the memberships -> organization -> jurisdiction, so we scope 48 | the resolution. 49 | """ 50 | if list(spec.keys()) == ["name"]: 51 | # if we're just resolving on name, include other names and family name 52 | name = spec["name"] 53 | return (Q(name=name) | Q(other_names__name=name) | Q(family_name=name)) & Q( 54 | memberships__organization__jurisdiction_id=self.jurisdiction_id 55 | ) 56 | spec["memberships__organization__jurisdiction_id"] = self.jurisdiction_id 57 | return spec 58 | 59 | def get_object(self, person): 60 | all_names = [person["name"]] + [o["name"] for o in person["other_names"]] 61 | 62 | matches = list( 63 | self.model_class.objects.filter( 64 | Q(memberships__organization__jurisdiction_id=self.jurisdiction_id), 65 | (Q(name__in=all_names) | Q(other_names__name__in=all_names)), 66 | ).distinct("id") 67 | ) 68 | 69 | matches_length = len(matches) 70 | if matches_length == 1 and not matches[0].birth_date: 71 | return matches[0] 72 | elif matches_length == 0: 73 | raise self.model_class.DoesNotExist( 74 | "No Person: {} in {}".format(all_names, self.jurisdiction_id) 75 | ) 76 | else: 77 | # Try and match based on birth_date. 78 | if person["birth_date"]: 79 | for match in matches: 80 | if ( 81 | person["birth_date"] 82 | and match.birth_date == person["birth_date"] 83 | ): 84 | return match 85 | 86 | # If we got here, no match based on birth_date, a new person? 87 | raise self.model_class.DoesNotExist( 88 | "No Person: {} in {} with birth_date {}".format( 89 | all_names, self.jurisdiction_id, person["birth_date"] 90 | ) 91 | ) 92 | 93 | raise SameNameError(person["name"]) 94 | -------------------------------------------------------------------------------- /pupa/reports/session.py: -------------------------------------------------------------------------------- 1 | from django.db.models import Count, Subquery, OuterRef, Q, F 2 | from opencivicdata.legislative.models import ( 3 | Bill, 4 | VoteEvent, 5 | VoteCount, 6 | PersonVote, 7 | BillSponsorship, 8 | ) 9 | from ..models import SessionDataQualityReport 10 | 11 | 12 | def _simple_count(ModelCls, session, **filter): 13 | return ( 14 | ModelCls.objects.filter(legislative_session_id=session).filter(**filter).count() 15 | ) 16 | 17 | 18 | def generate_session_report(session): 19 | report = { 20 | "bills_missing_actions": _simple_count(Bill, session, actions__isnull=True), 21 | "bills_missing_sponsors": _simple_count( 22 | Bill, session, sponsorships__isnull=True 23 | ), 24 | "bills_missing_versions": _simple_count(Bill, session, versions__isnull=True), 25 | "votes_missing_bill": _simple_count(VoteEvent, session, bill__isnull=True), 26 | "votes_missing_voters": _simple_count(VoteEvent, session, votes__isnull=True), 27 | "votes_missing_yes_count": 0, 28 | "votes_missing_no_count": 0, 29 | "votes_with_bad_counts": 0, 30 | } 31 | 32 | voteevents = VoteEvent.objects.filter(legislative_session_id=session) 33 | queryset = voteevents.annotate( 34 | yes_sum=Count("pk", filter=Q(votes__option="yes")), 35 | no_sum=Count("pk", filter=Q(votes__option="no")), 36 | other_sum=Count("pk", filter=Q(votes__option="other")), 37 | yes_count=Subquery( 38 | VoteCount.objects.filter(vote_event=OuterRef("pk"), option="yes").values( 39 | "value" 40 | ) 41 | ), 42 | no_count=Subquery( 43 | VoteCount.objects.filter(vote_event=OuterRef("pk"), option="no").values( 44 | "value" 45 | ) 46 | ), 47 | other_count=Subquery( 48 | VoteCount.objects.filter(vote_event=OuterRef("pk"), option="other").values( 49 | "value" 50 | ) 51 | ), 52 | ) 53 | 54 | for vote in queryset: 55 | if vote.yes_count is None: 56 | report["votes_missing_yes_count"] += 1 57 | vote.yes_count = 0 58 | if vote.no_count is None: 59 | report["votes_missing_no_count"] += 1 60 | vote.no_count = 0 61 | if vote.other_count is None: 62 | vote.other_count = 0 63 | if ( 64 | vote.yes_sum != vote.yes_count 65 | or vote.no_sum != vote.no_count 66 | or vote.other_sum != vote.other_count 67 | ): 68 | report["votes_with_bad_counts"] += 1 69 | 70 | # handle unmatched 71 | queryset = ( 72 | BillSponsorship.objects.filter( 73 | bill__legislative_session_id=session, entity_type="person", person_id=None 74 | ) 75 | .values("name") 76 | .annotate(num=Count("name")) 77 | ) 78 | report["unmatched_sponsor_people"] = { 79 | item["name"]: item["num"] for item in queryset 80 | } 81 | queryset = ( 82 | BillSponsorship.objects.filter( 83 | bill__legislative_session_id=session, 84 | entity_type="organization", 85 | person_id=None, 86 | ) 87 | .values("name") 88 | .annotate(num=Count("name")) 89 | ) 90 | report["unmatched_sponsor_organizations"] = { 91 | item["name"]: item["num"] for item in queryset 92 | } 93 | queryset = ( 94 | PersonVote.objects.filter( 95 | vote_event__legislative_session_id=session, voter__isnull=True 96 | ) 97 | .values(name=F("voter_name")) 98 | .annotate(num=Count("voter_name")) 99 | ) 100 | report["unmatched_voters"] = {item["name"]: item["num"] for item in queryset} 101 | 102 | return SessionDataQualityReport(legislative_session_id=session, **report) 103 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_scraper.py: -------------------------------------------------------------------------------- 1 | import mock 2 | import pytest 3 | from pupa.scrape import Person, Organization, Bill, Jurisdiction 4 | from pupa.scrape.base import Scraper, ScrapeError, BaseBillScraper 5 | 6 | 7 | class FakeJurisdiction(Jurisdiction): 8 | jurisdiction_id = "jurisdiction" 9 | 10 | 11 | juris = FakeJurisdiction() 12 | 13 | 14 | def test_save_object_basics(): 15 | # ensure that save object dumps a file 16 | s = Scraper(juris, "/tmp/") 17 | p = Person("Michael Jordan") 18 | p.add_source("http://example.com") 19 | 20 | with mock.patch("json.dump") as json_dump: 21 | s.save_object(p) 22 | 23 | # ensure object is saved in right place 24 | filename = "person_" + p._id + ".json" 25 | assert filename in s.output_names["person"] 26 | json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY) 27 | 28 | 29 | def test_save_object_invalid(): 30 | s = Scraper(juris, "/tmp/") 31 | p = Person("Michael Jordan") 32 | # no source, won't validate 33 | 34 | with pytest.raises(ValueError): 35 | s.save_object(p) 36 | 37 | 38 | def test_save_related(): 39 | s = Scraper(juris, "/tmp/") 40 | p = Person("Michael Jordan") 41 | p.add_source("http://example.com") 42 | o = Organization("Chicago Bulls", classification="committee") 43 | o.add_source("http://example.com") 44 | p._related.append(o) 45 | 46 | with mock.patch("json.dump") as json_dump: 47 | s.save_object(p) 48 | 49 | assert json_dump.mock_calls == [ 50 | mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), 51 | mock.call(o.as_dict(), mock.ANY, cls=mock.ANY), 52 | ] 53 | 54 | 55 | def test_simple_scrape(): 56 | class FakeScraper(Scraper): 57 | def scrape(self): 58 | p = Person("Michael Jordan") 59 | p.add_source("http://example.com") 60 | yield p 61 | 62 | with mock.patch("json.dump") as json_dump: 63 | record = FakeScraper(juris, "/tmp/").do_scrape() 64 | 65 | assert len(json_dump.mock_calls) == 1 66 | assert record["objects"]["person"] == 1 67 | assert record["end"] > record["start"] 68 | assert record["skipped"] == 0 69 | 70 | 71 | def test_double_iter(): 72 | """tests that scrapers that yield iterables work OK""" 73 | 74 | class IterScraper(Scraper): 75 | def scrape(self): 76 | yield self.scrape_people() 77 | 78 | def scrape_people(self): 79 | p = Person("Michael Jordan") 80 | p.add_source("http://example.com") 81 | yield p 82 | 83 | with mock.patch("json.dump") as json_dump: 84 | record = IterScraper(juris, "/tmp/").do_scrape() 85 | 86 | assert len(json_dump.mock_calls) == 1 87 | assert record["objects"]["person"] == 1 88 | 89 | 90 | def test_no_objects(): 91 | class NullScraper(Scraper): 92 | def scrape(self): 93 | pass 94 | 95 | with pytest.raises(ScrapeError): 96 | NullScraper(juris, "/tmp/", fastmode=True).do_scrape() 97 | 98 | 99 | def test_no_scrape(): 100 | class NonScraper(Scraper): 101 | pass 102 | 103 | with pytest.raises(NotImplementedError): 104 | NonScraper(juris, "/tmp/").do_scrape() 105 | 106 | 107 | def test_bill_scraper(): 108 | class BillScraper(BaseBillScraper): 109 | def get_bill_ids(self): 110 | yield "1", {"extra": "param"} 111 | yield "2", {} 112 | 113 | def get_bill(self, bill_id, **kwargs): 114 | if bill_id == "1": 115 | assert kwargs == {"extra": "param"} 116 | raise self.ContinueScraping 117 | else: 118 | assert bill_id == "2" 119 | assert kwargs == {} 120 | b = Bill("1", self.legislative_session, "title") 121 | b.add_source("http://example.com") 122 | return b 123 | 124 | bs = BillScraper(juris, "/tmp/") 125 | with mock.patch("json.dump") as json_dump: 126 | record = bs.do_scrape(legislative_session="2020") 127 | 128 | assert len(json_dump.mock_calls) == 1 129 | assert record["objects"]["bill"] == 1 130 | assert record["skipped"] == 1 131 | -------------------------------------------------------------------------------- /pupa/cli/commands/clean.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone, timedelta 2 | import sys 3 | 4 | import django 5 | from django.apps import apps 6 | from .base import BaseCommand 7 | 8 | 9 | def get_subclasses(app_list, abstract_class): 10 | """ 11 | Finds and returns all subclasses of an abstract class. 12 | """ 13 | result = [] 14 | for app in app_list: 15 | for model in apps.get_app_config(app).get_models(): 16 | if issubclass(model, abstract_class) and model is not abstract_class: 17 | result.append(model) 18 | return result 19 | 20 | 21 | class Command(BaseCommand): 22 | name = "clean" 23 | help = "Removes database objects that haven't been seen in recent scrapes" 24 | 25 | def add_args(self): 26 | self.add_argument( 27 | "--window", 28 | type=int, 29 | default=7, 30 | help=( 31 | "objects not seen in this many days will be deleted from the database" 32 | ), 33 | ) 34 | self.add_argument( 35 | "--max", 36 | type=int, 37 | default=10, 38 | help="max number of objects to delete without triggering failsafe", 39 | ) 40 | self.add_argument( 41 | "--report", 42 | action="store_true", 43 | help=( 44 | "generate a report of what objects this command" 45 | " would delete without making any changes to the database" 46 | ), 47 | ) 48 | self.add_argument( 49 | "--yes", 50 | action="store_true", 51 | help="assumes an answer of 'yes' to all interactive prompts", 52 | default=False, 53 | ) 54 | 55 | def get_stale_objects(self, window): 56 | """ 57 | Find all database objects that haven't seen been in {window} days. 58 | """ 59 | 60 | from opencivicdata.core.models.base import OCDBase 61 | 62 | ocd_apps = ["core", "legislative"] 63 | # Check all subclasses of OCDBase 64 | models = get_subclasses(ocd_apps, OCDBase) 65 | # Top-level models are protected from deletion 66 | protected_models = ("Division", "Jurisdiction", "Post") 67 | 68 | for model in models: 69 | if model.__name__ not in protected_models: 70 | cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window) 71 | yield from model.objects.filter(last_seen__lte=cutoff_date).iterator() 72 | 73 | def remove_stale_objects(self, window): 74 | """ 75 | Remove all database objects that haven't seen been in {window} days. 76 | """ 77 | 78 | for obj in self.get_stale_objects(window): 79 | print(f"Deleting {obj}...") 80 | obj.delete() 81 | 82 | def report_stale_objects(self, window): 83 | """ 84 | Print all database objects that haven't seen been in {window} days. 85 | """ 86 | for obj in self.get_stale_objects(window): 87 | print(obj) 88 | 89 | def handle(self, args, other): 90 | django.setup() 91 | 92 | if args.report: 93 | print( 94 | "These objects have not been seen in a scrape within the last" 95 | f" {args.window} days:" 96 | ) 97 | self.report_stale_objects(args.window) 98 | else: 99 | stale_objects = list(self.get_stale_objects(args.window)) 100 | num_stale_objects = len(stale_objects) 101 | 102 | print( 103 | f"{num_stale_objects} objects in your database have not been seen " 104 | f"in {args.window} days." 105 | ) 106 | 107 | if num_stale_objects > args.max: 108 | print( 109 | f"{num_stale_objects} exceeds the failsafe limit of {args.max}. " 110 | "Run this command with a larger --max value to proceed." 111 | ) 112 | sys.exit() 113 | 114 | if args.yes: 115 | print("Proceeding to deletion because you specified --yes.") 116 | 117 | else: 118 | print(f"Permanently delete {num_stale_objects} objects? [Y/n]") 119 | response = input() 120 | 121 | if args.yes or response == "Y": 122 | self.remove_stale_objects(args.window) 123 | print(f"Removed {num_stale_objects} from your database.") 124 | -------------------------------------------------------------------------------- /pupa/tests/importers/test_post_importer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.scrape import Post as ScrapePost 3 | from pupa.importers import PostImporter, OrganizationImporter 4 | from opencivicdata.core.models import Organization, Post, Division, Jurisdiction 5 | import datetime 6 | 7 | 8 | def create_jurisdictions(): 9 | Division.objects.create(id="ocd-division/country:us", name="USA") 10 | Division.objects.create(id="ocd-division/country:us/state:nc", name="NC") 11 | Jurisdiction.objects.create(id="us", division_id="ocd-division/country:us") 12 | Jurisdiction.objects.create(id="nc", division_id="ocd-division/country:us/state:nc") 13 | 14 | 15 | @pytest.mark.django_db 16 | def test_full_post(): 17 | create_jurisdictions() 18 | org = Organization.objects.create( 19 | name="United States Executive Branch", 20 | classification="executive", 21 | jurisdiction_id="us", 22 | ) 23 | post = ScrapePost( 24 | label="executive", 25 | role="President", 26 | organization_id='~{"classification": "executive"}', 27 | start_date=datetime.date(2015, 5, 18), 28 | end_date="2015-05-19", 29 | maximum_memberships=2, 30 | ) 31 | post.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") 32 | post.add_link("http://example.com/link") 33 | 34 | # import post 35 | oi = OrganizationImporter("us") 36 | PostImporter("jurisdiction-id", oi).import_data([post.as_dict()]) 37 | print(post.as_dict()) 38 | 39 | # get person from db and assert it imported correctly 40 | p = Post.objects.get() 41 | assert "ocd-post" in p.id 42 | assert p.label == post.label 43 | assert p.role == post.role 44 | assert p.organization_id == org.id 45 | assert p.maximum_memberships == 2 46 | 47 | assert p.contact_details.all()[0].type == "phone" 48 | assert p.contact_details.all()[0].value == "555-555-1234" 49 | assert p.contact_details.all()[0].note == "this is fake" 50 | 51 | assert p.links.all()[0].url == "http://example.com/link" 52 | 53 | assert p.start_date == "2015-05-18" 54 | assert p.end_date == "2015-05-19" 55 | 56 | 57 | @pytest.mark.django_db 58 | def test_deduplication(): 59 | create_jurisdictions() 60 | Organization.objects.create( 61 | id="us", 62 | name="United States Executive Branch", 63 | classification="executive", 64 | jurisdiction_id="us", 65 | ) 66 | Organization.objects.create( 67 | id="nc", 68 | name="North Carolina Executive Branch", 69 | classification="executive", 70 | jurisdiction_id="nc", 71 | ) 72 | pres = ScrapePost( 73 | label="executive", 74 | role="President", 75 | organization_id='~{"classification": "executive"}', 76 | ) 77 | vp = ScrapePost( 78 | label="vice-executive", 79 | role="Vice President", 80 | organization_id='~{"classification": "executive"}', 81 | ) 82 | gov = ScrapePost( 83 | label="executive", 84 | role="Governor", 85 | organization_id='~{"classification": "executive"}', 86 | ) 87 | 88 | # ensure pres, vp and gov are all imported 89 | # pres & gov - same label, different jurisdiction 90 | # vp & pres - same jurisdiction, different label 91 | us_oi = OrganizationImporter("us") 92 | nc_oi = OrganizationImporter("nc") 93 | PostImporter("us", us_oi).import_data([pres.as_dict(), vp.as_dict()]) 94 | PostImporter("nc", nc_oi).import_data([gov.as_dict()]) 95 | assert Post.objects.count() == 3 96 | 97 | 98 | @pytest.mark.django_db 99 | def test_resolve_special_json_id(): 100 | create_jurisdictions() 101 | Organization.objects.create( 102 | id="us", 103 | name="United States Executive Branch", 104 | classification="executive", 105 | jurisdiction_id="us", 106 | ) 107 | Organization.objects.create( 108 | id="nc", 109 | name="North Carolina Executive Branch", 110 | classification="executive", 111 | jurisdiction_id="nc", 112 | ) 113 | Post.objects.create( 114 | id="pres", label="executive", role="President", organization_id="us" 115 | ) 116 | Post.objects.create( 117 | id="vpres", label="vice-executive", role="Vice President", organization_id="us" 118 | ) 119 | Post.objects.create( 120 | id="gov", label="executive", role="Governor", organization_id="nc" 121 | ) 122 | 123 | oi = OrganizationImporter("") 124 | assert PostImporter("us", oi).resolve_json_id('~{"label": "executive"}') == "pres" 125 | assert ( 126 | PostImporter("us", oi).resolve_json_id('~{"label": "vice-executive"}') 127 | == "vpres" 128 | ) 129 | assert PostImporter("nc", oi).resolve_json_id('~{"label": "executive"}') == "gov" 130 | -------------------------------------------------------------------------------- /pupa/importers/bills.py: -------------------------------------------------------------------------------- 1 | from opencivicdata.legislative.models import ( 2 | Bill, 3 | RelatedBill, 4 | BillAbstract, 5 | BillTitle, 6 | BillIdentifier, 7 | BillAction, 8 | BillActionRelatedEntity, 9 | BillSponsorship, 10 | BillSource, 11 | BillDocument, 12 | BillVersion, 13 | BillDocumentLink, 14 | BillVersionLink, 15 | ) 16 | from .base import BaseImporter 17 | from ..exceptions import PupaInternalError 18 | 19 | 20 | class BillImporter(BaseImporter): 21 | _type = "bill" 22 | model_class = Bill 23 | related_models = { 24 | "abstracts": (BillAbstract, "bill_id", {}), 25 | "other_titles": (BillTitle, "bill_id", {}), 26 | "other_identifiers": (BillIdentifier, "bill_id", {}), 27 | "actions": ( 28 | BillAction, 29 | "bill_id", 30 | {"related_entities": (BillActionRelatedEntity, "action_id", {})}, 31 | ), 32 | "related_bills": (RelatedBill, "bill_id", {}), 33 | "sponsorships": (BillSponsorship, "bill_id", {}), 34 | "sources": (BillSource, "bill_id", {}), 35 | "documents": ( 36 | BillDocument, 37 | "bill_id", 38 | {"links": (BillDocumentLink, "document_id", {})}, 39 | ), 40 | "versions": ( 41 | BillVersion, 42 | "bill_id", 43 | {"links": (BillVersionLink, "version_id", {})}, 44 | ), 45 | } 46 | preserve_order = {"actions"} 47 | 48 | def __init__(self, jurisdiction_id, org_importer, person_importer): 49 | super(BillImporter, self).__init__(jurisdiction_id) 50 | self.org_importer = org_importer 51 | self.person_importer = person_importer 52 | 53 | def get_object(self, bill): 54 | spec = { 55 | "legislative_session_id": bill["legislative_session_id"], 56 | "identifier": bill["identifier"], 57 | } 58 | if "from_organization_id" in bill: 59 | spec["from_organization_id"] = bill["from_organization_id"] 60 | 61 | return self.model_class.objects.prefetch_related( 62 | "actions__related_entities", 63 | "versions__links", 64 | "documents__links", 65 | ).get(**spec) 66 | 67 | def limit_spec(self, spec): 68 | spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id 69 | return spec 70 | 71 | def prepare_for_db(self, data): 72 | data["legislative_session_id"] = self.get_session_id( 73 | data.pop("legislative_session") 74 | ) 75 | 76 | if data["from_organization"]: 77 | data["from_organization_id"] = self.org_importer.resolve_json_id( 78 | data.pop("from_organization") 79 | ) 80 | 81 | for action in data["actions"]: 82 | action["organization_id"] = self.org_importer.resolve_json_id( 83 | action["organization_id"] 84 | ) 85 | for entity in action["related_entities"]: 86 | if "organization_id" in entity: 87 | entity["organization_id"] = self.org_importer.resolve_json_id( 88 | entity["organization_id"] 89 | ) 90 | elif "person_id" in entity: 91 | entity["person_id"] = self.person_importer.resolve_json_id( 92 | entity["person_id"] 93 | ) 94 | 95 | for sponsor in data["sponsorships"]: 96 | if "person_id" in sponsor: 97 | sponsor["person_id"] = self.person_importer.resolve_json_id( 98 | sponsor["person_id"], allow_no_match=True 99 | ) 100 | 101 | if "organization_id" in sponsor: 102 | sponsor["organization_id"] = self.org_importer.resolve_json_id( 103 | sponsor["organization_id"], allow_no_match=True 104 | ) 105 | 106 | return data 107 | 108 | def postimport(self): 109 | # go through all RelatedBill objs that are attached to a bill in this 110 | # jurisdiction and are currently unresolved 111 | for rb in RelatedBill.objects.filter( 112 | bill__legislative_session__jurisdiction_id=self.jurisdiction_id, 113 | related_bill=None, 114 | ): 115 | candidates = list( 116 | Bill.objects.filter( 117 | legislative_session__identifier=rb.legislative_session, 118 | legislative_session__jurisdiction_id=self.jurisdiction_id, 119 | identifier=rb.identifier, 120 | ) 121 | ) 122 | if len(candidates) == 1: 123 | rb.related_bill = candidates[0] 124 | rb.save() 125 | elif len(candidates) > 1: # pragma: no cover 126 | # if we ever see this, we need to add additional fields on the relation 127 | raise PupaInternalError( 128 | "multiple related_bill candidates found for {}".format(rb) 129 | ) 130 | -------------------------------------------------------------------------------- /pupa/importers/events.py: -------------------------------------------------------------------------------- 1 | from .base import BaseImporter 2 | from pupa.utils import get_pseudo_id, _make_pseudo_id 3 | from opencivicdata.legislative.models import ( 4 | Event, 5 | EventLocation, 6 | EventSource, 7 | EventDocument, 8 | EventDocumentLink, 9 | EventLink, 10 | EventParticipant, 11 | EventMedia, 12 | EventMediaLink, 13 | EventAgendaItem, 14 | EventRelatedEntity, 15 | EventAgendaMedia, 16 | EventAgendaMediaLink, 17 | ) 18 | 19 | 20 | class EventImporter(BaseImporter): 21 | _type = "event" 22 | model_class = Event 23 | related_models = { 24 | "sources": (EventSource, "event_id", {}), 25 | "documents": ( 26 | EventDocument, 27 | "event_id", 28 | {"links": (EventDocumentLink, "document_id", {})}, 29 | ), 30 | "links": (EventLink, "event_id", {}), 31 | "participants": (EventParticipant, "event_id", {}), 32 | "media": ( 33 | EventMedia, 34 | "event_id", 35 | { 36 | "links": (EventMediaLink, "media_id", {}), 37 | }, 38 | ), 39 | "agenda": ( 40 | EventAgendaItem, 41 | "event_id", 42 | { 43 | "related_entities": (EventRelatedEntity, "agenda_item_id", {}), 44 | "media": ( 45 | EventAgendaMedia, 46 | "agenda_item_id", 47 | { 48 | "links": (EventAgendaMediaLink, "media_id", {}), 49 | }, 50 | ), 51 | }, 52 | ), 53 | } 54 | preserve_order = ("agenda",) 55 | 56 | def __init__( 57 | self, 58 | jurisdiction_id, 59 | org_importer, 60 | person_importer, 61 | bill_importer, 62 | vote_event_importer, 63 | ): 64 | super(EventImporter, self).__init__(jurisdiction_id) 65 | self.org_importer = org_importer 66 | self.person_importer = person_importer 67 | self.bill_importer = bill_importer 68 | self.vote_event_importer = vote_event_importer 69 | 70 | def get_object(self, event): 71 | if event.get("pupa_id"): 72 | e_id = self.lookup_obj_id(event["pupa_id"], Event) 73 | if e_id: 74 | spec = {"id": e_id} 75 | else: 76 | return None 77 | else: 78 | spec = { 79 | "name": event["name"], 80 | "description": event["description"], 81 | "start_date": event["start_date"], 82 | "end_date": event["end_date"], 83 | "jurisdiction_id": self.jurisdiction_id, 84 | } 85 | return self.model_class.objects.get(**spec) 86 | 87 | def get_location(self, location_data): 88 | obj, created = EventLocation.objects.get_or_create( 89 | name=location_data["name"], 90 | url=location_data.get("url", ""), 91 | jurisdiction_id=self.jurisdiction_id, 92 | ) 93 | # TODO: geocode here? 94 | return obj 95 | 96 | def prepare_for_db(self, data): 97 | data["jurisdiction_id"] = self.jurisdiction_id 98 | if data["location"]: 99 | data["location"] = self.get_location(data["location"]) 100 | 101 | data["start_date"] = data["start_date"] 102 | data["end_date"] = data.get("end_date", "") 103 | 104 | for participant in data["participants"]: 105 | if "person_id" in participant: 106 | participant["person_id"] = self.person_importer.resolve_json_id( 107 | participant["person_id"], allow_no_match=True 108 | ) 109 | elif "organization_id" in participant: 110 | participant["organization_id"] = self.org_importer.resolve_json_id( 111 | participant["organization_id"], allow_no_match=True 112 | ) 113 | 114 | for item in data["agenda"]: 115 | for entity in item["related_entities"]: 116 | if "person_id" in entity: 117 | entity["person_id"] = self.person_importer.resolve_json_id( 118 | entity["person_id"], allow_no_match=True 119 | ) 120 | elif "organization_id" in entity: 121 | entity["organization_id"] = self.org_importer.resolve_json_id( 122 | entity["organization_id"], allow_no_match=True 123 | ) 124 | elif "bill_id" in entity: 125 | # unpack and repack bill psuedo id in case filters alter it 126 | bill = get_pseudo_id(entity["bill_id"]) 127 | self.bill_importer.apply_transformers(bill) 128 | bill = _make_pseudo_id(**bill) 129 | entity["bill_id"] = self.bill_importer.resolve_json_id( 130 | bill, allow_no_match=True 131 | ) 132 | elif "vote_event_id" in entity: 133 | entity["vote_event_id"] = self.vote_event_importer.resolve_json_id( 134 | entity["vote_event_id"], allow_no_match=True 135 | ) 136 | 137 | return data 138 | -------------------------------------------------------------------------------- /pupa/importers/organizations.py: -------------------------------------------------------------------------------- 1 | from django.db.models import Q 2 | from opencivicdata.core.models import ( 3 | Organization, 4 | OrganizationIdentifier, 5 | OrganizationName, 6 | OrganizationContactDetail, 7 | OrganizationLink, 8 | OrganizationSource, 9 | ) 10 | from .base import BaseImporter 11 | from ..utils import get_pseudo_id 12 | from ..utils.topsort import Network 13 | from ..exceptions import UnresolvedIdError, PupaInternalError, SameOrgNameError 14 | 15 | 16 | class OrganizationImporter(BaseImporter): 17 | _type = "organization" 18 | model_class = Organization 19 | related_models = { 20 | "identifiers": (OrganizationIdentifier, "organization_id", {}), 21 | "other_names": (OrganizationName, "organization_id", {}), 22 | "contact_details": (OrganizationContactDetail, "organization_id", {}), 23 | "links": (OrganizationLink, "organization_id", {}), 24 | "sources": (OrganizationSource, "organization_id", {}), 25 | } 26 | 27 | def get_object(self, org): 28 | spec = {"classification": org["classification"], "parent_id": org["parent_id"]} 29 | 30 | # add jurisdiction_id unless this is a party 31 | jid = org.get("jurisdiction_id") 32 | if jid: 33 | spec["jurisdiction_id"] = jid 34 | 35 | all_names = [org["name"]] + [o["name"] for o in org["other_names"]] 36 | 37 | query = Q(**spec) & (Q(name__in=all_names) | Q(other_names__name__in=all_names)) 38 | matches = list(self.model_class.objects.filter(query).distinct("id")) 39 | matches_length = len(matches) 40 | if matches_length == 1: 41 | return matches[0] 42 | elif matches_length == 0: 43 | raise self.model_class.DoesNotExist( 44 | "No Organization: {} in {}".format(all_names, self.jurisdiction_id) 45 | ) 46 | else: 47 | raise SameOrgNameError(org["name"]) 48 | 49 | def prepare_for_db(self, data): 50 | data["parent_id"] = self.resolve_json_id(data["parent_id"]) 51 | 52 | if data["classification"] != "party": 53 | data["jurisdiction_id"] = self.jurisdiction_id 54 | return data 55 | 56 | def limit_spec(self, spec): 57 | if spec.get("classification") != "party": 58 | spec["jurisdiction_id"] = self.jurisdiction_id 59 | 60 | name = spec.pop("name", None) 61 | if name: 62 | return Q(**spec) & (Q(name=name) | Q(other_names__name=name)) 63 | return spec 64 | 65 | def _prepare_imports(self, dicts): 66 | """an override for prepare imports that sorts the imports 67 | by parent_id dependencies""" 68 | # all pseudo parent ids we've seen 69 | pseudo_ids = set() 70 | # pseudo matches 71 | pseudo_matches = {} 72 | 73 | # get prepared imports from parent 74 | prepared = dict(super(OrganizationImporter, self)._prepare_imports(dicts)) 75 | 76 | # collect parent pseudo_ids 77 | for _, data in prepared.items(): 78 | parent_id = data.get("parent_id", None) or "" 79 | if parent_id.startswith("~"): 80 | pseudo_ids.add(parent_id) 81 | 82 | # turn pseudo_ids into a tuple of dictionaries 83 | pseudo_ids = [(ppid, get_pseudo_id(ppid)) for ppid in pseudo_ids] 84 | 85 | # loop over all data again, finding the pseudo ids true json id 86 | for json_id, data in prepared.items(): 87 | # check if this matches one of our ppids 88 | for ppid, spec in pseudo_ids: 89 | match = True 90 | for k, v in spec.items(): 91 | if data[k] != v: 92 | match = False 93 | break 94 | if match: 95 | if ppid in pseudo_matches: 96 | raise UnresolvedIdError( 97 | "multiple matches for pseudo id: " + ppid 98 | ) 99 | pseudo_matches[ppid] = json_id 100 | 101 | # toposort the nodes so parents are imported first 102 | network = Network() 103 | in_network = set() 104 | import_order = [] 105 | 106 | for json_id, data in prepared.items(): 107 | parent_id = data.get("parent_id", None) 108 | 109 | # resolve pseudo_ids to their json id before building the network 110 | if parent_id in pseudo_matches: 111 | parent_id = pseudo_matches[parent_id] 112 | 113 | network.add_node(json_id) 114 | if parent_id: 115 | # Right. There's an import dep. We need to add the edge from 116 | # the parent to the current node, so that we import the parent 117 | # before the current node. 118 | network.add_edge(parent_id, json_id) 119 | 120 | # resolve the sorted import order 121 | for jid in network.sort(): 122 | import_order.append((jid, prepared[jid])) 123 | in_network.add(jid) 124 | 125 | # ensure all data made it into network (paranoid check, should never fail) 126 | if in_network != set(prepared.keys()): # pragma: no cover 127 | raise PupaInternalError("import is missing nodes in network set") 128 | 129 | return import_order 130 | -------------------------------------------------------------------------------- /pupa/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models, migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | dependencies = [ 10 | ("core", "0001_initial"), 11 | ("legislative", "0001_initial"), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name="ImportObjects", 17 | fields=[ 18 | ( 19 | "id", 20 | models.AutoField( 21 | primary_key=True, 22 | auto_created=True, 23 | serialize=False, 24 | verbose_name="ID", 25 | ), 26 | ), 27 | ( 28 | "object_type", 29 | models.CharField( 30 | max_length=20, 31 | choices=[ 32 | ("jurisdiction", "Jurisdiction"), 33 | ("person", "Person"), 34 | ("organization", "Organization"), 35 | ("post", "Post"), 36 | ("membership", "Membership"), 37 | ("bill", "Bill"), 38 | ("vote_event", "VoteEvent"), 39 | ("event", "Event"), 40 | ], 41 | ), 42 | ), 43 | ("insert_count", models.PositiveIntegerField()), 44 | ("update_count", models.PositiveIntegerField()), 45 | ("noop_count", models.PositiveIntegerField()), 46 | ("start_time", models.DateTimeField()), 47 | ("end_time", models.DateTimeField()), 48 | ], 49 | options={}, 50 | bases=(models.Model,), 51 | ), 52 | migrations.CreateModel( 53 | name="RunPlan", 54 | fields=[ 55 | ( 56 | "id", 57 | models.AutoField( 58 | primary_key=True, 59 | auto_created=True, 60 | serialize=False, 61 | verbose_name="ID", 62 | ), 63 | ), 64 | ("success", models.BooleanField(default=True)), 65 | ( 66 | "jurisdiction", 67 | models.ForeignKey(to="core.Jurisdiction", on_delete=models.CASCADE), 68 | ), 69 | ], 70 | options={}, 71 | bases=(models.Model,), 72 | ), 73 | migrations.AddField( 74 | model_name="importobjects", 75 | name="report", 76 | field=models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE), 77 | preserve_default=True, 78 | ), 79 | migrations.CreateModel( 80 | name="ScrapeObjects", 81 | fields=[ 82 | ( 83 | "id", 84 | models.AutoField( 85 | primary_key=True, 86 | auto_created=True, 87 | serialize=False, 88 | verbose_name="ID", 89 | ), 90 | ), 91 | ( 92 | "object_type", 93 | models.CharField( 94 | max_length=20, 95 | choices=[ 96 | ("jurisdiction", "Jurisdiction"), 97 | ("person", "Person"), 98 | ("organization", "Organization"), 99 | ("post", "Post"), 100 | ("membership", "Membership"), 101 | ("bill", "Bill"), 102 | ("vote_event", "VoteEvent"), 103 | ("event", "Event"), 104 | ], 105 | ), 106 | ), 107 | ("count", models.PositiveIntegerField()), 108 | ], 109 | options={}, 110 | bases=(models.Model,), 111 | ), 112 | migrations.CreateModel( 113 | name="ScrapeReport", 114 | fields=[ 115 | ( 116 | "id", 117 | models.AutoField( 118 | primary_key=True, 119 | auto_created=True, 120 | serialize=False, 121 | verbose_name="ID", 122 | ), 123 | ), 124 | ("scraper", models.CharField(max_length=300)), 125 | ("args", models.CharField(max_length=300)), 126 | ("start_time", models.DateTimeField()), 127 | ("end_time", models.DateTimeField()), 128 | ( 129 | "plan", 130 | models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE), 131 | ), 132 | ], 133 | options={}, 134 | bases=(models.Model,), 135 | ), 136 | migrations.AddField( 137 | model_name="scrapeobjects", 138 | name="report", 139 | field=models.ForeignKey(to="pupa.ScrapeReport", on_delete=models.CASCADE), 140 | preserve_default=True, 141 | ), 142 | ] 143 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/bill.py: -------------------------------------------------------------------------------- 1 | """ 2 | Schema for bill objects. 3 | """ 4 | 5 | from .common import sources, extras, fuzzy_date_blank, fuzzy_datetime 6 | from opencivicdata import common 7 | 8 | versions_or_documents = { 9 | "items": { 10 | "properties": { 11 | "note": {"type": "string", "minLength": 1}, 12 | "date": fuzzy_date_blank, 13 | "links": { 14 | "items": { 15 | "properties": { 16 | "media_type": {"type": "string"}, 17 | "url": {"type": "string", "format": "uri"}, 18 | }, 19 | "type": "object", 20 | }, 21 | "type": "array", 22 | }, 23 | }, 24 | "type": "object", 25 | }, 26 | "type": "array", 27 | } 28 | 29 | schema = { 30 | "type": "object", 31 | "properties": { 32 | "legislative_session": {"type": "string", "minLength": 1}, 33 | "identifier": {"type": "string", "minLength": 1}, 34 | "title": {"type": "string", "minLength": 1}, 35 | "from_organization": {"type": ["string", "null"]}, 36 | "classification": { 37 | "items": {"type": "string", "enum": common.BILL_CLASSIFICATIONS}, 38 | "type": "array", 39 | }, 40 | "subject": {"items": {"type": "string", "minLength": 1}, "type": "array"}, 41 | "abstracts": { 42 | "items": { 43 | "properties": { 44 | "abstract": {"type": "string", "minLength": 1}, 45 | "note": {"type": "string"}, 46 | "date": {"type": "string"}, 47 | }, 48 | "type": "object", 49 | }, 50 | "type": "array", 51 | }, 52 | "other_titles": { 53 | "items": { 54 | "properties": { 55 | "title": {"type": "string", "minLength": 1}, 56 | "note": {"type": "string"}, 57 | }, 58 | "type": "object", 59 | }, 60 | "type": "array", 61 | }, 62 | "other_identifiers": { 63 | "items": { 64 | "properties": { 65 | "identifier": {"type": "string", "minLength": 1}, 66 | "note": {"type": "string"}, 67 | "scheme": {"type": "string"}, 68 | }, 69 | "type": "object", 70 | }, 71 | "type": "array", 72 | }, 73 | "actions": { 74 | "items": { 75 | "properties": { 76 | "organization": {"type": ["string", "null"]}, 77 | "date": fuzzy_datetime, 78 | "description": {"type": "string", "minLength": 1}, 79 | "classification": { 80 | "items": { 81 | "type": "string", 82 | "enum": common.BILL_ACTION_CLASSIFICATIONS, 83 | }, 84 | "type": "array", 85 | }, 86 | "related_entities": { 87 | "items": { 88 | "properties": { 89 | "name": {"type": "string", "minLength": 1}, 90 | "entity_type": { 91 | "enum": ["organization", "person", ""], 92 | "type": "string", 93 | }, 94 | "person_id": {"type": ["string", "null"]}, 95 | "organization_id": {"type": ["string", "null"]}, 96 | }, 97 | "type": "object", 98 | }, 99 | "type": "array", 100 | }, 101 | }, 102 | "type": "object", 103 | }, 104 | "type": "array", 105 | }, 106 | "sponsorships": { 107 | "items": { 108 | "properties": { 109 | "primary": {"type": "boolean"}, 110 | "classification": {"type": "string", "minLength": 1}, 111 | "name": {"type": "string", "minLength": 1}, 112 | "entity_type": { 113 | "enum": ["organization", "person", ""], 114 | "type": "string", 115 | }, 116 | "person_id": {"type": ["string", "null"]}, 117 | "organization_id": {"type": ["string", "null"]}, 118 | }, 119 | "type": "object", 120 | }, 121 | "type": "array", 122 | }, 123 | "related_bills": { 124 | "items": { 125 | "properties": { 126 | "identifier": {"type": "string", "minLength": 1}, 127 | "legislative_session": {"type": "string", "minLength": 1}, 128 | "relation_type": { 129 | "enum": common.BILL_RELATION_TYPES, 130 | "type": "string", 131 | }, 132 | }, 133 | "type": "object", 134 | }, 135 | "type": "array", 136 | }, 137 | "versions": versions_or_documents, 138 | "documents": versions_or_documents, 139 | "sources": sources, 140 | "extras": extras, 141 | }, 142 | } 143 | -------------------------------------------------------------------------------- /pupa/ext/ansistrm.py: -------------------------------------------------------------------------------- 1 | # flake8: NOQA 2 | # 3 | # Copyright (C) 2010-2012 Vinay Sajip. All rights reserved. 4 | # Licensed under the new BSD license. 5 | # 6 | import ctypes 7 | import logging 8 | import os 9 | 10 | 11 | class ColorizingStreamHandler(logging.StreamHandler): 12 | # color names to indices 13 | color_map = { 14 | "black": 0, 15 | "red": 1, 16 | "green": 2, 17 | "yellow": 3, 18 | "blue": 4, 19 | "magenta": 5, 20 | "cyan": 6, 21 | "white": 7, 22 | } 23 | 24 | # levels to (background, foreground, bold/intense) 25 | if os.name == "nt": 26 | level_map = { 27 | logging.DEBUG: (None, "blue", True), 28 | logging.INFO: (None, "white", False), 29 | logging.WARNING: (None, "yellow", True), 30 | logging.ERROR: (None, "red", True), 31 | logging.CRITICAL: ("red", "white", True), 32 | } 33 | else: 34 | level_map = { 35 | logging.DEBUG: (None, "blue", False), 36 | logging.INFO: (None, "white", False), 37 | logging.WARNING: (None, "yellow", False), 38 | logging.ERROR: (None, "red", False), 39 | logging.CRITICAL: ("red", "white", True), 40 | } 41 | csi = "\x1b[" 42 | reset = "\x1b[0m" 43 | 44 | @property 45 | def is_tty(self): 46 | # bluff for Jenkins 47 | if os.environ.get("JENKINS_URL"): 48 | return True 49 | isatty = getattr(self.stream, "isatty", None) 50 | return isatty and isatty() 51 | 52 | def emit(self, record): 53 | try: 54 | message = self.format(record) 55 | stream = self.stream 56 | if not self.is_tty: 57 | stream.write(message) 58 | else: 59 | self.output_colorized(message) 60 | stream.write(getattr(self, "terminator", "\n")) 61 | self.flush() 62 | except (KeyboardInterrupt, SystemExit): 63 | raise 64 | except: 65 | self.handleError(record) 66 | 67 | if os.name != "nt": 68 | 69 | def output_colorized(self, message): 70 | self.stream.write(message) 71 | 72 | else: 73 | import re 74 | 75 | ansi_esc = re.compile(r"\x1b\[((?:\d+)(?:;(?:\d+))*)m") 76 | 77 | nt_color_map = { 78 | 0: 0x00, # black 79 | 1: 0x04, # red 80 | 2: 0x02, # green 81 | 3: 0x06, # yellow 82 | 4: 0x01, # blue 83 | 5: 0x05, # magenta 84 | 6: 0x03, # cyan 85 | 7: 0x07, # white 86 | } 87 | 88 | def output_colorized(self, message): 89 | parts = self.ansi_esc.split(message) 90 | write = self.stream.write 91 | h = None 92 | fd = getattr(self.stream, "fileno", None) 93 | if fd is not None: 94 | fd = fd() 95 | if fd in (1, 2): # stdout or stderr 96 | h = ctypes.windll.kernel32.GetStdHandle(-10 - fd) 97 | while parts: 98 | text = parts.pop(0) 99 | if text: 100 | write(text) 101 | if parts: 102 | params = parts.pop(0) 103 | if h is not None: 104 | params = [int(p) for p in params.split(";")] 105 | color = 0 106 | for p in params: 107 | if 40 <= p <= 47: 108 | color |= self.nt_color_map[p - 40] << 4 109 | elif 30 <= p <= 37: 110 | color |= self.nt_color_map[p - 30] 111 | elif p == 1: 112 | color |= 0x08 # foreground intensity on 113 | elif p == 0: # reset to default color 114 | color = 0x07 115 | else: 116 | pass # error condition ignored 117 | ctypes.windll.kernel32.SetConsoleTextAttribute(h, color) 118 | 119 | def colorize(self, message, record): 120 | if record.levelno in self.level_map: 121 | bg, fg, bold = self.level_map[record.levelno] 122 | params = [] 123 | if bg in self.color_map: 124 | params.append(str(self.color_map[bg] + 40)) 125 | if fg in self.color_map: 126 | params.append(str(self.color_map[fg] + 30)) 127 | if bold: 128 | params.append("1") 129 | if params: 130 | message = "".join( 131 | (self.csi, ";".join(params), "m", message, self.reset) 132 | ) 133 | return message 134 | 135 | def format(self, record): 136 | message = logging.StreamHandler.format(self, record) 137 | if self.is_tty: 138 | # Don't colorize any traceback 139 | parts = message.split("\n", 1) 140 | parts[0] = self.colorize(parts[0], record) 141 | message = "\n".join(parts) 142 | return message 143 | 144 | 145 | def main(): 146 | root = logging.getLogger() 147 | root.setLevel(logging.DEBUG) 148 | root.addHandler(ColorizingStreamHandler()) 149 | logging.debug("DEBUG") 150 | logging.info("INFO") 151 | logging.warning("WARNING") 152 | logging.error("ERROR") 153 | logging.critical("CRITICAL") 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /pupa/utils/topsort.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import chain 3 | 4 | 5 | class CyclicGraphError(ValueError): 6 | """ 7 | This exception is raised if the graph is Cyclic (or rather, when the 8 | sorting algorithm *knows* that the graph is Cyclic by hitting a snag 9 | in the top-sort) 10 | """ 11 | 12 | pass 13 | 14 | 15 | class Network(object): 16 | """ 17 | This object (the `Network` object) handles keeping track of all the 18 | graph's nodes, and links between the nodes. 19 | 20 | The `Network' object is mostly used to topologically sort the nodes, 21 | to handle dependency resolution. 22 | """ 23 | 24 | def __init__(self): 25 | self.nodes = set() 26 | self.edges = defaultdict(set) 27 | 28 | def add_node(self, node): 29 | """Add a node to the graph (with no edges)""" 30 | self.nodes.add(node) 31 | 32 | def add_edge(self, fro, to): 33 | """ 34 | When doing topological sorting, the semantics of the edge mean that 35 | the depedency runs from the parent to the child - which is to say that 36 | the parent is required to be sorted *before* the child. 37 | 38 | [ FROM ] ------> [ TO ] 39 | Committee on Finance -> Subcommittee of the Finance Committee on Budget 40 | -> Subcommittee of the Finance Committee on Roads 41 | """ 42 | self.add_node(fro) 43 | self.add_node(to) 44 | self.edges[fro].add(to) 45 | 46 | def leaf_nodes(self): 47 | """ 48 | Return an interable of nodes with no edges pointing at them. This is 49 | helpful to find all nodes without dependencies. 50 | """ 51 | # Now contains all nodes that contain dependencies. 52 | deps = {item for sublist in self.edges.values() for item in sublist} 53 | # contains all nodes *without* any dependencies (leaf nodes) 54 | return self.nodes - deps 55 | 56 | def prune_node(self, node, remove_backrefs=False): 57 | """ 58 | remove node `node` from the network (including any edges that may 59 | have been pointing at `node`). 60 | """ 61 | if not remove_backrefs: 62 | for fro, connections in self.edges.items(): 63 | if node in self.edges[fro]: 64 | raise ValueError( 65 | """Attempting to remove a node with 66 | backrefs. You may consider setting 67 | `remove_backrefs` to true.""" 68 | ) 69 | 70 | # OK. Otherwise, let's do our removal. 71 | 72 | self.nodes.remove(node) 73 | if node in self.edges: 74 | # Remove add edges from this node if we're pruning it. 75 | self.edges.pop(node) 76 | 77 | for fro, connections in self.edges.items(): 78 | # Remove any links to this node (if they exist) 79 | if node in self.edges[fro]: 80 | # If we should remove backrefs: 81 | self.edges[fro].remove(node) 82 | 83 | def sort(self): 84 | """ 85 | Return an iterable of nodes, toplogically sorted to correctly import 86 | dependencies before leaf nodes. 87 | """ 88 | while self.nodes: 89 | iterated = False 90 | for node in self.leaf_nodes(): 91 | iterated = True 92 | self.prune_node(node) 93 | yield node 94 | if not iterated: 95 | raise CyclicGraphError("Sorting has found a cyclic graph.") 96 | 97 | def dot(self): 98 | """ 99 | Return a buffer that represents something dot(1) can render. 100 | """ 101 | buff = "digraph graphname {" 102 | for fro in self.edges: 103 | for to in self.edges[fro]: 104 | buff += "%s -> %s;" % (fro, to) 105 | buff += "}" 106 | return buff 107 | 108 | def cycles(self): 109 | """ 110 | Fairly expensive cycle detection algorithm. This method 111 | will return the shortest unique cycles that were detected. 112 | 113 | Debug usage may look something like: 114 | 115 | print("The following cycles were found:") 116 | for cycle in network.cycles(): 117 | print(" ", " -> ".join(cycle)) 118 | """ 119 | 120 | def walk_node(node, seen): 121 | """ 122 | Walk each top-level node we know about, and recurse 123 | along the graph. 124 | """ 125 | if node in seen: 126 | yield (node,) 127 | return 128 | seen.add(node) 129 | for edge in self.edges[node]: 130 | for cycle in walk_node(edge, set(seen)): 131 | yield (node,) + cycle 132 | 133 | # First, let's get a iterable of all known cycles. 134 | cycles = chain.from_iterable((walk_node(node, set()) for node in self.nodes)) 135 | 136 | shortest = set() 137 | # Now, let's go through and sift through the cycles, finding 138 | # the shortest unique cycle known, ignoring cycles which contain 139 | # already known cycles. 140 | for cycle in sorted(cycles, key=len): 141 | for el in shortest: 142 | if set(el).issubset(set(cycle)): 143 | break 144 | else: 145 | shortest.add(cycle) 146 | # And return that unique list. 147 | return shortest 148 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_model_basics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.scrape.schemas.person import schema 3 | from pupa.scrape.base import ( 4 | BaseModel, 5 | SourceMixin, 6 | ContactDetailMixin, 7 | LinkMixin, 8 | AssociatedLinkMixin, 9 | OtherNameMixin, 10 | IdentifierMixin, 11 | ) 12 | 13 | 14 | class GenericModel( 15 | BaseModel, 16 | SourceMixin, 17 | ContactDetailMixin, 18 | LinkMixin, 19 | AssociatedLinkMixin, 20 | OtherNameMixin, 21 | IdentifierMixin, 22 | ): 23 | """a generic model used for testing the base and mixins""" 24 | 25 | _type = "generic" 26 | _schema = schema 27 | 28 | def __init__(self): 29 | super(GenericModel, self).__init__() 30 | self._associated = [] 31 | 32 | 33 | def test_init_id(): 34 | m = GenericModel() 35 | assert len(m._id) == 36 36 | 37 | 38 | def test_as_dict(): 39 | m = GenericModel() 40 | assert m.as_dict()["_id"] == m._id 41 | 42 | 43 | def test_setattr(): 44 | m = GenericModel() 45 | 46 | with pytest.raises(ValueError): 47 | m.some_random_key = 3 48 | 49 | # and no error raised since this is a valid key 50 | m._id = "new id" 51 | 52 | 53 | def test_add_source(): 54 | m = GenericModel() 55 | m.add_source("http://example.com/1") 56 | m.add_source("http://example.com/2", note="xyz") 57 | assert m.sources == [ 58 | {"url": "http://example.com/1", "note": ""}, 59 | {"url": "http://example.com/2", "note": "xyz"}, 60 | ] 61 | 62 | 63 | def test_add_contact_detail(): 64 | m = GenericModel() 65 | m.add_contact_detail(type="fax", value="111-222-3333", note="office") 66 | assert m.contact_details == [ 67 | {"type": "fax", "value": "111-222-3333", "note": "office"} 68 | ] 69 | 70 | 71 | def test_add_link(): 72 | m = GenericModel() 73 | m.add_link("http://example.com/1") 74 | m.add_link("http://example.com/2", note="xyz") 75 | assert m.links == [ 76 | {"url": "http://example.com/1", "note": ""}, 77 | {"url": "http://example.com/2", "note": "xyz"}, 78 | ] 79 | 80 | 81 | def test_add_associated_link_match(): 82 | m = GenericModel() 83 | m._add_associated_link( 84 | "_associated", 85 | "something", 86 | "http://example.com/1.txt", 87 | text="", 88 | media_type="text/plain", 89 | on_duplicate="error", 90 | ) 91 | m._add_associated_link( 92 | "_associated", 93 | "something", 94 | "http://example.com/1.pdf", 95 | text="", 96 | media_type="application/pdf", 97 | on_duplicate="error", 98 | ) 99 | # one 'document' added, multiple links for it 100 | assert len(m._associated) == 1 101 | assert len(m._associated[0]["links"]) == 2 102 | 103 | 104 | def test_add_associated_link_on_duplicate_bad(): 105 | m = GenericModel() 106 | 107 | with pytest.raises(ValueError): 108 | m._add_associated_link( 109 | "_associated", 110 | "something", 111 | "http://example.com", 112 | text="", 113 | media_type="text/html", 114 | on_duplicate="idk", 115 | ) 116 | 117 | 118 | def test_add_associated_link_on_duplicate_error(): 119 | m = GenericModel() 120 | m._add_associated_link( 121 | "_associated", 122 | "something", 123 | "http://example.com", 124 | text="", 125 | media_type="text/html", 126 | on_duplicate="error", 127 | ) 128 | 129 | with pytest.raises(ValueError): 130 | m._add_associated_link( 131 | "_associated", 132 | "something else", 133 | "http://example.com", 134 | text="", 135 | media_type="text/html", 136 | on_duplicate="error", 137 | ) 138 | 139 | 140 | def test_add_associated_link_on_duplicate_ignore(): 141 | m = GenericModel() 142 | m._add_associated_link( 143 | "_associated", 144 | "something", 145 | "http://example.com", 146 | text="", 147 | media_type="text/html", 148 | on_duplicate="ignore", 149 | ) 150 | m._add_associated_link( 151 | "_associated", 152 | "something else", 153 | "http://example.com", 154 | text="", 155 | media_type="text/html", 156 | on_duplicate="ignore", 157 | ) 158 | # one 'document' added, single link for it, keeps first name 159 | assert len(m._associated) == 1 160 | assert len(m._associated[0]["links"]) == 1 161 | assert m._associated[0]["note"] == "something" 162 | 163 | 164 | def test_add_name(): 165 | m = GenericModel() 166 | 167 | m.add_name("Thiston", note="What my friends call me") 168 | 169 | assert m.other_names == [{"name": "Thiston", "note": "What my friends call me"}] 170 | 171 | m.add_name( 172 | "Johnseph Q. Publico", 173 | note="Birth name", 174 | start_date="1920-01", 175 | end_date="1949-12-31", 176 | ) 177 | 178 | assert m.other_names == [ 179 | {"name": "Thiston", "note": "What my friends call me"}, 180 | { 181 | "name": "Johnseph Q. Publico", 182 | "note": "Birth name", 183 | "start_date": "1920-01", 184 | "end_date": "1949-12-31", 185 | }, 186 | ] 187 | 188 | 189 | def test_add_identifier(): 190 | g = GenericModel() 191 | 192 | with pytest.raises(TypeError): 193 | g.add_identifier("id10t", foo="bar") 194 | 195 | g.add_identifier("id10t") 196 | g.add_identifier("l0l", scheme="kruft") 197 | 198 | assert g.identifiers[-1]["scheme"] == "kruft" 199 | assert g.identifiers[0]["identifier"] == "id10t" 200 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_event_scrape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import datetime 3 | from pupa.scrape import Event 4 | 5 | 6 | def event_obj(): 7 | e = Event( 8 | name="get-together", 9 | start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", 10 | location_name="Joe's Place", 11 | ) 12 | e.add_source(url="http://example.com/foobar") 13 | return e 14 | 15 | 16 | def test_basic_event(): 17 | e = event_obj() 18 | e.validate() 19 | 20 | 21 | def test_no_location(): 22 | e = Event( 23 | name="get-together", 24 | start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", 25 | ) 26 | e.add_source(url="http://example.com/foobar") 27 | e.validate() 28 | 29 | 30 | def test_event_str(): 31 | e = event_obj() 32 | assert e.name in str(e) 33 | 34 | 35 | def test_bad_event(): 36 | e = event_obj() 37 | e.start_date = 6 38 | 39 | with pytest.raises(ValueError): 40 | e.validate() 41 | 42 | 43 | def test_basic_agenda(): 44 | e = event_obj() 45 | agenda = e.add_agenda_item("foo bar") 46 | assert agenda["description"] == "foo bar" 47 | assert e.agenda[0] == agenda 48 | e.validate() 49 | 50 | 51 | def test_agenda_add_person(): 52 | e = event_obj() 53 | agenda = e.add_agenda_item("foo bar") 54 | assert agenda["related_entities"] == [] 55 | 56 | agenda.add_person(person="John Q. Hacker", note="chair") 57 | assert len(e.agenda[0]["related_entities"]) == 1 58 | e.validate() 59 | 60 | 61 | def test_agenda_add_vote_event(): 62 | e = event_obj() 63 | agenda = e.add_agenda_item("foo bar") 64 | assert agenda["related_entities"] == [] 65 | 66 | agenda.add_vote_event(vote_event="Roll no. 12") 67 | assert len(e.agenda[0]["related_entities"]) == 1 68 | e.validate() 69 | 70 | 71 | def test_agenda_add_subject(): 72 | e = event_obj() 73 | agenda = e.add_agenda_item("foo bar") 74 | 75 | agenda.add_subject("test") 76 | assert e.agenda[0]["subjects"] == ["test"] 77 | agenda.add_subject("test2") 78 | assert e.agenda[0]["subjects"] == ["test", "test2"] 79 | 80 | e.validate() 81 | 82 | 83 | def test_agenda_add_classification(): 84 | e = event_obj() 85 | agenda = e.add_agenda_item("foo bar") 86 | 87 | agenda.add_classification("test") 88 | assert e.agenda[0]["classification"] == ["test"] 89 | agenda.add_classification("test2") 90 | assert e.agenda[0]["classification"] == ["test", "test2"] 91 | 92 | e.validate() 93 | 94 | 95 | def test_agenda_add_extra(): 96 | e = event_obj() 97 | a = e.add_agenda_item("foo bar") 98 | a["extras"] = dict(foo=1, bar=["baz"]) 99 | 100 | assert e.agenda[0]["extras"] == {"foo": 1, "bar": ["baz"]} 101 | 102 | 103 | def test_add_committee(): 104 | e = event_obj() 105 | agenda = e.add_agenda_item("foo bar") 106 | assert agenda["related_entities"] == [] 107 | 108 | agenda.add_committee(committee="Hello, World", note="host") 109 | e.validate() 110 | 111 | 112 | def test_add_bill(): 113 | e = event_obj() 114 | agenda = e.add_agenda_item("foo bar") 115 | assert agenda["related_entities"] == [] 116 | agenda.add_bill(bill="HB 101", note="consideration") 117 | e.validate() 118 | 119 | 120 | def test_add_document(): 121 | e = event_obj() 122 | assert e.documents == [] 123 | e.add_document(note="hello", url="http://example.com", media_type="text/html") 124 | assert len(e.documents) == 1 125 | o = e.documents[0] 126 | assert o["note"] == "hello" 127 | assert o["links"] == [ 128 | {"url": "http://example.com", "media_type": "text/html", "text": ""} 129 | ] 130 | e.validate() 131 | 132 | 133 | def test_participants(): 134 | e = event_obj() 135 | e.add_participant("Committee of the Whole", type="committee", note="everyone") 136 | assert len(e.participants) == 1 137 | assert e.participants[0]["name"] == "Committee of the Whole" 138 | assert e.participants[0]["entity_type"] == "committee" 139 | assert e.participants[0]["note"] == "everyone" 140 | 141 | # and add_person, which is a shortcut 142 | e.add_person("Bill Stevenson") 143 | assert len(e.participants) == 2 144 | assert e.participants[1]["name"] == "Bill Stevenson" 145 | assert e.participants[1]["entity_type"] == "person" 146 | assert e.participants[1]["note"] == "participant" 147 | 148 | 149 | def test_set_location(): 150 | e = event_obj() 151 | e.set_location( 152 | "North Pole", 153 | note="it is cold here", 154 | url="https://www.northpole.com", 155 | coordinates={"latitude": "90.0000", "longitude": "0.0000"}, 156 | ) 157 | 158 | assert e.location.get("name") == "North Pole" 159 | assert e.location.get("note") == "it is cold here" 160 | assert e.location.get("url") == "https://www.northpole.com" 161 | assert e.location.get("coordinates").get("latitude") == "90.0000" 162 | assert e.location.get("coordinates").get("longitude") == "0.0000" 163 | 164 | e.validate() 165 | 166 | 167 | def test_add_media(): 168 | e = event_obj() 169 | name = "Hello, World" 170 | a = e.add_agenda_item(description="foo") 171 | a.add_media_link(note=name, url="http://pault.ag", media_type="text/html") 172 | a.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain") 173 | e.validate() 174 | assert len(e.agenda[0]["media"]) == 1 175 | assert len(e.agenda[0]["media"][0]["links"]) == 2 176 | 177 | e.add_media_link(note=name, url="http://pault.ag", media_type="text/html") 178 | e.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain") 179 | e.validate() 180 | assert len(e.media) == 1 181 | assert len(e.media[0]["links"]) == 2 182 | -------------------------------------------------------------------------------- /pupa/scrape/bill.py: -------------------------------------------------------------------------------- 1 | from ..utils import _make_pseudo_id 2 | from .popolo import pseudo_organization 3 | from .base import BaseModel, SourceMixin, AssociatedLinkMixin, cleanup_list 4 | from .schemas.bill import schema 5 | 6 | 7 | class Action(dict): 8 | def add_related_entity(self, name, entity_type, entity_id=None): 9 | ent = { 10 | "name": name, 11 | "entity_type": entity_type, 12 | entity_type + "_id": entity_id, 13 | } 14 | self["related_entities"].append(ent) 15 | return ent 16 | 17 | 18 | class Bill(SourceMixin, AssociatedLinkMixin, BaseModel): 19 | """ 20 | An Open Civic Data bill. 21 | """ 22 | 23 | _type = "bill" 24 | _schema = schema 25 | 26 | def __init__( 27 | self, 28 | identifier, 29 | legislative_session, 30 | title, 31 | *, 32 | chamber=None, 33 | from_organization=None, 34 | classification=None 35 | ): 36 | super(Bill, self).__init__() 37 | 38 | self.identifier = identifier 39 | self.legislative_session = legislative_session 40 | self.title = title 41 | self.classification = cleanup_list(classification, ["bill"]) 42 | self.from_organization = pseudo_organization( 43 | from_organization, chamber, "legislature" 44 | ) 45 | 46 | self.actions = [] 47 | self.other_identifiers = [] 48 | self.other_titles = [] 49 | self.documents = [] 50 | self.related_bills = [] 51 | self.sponsorships = [] 52 | self.subject = [] 53 | self.abstracts = [] 54 | self.versions = [] 55 | 56 | def add_action( 57 | self, 58 | description, 59 | date, 60 | *, 61 | organization=None, 62 | chamber=None, 63 | classification=None, 64 | related_entities=None, 65 | extras=None 66 | ): 67 | action = Action( 68 | description=description, 69 | date=date, 70 | organization_id=pseudo_organization(organization, chamber, "legislature"), 71 | classification=cleanup_list(classification, []), 72 | related_entities=[], 73 | extras=extras or {}, 74 | ) 75 | self.actions.append(action) 76 | return action 77 | 78 | def add_related_bill(self, identifier, legislative_session, relation_type): 79 | # will we need jurisdiction, organization? 80 | self.related_bills.append( 81 | { 82 | "identifier": identifier, 83 | "legislative_session": legislative_session, 84 | "relation_type": relation_type, 85 | } 86 | ) 87 | 88 | def add_sponsorship( 89 | self, 90 | name, 91 | classification, 92 | entity_type, 93 | primary, 94 | *, 95 | chamber=None, 96 | entity_id=None 97 | ): 98 | sp = { 99 | "name": name, 100 | "classification": classification, 101 | "entity_type": entity_type, 102 | "primary": primary, 103 | # set these so that all JSON objects have the same keys, 104 | # prevents import errors 105 | "person_id": None, 106 | "organization_id": None, 107 | } 108 | # overwrite the id that exists 109 | if entity_type: 110 | if not entity_id: 111 | entity_id = _make_pseudo_id(name=name) 112 | sp[entity_type + "_id"] = entity_id 113 | self.sponsorships.append(sp) 114 | 115 | def add_sponsorship_by_identifier( 116 | self, 117 | name, 118 | classification, 119 | entity_type, 120 | primary, 121 | *, 122 | scheme, 123 | identifier, 124 | chamber=None 125 | ): 126 | return self.add_sponsorship( 127 | name, 128 | classification, 129 | entity_type, 130 | primary, 131 | chamber=chamber, 132 | entity_id=_make_pseudo_id( 133 | identifiers__scheme=scheme, identifiers__identifier=identifier 134 | ), 135 | ) 136 | 137 | def add_subject(self, subject): 138 | self.subject.append(subject) 139 | 140 | def add_abstract(self, abstract, note, date=""): 141 | self.abstracts.append({"note": note, "abstract": abstract, "date": date}) 142 | 143 | def add_title(self, title, note=""): 144 | self.other_titles.append({"note": note, "title": title}) 145 | 146 | def add_identifier(self, identifier, note="", scheme=""): 147 | self.other_identifiers.append( 148 | {"note": note, "identifier": identifier, "scheme": scheme} 149 | ) 150 | 151 | def add_document_link( 152 | self, note, url, *, date="", media_type="", text="", on_duplicate="error" 153 | ): 154 | return self._add_associated_link( 155 | collection="documents", 156 | note=note, 157 | url=url, 158 | date=date, 159 | text=text, 160 | media_type=media_type, 161 | on_duplicate=on_duplicate, 162 | ) 163 | 164 | def add_version_link( 165 | self, note, url, *, date="", media_type="", text="", on_duplicate="error" 166 | ): 167 | return self._add_associated_link( 168 | collection="versions", 169 | note=note, 170 | url=url, 171 | date=date, 172 | text=text, 173 | media_type=media_type, 174 | on_duplicate=on_duplicate, 175 | ) 176 | 177 | def __str__(self): 178 | return self.identifier + " in " + self.legislative_session 179 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_vote_event_scrape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.scrape import VoteEvent, Bill, Organization, OrderVoteEvent 3 | from pupa.utils import get_pseudo_id 4 | 5 | 6 | def toy_vote_event(): 7 | ve = VoteEvent( 8 | legislative_session="2009", 9 | motion_text="passage of the bill", 10 | start_date="2009-01-07", 11 | result="pass", 12 | classification="bill-passage", 13 | ) 14 | ve.add_source("http://uri.example.com/", note="foo") 15 | return ve 16 | 17 | 18 | def test_simple_vote_event(): 19 | ve = toy_vote_event() 20 | ve.set_count("yes", 2) 21 | ve.yes("James") 22 | ve.no("Paul") 23 | ve.vote("abstain", "Thom") 24 | 25 | assert len(ve.votes) == 3 26 | assert len(ve.counts) == 1 27 | assert get_pseudo_id(ve.organization) == {"classification": "legislature"} 28 | assert get_pseudo_id(ve.votes[0]["voter_id"]) == {"name": "James"} 29 | assert get_pseudo_id(ve.votes[1]["voter_id"]) == {"name": "Paul"} 30 | assert get_pseudo_id(ve.votes[2]["voter_id"]) == {"name": "Thom"} 31 | assert ve.bill is None 32 | 33 | ve.validate() 34 | assert "we get here" 35 | 36 | 37 | def test_vote_event_org_obj(): 38 | o = Organization("something", classification="committee") 39 | ve = VoteEvent( 40 | legislative_session="2009", 41 | motion_text="passage of the bill", 42 | start_date="2009-01-07", 43 | result="pass", 44 | classification="bill-passage", 45 | organization=o, 46 | ) 47 | assert ve.organization == o._id 48 | 49 | 50 | def test_vote_event_org_dict(): 51 | odict = {"name": "Random Committee", "classification": "committee"} 52 | ve = VoteEvent( 53 | legislative_session="2009", 54 | motion_text="passage of the bill", 55 | start_date="2009-01-07", 56 | result="pass", 57 | classification="bill-passage", 58 | organization=odict, 59 | ) 60 | assert get_pseudo_id(ve.organization) == odict 61 | 62 | 63 | def test_vote_event_org_chamber(): 64 | ve = VoteEvent( 65 | legislative_session="2009", 66 | motion_text="passage of the bill", 67 | start_date="2009-01-07", 68 | result="pass", 69 | classification="bill-passage", 70 | chamber="upper", 71 | ) 72 | assert get_pseudo_id(ve.organization) == {"classification": "upper"} 73 | 74 | 75 | def test_org_and_chamber_conflict(): 76 | with pytest.raises(ValueError): 77 | VoteEvent( 78 | legislative_session="2009", 79 | motion_text="passage of the bill", 80 | start_date="2009-01-07", 81 | result="pass", 82 | classification="passage", 83 | organization="test", 84 | chamber="lower", 85 | ) 86 | 87 | 88 | def test_set_count(): 89 | ve = toy_vote_event() 90 | ve.set_count("yes", 2) 91 | ve.set_count("no", 100) 92 | ve.set_count("yes", 0) 93 | assert ve.counts == [{"option": "yes", "value": 0}, {"option": "no", "value": 100}] 94 | 95 | 96 | def test_set_bill_obj(): 97 | ve = toy_vote_event() 98 | b = Bill("HB 1", legislative_session="2009", title="fake bill") 99 | ve.set_bill(b) 100 | assert ve.bill == b._id 101 | 102 | 103 | def test_set_bill_obj_no_extra_args(): 104 | ve = toy_vote_event() 105 | b = Bill("HB 1", legislative_session="2009", title="fake bill") 106 | with pytest.raises(ValueError): 107 | ve.set_bill(b, chamber="lower") 108 | 109 | 110 | def test_set_bill_pseudo_id(): 111 | ve = toy_vote_event() 112 | ve.set_bill("HB 1", chamber="lower") 113 | assert get_pseudo_id(ve.bill) == { 114 | "identifier": "HB 1", 115 | "from_organization__classification": "lower", 116 | "legislative_session__identifier": "2009", 117 | } 118 | 119 | 120 | def test_str(): 121 | ve = toy_vote_event() 122 | s = str(ve) 123 | assert ve.legislative_session in s 124 | assert ve.motion_text in s 125 | 126 | 127 | def test_order_vote_event(): 128 | ve = toy_vote_event() 129 | order_vote_event = OrderVoteEvent() 130 | 131 | # add order as seconds to date with no time 132 | ve.start_date = "2019-01-01" 133 | ve.end_date = None 134 | order_vote_event("2019", "1", ve) 135 | assert ve.start_date == "2019-01-01T00:00:01" 136 | assert ve.end_date is None 137 | 138 | # add order as seconds to time with explicit midnight time and 139 | # zone, preserving timezone 140 | ve.start_date = "2019-01-01T00:00:00+05:00" 141 | ve.end_date = "" 142 | order_vote_event("2019", "1", ve) 143 | assert ve.start_date == "2019-01-01T00:00:02+05:00" 144 | assert ve.end_date == "" 145 | 146 | # a second bill should start with '00:00:01' again 147 | ve.start_date = "2019-01-01" 148 | ve.end_date = None 149 | order_vote_event("2019", "2", ve) 150 | assert ve.start_date == "2019-01-01T00:00:01" 151 | assert ve.end_date is None 152 | 153 | # the same bill id in a different session should start with '00:00:01' again 154 | ve.start_date = "2019-01-01" 155 | ve.end_date = None 156 | order_vote_event("2020", "1", ve) 157 | assert ve.start_date == "2019-01-01T00:00:01" 158 | assert ve.end_date is None 159 | 160 | # add order as seconds to time with explicit midnight time and no timezone 161 | ve.start_date = ve.end_date = "2019-01-01T00:00:00" 162 | order_vote_event("2019", "1", ve) 163 | assert ve.start_date == "2019-01-01T00:00:03" 164 | assert ve.end_date == "2019-01-01T00:00:03" 165 | 166 | # don't change a date with a non-midnight time 167 | ve.start_date = "2019-01-01T00:00:55+05:00" 168 | order_vote_event("2019", "1", ve) 169 | assert ve.start_date == "2019-01-01T00:00:55+05:00" 170 | -------------------------------------------------------------------------------- /pupa/cli/commands/init.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .base import BaseCommand 4 | from pupa.exceptions import CommandError 5 | from opencivicdata.common import JURISDICTION_CLASSIFICATIONS 6 | from opencivicdata.divisions import Division 7 | 8 | 9 | def prompt(ps, default=""): 10 | return input(ps).strip() or default 11 | 12 | 13 | CLASS_DICT = { 14 | "events": "Event", 15 | "people": "Person", 16 | "bills": "Bill", 17 | "vote_events": "VoteEvent", 18 | } 19 | 20 | 21 | def write_jurisdiction_template( 22 | dirname, short_name, long_name, division_id, classification, url, scraper_types 23 | ): 24 | camel_case = short_name.title().replace(" ", "") 25 | 26 | # write __init__ 27 | lines = ["# encoding=utf-8", "from pupa.scrape import Jurisdiction, Organization"] 28 | for stype in scraper_types: 29 | lines.append( 30 | "from .{} import {}{}Scraper".format(stype, camel_case, CLASS_DICT[stype]) 31 | ) 32 | lines.append("") 33 | lines.append("") 34 | lines.append("class {}(Jurisdiction):".format(camel_case)) 35 | lines.append(' division_id = "{}"'.format(division_id)) 36 | lines.append(' classification = "{}"'.format(classification)) 37 | lines.append(' name = "{}"'.format(long_name)) 38 | lines.append(' url = "{}"'.format(url)) 39 | lines.append(" scrapers = {") 40 | for stype in scraper_types: 41 | lines.append( 42 | ' "{}": {}{}Scraper,'.format(stype, camel_case, CLASS_DICT[stype]) 43 | ) 44 | lines.append(" }") 45 | lines.append("") 46 | lines.append(" def get_organizations(self):") 47 | lines.append(" #REQUIRED: define an organization using this format") 48 | lines.append(" #where org_name is something like Seattle City Council") 49 | lines.append(" #and classification is described here:") 50 | 51 | lines.append( 52 | ' org = Organization(name="org_name", classification="legislature")' 53 | ) 54 | lines.append("") 55 | 56 | lines.append(" # OPTIONAL: add posts to your organizaion using this format,") 57 | lines.append( 58 | " # where label is a human-readable description of the post " 59 | '(eg "Ward 8 councilmember")' 60 | ) 61 | lines.append( 62 | " # and role is the position type (eg councilmember, alderman, mayor...)" 63 | ) 64 | lines.append(" # skip entirely if you're not writing a people scraper.") 65 | lines.append( 66 | ' org.add_post(label="position_description", role="position_type")' 67 | ) 68 | lines.append("") 69 | lines.append(" #REQUIRED: yield the organization") 70 | lines.append(" yield org") 71 | lines.append("") 72 | 73 | with open(os.path.join(dirname, "__init__.py"), "w") as of: 74 | of.write("\n".join(lines)) 75 | 76 | # write scraper files 77 | for stype in scraper_types: 78 | lines = ["from pupa.scrape import Scraper"] 79 | lines.append("from pupa.scrape import {}".format(CLASS_DICT[stype])) 80 | lines.append("") 81 | lines.append("") 82 | lines.append( 83 | "class {}{}Scraper(Scraper):".format(camel_case, CLASS_DICT[stype]) 84 | ) 85 | lines.append("") 86 | lines.append(" def scrape(self):") 87 | lines.append(" # needs to be implemented") 88 | lines.append(" pass") 89 | lines.append("") 90 | with open(os.path.join(dirname, stype + ".py"), "w") as of: 91 | of.write("\n".join(lines)) 92 | 93 | 94 | class Command(BaseCommand): 95 | name = "init" 96 | help = "start a new pupa scraper" 97 | 98 | def add_args(self): 99 | self.add_argument("module", type=str, help="name of the new scraper module") 100 | 101 | def handle(self, args, other): 102 | if os.path.exists(args.module): 103 | raise CommandError("Directory {} already exists".format(repr(args.module))) 104 | 105 | division = None 106 | while not division: 107 | division = prompt( 108 | "division id (see https://github.com/opencivicdata/" 109 | "ocd-division-ids/tree/master/identifiers): " 110 | ) 111 | if not division: 112 | print("\nERROR: Division ID is required.\n") 113 | 114 | try: 115 | Division.get(division) 116 | except (ValueError, IndexError): 117 | raise CommandError("Division ID {} is invalid".format(repr(division))) 118 | 119 | name = prompt("jurisdiction name (e.g. City of Seattle): ") 120 | classification = prompt( 121 | "classification (can be: {}): ".format( 122 | ", ".join(JURISDICTION_CLASSIFICATIONS) 123 | ) 124 | ) 125 | url = prompt("official url (e.g. http://www.seattle.gov/): ") 126 | 127 | os.makedirs(args.module) 128 | 129 | # Will default to True until they pick one, then defaults to False. 130 | selected_scraper_types = [] 131 | for stype in CLASS_DICT.keys(): 132 | if selected_scraper_types: 133 | default = "N" 134 | hint = "[y/N]" 135 | else: 136 | default = "Y" 137 | hint = "[Y/n]" 138 | result = prompt( 139 | "create {} scraper? {}: ".format(stype, hint), default 140 | ).upper() 141 | if result == "Y": 142 | selected_scraper_types.append(stype) 143 | 144 | write_jurisdiction_template( 145 | args.module, 146 | args.module, 147 | name, 148 | division, 149 | classification, 150 | url, 151 | selected_scraper_types, 152 | ) 153 | -------------------------------------------------------------------------------- /pupa/importers/vote_events.py: -------------------------------------------------------------------------------- 1 | from opencivicdata.legislative.models import ( 2 | VoteEvent, 3 | VoteCount, 4 | PersonVote, 5 | VoteSource, 6 | BillAction, 7 | ) 8 | from pupa.utils import get_pseudo_id, _make_pseudo_id 9 | from .base import BaseImporter 10 | from ..exceptions import InvalidVoteEventError 11 | 12 | 13 | class VoteEventImporter(BaseImporter): 14 | _type = "vote_event" 15 | model_class = VoteEvent 16 | related_models = { 17 | "counts": (VoteCount, "vote_event_id", {}), 18 | "votes": (PersonVote, "vote_event_id", {}), 19 | "sources": (VoteSource, "vote_event_id", {}), 20 | } 21 | 22 | def __init__(self, jurisdiction_id, person_importer, org_importer, bill_importer): 23 | 24 | super(VoteEventImporter, self).__init__(jurisdiction_id) 25 | self.person_importer = person_importer 26 | self.bill_importer = bill_importer 27 | self.org_importer = org_importer 28 | self.seen_bill_ids = set() 29 | self.seen_action_ids = set() 30 | self.vote_events_to_delete = set() 31 | 32 | def get_object(self, vote_event): 33 | spec = {"legislative_session_id": vote_event["legislative_session_id"]} 34 | 35 | if not vote_event["identifier"] and not vote_event["bill_id"]: 36 | raise InvalidVoteEventError( 37 | 'attempt to save a VoteEvent without an "identifier" or "bill_id"' 38 | ) 39 | 40 | if vote_event["bill_id"]: 41 | if vote_event["bill_id"] not in self.seen_bill_ids: 42 | self.seen_bill_ids.add(vote_event["bill_id"]) 43 | # keep a list of all the vote event ids that should be deleted 44 | self.vote_events_to_delete.update( 45 | self.model_class.objects.filter( 46 | bill_id=vote_event["bill_id"] 47 | ).values_list("id", flat=True) 48 | ) 49 | spec["bill_id"] = vote_event["bill_id"] 50 | 51 | if vote_event.get("pupa_id"): 52 | ve_id = self.lookup_obj_id(vote_event["pupa_id"], VoteEvent) 53 | if ve_id: 54 | spec = {"id": ve_id} 55 | else: 56 | return None 57 | elif vote_event["identifier"]: 58 | # if there's an identifier, just use it and the bill_id and the session 59 | spec["identifier"] = vote_event["identifier"] 60 | else: 61 | # otherwise use the motion, start_date, and org as well 62 | spec.update( 63 | { 64 | "motion_text": vote_event["motion_text"], 65 | "start_date": vote_event["start_date"], 66 | "organization_id": vote_event["organization_id"], 67 | } 68 | ) 69 | 70 | return self.model_class.objects.prefetch_related("votes__voter").get(**spec) 71 | 72 | def limit_spec(self, spec): 73 | spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id 74 | return spec 75 | 76 | def prepare_for_db(self, data): 77 | data["legislative_session_id"] = self.get_session_id( 78 | data.pop("legislative_session") 79 | ) 80 | data["organization_id"] = self.org_importer.resolve_json_id( 81 | data.pop("organization") 82 | ) 83 | 84 | bill = data.pop("bill") 85 | if bill and bill.startswith("~"): 86 | # unpack psuedo id and apply filter in case there are any that alter it 87 | bill = get_pseudo_id(bill) 88 | self.bill_importer.apply_transformers(bill) 89 | bill = _make_pseudo_id(**bill) 90 | 91 | data["bill_id"] = self.bill_importer.resolve_json_id(bill) 92 | bill_action = data.pop("bill_action") 93 | if bill_action: 94 | try: 95 | action = BillAction.objects.get( 96 | bill_id=data["bill_id"], 97 | description=bill_action, 98 | date=data["start_date"], 99 | organization_id=data["organization_id"], 100 | ) 101 | # seen_action_ids is for ones being added in this import 102 | # action.vote is already set if action was set on prior import 103 | if action.id in self.seen_action_ids or hasattr(action, "vote"): 104 | self.warning( 105 | "can not match two VoteEvents to %s: %s", action.id, bill_action 106 | ) 107 | else: 108 | data["bill_action_id"] = action.id 109 | self.seen_action_ids.add(action.id) 110 | except BillAction.DoesNotExist: 111 | self.warning( 112 | "could not match VoteEvent to %s %s %s", 113 | bill, 114 | bill_action, 115 | data["start_date"], 116 | ) 117 | except BillAction.MultipleObjectsReturned as e: 118 | self.warning( 119 | "could not match VoteEvent to %s %s %s: %s", 120 | bill, 121 | bill_action, 122 | data["start_date"], 123 | e, 124 | ) 125 | 126 | for vote in data["votes"]: 127 | vote["voter_id"] = self.person_importer.resolve_json_id( 128 | vote["voter_id"], allow_no_match=True 129 | ) 130 | return data 131 | 132 | def postimport(self): 133 | # be sure not to delete vote events that were 134 | # imported (meaning updated) this time through 135 | self.vote_events_to_delete.difference_update(self.json_to_db_id.values()) 136 | # everything remaining, goodbye 137 | self.model_class.objects.filter(id__in=self.vote_events_to_delete).delete() 138 | -------------------------------------------------------------------------------- /pupa/scrape/event.py: -------------------------------------------------------------------------------- 1 | from ..utils import _make_pseudo_id 2 | from .base import BaseModel, SourceMixin, AssociatedLinkMixin, LinkMixin 3 | from .schemas.event import schema 4 | from pupa.exceptions import ScrapeValueError 5 | 6 | 7 | class EventAgendaItem(dict, AssociatedLinkMixin): 8 | event = None 9 | 10 | def __init__(self, description, event): 11 | super(EventAgendaItem, self).__init__( 12 | { 13 | "description": description, 14 | "classification": [], 15 | "related_entities": [], 16 | "subjects": [], 17 | "media": [], 18 | "notes": [], 19 | "order": str(len(event.agenda)), 20 | "extras": {}, 21 | } 22 | ) 23 | self.event = event 24 | 25 | def add_subject(self, what): 26 | self["subjects"].append(what) 27 | 28 | def add_classification(self, what): 29 | self["classification"].append(what) 30 | 31 | def add_vote_event(self, vote_event, *, id=None, note="consideration"): 32 | self.add_entity(name=vote_event, entity_type="vote_event", id=id, note=note) 33 | 34 | def add_committee(self, committee, *, id=None, note="participant"): 35 | self.add_entity(name=committee, entity_type="organization", id=id, note=note) 36 | 37 | def add_bill(self, bill, *, id=None, note="consideration"): 38 | self.add_entity(name=bill, entity_type="bill", id=id, note=note) 39 | 40 | def add_person(self, person, *, id=None, note="participant"): 41 | self.add_entity(name=person, entity_type="person", id=id, note=note) 42 | 43 | def add_media_link( 44 | self, note, url, media_type, *, text="", type="media", on_duplicate="error" 45 | ): 46 | return self._add_associated_link( 47 | collection="media", 48 | note=note, 49 | url=url, 50 | text=text, 51 | media_type=media_type, 52 | on_duplicate=on_duplicate, 53 | ) 54 | 55 | def add_entity(self, name, entity_type, *, id, note): 56 | ret = {"name": name, "entity_type": entity_type, "note": note} 57 | if id: 58 | ret["id"] = id 59 | elif entity_type: 60 | if entity_type in ("organization", "person"): 61 | id = _make_pseudo_id(name=name) 62 | elif entity_type in ("bill", "vote_event"): 63 | id = _make_pseudo_id(identifier=name) 64 | else: 65 | raise ScrapeValueError( 66 | "attempt to call add_entity with unsupported " 67 | "entity type: {}".format(entity_type) 68 | ) 69 | ret[entity_type + "_id"] = id 70 | 71 | self["related_entities"].append(ret) 72 | 73 | 74 | class Event(BaseModel, SourceMixin, AssociatedLinkMixin, LinkMixin): 75 | """ 76 | Details for an event in .format 77 | """ 78 | 79 | _type = "event" 80 | _schema = schema 81 | 82 | def __init__( 83 | self, 84 | name, 85 | start_date, 86 | *, 87 | location_name=None, 88 | all_day=False, 89 | description="", 90 | end_date="", 91 | status="confirmed", 92 | classification="event" 93 | ): 94 | super(Event, self).__init__() 95 | self.start_date = start_date 96 | self.all_day = all_day 97 | self.end_date = end_date 98 | self.name = name 99 | self.description = description 100 | self.status = status 101 | self.classification = classification 102 | if location_name: 103 | self.location = {"name": location_name, "note": "", "coordinates": None} 104 | else: 105 | self.location = None 106 | self.documents = [] 107 | self.participants = [] 108 | self.media = [] 109 | self.agenda = [] 110 | 111 | def __str__(self): 112 | return "{} {}".format(self.start_date, self.name.strip()) 113 | 114 | def set_location(self, name, *, note="", url="", coordinates=None): 115 | self.location = { 116 | "name": name, 117 | "note": note, 118 | "url": url, 119 | "coordinates": coordinates, 120 | } 121 | 122 | def add_participant(self, name, type, *, id=None, note="participant"): 123 | p = {"name": name, "entity_type": type, "note": note} 124 | if id: 125 | p["id"] = id 126 | elif type: 127 | id = _make_pseudo_id(name=name) 128 | p[type + "_id"] = id 129 | 130 | self.participants.append(p) 131 | 132 | def add_person(self, name, *, id=None, note="participant"): 133 | return self.add_participant(name=name, type="person", id=id, note=note) 134 | 135 | def add_committee(self, name, *, id=None, note="participant"): 136 | return self.add_participant(name=name, type="organization", id=id, note=note) 137 | 138 | def add_agenda_item(self, description): 139 | obj = EventAgendaItem(description, self) 140 | self.agenda.append(obj) 141 | return obj 142 | 143 | def add_media_link( 144 | self, 145 | note, 146 | url, 147 | media_type, 148 | *, 149 | text="", 150 | type="media", 151 | on_duplicate="error", 152 | date="" 153 | ): 154 | return self._add_associated_link( 155 | collection="media", 156 | note=note, 157 | url=url, 158 | text=text, 159 | media_type=media_type, 160 | on_duplicate=on_duplicate, 161 | date=date, 162 | ) 163 | 164 | def add_document( 165 | self, note, url, *, text="", media_type="", on_duplicate="error", date="" 166 | ): 167 | return self._add_associated_link( 168 | collection="documents", 169 | note=note, 170 | url=url, 171 | text=text, 172 | media_type=media_type, 173 | on_duplicate=on_duplicate, 174 | date=date, 175 | ) 176 | -------------------------------------------------------------------------------- /pupa/tests/clean/test_clean.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import argparse 3 | from datetime import datetime, timezone, timedelta 4 | from freezegun import freeze_time 5 | 6 | from opencivicdata.core.models import Person, Organization, Jurisdiction, Division, Post 7 | 8 | from pupa.cli.commands.clean import Command as CleanCommand 9 | 10 | 11 | @pytest.fixture 12 | def subparsers(): 13 | parser = argparse.ArgumentParser("pupa", description="pupa CLI") 14 | parser.add_argument("--debug", action="store_true", help="open debugger on error") 15 | parser.add_argument( 16 | "--loglevel", 17 | default="INFO", 18 | help=( 19 | "set log level. options are: " 20 | "DEBUG|INFO|WARNING|ERROR|CRITICAL " 21 | "(default is INFO)" 22 | ), 23 | ) 24 | return parser.add_subparsers(dest="subcommand") 25 | 26 | 27 | @pytest.fixture 28 | def division(): 29 | return Division.objects.create(id="ocd-division/country:us", name="USA") 30 | 31 | 32 | @pytest.fixture 33 | def jurisdiction(division): 34 | return Jurisdiction.objects.create(id="jid", division=division) 35 | 36 | 37 | @pytest.fixture 38 | def organization(jurisdiction): 39 | return Organization.objects.create(name="WWE", jurisdiction=jurisdiction) 40 | 41 | 42 | @pytest.fixture 43 | def post(organization): 44 | return Post.objects.create(organization=organization, label="Some post", role="Some post") 45 | 46 | 47 | @pytest.fixture 48 | def person(): 49 | class PersonFactory: 50 | def build(self, **kwargs): 51 | person_info = { 52 | "name": "George Washington", 53 | "family_name": "Washington", 54 | } 55 | 56 | person_info.update(kwargs) 57 | 58 | return Person.objects.create(**person_info) 59 | 60 | return PersonFactory() 61 | 62 | 63 | @pytest.mark.django_db 64 | def test_get_stale_objects(subparsers, division, jurisdiction, organization, post, person): 65 | stale_person = person.build() 66 | membership = stale_person.memberships.create(organization=organization) 67 | 68 | protected_objects = {division, jurisdiction, post} 69 | expected_stale_objects = {stale_person, organization, membership} 70 | 71 | a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) 72 | with freeze_time(a_week_from_now): 73 | fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson") 74 | fresh_person.memberships.create(organization=organization) 75 | 76 | stale_objects = set(CleanCommand(subparsers).get_stale_objects(7)) 77 | assert stale_objects == expected_stale_objects 78 | 79 | # This is implied by the above check, but it's important, so we'll check 80 | # for it explicitly. 81 | assert protected_objects not in stale_objects 82 | 83 | 84 | @pytest.mark.django_db 85 | def test_remove_stale_objects(subparsers, organization, person): 86 | stale_person = person.build() 87 | membership = stale_person.memberships.create(organization=organization) 88 | 89 | expected_stale_objects = {stale_person, organization, membership} 90 | 91 | a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) 92 | with freeze_time(a_week_from_now): 93 | fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson") 94 | fresh_person.memberships.create(organization=organization) 95 | 96 | CleanCommand(subparsers).remove_stale_objects(7) 97 | for obj in expected_stale_objects: 98 | was_deleted = not type(obj).objects.filter(id=obj.id).exists() 99 | assert was_deleted 100 | 101 | 102 | @pytest.mark.django_db 103 | def test_clean_command(subparsers, organization, person): 104 | stale_person = person.build() 105 | stale_membership = stale_person.memberships.create(organization=organization) 106 | 107 | a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) 108 | with freeze_time(a_week_from_now): 109 | fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson") 110 | not_stale_membership = fresh_person.memberships.create( 111 | organization=organization 112 | ) 113 | organization.save() # Update org's last_seen field 114 | 115 | # Call clean command 116 | CleanCommand(subparsers).handle( 117 | argparse.Namespace(report=False, window=7, yes=True, max=10), [] 118 | ) 119 | 120 | expected_stale_objects = {stale_person, stale_membership} 121 | for obj in expected_stale_objects: 122 | was_deleted = not type(obj).objects.filter(id=obj.id).exists() 123 | assert was_deleted 124 | 125 | expected_not_stale_objects = {organization, fresh_person, not_stale_membership} 126 | for obj in expected_not_stale_objects: 127 | was_not_deleted = type(obj).objects.filter(id=obj.id).exists() 128 | assert was_not_deleted 129 | 130 | 131 | @pytest.mark.django_db 132 | def test_clean_command_failsafe(subparsers, organization, person): 133 | stale_people = [person.build() for i in range(20)] 134 | for p in stale_people: 135 | p.memberships.create(organization=organization) 136 | 137 | cmd = CleanCommand(subparsers) 138 | 139 | a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) 140 | with freeze_time(a_week_from_now): 141 | with pytest.raises(SystemExit): 142 | # Should trigger failsafe exist when deleting more than 10 objects 143 | cmd.handle( 144 | argparse.Namespace(report=False, window=7, yes=False, max=10), [] 145 | ) 146 | 147 | with pytest.raises(SystemExit): 148 | # Should trigger failsafe exist when deleting more than 10 objects, 149 | # even when yes is specified 150 | cmd.handle( 151 | argparse.Namespace(report=False, window=7, yes=True, max=10), [] 152 | ) 153 | 154 | # Should proceed without error, since max is increased (1 organization, 155 | # 20 people, 20 memberships) 156 | cmd.handle( 157 | argparse.Namespace(report=False, window=7, max=41, yes=True), [] 158 | ) 159 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # pupa changelog 2 | 3 | ## 0.11.0 - April 3 2023 4 | 5 | Improvements: 6 | 7 | * Add `pupa clean` command to delete database objects that haven't been seen in recent scrapes 8 | 9 | ## 0.10.2 - March 18 2021 10 | 11 | Improvements: 12 | 13 | * allow null event locations 14 | * resolve memberships based on start date, falling back to end date if no start date is available 15 | 16 | Fixes: 17 | 18 | * raise exception for whitespace in urls 19 | 20 | ## 0.10.1 - March 23 2020 21 | 22 | Improvements: 23 | 24 | * fire post-save hook on imports 25 | * check family_name when resolving names 26 | 27 | Fixes: 28 | 29 | * if scrapers are omitted on command line, use all scrapers again 30 | 31 | ## 0.10.0 - December 11 2019 32 | 33 | Improvements: 34 | 35 | * add data quality reports that update after each import (requires migration) 36 | * add flags to disable person, bill, vote, event import 37 | * let date be set on event media & document 38 | 39 | Fixes: 40 | 41 | * Fix warnings from obsolete usage of importlib & jsonschema 42 | * remove parties from tests 43 | * don't call check_session_list if running import only 44 | * add support for Post.maximum_memberships 45 | * add support for Person given & family names 46 | * stop testing on Postgres 9.x 47 | 48 | ## 0.9.1 - October 23 2018 49 | 50 | Fixes: 51 | 52 | * minor packaging fixes & dependency pinning tweaks 53 | 54 | 55 | ## 0.9.0 - February 14 2018 56 | 57 | Backwards-incompatible changes: 58 | 59 | * fix_bill_id is no longer called on bill identifiers 60 | 61 | Improvements: 62 | 63 | * django 2.0 compatibility fixes (on_delete on models) 64 | * require python-opencivicdata 2.1 fixes 65 | * drop validictory for jsonschema 66 | * add 'pupa party' command for atomic addition of parties, deprecate Jurisdiction.parties 67 | * add IMPORT_TRANSFORMERS setting allowing alterations of data on import 68 | 69 | Fixes: 70 | 71 | * bugfix for OrganizationImporter other_names 72 | * bugfix for VoteEvent bill resolution 73 | * bugfix for VoteEvent bill action resolution (#307) 74 | 75 | 76 | ## 0.8.0 - July 19 2017 77 | 78 | Backwards-incompatible changes: 79 | 80 | * role no longer defaults to 'member' and is now optional in Person constructor 81 | when used w/ primary_org. if primary_org alone is unambiguous scrapers 82 | can set primary org alone and role will be set automatically 83 | * in accordance w/ OCDEP101, Event.start_time/end_time are now 84 | Event.start_date/end_date 85 | 86 | Improvements: 87 | 88 | * allow extras to be set on bill actions & event agenda items 89 | * bill actions can now specify times 90 | * add classification field to event agenda items 91 | * resolving organizations checks other names like we do for people 92 | 93 | ## 0.7.0 - June 5 2017 94 | 95 | Backwards-incompatible changes: 96 | 97 | * moves from split dependency of opencivicdata-divisions/opencivicdata-django 98 | to new unified opencivicdata which also splits into two Django apps 99 | (see python-opencivicdata release notes for more detail) 100 | 101 | Improvements: 102 | 103 | * allow Memberships to have unresolved `person_name` similar to how other 104 | name resolutions work 105 | * allow linking of VoteEvent to BillAction by setting a matching chamber, 106 | date, and bill\_action 107 | * add Scraper.latest\_session convienience method 108 | * optionally allow setting \_scraped\_name on legislative\_session, which will 109 | be used in session\_list checking if present 110 | * add concept of Pupa identifiers, to aid in resolution 111 | 112 | Fixes: 113 | 114 | * pupa dbinit --reset now correctly drops dependent pupa tables and migrations 115 | * exit gracefully if the first scrape fails instead of complaining about RunPlan 116 | DB constraint 117 | * complex psuedo-ids are now deterministic (by sorting dict keys) 118 | 119 | 120 | ## 0.6.0 - February 19 2017 121 | 122 | Backwards-incompatible changes: 123 | 124 | * Identify sessions by their identifiers instead of their names (update your `get_session_list()` methods) 125 | 126 | Improvements: 127 | 128 | * Check for the presence of a `get_session_list()` method instead of `check_sessions = True` 129 | * Resolve an event's participants and its agenda items' related entities #206, #207 130 | * Accept an organization name in `Person.add_membership` for the second parameter #233 131 | * Accept `datetime` dates wherever string dates are accepted #218 132 | * Improve error reporting #214, #230, #231 133 | * Compatible with Django 1.10 134 | 135 | Fixes: 136 | 137 | * Allow people to hold multiple posts in an organization #244, #247 138 | * Add a `primary_org_name` parameter to `Person.add_term`, to disambiguate organizations with the same classification #223 139 | * Update an object if the explicit order of its related objects has changed #242 140 | * Touch an object's `updated_at` whenever its related objects are updated #226 141 | * Correctly resolve a new person with the same name #232 142 | * Don't raise a resolution error due to multiple matches in cases where zero matches are acceptable 143 | 144 | ## 0.5.2 - November 18 2015 145 | 146 | * show run logs in the admin 147 | * start tracking failed runs 148 | 149 | ## 0.5.1 - November 13 2015 150 | 151 | * use other\_names for psuedo\_id resolution on people 152 | * fix for postgis:// on Heroku 153 | * remove dump command that required imago 154 | * require py-ocd-django 0.8.0 models 155 | 156 | ## 0.5.0 - October 8 2015 157 | 158 | * fix major bug causing deadlock on party import 159 | * fix major bug where legislative\_session changes would wipe the database 160 | * update from Django 1.7 to Django 1.9 161 | * now uses Django's ArrayField, JSONField, etc. instead of external deps 162 | * also now requires Postgres 9.4 163 | * changes to be consistent with Popolo in naming of legislative\_session and vote\_event 164 | * some speedups on import by changing how we use bulk\_create 165 | * experimental Kafka support 166 | * actually use other\_names for person import 167 | * allow delayed resolution of people 168 | * respect locked\_fields during import 169 | * renamed make\_psuedo\_id() to discourage use 170 | * lots of other bugfixes 171 | 172 | ## 0.4.1 - August 13 2014 173 | 174 | * bugfix release for packaging issue w/ 0.4.0 175 | 176 | ## 0.4.0 - August 13 2014 177 | 178 | * near-complete rewrite from MongoDB to Postgres dependency 179 | 180 | ## 0.3.0 - March 27 2014 181 | 182 | * Initial PyPI release, MongoDB version heavily based on billy 183 | -------------------------------------------------------------------------------- /pupa/tests/importers/test_topsort.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.utils.topsort import Network, CyclicGraphError 3 | 4 | 5 | def chash(cycles): 6 | """ 7 | Hash a cycle, useful for comparing sets of cycles. 8 | 9 | This checks the sorted set of each of the nodes in the cycle. This 10 | is *not* a perfect check, but it's useful so that we can create a set 11 | of these hashes, and check that they all match. 12 | 13 | It's not perfect, since D -> A -> B will be the same as B -> A -> D, 14 | but since this is only used in the testing logic, we can ensure 15 | that we handle it correctly in the testcases. 16 | 17 | (Implicit warning: Don't use this anywhere important.) 18 | """ 19 | return {"".join(sorted(set(x))) for x in cycles} 20 | 21 | 22 | def test_sort_order_basic(): 23 | network = Network() 24 | network.add_node("A") 25 | network.add_node("B") 26 | network.add_node("C") 27 | 28 | network.add_edge("A", "B") 29 | network.add_edge("B", "C") 30 | 31 | assert (list(network.sort())) == ["A", "B", "C"] 32 | 33 | 34 | def test_sort_order_double(): 35 | network = Network() 36 | network.add_node("A") 37 | network.add_node("B") 38 | network.add_node("C") 39 | 40 | network.add_edge("A", "B") 41 | network.add_edge("A", "C") 42 | network.add_edge("C", "B") 43 | 44 | # A => B 45 | # / 46 | # A => C 47 | 48 | assert (list(network.sort())) == ["A", "C", "B"] 49 | 50 | 51 | def test_sort_order_staged(): 52 | network = Network() 53 | 54 | network.add_node("A1") 55 | network.add_node("A2") 56 | network.add_node("A3") 57 | 58 | network.add_edge("A1", "A2") 59 | network.add_edge("A1", "A3") 60 | network.add_edge("A2", "A3") 61 | 62 | network.add_node("B1") 63 | network.add_node("B2") 64 | network.add_node("B3") 65 | 66 | network.add_edge("B1", "B2") 67 | network.add_edge("B1", "B3") 68 | network.add_edge("B2", "B3") 69 | 70 | network.add_edge("B1", "A1") 71 | 72 | network.add_node("C1") 73 | network.add_node("C2") 74 | network.add_node("C3") 75 | 76 | network.add_edge("C1", "C2") 77 | network.add_edge("C1", "C3") 78 | network.add_edge("C2", "C3") 79 | 80 | network.add_edge("C1", "A1") 81 | network.add_edge("C1", "B1") 82 | 83 | network.add_edge("C1", "B1") 84 | network.add_edge("B1", "A1") 85 | network.add_edge("A1", "C2") 86 | network.add_edge("A1", "C3") 87 | 88 | # with open("/home/tag/debug.dot", 'w') as fd: 89 | # fd.write(network.dot()) 90 | 91 | sorted_order = list(network.sort()) 92 | 93 | assert sorted_order.pop(0) == "C1" 94 | assert sorted_order.pop(0) == "B1" 95 | assert sorted_order.pop(0) in ("A1", "B2") 96 | # ^^ This makes more sense after you dot debug it 97 | assert sorted_order.pop(0) in ("A1", "B2") 98 | 99 | 100 | def test_cyclic_graph_error_simple(): 101 | network = Network() 102 | network.add_node("A") 103 | network.add_node("B") 104 | network.add_edge("A", "B") 105 | network.add_edge("B", "A") 106 | 107 | with pytest.raises(CyclicGraphError): 108 | list(network.sort()) 109 | 110 | 111 | def test_cyclic_graph_error_indirect(): 112 | network = Network() 113 | network.add_node("A") 114 | network.add_node("B") 115 | network.add_node("C") 116 | 117 | network.add_edge("A", "B") 118 | network.add_edge("B", "C") 119 | network.add_edge("C", "A") 120 | 121 | with pytest.raises(CyclicGraphError): 122 | list(network.sort()) 123 | 124 | 125 | def test_cyclic_graph_error_massive(): 126 | network = Network() 127 | 128 | entries = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "A"] 129 | for i, e in enumerate(entries[:-1]): 130 | network.add_node(e) 131 | network.add_edge(e, entries[1 + i]) 132 | 133 | with pytest.raises(CyclicGraphError): 134 | list(network.sort()) 135 | 136 | 137 | def test_link_before_nodes(): 138 | network = Network() 139 | 140 | network.add_edge("A", "B") 141 | network.add_edge("B", "C") 142 | network.add_edge("C", "D") 143 | 144 | network.add_node("A") 145 | network.add_node("B") 146 | network.add_node("C") 147 | network.add_node("D") 148 | 149 | assert list(network.sort()) == ["A", "B", "C", "D"] 150 | 151 | 152 | def test_internal_node_removal(): 153 | network = Network() 154 | 155 | network.add_node("A") 156 | network.add_node("B") 157 | network.add_node("C") 158 | network.add_node("D") 159 | 160 | network.add_edge("A", "B") 161 | network.add_edge("B", "C") 162 | network.add_edge("C", "D") 163 | network.add_edge("A", "C") # Useful for ensuring the ending list 164 | # is deterministic. 165 | 166 | # Ensure that we can't remove an internal node without a ValueError 167 | # by default. 168 | with pytest.raises(ValueError): 169 | network.prune_node("B") 170 | 171 | # OK. Now that we know that works, let's prune it harder. 172 | network.prune_node("B", remove_backrefs=True) 173 | 174 | # And make sure "B" is gone. 175 | assert list(network.sort()) == ["A", "C", "D"] 176 | 177 | 178 | def test_dot_debug(): 179 | network = Network() 180 | 181 | network.add_node("A") 182 | network.add_node("B") 183 | network.add_edge("A", "B") 184 | 185 | dot = network.dot() 186 | assert dot == "digraph graphname {A -> B;}" 187 | 188 | 189 | def test_cycles_simple(): 190 | network = Network() 191 | network.add_node("A") 192 | network.add_node("B") 193 | network.add_edge("A", "B") 194 | network.add_edge("B", "A") 195 | assert chash(network.cycles()) == chash([("A", "B", "A")]) 196 | 197 | 198 | def test_cycles_complex(): 199 | network = Network() 200 | network.add_node("A") 201 | network.add_node("B") 202 | network.add_node("C") 203 | network.add_node("D") 204 | 205 | network.add_edge("A", "B") 206 | network.add_edge("B", "C") 207 | network.add_edge("C", "D") 208 | network.add_edge("D", "A") 209 | 210 | network.add_edge("D", "C") 211 | network.add_edge("C", "B") 212 | network.add_edge("B", "D") 213 | 214 | # with open("/home/tag/debug.dot", 'w') as fd: 215 | # fd.write(network.dot()) 216 | 217 | assert chash(network.cycles()) == chash( 218 | [("B", "C", "B"), ("C", "D", "C"), ("A", "B", "D", "A")] 219 | ) 220 | -------------------------------------------------------------------------------- /pupa/scrape/schemas/event.py: -------------------------------------------------------------------------------- 1 | """ 2 | Schema for event objects. 3 | """ 4 | 5 | from .common import ( 6 | sources, 7 | extras, 8 | fuzzy_date_blank, 9 | fuzzy_datetime, 10 | fuzzy_datetime_blank, 11 | ) 12 | 13 | media_schema = { 14 | "items": { 15 | "properties": { 16 | "name": {"type": "string", "minLength": 1}, 17 | "type": {"type": "string", "minLength": 1}, 18 | "date": fuzzy_date_blank, 19 | "offset": {"type": ["number", "null"]}, 20 | "links": { 21 | "items": { 22 | "properties": { 23 | "media_type": {"type": "string"}, 24 | "url": {"type": "string", "format": "uri"}, 25 | }, 26 | "type": "object", 27 | }, 28 | "type": "array", 29 | }, 30 | }, 31 | "type": "object", 32 | }, 33 | "type": "array", 34 | } 35 | 36 | schema = { 37 | "properties": { 38 | "name": {"type": "string", "minLength": 1}, 39 | "all_day": {"type": "boolean"}, 40 | "start_date": fuzzy_datetime, 41 | "end_date": fuzzy_datetime_blank, 42 | "status": { 43 | "type": "string", 44 | "enum": ["cancelled", "tentative", "confirmed", "passed"], 45 | }, 46 | "classification": {"type": "string", "minLength": 1}, # TODO: enum 47 | "description": {"type": "string"}, 48 | "location": { 49 | "type": ["object", "null"], 50 | "properties": { 51 | "name": {"type": "string", "minLength": 1}, 52 | "note": { 53 | "type": "string", 54 | }, 55 | "url": { 56 | "type": ["string", "null"], 57 | "format": "uri", 58 | }, 59 | "coordinates": { 60 | "type": ["object", "null"], 61 | "properties": { 62 | "latitude": { 63 | "type": "string", 64 | "minLength": 1, 65 | }, 66 | "longitude": { 67 | "type": "string", 68 | "minLength": 1, 69 | }, 70 | }, 71 | }, 72 | }, 73 | }, 74 | "media": media_schema, 75 | "documents": { 76 | "items": { 77 | "properties": { 78 | "note": {"type": "string", "minLength": 1}, 79 | "url": {"type": "string", "minLength": 1}, 80 | "media_type": {"type": "string", "minLength": 1}, 81 | "date": fuzzy_date_blank, 82 | }, 83 | "type": "object", 84 | }, 85 | "type": "array", 86 | }, 87 | "links": { 88 | "items": { 89 | "properties": { 90 | "note": { 91 | "type": "string", 92 | }, 93 | "url": {"format": "uri", "type": "string"}, 94 | }, 95 | "type": "object", 96 | }, 97 | "type": "array", 98 | }, 99 | "participants": { 100 | "items": { 101 | "properties": { 102 | "name": { 103 | "type": "string", 104 | "minLength": 1, 105 | }, 106 | "type": { 107 | "enum": ["organization", "person"], 108 | "type": "string", 109 | }, 110 | "note": { 111 | "type": "string", 112 | "minLength": 1, 113 | }, 114 | }, 115 | "type": "object", 116 | }, 117 | "type": "array", 118 | }, 119 | "agenda": { 120 | "items": { 121 | "properties": { 122 | "description": {"type": "string", "minLength": 1}, 123 | "classification": { 124 | "items": {"type": "string", "minLength": 1}, 125 | "type": "array", 126 | }, 127 | "order": { 128 | "type": ["string", "null"], 129 | }, 130 | "subjects": { 131 | "items": {"type": "string", "minLength": 1}, 132 | "type": "array", 133 | }, 134 | "media": media_schema, 135 | "notes": { 136 | "items": { 137 | "type": "string", 138 | "minLength": 1, 139 | }, 140 | "type": "array", 141 | }, 142 | "related_entities": { 143 | "items": { 144 | "properties": { 145 | "entity_type": { 146 | "type": "string", 147 | "minLength": 1, 148 | }, 149 | "name": { 150 | "type": "string", 151 | "minLength": 1, 152 | }, 153 | "note": { 154 | "type": [ 155 | "string", 156 | "null", 157 | ], 158 | "minLength": 1, 159 | }, 160 | }, 161 | "type": "object", 162 | }, 163 | "minItems": 0, 164 | "type": "array", 165 | }, 166 | }, 167 | "type": "object", 168 | }, 169 | "minItems": 0, 170 | "type": "array", 171 | }, 172 | "sources": sources, 173 | "extras": extras, 174 | "pupa_id": { 175 | "type": ["string", "null"], 176 | "minLength": 1, 177 | }, 178 | }, 179 | "type": "object", 180 | } 181 | -------------------------------------------------------------------------------- /pupa/scrape/vote_event.py: -------------------------------------------------------------------------------- 1 | from ..utils import _make_pseudo_id 2 | from .base import BaseModel, cleanup_list, SourceMixin 3 | from .bill import Bill 4 | from .popolo import pseudo_organization 5 | from .schemas.vote_event import schema 6 | from pupa.exceptions import ScrapeValueError 7 | import re 8 | 9 | 10 | class VoteEvent(BaseModel, SourceMixin): 11 | _type = "vote_event" 12 | _schema = schema 13 | 14 | def __init__( 15 | self, 16 | *, 17 | motion_text, 18 | start_date, 19 | classification, 20 | result, 21 | legislative_session=None, 22 | identifier="", 23 | bill=None, 24 | bill_chamber=None, 25 | bill_action=None, 26 | organization=None, 27 | chamber=None 28 | ): 29 | super(VoteEvent, self).__init__() 30 | 31 | self.legislative_session = legislative_session 32 | self.motion_text = motion_text 33 | self.motion_classification = cleanup_list(classification, []) 34 | self.start_date = start_date 35 | self.result = result 36 | self.identifier = identifier 37 | self.bill_action = bill_action 38 | 39 | self.set_bill(bill, chamber=bill_chamber) 40 | 41 | if isinstance(bill, Bill) and not self.legislative_session: 42 | self.legislative_session = bill.legislative_session 43 | 44 | if not self.legislative_session: 45 | raise ScrapeValueError("must set legislative_session or bill") 46 | 47 | self.organization = pseudo_organization(organization, chamber, "legislature") 48 | self.votes = [] 49 | self.counts = [] 50 | 51 | def __str__(self): 52 | return "{0} - {1} - {2}".format( 53 | self.legislative_session, self.start_date, self.motion_text 54 | ) 55 | 56 | def set_bill(self, bill_or_identifier, *, chamber=None): 57 | if not bill_or_identifier: 58 | self.bill = None 59 | elif isinstance(bill_or_identifier, Bill): 60 | if chamber: 61 | raise ScrapeValueError( 62 | "set_bill takes no arguments when using a `Bill` object" 63 | ) 64 | self.bill = bill_or_identifier._id 65 | else: 66 | if chamber is None: 67 | chamber = "legislature" 68 | kwargs = { 69 | "identifier": bill_or_identifier, 70 | "from_organization__classification": chamber, 71 | "legislative_session__identifier": self.legislative_session, 72 | } 73 | self.bill = _make_pseudo_id(**kwargs) 74 | 75 | def vote(self, option, voter, *, note=""): 76 | self.votes.append( 77 | { 78 | "option": option, 79 | "voter_name": voter, 80 | "voter_id": _make_pseudo_id(name=voter), 81 | "note": note, 82 | } 83 | ) 84 | 85 | def yes(self, name, *, id=None, note=""): 86 | return self.vote("yes", name, note=note) 87 | 88 | def no(self, name, *, id=None, note=""): 89 | return self.vote("no", name, note=note) 90 | 91 | def set_count(self, option, value): 92 | for co in self.counts: 93 | if co["option"] == option: 94 | co["value"] = value 95 | break 96 | else: 97 | self.counts.append({"option": option, "value": value}) 98 | 99 | 100 | class OrderVoteEvent: 101 | """A functor for applying order to voteEvents. 102 | A single OrderVoteEvent instance should be used for all bills in a scrape. 103 | The vote events of each bill must be processed in chronological order, 104 | but the processing of bills may be interleaved (needed in e.g. NH). 105 | Currently, it only fudges midnight dates (start_date and end_date) 106 | by adding the event sequence number in seconds 107 | to the start_date and end_date (if they are well-formed string dates) 108 | In the future, when there is an 'order' field on voteEvents, 109 | it should fill that as well. 110 | This fails softly and silently; 111 | if a valid string date is not found in start_date or end_date, 112 | the date is not touched. 113 | This assumes that times are reported as local time, not UTC. 114 | A UTC time that is local midnight will not be touched. 115 | Sometimes one chamber reports the time of a vote, 116 | but the other chamber reports only the date. This is handled. 117 | See the unit tests for examples and more behavior. 118 | """ 119 | 120 | _midnight = r"\d\d\d\d-\d\d-\d\dT00:00:00.*" 121 | _timeless = r"\d\d\d\d-\d\d-\d\d" 122 | 123 | class OrderBillVoteEvent: 124 | """Order VoteEvents for a single bill""" 125 | 126 | def __init__(self): 127 | self.order = 0 # voteEvent sequence number. 1st voteEvent is 1. 128 | 129 | def __call__(self, voteEvent): 130 | 131 | self.order += 1 132 | voteEvent.start_date = self._adjust_date(voteEvent.start_date) 133 | if hasattr(voteEvent, "end_date"): 134 | voteEvent.end_date = self._adjust_date(voteEvent.end_date) 135 | 136 | def _adjust_date(self, date): 137 | 138 | if not isinstance(date, str): 139 | return date 140 | 141 | if re.fullmatch(OrderVoteEvent._timeless, date): 142 | d2 = date + "T00:00:00" 143 | elif re.fullmatch(OrderVoteEvent._midnight, date): 144 | d2 = date 145 | else: 146 | return date 147 | 148 | assert self.order <= 60 * 60 149 | mins = "{:02d}".format(self.order // 60) 150 | secs = "{:02d}".format(self.order % 60) 151 | 152 | # yyyy-mm-ddThh:mm:dd+05:00 153 | # 0123456789012345678 154 | return d2[:14] + mins + ":" + secs + d2[19:] 155 | 156 | def __init__(self): 157 | self.orderers = {} 158 | 159 | def __call__(self, session_id, bill_id, voteEvent): 160 | """ 161 | Record order of voteEvent within bill. 162 | 163 | The "order" field is not yet implemented; this fudges voteEvent 164 | start_date and end_date. 165 | See OrderVoteEvent docstring for details. 166 | 167 | :param session_id: session id 168 | :param bill_id: an identifier for the vote's bill 169 | that is at least unique within the session. 170 | :param voteEvent: 171 | :return: None 172 | """ 173 | bill_orderer = self.orderers.get((session_id, bill_id)) 174 | 175 | if not bill_orderer: 176 | bill_orderer = self.OrderBillVoteEvent() 177 | self.orderers[(session_id, bill_id)] = bill_orderer 178 | 179 | bill_orderer(voteEvent) 180 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_people_org_scrape.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pytest 3 | from pupa.scrape import Person, Organization, Membership, Post 4 | from pupa.utils import get_pseudo_id 5 | from pupa.exceptions import ScrapeValueError 6 | 7 | 8 | def test_basic_post(): 9 | post = Post(label="1", role="Representative", organization_id="fake_org") 10 | assert "1" in str(post) 11 | post.validate() 12 | 13 | 14 | def test_basic_invalid_post(): 15 | post = Post(label=1, role="Representative", organization_id="fake_org") 16 | with pytest.raises(ValueError): 17 | post.validate() 18 | 19 | 20 | def test_basic_membership(): 21 | m = Membership(person_id="person", organization_id="org") 22 | assert "person" in str(m) and "org" in str(m) 23 | 24 | 25 | def test_basic_invalid_membership(): 26 | membership = Membership(person_id=33, organization_id="orga_id") 27 | with pytest.raises(ValueError): 28 | membership.validate() 29 | 30 | 31 | def test_basic_invalid_person(): 32 | bob = Person("Bob B. Johnson") 33 | bob.add_source(url="http://example.com") 34 | bob.validate() 35 | 36 | bob.name = None 37 | 38 | with pytest.raises(ScrapeValueError): 39 | bob.validate() 40 | 41 | 42 | def test_basic_person(): 43 | p = Person("Bob B. Bear") 44 | p.add_source("http://example.com") 45 | assert p.name in str(p) 46 | p.validate() 47 | 48 | 49 | def test_person_add_membership_org(): 50 | p = Person("Bob B. Bear") 51 | p.add_source("http://example.com") 52 | o = Organization("test org", classification="unknown") 53 | p.add_membership( 54 | o, role="member", start_date="2007", end_date=datetime.date(2015, 5, 8) 55 | ) 56 | assert len(p._related) == 1 57 | p._related[0].validate() 58 | assert p._related[0].person_id == p._id 59 | assert p._related[0].organization_id == o._id 60 | assert p._related[0].start_date == "2007" 61 | assert p._related[0].end_date == datetime.date(2015, 5, 8) 62 | 63 | 64 | def test_basic_organization(): 65 | org = Organization("some org", classification="committee") 66 | org.add_source("http://example.com") 67 | assert org.name in str(org) 68 | org.validate() 69 | 70 | 71 | def test_no_source_on_party_org(): 72 | org = Organization("Hat", classification="party") 73 | # no source? no problem because classification = party 74 | org.validate() 75 | 76 | 77 | def test_basic_invalid_organization(): 78 | orga = Organization("name") 79 | 80 | # no source 81 | with pytest.raises(ScrapeValueError): 82 | orga.validate() 83 | 84 | 85 | def test_org_add_post(): 86 | """Test that we can hack posts in on the fly'""" 87 | orga = Organization("name", classification="committee") 88 | orga.add_source(url="http://example.com") 89 | orga.validate() 90 | 91 | orga.add_post("Human Readable Name", "Chef") 92 | 93 | assert orga._related[0].role == "Chef" 94 | assert orga._related[0].label == "Human Readable Name" 95 | 96 | 97 | def test_legislator_related_district(): 98 | leg = Person("John Adams", district="1", primary_org="legislature") 99 | leg.pre_save("jurisdiction-id") 100 | 101 | assert len(leg._related) == 1 102 | assert leg._related[0].person_id == leg._id 103 | assert get_pseudo_id(leg._related[0].organization_id) == { 104 | "classification": "legislature" 105 | } 106 | assert get_pseudo_id(leg._related[0].post_id) == { 107 | "organization__classification": "legislature", 108 | "label": "1", 109 | } 110 | 111 | 112 | def test_legislator_related_chamber_district(): 113 | leg = Person("John Adams", district="1", primary_org="upper") 114 | leg.pre_save("jurisdiction-id") 115 | 116 | assert len(leg._related) == 1 117 | assert leg._related[0].person_id == leg._id 118 | assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "upper"} 119 | assert get_pseudo_id(leg._related[0].post_id) == { 120 | "organization__classification": "upper", 121 | "label": "1", 122 | } 123 | 124 | 125 | def test_legislator_related_chamber_district_role(): 126 | leg = Person("John Adams", district="1", primary_org="lower", role="Speaker") 127 | leg.pre_save("jurisdiction-id") 128 | 129 | assert len(leg._related) == 1 130 | assert leg._related[0].person_id == leg._id 131 | assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "lower"} 132 | assert get_pseudo_id(leg._related[0].post_id) == { 133 | "organization__classification": "lower", 134 | "label": "1", 135 | "role": "Speaker", 136 | } 137 | assert leg._related[0].role == "Speaker" 138 | 139 | 140 | def test_legislator_related_party(): 141 | leg = Person("John Adams", party="Democratic-Republican") 142 | leg.pre_save("jurisdiction-id") 143 | 144 | # a party membership 145 | assert len(leg._related) == 1 146 | assert leg._related[0].person_id == leg._id 147 | assert get_pseudo_id(leg._related[0].organization_id) == { 148 | "classification": "party", 149 | "name": "Democratic-Republican", 150 | } 151 | assert leg._related[0].role == "member" 152 | 153 | 154 | def test_committee_add_member_person(): 155 | c = Organization("Defense", classification="committee") 156 | p = Person("John Adams") 157 | c.add_member(p, role="chairman") 158 | assert c._related[0].person_id == p._id 159 | assert c._related[0].organization_id == c._id 160 | assert c._related[0].role == "chairman" 161 | 162 | 163 | def test_committee_add_member_name(): 164 | c = Organization("Defense", classification="committee") 165 | c.add_member("John Adams") 166 | assert get_pseudo_id(c._related[0].person_id) == {"name": "John Adams"} 167 | assert c._related[0].organization_id == c._id 168 | assert c._related[0].role == "member" 169 | 170 | 171 | def test_person_add_membership_name(): 172 | p = Person("Leonardo DiCaprio") 173 | p.add_membership( 174 | "Academy of Motion Picture Arts and Sciences", role="winner", start_date="2016" 175 | ) 176 | p._related[0].validate() 177 | assert get_pseudo_id(p._related[0].organization_id) == { 178 | "name": "Academy of Motion Picture Arts and Sciences" 179 | } 180 | assert p._related[0].person_id == p._id 181 | assert p._related[0].role == "winner" 182 | assert p._related[0].start_date == "2016" 183 | 184 | 185 | def test_person_add_party(): 186 | p = Person("Groot") 187 | p.add_party("Green") 188 | p._related[0].validate() 189 | assert get_pseudo_id(p._related[0].organization_id) == { 190 | "name": "Green", 191 | "classification": "party", 192 | } 193 | 194 | 195 | def test_person_add_term(): 196 | p = Person("Eternal") 197 | p.add_term("eternal", "council", start_date="0001", end_date="9999") 198 | p._related[0].validate() 199 | assert get_pseudo_id(p._related[0].organization_id) == { 200 | "classification": "council", 201 | } 202 | assert p._related[0].start_date == "0001" 203 | assert p._related[0].end_date == "9999" 204 | -------------------------------------------------------------------------------- /pupa/tests/scrape/test_bill_scrape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pupa.scrape import Bill 3 | from pupa.utils.generic import get_pseudo_id 4 | from pupa.exceptions import ScrapeValueError 5 | 6 | 7 | def toy_bill(): 8 | b = Bill( 9 | identifier="HB 2017", 10 | legislative_session="2012A", 11 | title="A bill for an act to raise the cookie budget by 200%", 12 | from_organization="Foo Senate", 13 | classification="bill", 14 | ) 15 | b.add_source("http://uri.example.com/", note="foo") 16 | return b 17 | 18 | 19 | def test_basic_valid_bill(): 20 | b = toy_bill() 21 | b.validate() 22 | assert "we got here" 23 | 24 | 25 | def test_bill_type_setting(): 26 | # default 27 | b = Bill(identifier="some bill", legislative_session="session", title="the title") 28 | assert b.classification == ["bill"] 29 | 30 | # string -> list 31 | b = Bill( 32 | identifier="some bill", 33 | legislative_session="session", 34 | title="the title", 35 | classification="string", 36 | ) 37 | assert b.classification == ["string"] 38 | 39 | # list unmodified 40 | b = Bill( 41 | identifier="some bill", 42 | legislative_session="session", 43 | title="the title", 44 | classification=["two", "items"], 45 | ) 46 | assert b.classification == ["two", "items"] 47 | 48 | # tuple -> list 49 | b = Bill( 50 | identifier="some bill", 51 | legislative_session="session", 52 | title="the title", 53 | classification=("two", "items"), 54 | ) 55 | assert b.classification == ["two", "items"] 56 | 57 | 58 | def test_basic_invalid_bill(): 59 | """Test that we can create an invalid bill, and validation will fail""" 60 | b = toy_bill() 61 | b.identifier = None 62 | with pytest.raises(ValueError): 63 | b.validate() 64 | 65 | 66 | def test_from_organization(): 67 | # none set 68 | assert get_pseudo_id(Bill("HB 1", "2014", "Some Bill").from_organization) == { 69 | "classification": "legislature" 70 | } 71 | 72 | # chamber set 73 | assert get_pseudo_id( 74 | Bill("SB 1", "2014", "Some Bill", chamber="upper").from_organization 75 | ) == {"classification": "upper"} 76 | # org direct set 77 | assert ( 78 | Bill("HB 1", "2014", "Some Bill", from_organization="test").from_organization 79 | == "test" 80 | ) 81 | 82 | # can't set both 83 | with pytest.raises(ValueError): 84 | Bill("HB 1", "2014", "Some Bill", from_organization="upper", chamber="upper") 85 | 86 | 87 | def test_add_action(): 88 | """Make sure actions work""" 89 | b = toy_bill() 90 | b.add_action("Some dude liked it.", "2013-04-29T20:00Z", chamber="lower") 91 | assert len(b.actions) == 1 92 | assert b.actions[0]["description"] == "Some dude liked it." 93 | assert get_pseudo_id(b.actions[0]["organization_id"]) == {"classification": "lower"} 94 | assert b.actions[0]["date"] == "2013-04-29T20:00Z" 95 | b.validate() 96 | 97 | 98 | def test_action_extra(): 99 | b = toy_bill() 100 | b.add_action( 101 | "an action with some extra information", 102 | "2017-01-01", 103 | extras=dict(sitting_chair="Adams"), 104 | ) 105 | assert b.actions[0]["extras"] == {"sitting_chair": "Adams"} 106 | 107 | 108 | def test_add_related_bill(): 109 | """Make sure related bills work""" 110 | b = toy_bill() 111 | b.add_related_bill( 112 | identifier="HB 2020", legislative_session="2011A", relation_type="companion" 113 | ) 114 | assert len(b.related_bills) == 1 115 | assert b.related_bills[0] == { 116 | "identifier": "HB 2020", 117 | "legislative_session": "2011A", 118 | "relation_type": "companion", 119 | } 120 | b.validate() 121 | 122 | 123 | def test_add_sponsor(): 124 | b = toy_bill() 125 | b.add_sponsorship( 126 | name="Joe Bleu", 127 | classification="Author", 128 | entity_type="person", 129 | primary=True, 130 | chamber="upper", 131 | ) 132 | assert len(b.sponsorships) == 1 133 | assert b.sponsorships[0] == { 134 | "person_id": '~{"name": "Joe Bleu"}', 135 | "name": "Joe Bleu", 136 | "classification": "Author", 137 | "entity_type": "person", 138 | "primary": True, 139 | "organization_id": None, 140 | } 141 | b.validate() 142 | 143 | 144 | def test_subjects(): 145 | b = toy_bill() 146 | b.add_subject("Foo") 147 | b.add_subject("Bar") 148 | assert b.subject == ["Foo", "Bar"] 149 | b.validate() 150 | 151 | 152 | def test_abstract(): 153 | b = toy_bill() 154 | b.add_abstract("this bill is stupid", "K-5", "1969-10-20") 155 | b.add_abstract("this legislative document is ignorant", "6-12", "2010-10-10") 156 | assert b.abstracts == [ 157 | {"note": "K-5", "abstract": "this bill is stupid", "date": "1969-10-20"}, 158 | { 159 | "note": "6-12", 160 | "abstract": "this legislative document is ignorant", 161 | "date": "2010-10-10", 162 | }, 163 | ] 164 | 165 | 166 | def test_add_documents(): 167 | b = toy_bill() 168 | 169 | # should only add one document since they all have same note 170 | b.add_document_link( 171 | note="Fiscal Impact", 172 | date="2013-04", 173 | url="http://hi.example.com/foo#bar", 174 | media_type="text/html", 175 | ) 176 | b.add_document_link(note="Fiscal Impact", date="2013-04", url="http://foobar.baz") 177 | assert len(b.documents) == 1 178 | 179 | # should now be two documents 180 | b.add_document_link( 181 | note="Other Document", date="2013-04", url="http://foobar.baz/other" 182 | ) 183 | assert len(b.documents) == 2 184 | 185 | # valid documents so far 186 | b.validate() 187 | 188 | # an invalid document 189 | b.add_document_link( 190 | note="Fiscal Impact", date="2013-04", url=None, media_type="foo" 191 | ) 192 | with pytest.raises(ScrapeValueError): 193 | b.validate() 194 | 195 | 196 | def test_versions(): 197 | b = toy_bill() 198 | 199 | # only one document, multiple links 200 | b.add_version_link(url="http://pault.ag/", note="Final Version", date="2013-04") 201 | b.add_version_link(url="http://pault.ag/foo", note="Final Version", date="2013-04") 202 | b.validate() 203 | assert len(b.versions) == 1 204 | assert len(b.versions[0]["links"]) == 2 205 | 206 | # duplicate! 207 | with pytest.raises(ValueError): 208 | b.add_version_link( 209 | url="http://pault.ag/foo", note="Final Version", date="2013-04" 210 | ) 211 | 212 | # ignore duplicate - nothing should change 213 | b.add_version_link( 214 | url="http://pault.ag/foo", 215 | note="Final Version", 216 | date="2013-04", 217 | on_duplicate="ignore", 218 | ) 219 | assert len(b.versions) == 1 220 | assert len(b.versions[0]["links"]) == 2 221 | 222 | # duplicate URL 223 | with pytest.raises(ValueError): 224 | b.add_version_link( 225 | url="http://pault.ag/foo", note="Finals Versions", date="2013-04" 226 | ) 227 | assert len(b.versions) == 1 228 | assert len(b.versions[0]["links"]) == 2 229 | 230 | # a new doc, numbers go up 231 | b.add_version_link( 232 | url="http://pault.ag/foovbar", note="Finals Versions", date="2013-04" 233 | ) 234 | assert len(b.versions) == 2 235 | assert len(b.versions[1]["links"]) == 1 236 | 237 | # still validates 238 | b.validate() 239 | 240 | 241 | def test_str(): 242 | b = toy_bill() 243 | assert b.identifier in str(b) 244 | 245 | 246 | def test_no_whitespace_in_uri(): 247 | b = Bill( 248 | identifier="HB 2017", 249 | legislative_session="2012A", 250 | title="A bill for an act to raise the cookie budget by 200%", 251 | from_organization="Foo Senate", 252 | classification="bill", 253 | ) 254 | b.add_source("http://uri.example.com/fail here", note="foo") 255 | with pytest.raises(ScrapeValueError): 256 | b.validate() 257 | -------------------------------------------------------------------------------- /pupa/tests/importers/test_base_importer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | import tempfile 5 | import mock 6 | import pytest 7 | from opencivicdata.core.models import Person, Organization, Jurisdiction, Division 8 | from pupa.scrape import Person as ScrapePerson 9 | from pupa.scrape import Organization as ScrapeOrganization 10 | from pupa.importers.base import omnihash, BaseImporter 11 | from pupa.importers import PersonImporter, OrganizationImporter 12 | from pupa.exceptions import UnresolvedIdError, DataImportError 13 | 14 | 15 | def create_jurisdiction(): 16 | Division.objects.create(id="ocd-division/country:us", name="USA") 17 | Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") 18 | 19 | 20 | class FakeImporter(BaseImporter): 21 | _type = "test" 22 | 23 | 24 | def test_omnihash_python_types(): 25 | # string 26 | assert omnihash("test") == omnihash("test") 27 | # list 28 | assert omnihash(["this", "is", "a", "list"]) == omnihash( 29 | ["this", "is", "a", "list"] 30 | ) 31 | # set 32 | assert omnihash({"and", "a", "set"}) == omnihash({"set", "set", "and", "a"}) 33 | # dict w/ set and tuple as well 34 | assert omnihash({"a": {("fancy", "nested"): {"dict"}}}) == omnihash( 35 | {"a": {("fancy", "nested"): {"dict"}}} 36 | ) 37 | 38 | 39 | def test_import_directory(): 40 | # write out some temp data to filesystem 41 | datadir = tempfile.mkdtemp() 42 | dicta = {"test": "A"} 43 | dictb = {"test": "B"} 44 | open(os.path.join(datadir, "test_a.json"), "w").write(json.dumps(dicta)) 45 | open(os.path.join(datadir, "test_b.json"), "w").write(json.dumps(dictb)) 46 | 47 | # simply ensure that import directory calls import_data with all dicts 48 | ti = FakeImporter("jurisdiction-id") 49 | with mock.patch.object(ti, attribute="import_data") as mockobj: 50 | ti.import_directory(datadir) 51 | 52 | # import_data should be called once 53 | assert mockobj.call_count == 1 54 | # kind of hacky, get the total list of args passed in 55 | arg_objs = list(mockobj.call_args[0][0]) 56 | 57 | # 2 args only, make sure a and b are in there 58 | assert len(arg_objs) == 2 59 | assert dicta in arg_objs 60 | assert dictb in arg_objs 61 | 62 | # clean up datadir 63 | shutil.rmtree(datadir) 64 | 65 | 66 | def test_apply_transformers(): 67 | transformers = { 68 | "capitalize": lambda x: x.upper(), 69 | "cap_and_reverse": [lambda x: x.upper(), lambda y: y[::-1]], 70 | "never_used": lambda x: 1 / 0, 71 | "nested": {"replace": lambda x: "replaced"}, 72 | } 73 | data = { 74 | "capitalize": "words", 75 | "cap_and_reverse": "simple", 76 | "nested": {"replace": None}, 77 | } 78 | ti = FakeImporter("jid") 79 | ti.cached_transformers = transformers 80 | output = ti.apply_transformers(data) 81 | assert output["capitalize"] == "WORDS" 82 | assert output["cap_and_reverse"] == "ELPMIS" 83 | assert output["nested"]["replace"] == "replaced" 84 | 85 | 86 | # doing these next few tests just on a Person because it is the same 87 | # code that handles it but for completeness maybe it is better to do 88 | # these on each type? 89 | 90 | 91 | @pytest.mark.django_db 92 | def test_last_seen_updates_on_scrape(): 93 | create_jurisdiction() 94 | o = Organization.objects.create(name="WWE", jurisdiction_id="jid") 95 | 96 | p = Person.objects.create(name="George Washington", family_name="Washington") 97 | p.memberships.create(organization=o) 98 | 99 | expected_updated_at = p.updated_at 100 | last_seen_before_scrape = p.last_seen 101 | 102 | # Simulate no-op scrape 103 | scraped_p = ScrapePerson("George Washington").as_dict() 104 | PersonImporter("jid").import_data([scraped_p]) 105 | 106 | p.refresh_from_db() 107 | 108 | assert p.updated_at < p.last_seen, "Should refresh last_seen but not updated_at" 109 | assert ( 110 | p.updated_at == expected_updated_at 111 | ), "Should not refresh updated_at when there's no update" 112 | 113 | assert ( 114 | p.last_seen > last_seen_before_scrape 115 | ), "Should refresh last_seen even when there's no update" 116 | 117 | 118 | @pytest.mark.django_db 119 | def test_deduplication_identical_object(): 120 | p1 = ScrapePerson("Dwayne").as_dict() 121 | p2 = ScrapePerson("Dwayne").as_dict() 122 | PersonImporter("jid").import_data([p1, p2]) 123 | 124 | assert Person.objects.count() == 1 125 | 126 | 127 | @pytest.mark.django_db 128 | def test_exception_on_identical_objects_in_import_stream(): 129 | create_jurisdiction() 130 | # these two objects aren't identical, but refer to the same thing 131 | # at the moment we consider this an error (but there may be a better 132 | # way to handle this?) 133 | o1 = ScrapeOrganization("X-Men", classification="unknown").as_dict() 134 | o2 = ScrapeOrganization( 135 | "X-Men", founding_date="1970", classification="unknown" 136 | ).as_dict() 137 | 138 | with pytest.raises(Exception): 139 | OrganizationImporter("jid").import_data([o1, o2]) 140 | 141 | 142 | @pytest.mark.django_db 143 | def test_resolve_json_id(): 144 | p1 = ScrapePerson("Dwayne").as_dict() 145 | p2 = ScrapePerson("Dwayne").as_dict() 146 | pi = PersonImporter("jid") 147 | 148 | # do import and get database id 149 | p1_id = p1["_id"] 150 | p2_id = p2["_id"] 151 | pi.import_data([p1, p2]) 152 | db_id = Person.objects.get().id 153 | 154 | # simplest case 155 | assert pi.resolve_json_id(p1_id) == db_id 156 | # duplicate should resolve to same id 157 | assert pi.resolve_json_id(p2_id) == db_id 158 | # a null id should map to None 159 | assert pi.resolve_json_id(None) is None 160 | # no such id 161 | with pytest.raises(UnresolvedIdError): 162 | pi.resolve_json_id("this-is-invalid") 163 | 164 | 165 | @pytest.mark.django_db 166 | def test_invalid_fields(): 167 | p1 = ScrapePerson("Dwayne").as_dict() 168 | p1["newfield"] = "shouldn't happen" 169 | 170 | with pytest.raises(DataImportError): 171 | PersonImporter("jid").import_data([p1]) 172 | 173 | 174 | @pytest.mark.django_db 175 | def test_invalid_fields_related_item(): 176 | p1 = ScrapePerson("Dwayne") 177 | p1.add_link("http://example.com") 178 | p1 = p1.as_dict() 179 | p1["links"][0]["test"] = 3 180 | 181 | with pytest.raises(DataImportError): 182 | PersonImporter("jid").import_data([p1]) 183 | 184 | 185 | @pytest.mark.django_db 186 | def test_locked_field(): 187 | create_jurisdiction() 188 | org = ScrapeOrganization("SHIELD").as_dict() 189 | oi = OrganizationImporter("jid") 190 | oi.import_data([org]) 191 | 192 | # set date and lock field 193 | o = Organization.objects.get() 194 | o.dissolution_date = "2015" 195 | o.locked_fields = ["dissolution_date"] 196 | o.save() 197 | 198 | # reimport 199 | org = ScrapeOrganization("SHIELD").as_dict() 200 | oi = OrganizationImporter("jid") 201 | oi.import_data([org]) 202 | 203 | o = Organization.objects.get() 204 | assert o.dissolution_date == "2015" 205 | assert o.locked_fields == ["dissolution_date"] 206 | 207 | # do it a third time to check for the locked_fields reversion issue 208 | org = ScrapeOrganization("SHIELD").as_dict() 209 | oi = OrganizationImporter("jid") 210 | oi.import_data([org]) 211 | 212 | o = Organization.objects.get() 213 | assert o.dissolution_date == "2015" 214 | assert o.locked_fields == ["dissolution_date"] 215 | 216 | 217 | @pytest.mark.django_db 218 | def test_locked_field_subitem(): 219 | create_jurisdiction() 220 | org = ScrapeOrganization("SHIELD") 221 | org.add_name("S.H.I.E.L.D.") 222 | oi = OrganizationImporter("jid") 223 | oi.import_data([org.as_dict()]) 224 | 225 | # lock the field 226 | o = Organization.objects.get() 227 | o.locked_fields = ["other_names"] 228 | o.save() 229 | 230 | # reimport 231 | org = ScrapeOrganization("SHIELD").as_dict() 232 | oi = OrganizationImporter("jid") 233 | oi.import_data([org]) 234 | 235 | o = Organization.objects.get() 236 | assert o.other_names.get().name == "S.H.I.E.L.D." 237 | --------------------------------------------------------------------------------