├── pupa
    ├── cli
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── party.py
    │   │   ├── dbinit.py
    │   │   ├── clean.py
    │   │   └── init.py
    │   └── __main__.py
    ├── ext
    │   ├── __init__.py
    │   └── ansistrm.py
    ├── tests
    │   ├── __init__.py
    │   ├── importers
    │   │   ├── __init__.py
    │   │   ├── test_jurisdiction_importer.py
    │   │   ├── test_post_importer.py
    │   │   ├── test_topsort.py
    │   │   └── test_base_importer.py
    │   ├── reports
    │   │   └── __init__.py
    │   ├── scrape
    │   │   ├── __init__.py
    │   │   ├── test_utils.py
    │   │   ├── test_jurisdiction_scrape.py
    │   │   ├── test_scraper.py
    │   │   ├── test_model_basics.py
    │   │   ├── test_event_scrape.py
    │   │   ├── test_vote_event_scrape.py
    │   │   ├── test_people_org_scrape.py
    │   │   └── test_bill_scrape.py
    │   ├── django_settings.py
    │   ├── update
    │   │   └── test_importer_resolution.py
    │   └── clean
    │   │   └── test_clean.py
    ├── migrations
    │   ├── __init__.py
    │   ├── 0005_auto_20170522_1935.py
    │   ├── 0006_identifier_jurisdiction.py
    │   ├── 0004_identifier.py
    │   ├── 0003_auto_20151118_0408.py
    │   ├── 0002_auto_20150906_1458.py
    │   ├── 0007_sessiondataqualityreport.py
    │   └── 0001_initial.py
    ├── scrape
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   ├── post.py
    │   │   ├── membership.py
    │   │   ├── jurisdiction.py
    │   │   ├── person.py
    │   │   ├── organization.py
    │   │   ├── vote_event.py
    │   │   ├── common.py
    │   │   ├── bill.py
    │   │   └── event.py
    │   ├── __init__.py
    │   ├── jurisdiction.py
    │   ├── bill.py
    │   ├── event.py
    │   └── vote_event.py
    ├── __init__.py
    ├── reports
    │   ├── __init__.py
    │   └── session.py
    ├── utils
    │   ├── __init__.py
    │   ├── generic.py
    │   └── topsort.py
    ├── importers
    │   ├── __init__.py
    │   ├── jurisdiction.py
    │   ├── posts.py
    │   ├── memberships.py
    │   ├── people.py
    │   ├── bills.py
    │   ├── events.py
    │   ├── organizations.py
    │   └── vote_events.py
    ├── settings.py
    ├── exceptions.py
    ├── admin.py
    └── models.py
├── setup.cfg
├── run-tests.sh
├── .gitignore
├── .coveragerc
├── tox.ini
├── README.md
├── LICENSE
├── setup.py
├── .github
    └── workflows
    │   └── package.yml
├── ARCHITECTURE.md
└── CHANGELOG.md


/pupa/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/ext/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/tests/importers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/tests/reports/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pupa/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.11.0"  # pragma: no cover
2 | 


--------------------------------------------------------------------------------
/pupa/reports/__init__.py:
--------------------------------------------------------------------------------
1 | from .session import generate_session_report  # noqa
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 
4 | [flake8]
5 | max-line-length = 99
6 | exclude = pupa/migrations
7 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH=.
3 | pytest --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *swp
 3 | *egg-info*
 4 | .tox
 5 | dist
 6 | .coverage
 7 | htmlcov/
 8 | _data/
 9 | _cache/
10 | build/
11 | .cache/
12 | .idea/
13 | .env/


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = pupa/tests/*
3 |        pupa/ext/*
4 |        pupa/cli/*
5 | [report]
6 | exclude_lines =
7 |     if __name__ == .__main__.:
8 |     pragma: no cover
9 | 


--------------------------------------------------------------------------------
/pupa/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from .generic import (
 3 |     _make_pseudo_id,
 4 |     get_pseudo_id,
 5 |     makedirs,
 6 |     JSONEncoderPlus,
 7 |     convert_pdf,
 8 |     utcnow,
 9 |     format_datetime,
10 | )
11 | 


--------------------------------------------------------------------------------
/pupa/scrape/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .jurisdiction import Jurisdiction, JurisdictionScraper
3 | from .popolo import Membership, Organization, Person, Post
4 | from .vote_event import VoteEvent, OrderVoteEvent
5 | from .bill import Bill
6 | from .event import Event
7 | from .base import Scraper, BaseBillScraper
8 | 


--------------------------------------------------------------------------------
/pupa/importers/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from .jurisdiction import JurisdictionImporter
 3 | from .organizations import OrganizationImporter
 4 | from .people import PersonImporter
 5 | from .posts import PostImporter
 6 | from .memberships import MembershipImporter
 7 | from .bills import BillImporter
 8 | from .vote_events import VoteEventImporter
 9 | from .events import EventImporter
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37-django{22,30},flake8
 3 | [testenv]
 4 | deps = 
 5 |     django22: Django==2.2
 6 |     django30: Django==3.0
 7 | commands = 
 8 |  pip install -e .[dev] git+https://github.com/opencivicdata/python-opencivicdata.git#egg=opencivicdata
 9 |  pytest pupa --ds=pupa.tests.django_settings
10 | 
11 | [testenv:flake8]
12 | deps = flake8
13 | commands = flake8 pupa
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pupa: A legislative data scraping framework
2 | 
3 | ![example workflow](https://github.com/opencivicdata/pupa/actions/workflows/package.yml/badge.svg?branch=master)
4 | [![Coverage Status](https://coveralls.io/repos/opencivicdata/pupa/badge.png?branch=master)](https://coveralls.io/r/opencivicdata/pupa?branch=master)
5 | [![PyPI](https://img.shields.io/pypi/v/pupa.svg)](https://pypi.python.org/pypi/pupa)
6 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/base.py:
--------------------------------------------------------------------------------
 1 | class BaseCommand(object):
 2 |     def __init__(self, subparsers):
 3 |         self.subparser = subparsers.add_parser(self.name, description=self.help)
 4 |         self.add_args()
 5 | 
 6 |     def add_args(self):
 7 |         pass
 8 | 
 9 |     def add_argument(self, *args, **kwargs):
10 |         self.subparser.add_argument(*args, **kwargs)
11 | 
12 |     def handle(self, args):
13 |         raise NotImplementedError("commands must implement handle(args)")
14 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/post.py:
--------------------------------------------------------------------------------
 1 | from .common import links, contact_details, extras, fuzzy_date_blank
 2 | 
 3 | schema = {
 4 |     "properties": {
 5 |         "label": {"type": "string", "minLength": 1},
 6 |         "role": {"type": "string"},
 7 |         "maximum_memberships": {"type": "number"},
 8 |         "organization_id": {"type": "string", "minLength": 1},
 9 |         "division_id": {"type": ["null", "string"], "minLength": 1},
10 |         "start_date": fuzzy_date_blank,
11 |         "end_date": fuzzy_date_blank,
12 |         "contact_details": contact_details,
13 |         "links": links,
14 |         "extras": extras,
15 |     },
16 |     "type": "object",
17 | }
18 | 


--------------------------------------------------------------------------------
/pupa/tests/django_settings.py:
--------------------------------------------------------------------------------
 1 | # django settings for tests
 2 | import os
 3 | 
 4 | SECRET_KEY = 'test'
 5 | INSTALLED_APPS = ('django.contrib.contenttypes',
 6 |                   'opencivicdata.core.apps.BaseConfig',
 7 |                   'opencivicdata.legislative.apps.BaseConfig',
 8 |                   'pupa')
 9 | DATABASES = {
10 |     'default': {
11 |         'ENGINE': 'django.contrib.gis.db.backends.postgis',
12 |         'NAME': os.getenv('POSTGRES_DB', 'test'),
13 |         'USER': os.getenv('POSTGRES_USER', 'test'),
14 |         'PASSWORD': os.getenv('POSTGRES_PASSWORD', 'test'),
15 |         'HOST': os.getenv('POSTGRES_HOST', 'localhost'),
16 |         'PORT': os.getenv('POSTGRES_PORT', 5432),
17 |     }
18 | }
19 | MIDDLEWARE_CLASSES = ()
20 | 


--------------------------------------------------------------------------------
/pupa/migrations/0005_auto_20170522_1935.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by Django 1.10.5 on 2017-05-22 19:35
 3 | from __future__ import unicode_literals
 4 | 
 5 | from django.db import migrations, models
 6 | 
 7 | 
 8 | class Migration(migrations.Migration):
 9 | 
10 |     dependencies = [
11 |         ("pupa", "0004_identifier"),
12 |     ]
13 | 
14 |     operations = [
15 |         migrations.AlterField(
16 |             model_name="identifier",
17 |             name="identifier",
18 |             field=models.CharField(max_length=300),
19 |         ),
20 |         migrations.AlterField(
21 |             model_name="identifier",
22 |             name="object_id",
23 |             field=models.CharField(max_length=300),
24 |         ),
25 |     ]
26 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pupa.cli.commands.update import override_settings
 4 | 
 5 | 
 6 | class _Settings:
 7 |     pass
 8 | 
 9 | 
10 | @pytest.fixture
11 | def settings():
12 |     ret = _Settings()
13 |     ret.foo = "bar"
14 |     ret.baz = "bob"
15 |     return ret
16 | 
17 | 
18 | def test_override_settings(settings):
19 |     with override_settings(settings, {"baz": "fez"}):
20 |         assert settings.foo == "bar"
21 |         assert settings.baz == "fez"
22 |     assert settings.foo == "bar"
23 |     assert settings.baz == "bob"
24 | 
25 | 
26 | def test_override_settings_unset(settings):
27 |     with override_settings(settings, {"qux": "fez"}):
28 |         assert settings.qux == "fez"
29 |     assert not hasattr(settings, "qux")
30 | 


--------------------------------------------------------------------------------
/pupa/importers/jurisdiction.py:
--------------------------------------------------------------------------------
 1 | from opencivicdata.core.models import Jurisdiction
 2 | from opencivicdata.legislative.models import LegislativeSession
 3 | from .base import BaseImporter
 4 | 
 5 | 
 6 | class JurisdictionImporter(BaseImporter):
 7 |     _type = "jurisdiction"
 8 |     model_class = Jurisdiction
 9 |     related_models = {
10 |         "legislative_sessions": (LegislativeSession, "jurisdiction_id", {})
11 |     }
12 |     merge_related = {"legislative_sessions": ["identifier"]}
13 | 
14 |     def get_object(self, data):
15 |         return self.model_class.objects.get(
16 |             division_id=data["division_id"], classification=data["classification"]
17 |         )
18 | 
19 |     def prepare_for_db(self, data):
20 |         for s in data["legislative_sessions"]:
21 |             s.pop("_scraped_name", None)
22 |         return data
23 | 


--------------------------------------------------------------------------------
/pupa/migrations/0006_identifier_jurisdiction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by Django 1.10.5 on 2017-06-15 14:07
 3 | from __future__ import unicode_literals
 4 | 
 5 | from django.db import migrations, models
 6 | import django.db.models.deletion
 7 | 
 8 | 
 9 | class Migration(migrations.Migration):
10 | 
11 |     dependencies = [
12 |         ("core", "0001_initial"),
13 |         ("pupa", "0005_auto_20170522_1935"),
14 |     ]
15 | 
16 |     operations = [
17 |         migrations.AddField(
18 |             model_name="identifier",
19 |             name="jurisdiction",
20 |             field=models.ForeignKey(
21 |                 default="",
22 |                 on_delete=django.db.models.deletion.CASCADE,
23 |                 related_name="pupa_ids",
24 |                 to="core.Jurisdiction",
25 |             ),
26 |             preserve_default=False,
27 |         ),
28 |     ]
29 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/membership.py:
--------------------------------------------------------------------------------
 1 | from .common import links, contact_details, extras, fuzzy_date_blank
 2 | 
 3 | schema = {
 4 |     "properties": {
 5 |         "label": {"type": "string"},
 6 |         "role": {"type": "string"},
 7 |         "person_id": {"type": ["string", "null"]},
 8 |         "person_name": {"type": ["string"], "minLength": 1},
 9 |         "organization_id": {"type": "string", "minLength": 1},
10 |         "post_id": {"type": ["string", "null"]},
11 |         "on_behalf_of_id": {"type": ["string", "null"]},
12 |         "start_date": fuzzy_date_blank,
13 |         "end_date": fuzzy_date_blank,
14 |         "contact_details": contact_details,
15 |         "links": links,
16 |         "extras": extras,
17 |         # division & jurisdiction are additions to popolo
18 |         "division_id": {"type": ["string", "null"]},
19 |         "jurisdiction_id": {"type": "string", "minLength": 1},
20 |     },
21 |     "type": "object",
22 | }
23 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/jurisdiction.py:
--------------------------------------------------------------------------------
 1 | from .common import extras, fuzzy_date_blank
 2 | 
 3 | schema = {
 4 |     "type": "object",
 5 |     "properties": {
 6 |         "name": {"type": "string", "minLength": 1},
 7 |         "url": {"type": "string", "minLength": 1},
 8 |         "classification": {"type": "string", "minLength": 1},  # TODO: enum
 9 |         "division_id": {"type": "string", "minLength": 1},
10 |         "legislative_sessions": {
11 |             "type": "array",
12 |             "items": {
13 |                 "type": "object",
14 |                 "properties": {
15 |                     "name": {"type": "string", "minLength": 1},
16 |                     "type": {"type": "string", "enum": ["primary", "special"]},
17 |                     "start_date": fuzzy_date_blank,
18 |                     "end_date": fuzzy_date_blank,
19 |                 },
20 |             },
21 |         },
22 |         "feature_flags": {"type": "array", "items": {"type": "string", "minLength": 1}},
23 |         "extras": extras,
24 |     },
25 | }
26 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/person.py:
--------------------------------------------------------------------------------
 1 | from .common import (
 2 |     links,
 3 |     contact_details,
 4 |     identifiers,
 5 |     other_names,
 6 |     sources,
 7 |     extras,
 8 |     fuzzy_date_blank,
 9 | )
10 | 
11 | schema = {
12 |     "properties": {
13 |         "name": {"type": "string", "minLength": 1},
14 |         "other_names": other_names,
15 |         "identifiers": identifiers,
16 |         "sort_name": {"type": "string"},
17 |         "family_name": {"type": "string"},
18 |         "given_name": {"type": "string"},
19 |         "gender": {"type": "string"},
20 |         "birth_date": fuzzy_date_blank,
21 |         "death_date": fuzzy_date_blank,
22 |         "image": {"format": "uri-blank", "type": "string"},
23 |         "summary": {"type": "string"},
24 |         "biography": {"type": "string"},
25 |         "national_identity": {"type": "string"},
26 |         "contact_details": contact_details,
27 |         "links": links,
28 |         "sources": sources,
29 |         "extras": extras,
30 |     },
31 |     "type": "object",
32 | }
33 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/organization.py:
--------------------------------------------------------------------------------
 1 | from .common import (
 2 |     links,
 3 |     contact_details,
 4 |     identifiers,
 5 |     other_names,
 6 |     sources,
 7 |     extras,
 8 |     fuzzy_date_blank,
 9 | )
10 | from opencivicdata import common
11 | 
12 | schema = {
13 |     "properties": {
14 |         "name": {"type": "string", "minLength": 1},
15 |         "other_names": other_names,
16 |         "identifiers": identifiers,
17 |         "classification": {
18 |             "type": ["string", "null"],
19 |             "enum": common.ORGANIZATION_CLASSIFICATIONS,
20 |         },
21 |         "parent_id": {
22 |             "type": ["string", "null"],
23 |         },
24 |         "founding_date": fuzzy_date_blank,
25 |         "dissolution_date": fuzzy_date_blank,
26 |         "image": {"type": "string", "format": "uri-blank"},
27 |         "contact_details": contact_details,
28 |         "links": links,
29 |         "sources": sources,
30 |         # added to popolo
31 |         "jurisdiction_id": {"type": "string", "minLength": 1},
32 |         "division_id": {"type": ["string", "null"], "minLength": 1},
33 |         "extras": extras,
34 |     },
35 |     "type": "object",
36 | }
37 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/party.py:
--------------------------------------------------------------------------------
 1 | import django
 2 | from .base import BaseCommand
 3 | from pupa.exceptions import CommandError
 4 | 
 5 | 
 6 | class Command(BaseCommand):
 7 |     name = "party"
 8 |     help = "command line tool to manage parties"
 9 | 
10 |     def add_args(self):
11 |         self.add_argument("action", type=str, help="add|list")
12 |         self.add_argument("party_name", type=str, nargs="?")
13 | 
14 |     def handle(self, args, other):
15 |         django.setup()
16 |         from opencivicdata.core.models import Organization
17 | 
18 |         if args.action == "add":
19 |             o, created = Organization.objects.get_or_create(
20 |                 name=args.party_name, classification="party"
21 |             )
22 |             if created:
23 |                 print("added {}".format(o))
24 |             else:
25 |                 print("{} already exists".format(o))
26 |         elif args.action == "list":
27 |             for party in Organization.objects.filter(classification="party").order_by(
28 |                 "name"
29 |             ):
30 |                 print(party.name)
31 |         else:
32 |             raise CommandError('party action must be "add" or "list"')
33 | 


--------------------------------------------------------------------------------
/pupa/importers/posts.py:
--------------------------------------------------------------------------------
 1 | from opencivicdata.core.models import Post, PostContactDetail, PostLink
 2 | from .base import BaseImporter
 3 | 
 4 | 
 5 | class PostImporter(BaseImporter):
 6 |     _type = "post"
 7 |     model_class = Post
 8 |     related_models = {
 9 |         "contact_details": (PostContactDetail, "post_id", {}),
10 |         "links": (PostLink, "post_id", {}),
11 |     }
12 | 
13 |     def __init__(self, jurisdiction_id, org_importer):
14 |         super(PostImporter, self).__init__(jurisdiction_id)
15 |         self.org_importer = org_importer
16 | 
17 |     def prepare_for_db(self, data):
18 |         data["organization_id"] = self.org_importer.resolve_json_id(
19 |             data["organization_id"]
20 |         )
21 |         return data
22 | 
23 |     def get_object(self, post):
24 |         spec = {
25 |             "organization_id": post["organization_id"],
26 |             "label": post["label"],
27 |         }
28 | 
29 |         if post["role"]:
30 |             spec["role"] = post["role"]
31 | 
32 |         return self.model_class.objects.get(**spec)
33 | 
34 |     def limit_spec(self, spec):
35 |         spec["organization__jurisdiction_id"] = self.jurisdiction_id
36 |         return spec
37 | 


--------------------------------------------------------------------------------
/pupa/migrations/0004_identifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by Django 1.10.5 on 2017-05-22 15:51
 3 | from __future__ import unicode_literals
 4 | 
 5 | from django.db import migrations, models
 6 | import django.db.models.deletion
 7 | 
 8 | 
 9 | class Migration(migrations.Migration):
10 | 
11 |     dependencies = [
12 |         ("contenttypes", "0002_remove_content_type_name"),
13 |         ("pupa", "0003_auto_20151118_0408"),
14 |     ]
15 | 
16 |     operations = [
17 |         migrations.CreateModel(
18 |             name="Identifier",
19 |             fields=[
20 |                 (
21 |                     "id",
22 |                     models.AutoField(
23 |                         auto_created=True,
24 |                         primary_key=True,
25 |                         serialize=False,
26 |                         verbose_name="ID",
27 |                     ),
28 |                 ),
29 |                 ("identifier", models.CharField(max_length=500)),
30 |                 ("object_id", models.PositiveIntegerField()),
31 |                 (
32 |                     "content_type",
33 |                     models.ForeignKey(
34 |                         on_delete=django.db.models.deletion.CASCADE,
35 |                         to="contenttypes.ContentType",
36 |                     ),
37 |                 ),
38 |             ],
39 |         ),
40 |     ]
41 | 


--------------------------------------------------------------------------------
/pupa/migrations/0003_auto_20151118_0408.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by Django 1.9b1 on 2015-11-18 04:08
 3 | from __future__ import unicode_literals
 4 | 
 5 | import datetime
 6 | from django.db import migrations, models
 7 | from django.utils.timezone import utc
 8 | 
 9 | 
10 | class Migration(migrations.Migration):
11 | 
12 |     dependencies = [
13 |         ("pupa", "0002_auto_20150906_1458"),
14 |     ]
15 | 
16 |     operations = [
17 |         migrations.AddField(
18 |             model_name="runplan",
19 |             name="end_time",
20 |             field=models.DateTimeField(
21 |                 default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc)
22 |             ),
23 |             preserve_default=False,
24 |         ),
25 |         migrations.AddField(
26 |             model_name="runplan",
27 |             name="exception",
28 |             field=models.TextField(blank=True, default=""),
29 |         ),
30 |         migrations.AddField(
31 |             model_name="runplan",
32 |             name="start_time",
33 |             field=models.DateTimeField(
34 |                 default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc)
35 |             ),
36 |             preserve_default=False,
37 |         ),
38 |         migrations.AddField(
39 |             model_name="runplan",
40 |             name="traceback",
41 |             field=models.TextField(blank=True, default=""),
42 |         ),
43 |     ]
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015-, Open Civic Data Contributors
 2 | Copyright (c) 2014, Sunlight Foundation
 3 | 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification,
 7 | are permitted provided that the following conditions are met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright notice, 
10 |       this list of conditions and the following disclaimer.
11 |     * Redistributions in binary form must reproduce the above copyright notice, 
12 |       this list of conditions and the following disclaimer in the documentation 
13 |       and/or other materials provided with the distribution.
14 |     * Neither the name of Open Civic Data nor the names of its contributors may be
15 |       used to endorse or promote products derived from this software without 
16 |       specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/pupa/migrations/0002_auto_20150906_1458.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Generated by Django 1.9.dev20150906080247 on 2015-09-06 14:58
 3 | from __future__ import unicode_literals
 4 | 
 5 | from django.db import migrations, models
 6 | import django.db.models.deletion
 7 | 
 8 | 
 9 | class Migration(migrations.Migration):
10 | 
11 |     dependencies = [
12 |         ("pupa", "0001_initial"),
13 |     ]
14 | 
15 |     operations = [
16 |         migrations.AlterField(
17 |             model_name="importobjects",
18 |             name="report",
19 |             field=models.ForeignKey(
20 |                 on_delete=django.db.models.deletion.CASCADE,
21 |                 related_name="imported_objects",
22 |                 to="pupa.RunPlan",
23 |             ),
24 |         ),
25 |         migrations.AlterField(
26 |             model_name="runplan",
27 |             name="jurisdiction",
28 |             field=models.ForeignKey(
29 |                 on_delete=django.db.models.deletion.CASCADE,
30 |                 related_name="runs",
31 |                 to="core.Jurisdiction",
32 |             ),
33 |         ),
34 |         migrations.AlterField(
35 |             model_name="scrapeobjects",
36 |             name="report",
37 |             field=models.ForeignKey(
38 |                 on_delete=django.db.models.deletion.CASCADE,
39 |                 related_name="scraped_objects",
40 |                 to="pupa.ScrapeReport",
41 |             ),
42 |         ),
43 |         migrations.AlterField(
44 |             model_name="scrapereport",
45 |             name="plan",
46 |             field=models.ForeignKey(
47 |                 on_delete=django.db.models.deletion.CASCADE,
48 |                 related_name="scrapers",
49 |                 to="pupa.RunPlan",
50 |             ),
51 |         ),
52 |     ]
53 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/vote_event.py:
--------------------------------------------------------------------------------
 1 | from .common import sources, extras, fuzzy_datetime_blank
 2 | from opencivicdata import common
 3 | 
 4 | 
 5 | schema = {
 6 |     "type": "object",
 7 |     "properties": {
 8 |         "identifier": {"type": "string"},
 9 |         "motion_text": {"type": "string", "minLength": 1},
10 |         "motion_classification": {
11 |             "items": {"type": "string", "minLength": 1},
12 |             "type": "array",
13 |         },
14 |         "start_date": fuzzy_datetime_blank,
15 |         "end_date": fuzzy_datetime_blank,
16 |         "result": {"type": "string", "enum": common.VOTE_RESULTS},
17 |         "organization": {"type": ["string", "null"], "minLength": 1},
18 |         "legislative_session": {"type": "string", "minLength": 1},
19 |         "bill": {"type": ["string", "null"], "minLength": 1},
20 |         "bill_action": {"type": ["string", "null"], "minLength": 1},
21 |         "votes": {
22 |             "items": {
23 |                 "type": "object",
24 |                 "properties": {
25 |                     "option": {"type": "string", "enum": common.VOTE_OPTIONS},
26 |                     "voter_name": {"type": "string", "minLength": 1},
27 |                     "voter_id": {"type": "string", "minLength": 1},
28 |                     "note": {"type": "string"},
29 |                 },
30 |             },
31 |         },
32 |         "counts": {
33 |             "items": {
34 |                 "properties": {
35 |                     "option": {"type": "string", "enum": common.VOTE_OPTIONS},
36 |                     "value": {"type": "integer", "minimum": 0},
37 |                 },
38 |                 "type": "object",
39 |             },
40 |         },
41 |         "sources": sources,
42 |         "extras": extras,
43 |         "pupa_id": {"type": ["string", "null"], "minLength": 1},
44 |     },
45 | }
46 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup, find_packages
 3 | from pupa import __version__
 4 | 
 5 | long_description = ''
 6 | 
 7 | setup(name='pupa',
 8 |       version=__version__,
 9 |       packages=find_packages(),
10 |       author='James Turk',
11 |       author_email='james@openstates.org',
12 |       license='BSD',
13 |       url='https://github.com/opencivicdata/pupa/',
14 |       description='scraping framework for muncipal data',
15 |       long_description=long_description,
16 |       platforms=['any'],
17 |       zip_safe=False,
18 |       entry_points='''[console_scripts]
19 | pupa = pupa.cli.__main__:main''',
20 |       install_requires=[
21 |           'Django>=2.2,<5',
22 |           'opencivicdata>=3.3.0',
23 |           'dj_database_url>=0.3.0',
24 |           'scrapelib>=1.0',
25 |           'jsonschema>=3.0.0',  # TODO: Drop alpha release once stable release available
26 |           'psycopg2-binary',
27 |           'pytz',
28 |       ],
29 |       extras_require={
30 |           'dev': [
31 |             'mock',
32 |             'pytest>=3.6',
33 |             'pytest-cov',
34 |             'pytest-django',
35 |             'freezegun',
36 |             'coveralls',
37 |             'coverage<=6.5.0',
38 |             'flake8',
39 |           ],
40 |       },
41 |       classifiers=["Development Status :: 4 - Beta",
42 |                    "Intended Audience :: Developers",
43 |                    "License :: OSI Approved :: BSD License",
44 |                    "Natural Language :: English",
45 |                    "Operating System :: OS Independent",
46 |                    "Programming Language :: Python :: 3.8",
47 |                    "Programming Language :: Python :: 3.9",
48 |                    "Programming Language :: Python :: 3.10",
49 |                    "Topic :: Software Development :: Libraries :: Python Modules",
50 |                    ],
51 |       )
52 | 


--------------------------------------------------------------------------------
/pupa/tests/update/test_importer_resolution.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import types
 3 | import pytest
 4 | 
 5 | from django.utils.module_loading import import_string
 6 | 
 7 | from pupa.cli.commands import update
 8 | from pupa.exceptions import CommandError
 9 | 
10 | 
11 | @pytest.fixture(params=[
12 |     "JurisdictionImporter",
13 |     "OrganizationImporter",
14 |     "PersonImporter",
15 |     "PostImporter",
16 |     "MembershipImporter",
17 |     "BillImporter",
18 |     "VoteEventImporter",
19 |     "EventImporter",
20 | ])
21 | def importer_test_case(request):
22 |     return request.param
23 | 
24 | 
25 | @pytest.fixture
26 | def custom_importer():
27 |     """
28 |     Create a module object at runtime with a single class inside, and insert it
29 |     into sys.modules so import_string() can load it by dotted path.
30 |     """
31 |     module_name, class_name = ["tests.fixtures.custom_importers", "MyCustomImporter"]
32 | 
33 |     module = types.ModuleType("tests.fixtures.custom_importers")
34 |     cls = type(class_name, (), {})
35 |     setattr(module, class_name, cls)
36 |     sys.modules[module_name] = module
37 | 
38 |     return cls, module_name
39 | 
40 | 
41 | def test_resolve_custom_importer(custom_importer, settings, importer_test_case):
42 |     cls, module_name = custom_importer
43 |     settings.IMPORTER_CLASSES = {importer_test_case: f"{module_name}.{cls.__name__}"}
44 |     resolved = update.resolve_importer(importer_test_case)
45 |     assert resolved is cls
46 | 
47 | 
48 | def test_resolve_default_importer(importer_test_case):
49 |     expected_importer = import_string(f"pupa.importers.{importer_test_case}")
50 |     resolved_importer = update.resolve_importer(importer_test_case)
51 |     assert resolved_importer is expected_importer
52 | 
53 | 
54 | def test_resolve_bad_path_raises_error(settings):
55 |     settings.IMPORTER_CLASSES = {"PersonImporter": "non.existent.Path"}
56 |     with pytest.raises(CommandError):
57 |         update.resolve_importer("PersonImporter")
58 | 


--------------------------------------------------------------------------------
/.github/workflows/package.yml:
--------------------------------------------------------------------------------
 1 | name: Test and build Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |     tags:
 7 |       - v*
 8 |   pull_request:
 9 |     branches: [ master ]
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     services:
15 |       postgres:
16 |         image: postgis/postgis:10-2.5
17 |         env:
18 |           POSTGRES_USER: test
19 |           POSTGRES_DB: test
20 |           POSTGRES_PASSWORD: test
21 |         options: >-
22 |           --health-cmd pg_isready
23 |           --health-interval 10s
24 |           --health-timeout 5s
25 |           --health-retries 5
26 |         ports:
27 |           - 5432:5432
28 |     strategy:
29 |       matrix:
30 |         python-version: ['3.8', '3.9', '3.10']
31 |         django-series: ['2.2', '3.0']
32 |     steps:
33 |     - uses: actions/checkout@v2
34 |     - name: Set up Python ${{ matrix.python-version }}
35 |       uses: actions/setup-python@v2
36 |       with:
37 |         python-version: ${{ matrix.python-version }}
38 |     - name: Install dependencies
39 |       run: |
40 |         sudo apt update
41 |         sudo apt install -y gdal-bin
42 |         pip install .[dev] --pre Django==${{ matrix.django-series }}
43 |     - name: Lint with flake8
44 |       run: |
45 |         flake8 pupa
46 |     - name: Test with pytest
47 |       run: |
48 |         ./run-tests.sh
49 |     - name: Calculate test coverage
50 |       env:
51 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
52 |       run: |
53 |         coveralls --service=github
54 | 
55 |   build:
56 |     needs: test
57 |     name: Build package and upload to PyPI
58 |     runs-on: ubuntu-latest
59 |     steps:
60 |     - uses: actions/checkout@v2
61 |     - name: Build and publish
62 |       if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
63 |       env:
64 |         TWINE_USERNAME: __token__
65 |         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
66 |       run: |
67 |         pip install twine wheel
68 |         pip wheel -w dist --no-deps .
69 |         python setup.py sdist
70 |         twine upload dist/*
71 |       continue-on-error: true
72 | 


--------------------------------------------------------------------------------
/pupa/utils/generic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pytz
 4 | import datetime
 5 | import subprocess
 6 | 
 7 | 
 8 | def utcnow():
 9 |     return datetime.datetime.now(datetime.timezone.utc)
10 | 
11 | 
12 | def _make_pseudo_id(**kwargs):
13 |     """pseudo ids are just JSON"""
14 |     # ensure keys are sorted so that these are deterministic
15 |     return "~" + json.dumps(kwargs, sort_keys=True)
16 | 
17 | 
18 | def get_pseudo_id(pid):
19 |     if pid[0] != "~":
20 |         raise ValueError("pseudo id doesn't start with ~")
21 |     return json.loads(pid[1:])
22 | 
23 | 
24 | def makedirs(dname):
25 |     if not os.path.isdir(dname):
26 |         os.makedirs(dname)
27 | 
28 | 
29 | class JSONEncoderPlus(json.JSONEncoder):
30 |     """
31 |     JSONEncoder that encodes datetime objects as Unix timestamps.
32 |     """
33 | 
34 |     def default(self, obj, **kwargs):
35 |         if isinstance(obj, datetime.datetime):
36 |             if obj.tzinfo is None:
37 |                 raise TypeError("date '%s' is not fully timezone qualified." % (obj))
38 |             obj = obj.astimezone(pytz.UTC)
39 |             return "{}".format(obj.isoformat())
40 |         elif isinstance(obj, datetime.date):
41 |             return "{}".format(obj.isoformat())
42 |         return super(JSONEncoderPlus, self).default(obj, **kwargs)
43 | 
44 | 
45 | def convert_pdf(filename, type="xml"):
46 |     commands = {
47 |         "text": ["pdftotext", "-layout", filename, "-"],
48 |         "text-nolayout": ["pdftotext", filename, "-"],
49 |         "xml": ["pdftohtml", "-xml", "-stdout", filename],
50 |         "html": ["pdftohtml", "-stdout", filename],
51 |     }
52 |     try:
53 |         pipe = subprocess.Popen(
54 |             commands[type], stdout=subprocess.PIPE, close_fds=True
55 |         ).stdout
56 |     except OSError as e:
57 |         raise EnvironmentError(
58 |             "error running %s, missing executable? [%s]" % " ".join(commands[type]), e
59 |         )
60 |     data = pipe.read()
61 |     pipe.close()
62 |     return data
63 | 
64 | 
65 | def format_datetime(dt, timezone):
66 |     return pytz.timezone(timezone).localize(dt).replace(microsecond=0).isoformat()
67 | 


--------------------------------------------------------------------------------
/pupa/migrations/0007_sessiondataqualityreport.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 2.1.2 on 2018-10-23 15:25
 2 | 
 3 | import django.contrib.postgres.fields.jsonb
 4 | from django.db import migrations, models
 5 | import django.db.models.deletion
 6 | 
 7 | 
 8 | class Migration(migrations.Migration):
 9 | 
10 |     dependencies = [
11 |         ("legislative", "0005_auto_20171005_2028"),
12 |         ("pupa", "0006_identifier_jurisdiction"),
13 |     ]
14 | 
15 |     operations = [
16 |         migrations.CreateModel(
17 |             name="SessionDataQualityReport",
18 |             fields=[
19 |                 (
20 |                     "id",
21 |                     models.AutoField(
22 |                         auto_created=True,
23 |                         primary_key=True,
24 |                         serialize=False,
25 |                         verbose_name="ID",
26 |                     ),
27 |                 ),
28 |                 ("bills_missing_actions", models.PositiveIntegerField()),
29 |                 ("bills_missing_sponsors", models.PositiveIntegerField()),
30 |                 ("bills_missing_versions", models.PositiveIntegerField()),
31 |                 ("votes_missing_voters", models.PositiveIntegerField()),
32 |                 ("votes_missing_bill", models.PositiveIntegerField()),
33 |                 ("votes_missing_yes_count", models.PositiveIntegerField()),
34 |                 ("votes_missing_no_count", models.PositiveIntegerField()),
35 |                 ("votes_with_bad_counts", models.PositiveIntegerField()),
36 |                 (
37 |                     "unmatched_sponsor_people",
38 |                     django.contrib.postgres.fields.jsonb.JSONField(),
39 |                 ),
40 |                 (
41 |                     "unmatched_sponsor_organizations",
42 |                     django.contrib.postgres.fields.jsonb.JSONField(),
43 |                 ),
44 |                 ("unmatched_voters", django.contrib.postgres.fields.jsonb.JSONField()),
45 |                 (
46 |                     "legislative_session",
47 |                     models.ForeignKey(
48 |                         on_delete=django.db.models.deletion.CASCADE,
49 |                         to="legislative.LegislativeSession",
50 |                     ),
51 |                 ),
52 |             ],
53 |         ),
54 |     ]
55 | 


--------------------------------------------------------------------------------
/pupa/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import importlib
 4 | 
 5 | import dj_database_url
 6 | 
 7 | DATABASE_URL = os.environ.get(
 8 |     "DATABASE_URL", "postgis://pupa:pupa@localhost/opencivicdata"
 9 | )
10 | SECRET_KEY = "non-secret"
11 | INSTALLED_APPS = (
12 |     "django.contrib.contenttypes",
13 |     "opencivicdata.core.apps.BaseConfig",
14 |     "opencivicdata.legislative.apps.BaseConfig",
15 |     "pupa",
16 | )
17 | 
18 | ALLOWED_HOSTS = ["localhost"]
19 | SILENCED_SYSTEM_CHECKS = ["fields.E904"]
20 | 
21 | # scrape settings
22 | 
23 | SCRAPELIB_RPM = 60
24 | SCRAPELIB_TIMEOUT = 60
25 | SCRAPELIB_RETRY_ATTEMPTS = 3
26 | SCRAPELIB_RETRY_WAIT_SECONDS = 10
27 | SCRAPELIB_VERIFY = True
28 | 
29 | CACHE_DIR = os.path.join(os.getcwd(), "_cache")
30 | SCRAPED_DATA_DIR = os.path.join(os.getcwd(), "_data")
31 | 
32 | # import settings
33 | 
34 | ENABLE_PEOPLE_AND_ORGS = True
35 | ENABLE_BILLS = True
36 | ENABLE_VOTES = True
37 | ENABLE_EVENTS = True
38 | 
39 | IMPORT_TRANSFORMERS = {"bill": []}
40 | 
41 | # Django settings
42 | DEBUG = False
43 | TEMPLATE_DEBUG = False
44 | 
45 | MIDDLEWARE_CLASSES = ()
46 | LOGGING = {
47 |     "version": 1,
48 |     "disable_existing_loggers": False,
49 |     "formatters": {
50 |         "standard": {
51 |             "format": "%(asctime)s %(levelname)s %(name)s: %(message)s",
52 |             "datefmt": "%H:%M:%S",
53 |         }
54 |     },
55 |     "handlers": {
56 |         "default": {
57 |             "level": "DEBUG",
58 |             "class": "pupa.ext.ansistrm.ColorizingStreamHandler",
59 |             "formatter": "standard",
60 |         },
61 |     },
62 |     "loggers": {
63 |         "": {"handlers": ["default"], "level": "DEBUG", "propagate": True},
64 |         "scrapelib": {"handlers": ["default"], "level": "INFO", "propagate": False},
65 |         "requests": {"handlers": ["default"], "level": "WARN", "propagate": False},
66 |         "boto": {"handlers": ["default"], "level": "WARN", "propagate": False},
67 |     },
68 | }
69 | 
70 | 
71 | sys.path.insert(1, os.getcwd())
72 | loader = importlib.util.find_spec("pupa_settings")
73 | if loader is None:
74 |     print("no pupa_settings on path, using defaults")
75 | else:
76 |     from pupa_settings import *  # NOQA
77 | 
78 | 
79 | DATABASES = {"default": dj_database_url.parse(DATABASE_URL)}
80 | DATABASES["default"]["ENGINE"] = "django.contrib.gis.db.backends.postgis"
81 | 


--------------------------------------------------------------------------------
/ARCHITECTURE.md:
--------------------------------------------------------------------------------
 1 | =================
 2 | pupa architecture
 3 | =================
 4 | 
 5 | pupa.cli
 6 | ========
 7 | 
 8 | * dbinit    - initializes a postgres database for use with pupa scrapers
 9 | 
10 | * init      - initializes a local project directory ready for people to write scrapers
11 | 
12 | * update    - updates data, can be run with --scrape if desire is to examine data locally
13 | 
14 | pupa.ext
15 | ========
16 | 
17 | Nothing here is particularly interesting architecturally, this is where a few vendorized files
18 | live.
19 | 
20 | pupa.scrape
21 | ===========
22 | 
23 | scrape.Scraper - base class for all scrapers
24 | 
25 |     self.info
26 |     self.debug
27 |     self.warning
28 |     self.error
29 |     self.critical
30 | 
31 |     self.save_object(obj) - given a scrape object saves it to disk
32 |         calls obj.pre_save(jid), obj.as_dict(), and obj.validate()
33 | 
34 |     self.do_scrape(**kwargs) - the workhorse of the scraper, runs a scrape by calling self.scrape()
35 |         passed on all arbitrary args to scrape, which can use them for discrimination
36 | 
37 |     self.scrape(**kwargs) - the user-implemented method where the scraper should be implemented
38 | 
39 | 
40 | scrape.BaseBillScraper - special helper for bill scrapers
41 | 
42 |     ContinueScraping - exception that can be raised to skip a bill
43 | 
44 |     scrape() defined to call two functions
45 |         get_bill_ids(**kwargs) - returns a list of (bill_id, extras) tuples
46 |         get_bill(bill_id, **extras) - either gets a bill or raises a ContinueScraping
47 | 
48 | 
49 | scrape.BaseModel - base class for all scrape models
50 |     _type - overriden to the type (???used where???)
51 |     _schema - the schema dictionary to use in validate()
52 | 
53 |     self._id - defaults to a UUID
54 |     self._related - list of related models
55 |     self._meta - ???used???
56 |     self.extras = {} - dict of all irregular fields
57 | 
58 |     validate() - validates against _schema
59 |     as_dict() - converts to a dict, only includes properties in the schema
60 | 
61 |     notes:
62 |         setattr is overriden to avoid setting properties that will fail on save
63 |         __eq__ is overriden (???used???)
64 | 
65 | 
66 | scrape.SourceMixin, ContactDetailMixin, LinkMixin, AssociatedLinkMixin
67 |     various mixins that add common fields and helper methods for each of these common attributes
68 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/common.py:
--------------------------------------------------------------------------------
 1 | from opencivicdata import common
 2 | 
 3 | contact_details = {
 4 |     "type": "array",
 5 |     "items": {
 6 |         "type": "object",
 7 |         "properties": {
 8 |             "type": {"type": "string", "enum": common.CONTACT_TYPES},
 9 |             "value": {"type": "string", "minLength": 1},
10 |             "note": {"type": "string"},
11 |             "label": {"type": "string"},
12 |         },
13 |     },
14 | }
15 | 
16 | identifiers = {
17 |     "items": {
18 |         "properties": {
19 |             "identifier": {"type": "string", "minLength": 1},
20 |             "scheme": {"type": "string"},
21 |         }
22 |     },
23 |     "type": "array",
24 | }
25 | 
26 | fuzzy_date_string = {"type": "string", "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"}
27 | fuzzy_date_string_blank = {
28 |     "type": "string",
29 |     "pattern": "^([0-9]{4})?(-[0-9]{2}){0,2}$",
30 | }
31 | fuzzy_datetime_string_blank = {
32 |     "type": "string",
33 |     "pattern": (
34 |         "^([0-9]{4}((-[0-9]{2}){0,2}|(-[0-9]{2}){2}T"
35 |         "[0-9]{2}(:[0-9]{2}){0,2}"
36 |         "(Z|[+-][0-9]{2}(:[0-9]{2})?))?)?$"
37 |     ),
38 | }
39 | fuzzy_date = {"type": [fuzzy_date_string, "date"]}
40 | fuzzy_date_blank = {"type": [fuzzy_date_string_blank, "date"]}
41 | fuzzy_datetime = {"type": [fuzzy_datetime_string_blank, "datetime"]}
42 | fuzzy_datetime_blank = {"type": [fuzzy_datetime_string_blank, "datetime"]}
43 | 
44 | other_names = {
45 |     "items": {
46 |         "properties": {
47 |             "name": {"type": "string", "minLength": 1},
48 |             "start_date": fuzzy_date_blank,
49 |             "end_date": fuzzy_date_blank,
50 |             "note": {"type": "string"},
51 |         },
52 |         "type": "object",
53 |     },
54 |     "type": "array",
55 | }
56 | 
57 | 
58 | links = {
59 |     "items": {
60 |         "properties": {
61 |             "note": {"type": "string"},
62 |             "url": {"format": "uri", "type": "string"},
63 |         },
64 |         "type": "object",
65 |     },
66 |     "type": "array",
67 | }
68 | 
69 | 
70 | sources = {
71 |     "items": {
72 |         "properties": {
73 |             "url": {"type": "string", "format": "uri"},
74 |             "note": {"type": "string"},
75 |         },
76 |         "type": "object",
77 |     },
78 |     "minItems": 1,
79 |     "type": "array",
80 | }
81 | 
82 | extras = {
83 |     "type": "object",
84 | }
85 | 


--------------------------------------------------------------------------------
/pupa/scrape/jurisdiction.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from .base import BaseModel, Scraper
 3 | from .schemas.jurisdiction import schema
 4 | from .popolo import Organization
 5 | 
 6 | 
 7 | class Jurisdiction(BaseModel):
 8 |     """Base class for a jurisdiction"""
 9 | 
10 |     _type = "jurisdiction"
11 |     _schema = schema
12 | 
13 |     # schema objects
14 |     classification = None
15 |     name = None
16 |     url = None
17 |     legislative_sessions = []
18 |     feature_flags = []
19 |     extras = {}
20 | 
21 |     # non-db properties
22 |     scrapers = {}
23 |     default_scrapers = None
24 |     parties = []
25 |     ignored_scraped_sessions = []
26 | 
27 |     def __init__(self):
28 |         super(BaseModel, self).__init__()
29 |         self._related = []
30 |         self.extras = {}
31 | 
32 |     @property
33 |     def jurisdiction_id(self):
34 |         return "{}/{}".format(
35 |             self.division_id.replace("ocd-division", "ocd-jurisdiction"),
36 |             self.classification,
37 |         )
38 | 
39 |     _id = jurisdiction_id
40 | 
41 |     def as_dict(self):
42 |         return {
43 |             "_id": self.jurisdiction_id,
44 |             "id": self.jurisdiction_id,
45 |             "name": self.name,
46 |             "url": self.url,
47 |             "division_id": self.division_id,
48 |             "classification": self.classification,
49 |             "legislative_sessions": self.legislative_sessions,
50 |             "feature_flags": self.feature_flags,
51 |             "extras": self.extras,
52 |         }
53 | 
54 |     def __str__(self):
55 |         return self.name
56 | 
57 |     def get_organizations(self):
58 |         raise NotImplementedError(
59 |             "get_organizations is not implemented"
60 |         )  # pragma: no cover
61 | 
62 | 
63 | class JurisdictionScraper(Scraper):
64 |     def scrape(self):
65 |         # yield a single Jurisdiction object
66 |         yield self.jurisdiction
67 | 
68 |         # yield all organizations
69 |         for org in self.jurisdiction.get_organizations():
70 |             yield org
71 | 
72 |         if self.jurisdiction.parties:
73 |             warnings.warn(
74 |                 "including parties on Jurisdiction is deprecated, "
75 |                 'use "pupa party" command instead'
76 |             )
77 |         for party in self.jurisdiction.parties:
78 |             org = Organization(classification="party", name=party["name"])
79 |             yield org
80 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_jurisdiction_scrape.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from pupa.scrape import Jurisdiction, Organization, JurisdictionScraper
 3 | 
 4 | 
 5 | class FakeJurisdiction(Jurisdiction):
 6 |     division_id = "ocd-division/test"
 7 |     classification = "government"
 8 |     name = "Test"
 9 |     url = "http://example.com"
10 | 
11 |     def get_organizations(self):
12 |         parent = Organization("Congress", classification="legislature")
13 |         yield parent
14 |         yield Organization("House", classification="lower", parent_id=parent)
15 |         yield Organization("Senate", classification="upper", parent_id=parent)
16 | 
17 | 
18 | def test_basics():
19 |     # id property and string
20 |     j = FakeJurisdiction()
21 |     assert j.jurisdiction_id == "ocd-jurisdiction/test/government"
22 |     assert j.name in str(j)
23 | 
24 | 
25 | def test_as_dict():
26 |     j = FakeJurisdiction()
27 |     d = j.as_dict()
28 | 
29 |     assert d["_id"] == j.jurisdiction_id
30 |     assert d["name"] == j.name
31 |     assert d["url"] == j.url
32 |     assert d["legislative_sessions"] == []
33 |     assert d["feature_flags"] == []
34 | 
35 | 
36 | def test_jurisdiction_unicam_scrape():
37 |     class UnicameralJurisdiction(Jurisdiction):
38 |         jurisdiction_id = "unicam"
39 |         name = "Unicameral"
40 |         url = "http://example.com"
41 | 
42 |         def get_organizations(self):
43 |             yield Organization("Unicameral Legislature", classification="legislature")
44 | 
45 |     j = UnicameralJurisdiction()
46 |     js = JurisdictionScraper(j, "/tmp/")
47 |     objects = list(js.scrape())
48 | 
49 |     # two objects, first is the Jurisdiction
50 |     assert len(objects) == 2
51 |     assert objects[0] == j
52 | 
53 |     # ensure we made a single legislature org
54 |     assert isinstance(objects[1], Organization)
55 |     assert objects[1].classification == "legislature"
56 | 
57 | 
58 | def test_jurisdiction_bicameral_scrape():
59 |     j = FakeJurisdiction()
60 |     js = JurisdictionScraper(j, "/tmp/")
61 |     objects = list(js.scrape())
62 |     obj_names = set()
63 |     obj_types = defaultdict(int)
64 | 
65 |     for o in objects:
66 |         obj_names.add(o.name)
67 |         obj_types[type(o)] += 1
68 | 
69 |     # ensure Jurisdiction and 5 organizations were found
70 |     assert obj_names == {"Test", "Congress", "House", "Senate"}
71 |     assert obj_types[FakeJurisdiction] == 1
72 |     assert obj_types[Organization] == 3
73 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/dbinit.py:
--------------------------------------------------------------------------------
 1 | import django
 2 | from django.db import connection
 3 | from django.core.management import call_command
 4 | 
 5 | from .base import BaseCommand
 6 | 
 7 | 
 8 | def copy_tmp(tablename):
 9 |     cursor = connection.cursor()
10 |     print("copying data from table " + tablename)
11 |     cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename))
12 |     cursor.execute("CREATE TABLE tmp_{t} (LIKE {t});".format(t=tablename))
13 |     cursor.execute("INSERT INTO tmp_{t} SELECT * FROM {t};".format(t=tablename))
14 | 
15 | 
16 | def restore_from_tmp(tablename):
17 |     print("restoring data to table " + tablename)
18 |     cursor = connection.cursor()
19 |     cursor.execute("INSERT INTO {t} SELECT * FROM tmp_{t};".format(t=tablename))
20 |     cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename))
21 | 
22 | 
23 | def drop_tables(skip_divisions=False):
24 |     tables = connection.introspection.table_names()
25 |     cursor = connection.cursor()
26 |     for table in tables:
27 |         if table.startswith(("opencivicdata_", "pupa_")):
28 |             print("dropping table " + table)
29 |             cursor.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table))
30 |         cursor.execute("DELETE FROM django_migrations WHERE app='core';")
31 |         cursor.execute("DELETE FROM django_migrations WHERE app='legislative';")
32 |         cursor.execute("DELETE FROM django_migrations WHERE app='pupa';")
33 | 
34 | 
35 | class Command(BaseCommand):
36 |     name = "dbinit"
37 |     help = "initialize a pupa database"
38 | 
39 |     def add_args(self):
40 |         self.add_argument(
41 |             "--reset",
42 |             action="store_true",
43 |             default=False,
44 |             help="reset entire database - USE WITH CAUTION",
45 |         )
46 |         self.add_argument(
47 |             "--partial-reset",
48 |             action="store_true",
49 |             default=False,
50 |             help="reset entire database, except for divisions - USE WITH CAUTION",
51 |         )
52 |         self.add_argument(
53 |             type=str, dest="country", nargs="+", help="country to load divisions for"
54 |         )
55 | 
56 |     def handle(self, args, other):
57 |         django.setup()
58 | 
59 |         if args.partial_reset:
60 |             copy_tmp("opencivicdata_division")
61 |             drop_tables()
62 |         elif args.reset:
63 |             drop_tables()
64 |         else:
65 |             pass
66 | 
67 |         call_command("migrate", interactive=False)
68 | 
69 |         if args.partial_reset:
70 |             restore_from_tmp("opencivicdata_division")
71 |         else:
72 |             for country in args.country:
73 |                 call_command("loaddivisions", country)
74 | 


--------------------------------------------------------------------------------
/pupa/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging.config
 4 | import argparse
 5 | import importlib
 6 | import traceback
 7 | from django.conf import settings
 8 | from pupa.exceptions import CommandError
 9 | 
10 | logger = logging.getLogger("pupa")
11 | 
12 | COMMAND_MODULES = (
13 |     "pupa.cli.commands.init",
14 |     "pupa.cli.commands.dbinit",
15 |     "pupa.cli.commands.update",
16 |     "pupa.cli.commands.party",
17 |     "pupa.cli.commands.clean",
18 | )
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser("pupa", description="pupa CLI")
23 |     parser.add_argument("--debug", action="store_true", help="open debugger on error")
24 |     parser.add_argument(
25 |         "--loglevel",
26 |         default="INFO",
27 |         help=(
28 |             "set log level. options are: "
29 |             "DEBUG|INFO|WARNING|ERROR|CRITICAL "
30 |             "(default is INFO)"
31 |         ),
32 |     )
33 |     subparsers = parser.add_subparsers(dest="subcommand")
34 | 
35 |     # configure Django before model imports
36 |     if os.environ.get("DJANGO_SETTINGS_MODULE") is None:
37 |         os.environ["DJANGO_SETTINGS_MODULE"] = "pupa.settings"
38 | 
39 |     subcommands = {}
40 |     for mod in COMMAND_MODULES:
41 |         try:
42 |             cmd = importlib.import_module(mod).Command(subparsers)
43 |             subcommands[cmd.name] = cmd
44 |         except ImportError as e:
45 |             logger.error('exception "%s" prevented loading of %s module', e, mod)
46 | 
47 |     # process args
48 |     args, other = parser.parse_known_args()
49 | 
50 |     # set log level from command line
51 |     handler_level = getattr(logging, args.loglevel.upper(), "INFO")
52 |     settings.LOGGING["handlers"]["default"]["level"] = handler_level
53 |     logging.config.dictConfig(settings.LOGGING)
54 | 
55 |     # turn debug on
56 |     if args.debug:
57 |         try:
58 |             debug_module = importlib.import_module("ipdb")
59 |         except ImportError:
60 |             debug_module = importlib.import_module("pdb")
61 | 
62 |         # turn on PDB-on-error mode
63 |         # stolen from http://stackoverflow.com/questions/1237379/
64 |         # if this causes problems in interactive mode check that page
65 |         def _tb_info(type, value, tb):
66 |             traceback.print_exception(type, value, tb)
67 |             debug_module.pm()
68 | 
69 |         sys.excepthook = _tb_info
70 | 
71 |     if not args.subcommand:
72 |         parser.print_help()
73 |     else:
74 |         try:
75 |             subcommands[args.subcommand].handle(args, other)
76 |         except CommandError as e:
77 |             logger.critical(str(e))
78 |             sys.exit(1)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/pupa/exceptions.py:
--------------------------------------------------------------------------------
 1 | class PupaError(Exception):
 2 |     """Base class for exceptions from within Pupa"""
 3 | 
 4 | 
 5 | class PupaInternalError(PupaError):
 6 |     """Indication something went wrong inside of Pupa that never should happen"""
 7 | 
 8 | 
 9 | class CommandError(PupaError):
10 |     """Errors from within pupa CLI"""
11 | 
12 | 
13 | # import-related errors
14 | 
15 | 
16 | class DataImportError(PupaError):
17 |     """A generic error related to the import process."""
18 | 
19 | 
20 | class InvalidVoteEventError(DataImportError):
21 |     """Attempt to create a vote event without an identifier or bill_id"""
22 | 
23 | 
24 | class NoMembershipsError(DataImportError):
25 |     """An attempt was made to import a person without any memberships."""
26 | 
27 |     def __init__(self, ids):
28 |         super(NoMembershipsError, self).__init__(
29 |             "no memberships for {} people: \n{}".format(len(ids), ", ".join(ids))
30 |         )
31 | 
32 | 
33 | class SameNameError(DataImportError):
34 |     """Attempt was made to import two people with the same name."""
35 | 
36 |     def __init__(self, name):
37 |         super(SameNameError, self).__init__(
38 |             'multiple people with same name "{}" in Jurisdiction '
39 |             "- must provide birth_date to disambiguate".format(name)
40 |         )
41 | 
42 | 
43 | class SameOrgNameError(DataImportError):
44 |     """Attempt was made to import two orgs with the same name."""
45 | 
46 |     def __init__(self, name):
47 |         super(SameOrgNameError, self).__init__(
48 |             'multiple orgs with same name "{}" in Jurisdiction '.format(name)
49 |         )
50 | 
51 | 
52 | class DuplicateItemError(DataImportError):
53 |     """Attempt was made to import items that resolve to the same database item."""
54 | 
55 |     def __init__(self, data, obj, data_sources=None):
56 |         super(DuplicateItemError, self).__init__(
57 |             "attempt to import data that would conflict with "
58 |             "data already in the import: {} "
59 |             "(already imported as {})\n"
60 |             "obj1 sources: {}\nobj2 sources: {}".format(
61 |                 data,
62 |                 obj,
63 |                 list(
64 |                     obj.sources.values_list("url", flat=True)
65 |                     if hasattr(obj, "sources")
66 |                     else []
67 |                 ),
68 |                 [s["url"] for s in data_sources or []],
69 |             )
70 |         )
71 | 
72 | 
73 | class UnresolvedIdError(DataImportError):
74 |     """Attempt was made to resolve an id that has no result."""
75 | 
76 | 
77 | # scrape-related errors
78 | 
79 | 
80 | class ScrapeError(PupaError):
81 |     """A generic error related to the scrape process."""
82 | 
83 | 
84 | class ScrapeValueError(PupaError, ValueError):
85 |     """An invalid value was passed to a pupa scrape object."""
86 | 


--------------------------------------------------------------------------------
/pupa/admin.py:
--------------------------------------------------------------------------------
 1 | from django.contrib import admin
 2 | from . import models
 3 | 
 4 | 
 5 | class ScrapeReportInline(admin.TabularInline):
 6 |     model = models.ScrapeReport
 7 |     readonly_fields = ("scraper", "args", "start_time", "end_time", "get_object_list")
 8 | 
 9 |     def has_add_permission(self, request):
10 |         return False
11 | 
12 |     can_delete = False
13 | 
14 |     def get_object_list(self, obj):
15 |         return "\n".join(
16 |             "{} ({})".format(o.object_type, o.count) for o in obj.scraped_objects.all()
17 |         )
18 | 
19 | 
20 | class ImportObjectsInline(admin.TabularInline):
21 |     model = models.ImportObjects
22 |     readonly_fields = (
23 |         "object_type",
24 |         "insert_count",
25 |         "update_count",
26 |         "noop_count",
27 |         "start_time",
28 |         "end_time",
29 |     )
30 | 
31 |     def has_add_permission(self, request):
32 |         return False
33 | 
34 |     can_delete = False
35 | 
36 | 
37 | @admin.register(models.RunPlan)
38 | class RunPlanAdmin(admin.ModelAdmin):
39 |     actions = None
40 | 
41 |     readonly_fields = (
42 |         "jurisdiction",
43 |         "success",
44 |         "start_time",
45 |         "end_time",
46 |         "exception",
47 |         "traceback",
48 |     )
49 |     list_filter = ("jurisdiction__name", "success")
50 |     list_display = ("jurisdiction", "success", "start_time")
51 |     inlines = [
52 |         ScrapeReportInline,
53 |         ImportObjectsInline,
54 |     ]
55 | 
56 |     def has_delete_permission(self, request, obj=None):
57 |         return False
58 | 
59 |     def has_add_permission(self, request):
60 |         return False
61 | 
62 | 
63 | @admin.register(models.SessionDataQualityReport)
64 | class SessionDataQualityAdmin(admin.ModelAdmin):
65 |     actions = None
66 | 
67 |     readonly_fields = (
68 |         "legislative_session",
69 |         "bills_missing_actions",
70 |         "bills_missing_sponsors",
71 |         "bills_missing_versions",
72 |         "votes_missing_voters",
73 |         "votes_missing_bill",
74 |         "votes_missing_yes_count",
75 |         "votes_missing_no_count",
76 |         "votes_with_bad_counts",
77 |         "unmatched_sponsor_people",
78 |         "unmatched_sponsor_organizations",
79 |         "unmatched_voters",
80 |     )
81 |     list_display = (
82 |         "jurisdiction_name",
83 |         "legislative_session",
84 |         "bills_missing_actions",
85 |         "bills_missing_sponsors",
86 |         "bills_missing_versions",
87 |         "votes_missing_voters",
88 |         "votes_missing_bill",
89 |         "votes_missing_yes_count",
90 |         "votes_missing_no_count",
91 |         "votes_with_bad_counts",
92 |     )
93 |     list_filter = ("legislative_session__jurisdiction__name",)
94 | 
95 |     def jurisdiction_name(self, obj):
96 |         return obj.legislative_session.jurisdiction.name
97 | 


--------------------------------------------------------------------------------
/pupa/tests/importers/test_jurisdiction_importer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pupa.scrape import Jurisdiction as JurisdictionBase
 3 | from pupa.importers import JurisdictionImporter
 4 | from opencivicdata.core.models import Jurisdiction, Division
 5 | from opencivicdata.legislative.models import LegislativeSession
 6 | 
 7 | 
 8 | class FakeJurisdiction(JurisdictionBase):
 9 |     division_id = "ocd-division/country:us"
10 |     name = "test"
11 |     url = "http://example.com"
12 |     classification = "government"
13 | 
14 |     legislative_sessions = [
15 |         {"identifier": "2015", "name": "2015 Regular Session"},
16 |         {"identifier": "2016", "name": "2016 Regular Session"},
17 |     ]
18 | 
19 | 
20 | @pytest.mark.django_db
21 | def test_jurisdiction_import():
22 |     Division.objects.create(id="ocd-division/country:us", name="USA")
23 |     tj = FakeJurisdiction()
24 |     juris_dict = tj.as_dict()
25 |     JurisdictionImporter("jurisdiction-id").import_data([juris_dict])
26 | 
27 |     dbj = Jurisdiction.objects.get()
28 |     assert dbj.id == tj.jurisdiction_id
29 |     assert dbj.division_id == tj.division_id
30 |     assert dbj.name == tj.name
31 |     assert dbj.url == tj.url
32 | 
33 | 
34 | @pytest.mark.django_db
35 | def test_jurisdiction_update():
36 |     Division.objects.create(id="ocd-division/country:us", name="USA")
37 |     tj = FakeJurisdiction()
38 |     ji = JurisdictionImporter("jurisdiction-id")
39 |     _, what = ji.import_item(tj.as_dict())
40 |     assert what == "insert"
41 | 
42 |     _, what = ji.import_item(tj.as_dict())
43 |     assert what == "noop"
44 |     assert Jurisdiction.objects.count() == 1
45 | 
46 |     tj.name = "different name"
47 |     obj, what = ji.import_item(tj.as_dict())
48 |     assert what == "update"
49 |     assert Jurisdiction.objects.count() == 1
50 |     assert Jurisdiction.objects.get().name == "different name"
51 | 
52 | 
53 | @pytest.mark.django_db
54 | def test_jurisdiction_merge_related():
55 |     Division.objects.create(id="ocd-division/country:us", name="USA")
56 |     # need to ensure legislative_sessions don't get deleted
57 |     ji = JurisdictionImporter("jurisdiction-id")
58 |     tj = FakeJurisdiction()
59 |     ji.import_item(tj.as_dict())
60 | 
61 |     assert LegislativeSession.objects.count() == 2
62 | 
63 |     # disallow deletion of legislative sessions as it can remove bills
64 |     tj.legislative_sessions.pop()
65 |     ji.import_item(tj.as_dict())
66 | 
67 |     # should still have two
68 |     assert LegislativeSession.objects.count() == 2
69 | 
70 |     # now will have three
71 |     tj.legislative_sessions.append({"identifier": "2017", "name": "2017 Session"})
72 |     ji.import_item(tj.as_dict())
73 |     assert LegislativeSession.objects.count() == 3
74 | 
75 |     # and test that the non-identifier fields actually update
76 |     tj.legislative_sessions.append({"identifier": "2016", "name": "updated"})
77 |     ji.import_item(tj.as_dict())
78 |     assert LegislativeSession.objects.count() == 3
79 |     assert LegislativeSession.objects.get(identifier="2016").name == "updated"
80 | 


--------------------------------------------------------------------------------
/pupa/importers/memberships.py:
--------------------------------------------------------------------------------
 1 | from opencivicdata.core.models import (
 2 |     Membership,
 3 |     MembershipContactDetail,
 4 |     MembershipLink,
 5 | )
 6 | from .base import BaseImporter
 7 | from ..utils import get_pseudo_id
 8 | from ..exceptions import NoMembershipsError
 9 | 
10 | 
11 | class MembershipImporter(BaseImporter):
12 |     _type = "membership"
13 |     model_class = Membership
14 |     related_models = {
15 |         "contact_details": (MembershipContactDetail, "membership_id", {}),
16 |         "links": (MembershipLink, "membership_id", {}),
17 |     }
18 | 
19 |     def __init__(self, jurisdiction_id, person_importer, org_importer, post_importer):
20 |         super(MembershipImporter, self).__init__(jurisdiction_id)
21 |         self.person_importer = person_importer
22 |         self.org_importer = org_importer
23 |         self.post_importer = post_importer
24 |         self.seen_person_ids = set()
25 | 
26 |     def get_object(self, membership):
27 |         spec = {
28 |             "organization_id": membership["organization_id"],
29 |             "person_id": membership["person_id"],
30 |             "label": membership["label"],
31 |             "role": membership["role"],
32 |         }
33 | 
34 |         # post_id is optional - might exist in DB but not scraped here?
35 |         if membership["post_id"]:
36 |             spec["post_id"] = membership["post_id"]
37 | 
38 |         if membership["person_name"]:
39 |             spec["person_name"] = membership["person_name"]
40 | 
41 |         if membership["start_date"]:
42 |             spec["start_date"] = membership["start_date"]
43 |         else:
44 |             # if this is a historical role, only update historical roles
45 |             spec["end_date"] = membership["end_date"]
46 | 
47 |         return self.model_class.objects.get(**spec)
48 | 
49 |     def prepare_for_db(self, data):
50 |         # check if the organization is not tied to a jurisdiction
51 |         if data["organization_id"].startswith("~"):
52 |             pseudo_id = get_pseudo_id(data["organization_id"])
53 |             is_party = pseudo_id.get("classification") == "party"
54 |         else:
55 |             # we have to assume it is not a party if we want to avoid
56 |             # doing a lookup here
57 |             is_party = False
58 | 
59 |         data["organization_id"] = self.org_importer.resolve_json_id(
60 |             data["organization_id"]
61 |         )
62 |         data["person_id"] = self.person_importer.resolve_json_id(
63 |             data["person_id"], allow_no_match=True
64 |         )
65 |         data["post_id"] = self.post_importer.resolve_json_id(data["post_id"])
66 |         if not is_party:
67 |             # track that we had a membership for this person
68 |             self.seen_person_ids.add(data["person_id"])
69 |         return data
70 | 
71 |     def postimport(self):
72 |         person_ids = (
73 |             set(self.person_importer.json_to_db_id.values()) - self.seen_person_ids
74 |         )
75 |         if person_ids:
76 |             reverse_id_dict = {
77 |                 v: k for k, v in self.person_importer.json_to_db_id.items()
78 |             }
79 |             person_ids = [reverse_id_dict[pid] for pid in person_ids]
80 |             raise NoMembershipsError(person_ids)
81 | 


--------------------------------------------------------------------------------
/pupa/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | from django.contrib.postgres.fields import JSONField
 3 | from django.contrib.contenttypes.fields import GenericForeignKey
 4 | from django.contrib.contenttypes.models import ContentType
 5 | from opencivicdata.core.models import Jurisdiction
 6 | from opencivicdata.legislative.models import LegislativeSession
 7 | 
 8 | 
 9 | OBJECT_TYPES = (
10 |     ("jurisdiction", "Jurisdiction"),
11 |     ("person", "Person"),
12 |     ("organization", "Organization"),
13 |     ("post", "Post"),
14 |     ("membership", "Membership"),
15 |     ("bill", "Bill"),
16 |     ("vote_event", "VoteEvent"),
17 |     ("event", "Event"),
18 | )
19 | 
20 | 
21 | class RunPlan(models.Model):
22 |     jurisdiction = models.ForeignKey(
23 |         Jurisdiction, related_name="runs", on_delete=models.CASCADE
24 |     )
25 |     success = models.BooleanField(default=True)
26 |     start_time = models.DateTimeField()
27 |     end_time = models.DateTimeField()
28 |     exception = models.TextField(blank=True, default="")
29 |     traceback = models.TextField(blank=True, default="")
30 | 
31 | 
32 | class ScrapeReport(models.Model):
33 |     plan = models.ForeignKey(RunPlan, related_name="scrapers", on_delete=models.CASCADE)
34 |     scraper = models.CharField(max_length=300)
35 |     args = models.CharField(max_length=300)
36 |     start_time = models.DateTimeField()
37 |     end_time = models.DateTimeField()
38 | 
39 | 
40 | class ScrapeObjects(models.Model):
41 |     report = models.ForeignKey(
42 |         ScrapeReport, related_name="scraped_objects", on_delete=models.CASCADE
43 |     )
44 |     object_type = models.CharField(max_length=20, choices=OBJECT_TYPES)
45 |     count = models.PositiveIntegerField()
46 | 
47 | 
48 | class ImportObjects(models.Model):
49 |     report = models.ForeignKey(
50 |         RunPlan, related_name="imported_objects", on_delete=models.CASCADE
51 |     )
52 |     object_type = models.CharField(max_length=20, choices=OBJECT_TYPES)
53 |     insert_count = models.PositiveIntegerField()
54 |     update_count = models.PositiveIntegerField()
55 |     noop_count = models.PositiveIntegerField()
56 |     start_time = models.DateTimeField()
57 |     end_time = models.DateTimeField()
58 | 
59 | 
60 | class Identifier(models.Model):
61 |     identifier = models.CharField(max_length=300)
62 |     jurisdiction = models.ForeignKey(
63 |         Jurisdiction,
64 |         related_name="pupa_ids",
65 |         on_delete=models.CASCADE,
66 |     )
67 |     content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
68 |     object_id = models.CharField(max_length=300)
69 |     content_object = GenericForeignKey("content_type", "object_id")
70 | 
71 |     def __str__(self):  # __unicode__ on Python 2
72 |         return self.identifier
73 | 
74 | 
75 | class SessionDataQualityReport(models.Model):
76 |     legislative_session = models.ForeignKey(
77 |         LegislativeSession, on_delete=models.CASCADE
78 |     )
79 | 
80 |     bills_missing_actions = models.PositiveIntegerField()
81 |     bills_missing_sponsors = models.PositiveIntegerField()
82 |     bills_missing_versions = models.PositiveIntegerField()
83 | 
84 |     votes_missing_voters = models.PositiveIntegerField()
85 |     votes_missing_bill = models.PositiveIntegerField()
86 |     votes_missing_yes_count = models.PositiveIntegerField()
87 |     votes_missing_no_count = models.PositiveIntegerField()
88 |     votes_with_bad_counts = models.PositiveIntegerField()
89 | 
90 |     # these fields store lists of names mapped to numbers of occurances
91 |     unmatched_sponsor_people = JSONField()
92 |     unmatched_sponsor_organizations = JSONField()
93 |     unmatched_voters = JSONField()
94 | 


--------------------------------------------------------------------------------
/pupa/importers/people.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from django.db.models import Q
 3 | from opencivicdata.core.models import (
 4 |     Person,
 5 |     PersonIdentifier,
 6 |     PersonName,
 7 |     PersonContactDetail,
 8 |     PersonLink,
 9 |     PersonSource,
10 | )
11 | from .base import BaseImporter
12 | from ..exceptions import SameNameError
13 | 
14 | 
15 | class PersonImporter(BaseImporter):
16 |     _type = "person"
17 |     model_class = Person
18 |     related_models = {
19 |         "identifiers": (PersonIdentifier, "person_id", {}),
20 |         "other_names": (PersonName, "person_id", {}),
21 |         "contact_details": (PersonContactDetail, "person_id", {}),
22 |         "links": (PersonLink, "person_id", {}),
23 |         "sources": (PersonSource, "person_id", {}),
24 |     }
25 | 
26 |     def _prepare_imports(self, dicts):
27 |         dicts = list(super(PersonImporter, self)._prepare_imports(dicts))
28 | 
29 |         by_name = defaultdict(list)
30 |         for _, person in dicts:
31 |             by_name[person["name"]].append(person)
32 |             for other in person["other_names"]:
33 |                 by_name[other["name"]].append(person)
34 | 
35 |         # check for duplicates
36 |         for name, people in by_name.items():
37 |             if len(people) > 1:
38 |                 for person in people:
39 |                     if person["birth_date"] == "":
40 |                         raise SameNameError(name)
41 | 
42 |         return dicts
43 | 
44 |     def limit_spec(self, spec):
45 |         """
46 |         Whenever we do a Pseudo ID lookup from the database, we need to limit
47 |         based on the memberships -> organization -> jurisdiction, so we scope
48 |         the resolution.
49 |         """
50 |         if list(spec.keys()) == ["name"]:
51 |             # if we're just resolving on name, include other names and family name
52 |             name = spec["name"]
53 |             return (Q(name=name) | Q(other_names__name=name) | Q(family_name=name)) & Q(
54 |                 memberships__organization__jurisdiction_id=self.jurisdiction_id
55 |             )
56 |         spec["memberships__organization__jurisdiction_id"] = self.jurisdiction_id
57 |         return spec
58 | 
59 |     def get_object(self, person):
60 |         all_names = [person["name"]] + [o["name"] for o in person["other_names"]]
61 | 
62 |         matches = list(
63 |             self.model_class.objects.filter(
64 |                 Q(memberships__organization__jurisdiction_id=self.jurisdiction_id),
65 |                 (Q(name__in=all_names) | Q(other_names__name__in=all_names)),
66 |             ).distinct("id")
67 |         )
68 | 
69 |         matches_length = len(matches)
70 |         if matches_length == 1 and not matches[0].birth_date:
71 |             return matches[0]
72 |         elif matches_length == 0:
73 |             raise self.model_class.DoesNotExist(
74 |                 "No Person: {} in {}".format(all_names, self.jurisdiction_id)
75 |             )
76 |         else:
77 |             # Try and match based on birth_date.
78 |             if person["birth_date"]:
79 |                 for match in matches:
80 |                     if (
81 |                         person["birth_date"]
82 |                         and match.birth_date == person["birth_date"]
83 |                     ):
84 |                         return match
85 | 
86 |                 # If we got here, no match based on birth_date, a new person?
87 |                 raise self.model_class.DoesNotExist(
88 |                     "No Person: {} in {} with birth_date {}".format(
89 |                         all_names, self.jurisdiction_id, person["birth_date"]
90 |                     )
91 |                 )
92 | 
93 |             raise SameNameError(person["name"])
94 | 


--------------------------------------------------------------------------------
/pupa/reports/session.py:
--------------------------------------------------------------------------------
  1 | from django.db.models import Count, Subquery, OuterRef, Q, F
  2 | from opencivicdata.legislative.models import (
  3 |     Bill,
  4 |     VoteEvent,
  5 |     VoteCount,
  6 |     PersonVote,
  7 |     BillSponsorship,
  8 | )
  9 | from ..models import SessionDataQualityReport
 10 | 
 11 | 
 12 | def _simple_count(ModelCls, session, **filter):
 13 |     return (
 14 |         ModelCls.objects.filter(legislative_session_id=session).filter(**filter).count()
 15 |     )
 16 | 
 17 | 
 18 | def generate_session_report(session):
 19 |     report = {
 20 |         "bills_missing_actions": _simple_count(Bill, session, actions__isnull=True),
 21 |         "bills_missing_sponsors": _simple_count(
 22 |             Bill, session, sponsorships__isnull=True
 23 |         ),
 24 |         "bills_missing_versions": _simple_count(Bill, session, versions__isnull=True),
 25 |         "votes_missing_bill": _simple_count(VoteEvent, session, bill__isnull=True),
 26 |         "votes_missing_voters": _simple_count(VoteEvent, session, votes__isnull=True),
 27 |         "votes_missing_yes_count": 0,
 28 |         "votes_missing_no_count": 0,
 29 |         "votes_with_bad_counts": 0,
 30 |     }
 31 | 
 32 |     voteevents = VoteEvent.objects.filter(legislative_session_id=session)
 33 |     queryset = voteevents.annotate(
 34 |         yes_sum=Count("pk", filter=Q(votes__option="yes")),
 35 |         no_sum=Count("pk", filter=Q(votes__option="no")),
 36 |         other_sum=Count("pk", filter=Q(votes__option="other")),
 37 |         yes_count=Subquery(
 38 |             VoteCount.objects.filter(vote_event=OuterRef("pk"), option="yes").values(
 39 |                 "value"
 40 |             )
 41 |         ),
 42 |         no_count=Subquery(
 43 |             VoteCount.objects.filter(vote_event=OuterRef("pk"), option="no").values(
 44 |                 "value"
 45 |             )
 46 |         ),
 47 |         other_count=Subquery(
 48 |             VoteCount.objects.filter(vote_event=OuterRef("pk"), option="other").values(
 49 |                 "value"
 50 |             )
 51 |         ),
 52 |     )
 53 | 
 54 |     for vote in queryset:
 55 |         if vote.yes_count is None:
 56 |             report["votes_missing_yes_count"] += 1
 57 |             vote.yes_count = 0
 58 |         if vote.no_count is None:
 59 |             report["votes_missing_no_count"] += 1
 60 |             vote.no_count = 0
 61 |         if vote.other_count is None:
 62 |             vote.other_count = 0
 63 |         if (
 64 |             vote.yes_sum != vote.yes_count
 65 |             or vote.no_sum != vote.no_count
 66 |             or vote.other_sum != vote.other_count
 67 |         ):
 68 |             report["votes_with_bad_counts"] += 1
 69 | 
 70 |     # handle unmatched
 71 |     queryset = (
 72 |         BillSponsorship.objects.filter(
 73 |             bill__legislative_session_id=session, entity_type="person", person_id=None
 74 |         )
 75 |         .values("name")
 76 |         .annotate(num=Count("name"))
 77 |     )
 78 |     report["unmatched_sponsor_people"] = {
 79 |         item["name"]: item["num"] for item in queryset
 80 |     }
 81 |     queryset = (
 82 |         BillSponsorship.objects.filter(
 83 |             bill__legislative_session_id=session,
 84 |             entity_type="organization",
 85 |             person_id=None,
 86 |         )
 87 |         .values("name")
 88 |         .annotate(num=Count("name"))
 89 |     )
 90 |     report["unmatched_sponsor_organizations"] = {
 91 |         item["name"]: item["num"] for item in queryset
 92 |     }
 93 |     queryset = (
 94 |         PersonVote.objects.filter(
 95 |             vote_event__legislative_session_id=session, voter__isnull=True
 96 |         )
 97 |         .values(name=F("voter_name"))
 98 |         .annotate(num=Count("voter_name"))
 99 |     )
100 |     report["unmatched_voters"] = {item["name"]: item["num"] for item in queryset}
101 | 
102 |     return SessionDataQualityReport(legislative_session_id=session, **report)
103 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_scraper.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import pytest
  3 | from pupa.scrape import Person, Organization, Bill, Jurisdiction
  4 | from pupa.scrape.base import Scraper, ScrapeError, BaseBillScraper
  5 | 
  6 | 
  7 | class FakeJurisdiction(Jurisdiction):
  8 |     jurisdiction_id = "jurisdiction"
  9 | 
 10 | 
 11 | juris = FakeJurisdiction()
 12 | 
 13 | 
 14 | def test_save_object_basics():
 15 |     # ensure that save object dumps a file
 16 |     s = Scraper(juris, "/tmp/")
 17 |     p = Person("Michael Jordan")
 18 |     p.add_source("http://example.com")
 19 | 
 20 |     with mock.patch("json.dump") as json_dump:
 21 |         s.save_object(p)
 22 | 
 23 |     # ensure object is saved in right place
 24 |     filename = "person_" + p._id + ".json"
 25 |     assert filename in s.output_names["person"]
 26 |     json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
 27 | 
 28 | 
 29 | def test_save_object_invalid():
 30 |     s = Scraper(juris, "/tmp/")
 31 |     p = Person("Michael Jordan")
 32 |     # no source, won't validate
 33 | 
 34 |     with pytest.raises(ValueError):
 35 |         s.save_object(p)
 36 | 
 37 | 
 38 | def test_save_related():
 39 |     s = Scraper(juris, "/tmp/")
 40 |     p = Person("Michael Jordan")
 41 |     p.add_source("http://example.com")
 42 |     o = Organization("Chicago Bulls", classification="committee")
 43 |     o.add_source("http://example.com")
 44 |     p._related.append(o)
 45 | 
 46 |     with mock.patch("json.dump") as json_dump:
 47 |         s.save_object(p)
 48 | 
 49 |     assert json_dump.mock_calls == [
 50 |         mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
 51 |         mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
 52 |     ]
 53 | 
 54 | 
 55 | def test_simple_scrape():
 56 |     class FakeScraper(Scraper):
 57 |         def scrape(self):
 58 |             p = Person("Michael Jordan")
 59 |             p.add_source("http://example.com")
 60 |             yield p
 61 | 
 62 |     with mock.patch("json.dump") as json_dump:
 63 |         record = FakeScraper(juris, "/tmp/").do_scrape()
 64 | 
 65 |     assert len(json_dump.mock_calls) == 1
 66 |     assert record["objects"]["person"] == 1
 67 |     assert record["end"] > record["start"]
 68 |     assert record["skipped"] == 0
 69 | 
 70 | 
 71 | def test_double_iter():
 72 |     """tests that scrapers that yield iterables work OK"""
 73 | 
 74 |     class IterScraper(Scraper):
 75 |         def scrape(self):
 76 |             yield self.scrape_people()
 77 | 
 78 |         def scrape_people(self):
 79 |             p = Person("Michael Jordan")
 80 |             p.add_source("http://example.com")
 81 |             yield p
 82 | 
 83 |     with mock.patch("json.dump") as json_dump:
 84 |         record = IterScraper(juris, "/tmp/").do_scrape()
 85 | 
 86 |     assert len(json_dump.mock_calls) == 1
 87 |     assert record["objects"]["person"] == 1
 88 | 
 89 | 
 90 | def test_no_objects():
 91 |     class NullScraper(Scraper):
 92 |         def scrape(self):
 93 |             pass
 94 | 
 95 |     with pytest.raises(ScrapeError):
 96 |         NullScraper(juris, "/tmp/", fastmode=True).do_scrape()
 97 | 
 98 | 
 99 | def test_no_scrape():
100 |     class NonScraper(Scraper):
101 |         pass
102 | 
103 |     with pytest.raises(NotImplementedError):
104 |         NonScraper(juris, "/tmp/").do_scrape()
105 | 
106 | 
107 | def test_bill_scraper():
108 |     class BillScraper(BaseBillScraper):
109 |         def get_bill_ids(self):
110 |             yield "1", {"extra": "param"}
111 |             yield "2", {}
112 | 
113 |         def get_bill(self, bill_id, **kwargs):
114 |             if bill_id == "1":
115 |                 assert kwargs == {"extra": "param"}
116 |                 raise self.ContinueScraping
117 |             else:
118 |                 assert bill_id == "2"
119 |                 assert kwargs == {}
120 |                 b = Bill("1", self.legislative_session, "title")
121 |                 b.add_source("http://example.com")
122 |                 return b
123 | 
124 |     bs = BillScraper(juris, "/tmp/")
125 |     with mock.patch("json.dump") as json_dump:
126 |         record = bs.do_scrape(legislative_session="2020")
127 | 
128 |     assert len(json_dump.mock_calls) == 1
129 |     assert record["objects"]["bill"] == 1
130 |     assert record["skipped"] == 1
131 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/clean.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timezone, timedelta
  2 | import sys
  3 | 
  4 | import django
  5 | from django.apps import apps
  6 | from .base import BaseCommand
  7 | 
  8 | 
  9 | def get_subclasses(app_list, abstract_class):
 10 |     """
 11 |     Finds and returns all subclasses of an abstract class.
 12 |     """
 13 |     result = []
 14 |     for app in app_list:
 15 |         for model in apps.get_app_config(app).get_models():
 16 |             if issubclass(model, abstract_class) and model is not abstract_class:
 17 |                 result.append(model)
 18 |         return result
 19 | 
 20 | 
 21 | class Command(BaseCommand):
 22 |     name = "clean"
 23 |     help = "Removes database objects that haven't been seen in recent scrapes"
 24 | 
 25 |     def add_args(self):
 26 |         self.add_argument(
 27 |             "--window",
 28 |             type=int,
 29 |             default=7,
 30 |             help=(
 31 |                 "objects not seen in this many days will be deleted from the database"
 32 |             ),
 33 |         )
 34 |         self.add_argument(
 35 |             "--max",
 36 |             type=int,
 37 |             default=10,
 38 |             help="max number of objects to delete without triggering failsafe",
 39 |         )
 40 |         self.add_argument(
 41 |             "--report",
 42 |             action="store_true",
 43 |             help=(
 44 |                 "generate a report of what objects this command"
 45 |                 " would delete without making any changes to the database"
 46 |             ),
 47 |         )
 48 |         self.add_argument(
 49 |             "--yes",
 50 |             action="store_true",
 51 |             help="assumes an answer of 'yes' to all interactive prompts",
 52 |             default=False,
 53 |         )
 54 | 
 55 |     def get_stale_objects(self, window):
 56 |         """
 57 |         Find all database objects that haven't seen been in {window} days.
 58 |         """
 59 | 
 60 |         from opencivicdata.core.models.base import OCDBase
 61 | 
 62 |         ocd_apps = ["core", "legislative"]
 63 |         # Check all subclasses of OCDBase
 64 |         models = get_subclasses(ocd_apps, OCDBase)
 65 |         # Top-level models are protected from deletion
 66 |         protected_models = ("Division", "Jurisdiction", "Post")
 67 | 
 68 |         for model in models:
 69 |             if model.__name__ not in protected_models:
 70 |                 cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window)
 71 |                 yield from model.objects.filter(last_seen__lte=cutoff_date).iterator()
 72 | 
 73 |     def remove_stale_objects(self, window):
 74 |         """
 75 |         Remove all database objects that haven't seen been in {window} days.
 76 |         """
 77 | 
 78 |         for obj in self.get_stale_objects(window):
 79 |             print(f"Deleting {obj}...")
 80 |             obj.delete()
 81 | 
 82 |     def report_stale_objects(self, window):
 83 |         """
 84 |         Print all database objects that haven't seen been in {window} days.
 85 |         """
 86 |         for obj in self.get_stale_objects(window):
 87 |             print(obj)
 88 | 
 89 |     def handle(self, args, other):
 90 |         django.setup()
 91 | 
 92 |         if args.report:
 93 |             print(
 94 |                 "These objects have not been seen in a scrape within the last"
 95 |                 f" {args.window} days:"
 96 |             )
 97 |             self.report_stale_objects(args.window)
 98 |         else:
 99 |             stale_objects = list(self.get_stale_objects(args.window))
100 |             num_stale_objects = len(stale_objects)
101 | 
102 |             print(
103 |                 f"{num_stale_objects} objects in your database have not been seen "
104 |                 f"in {args.window} days."
105 |             )
106 | 
107 |             if num_stale_objects > args.max:
108 |                 print(
109 |                     f"{num_stale_objects} exceeds the failsafe limit of {args.max}. "
110 |                     "Run this command with a larger --max value to proceed."
111 |                 )
112 |                 sys.exit()
113 | 
114 |             if args.yes:
115 |                 print("Proceeding to deletion because you specified --yes.")
116 | 
117 |             else:
118 |                 print(f"Permanently delete {num_stale_objects} objects? [Y/n]")
119 |                 response = input()
120 | 
121 |             if args.yes or response == "Y":
122 |                 self.remove_stale_objects(args.window)
123 |                 print(f"Removed {num_stale_objects} from your database.")
124 | 


--------------------------------------------------------------------------------
/pupa/tests/importers/test_post_importer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pupa.scrape import Post as ScrapePost
  3 | from pupa.importers import PostImporter, OrganizationImporter
  4 | from opencivicdata.core.models import Organization, Post, Division, Jurisdiction
  5 | import datetime
  6 | 
  7 | 
  8 | def create_jurisdictions():
  9 |     Division.objects.create(id="ocd-division/country:us", name="USA")
 10 |     Division.objects.create(id="ocd-division/country:us/state:nc", name="NC")
 11 |     Jurisdiction.objects.create(id="us", division_id="ocd-division/country:us")
 12 |     Jurisdiction.objects.create(id="nc", division_id="ocd-division/country:us/state:nc")
 13 | 
 14 | 
 15 | @pytest.mark.django_db
 16 | def test_full_post():
 17 |     create_jurisdictions()
 18 |     org = Organization.objects.create(
 19 |         name="United States Executive Branch",
 20 |         classification="executive",
 21 |         jurisdiction_id="us",
 22 |     )
 23 |     post = ScrapePost(
 24 |         label="executive",
 25 |         role="President",
 26 |         organization_id='~{"classification": "executive"}',
 27 |         start_date=datetime.date(2015, 5, 18),
 28 |         end_date="2015-05-19",
 29 |         maximum_memberships=2,
 30 |     )
 31 |     post.add_contact_detail(type="phone", value="555-555-1234", note="this is fake")
 32 |     post.add_link("http://example.com/link")
 33 | 
 34 |     # import post
 35 |     oi = OrganizationImporter("us")
 36 |     PostImporter("jurisdiction-id", oi).import_data([post.as_dict()])
 37 |     print(post.as_dict())
 38 | 
 39 |     # get person from db and assert it imported correctly
 40 |     p = Post.objects.get()
 41 |     assert "ocd-post" in p.id
 42 |     assert p.label == post.label
 43 |     assert p.role == post.role
 44 |     assert p.organization_id == org.id
 45 |     assert p.maximum_memberships == 2
 46 | 
 47 |     assert p.contact_details.all()[0].type == "phone"
 48 |     assert p.contact_details.all()[0].value == "555-555-1234"
 49 |     assert p.contact_details.all()[0].note == "this is fake"
 50 | 
 51 |     assert p.links.all()[0].url == "http://example.com/link"
 52 | 
 53 |     assert p.start_date == "2015-05-18"
 54 |     assert p.end_date == "2015-05-19"
 55 | 
 56 | 
 57 | @pytest.mark.django_db
 58 | def test_deduplication():
 59 |     create_jurisdictions()
 60 |     Organization.objects.create(
 61 |         id="us",
 62 |         name="United States Executive Branch",
 63 |         classification="executive",
 64 |         jurisdiction_id="us",
 65 |     )
 66 |     Organization.objects.create(
 67 |         id="nc",
 68 |         name="North Carolina Executive Branch",
 69 |         classification="executive",
 70 |         jurisdiction_id="nc",
 71 |     )
 72 |     pres = ScrapePost(
 73 |         label="executive",
 74 |         role="President",
 75 |         organization_id='~{"classification": "executive"}',
 76 |     )
 77 |     vp = ScrapePost(
 78 |         label="vice-executive",
 79 |         role="Vice President",
 80 |         organization_id='~{"classification": "executive"}',
 81 |     )
 82 |     gov = ScrapePost(
 83 |         label="executive",
 84 |         role="Governor",
 85 |         organization_id='~{"classification": "executive"}',
 86 |     )
 87 | 
 88 |     # ensure pres, vp and gov are all imported
 89 |     #   pres & gov - same label, different jurisdiction
 90 |     #   vp & pres - same jurisdiction, different label
 91 |     us_oi = OrganizationImporter("us")
 92 |     nc_oi = OrganizationImporter("nc")
 93 |     PostImporter("us", us_oi).import_data([pres.as_dict(), vp.as_dict()])
 94 |     PostImporter("nc", nc_oi).import_data([gov.as_dict()])
 95 |     assert Post.objects.count() == 3
 96 | 
 97 | 
 98 | @pytest.mark.django_db
 99 | def test_resolve_special_json_id():
100 |     create_jurisdictions()
101 |     Organization.objects.create(
102 |         id="us",
103 |         name="United States Executive Branch",
104 |         classification="executive",
105 |         jurisdiction_id="us",
106 |     )
107 |     Organization.objects.create(
108 |         id="nc",
109 |         name="North Carolina Executive Branch",
110 |         classification="executive",
111 |         jurisdiction_id="nc",
112 |     )
113 |     Post.objects.create(
114 |         id="pres", label="executive", role="President", organization_id="us"
115 |     )
116 |     Post.objects.create(
117 |         id="vpres", label="vice-executive", role="Vice President", organization_id="us"
118 |     )
119 |     Post.objects.create(
120 |         id="gov", label="executive", role="Governor", organization_id="nc"
121 |     )
122 | 
123 |     oi = OrganizationImporter("")
124 |     assert PostImporter("us", oi).resolve_json_id('~{"label": "executive"}') == "pres"
125 |     assert (
126 |         PostImporter("us", oi).resolve_json_id('~{"label": "vice-executive"}')
127 |         == "vpres"
128 |     )
129 |     assert PostImporter("nc", oi).resolve_json_id('~{"label": "executive"}') == "gov"
130 | 


--------------------------------------------------------------------------------
/pupa/importers/bills.py:
--------------------------------------------------------------------------------
  1 | from opencivicdata.legislative.models import (
  2 |     Bill,
  3 |     RelatedBill,
  4 |     BillAbstract,
  5 |     BillTitle,
  6 |     BillIdentifier,
  7 |     BillAction,
  8 |     BillActionRelatedEntity,
  9 |     BillSponsorship,
 10 |     BillSource,
 11 |     BillDocument,
 12 |     BillVersion,
 13 |     BillDocumentLink,
 14 |     BillVersionLink,
 15 | )
 16 | from .base import BaseImporter
 17 | from ..exceptions import PupaInternalError
 18 | 
 19 | 
 20 | class BillImporter(BaseImporter):
 21 |     _type = "bill"
 22 |     model_class = Bill
 23 |     related_models = {
 24 |         "abstracts": (BillAbstract, "bill_id", {}),
 25 |         "other_titles": (BillTitle, "bill_id", {}),
 26 |         "other_identifiers": (BillIdentifier, "bill_id", {}),
 27 |         "actions": (
 28 |             BillAction,
 29 |             "bill_id",
 30 |             {"related_entities": (BillActionRelatedEntity, "action_id", {})},
 31 |         ),
 32 |         "related_bills": (RelatedBill, "bill_id", {}),
 33 |         "sponsorships": (BillSponsorship, "bill_id", {}),
 34 |         "sources": (BillSource, "bill_id", {}),
 35 |         "documents": (
 36 |             BillDocument,
 37 |             "bill_id",
 38 |             {"links": (BillDocumentLink, "document_id", {})},
 39 |         ),
 40 |         "versions": (
 41 |             BillVersion,
 42 |             "bill_id",
 43 |             {"links": (BillVersionLink, "version_id", {})},
 44 |         ),
 45 |     }
 46 |     preserve_order = {"actions"}
 47 | 
 48 |     def __init__(self, jurisdiction_id, org_importer, person_importer):
 49 |         super(BillImporter, self).__init__(jurisdiction_id)
 50 |         self.org_importer = org_importer
 51 |         self.person_importer = person_importer
 52 | 
 53 |     def get_object(self, bill):
 54 |         spec = {
 55 |             "legislative_session_id": bill["legislative_session_id"],
 56 |             "identifier": bill["identifier"],
 57 |         }
 58 |         if "from_organization_id" in bill:
 59 |             spec["from_organization_id"] = bill["from_organization_id"]
 60 | 
 61 |         return self.model_class.objects.prefetch_related(
 62 |             "actions__related_entities",
 63 |             "versions__links",
 64 |             "documents__links",
 65 |         ).get(**spec)
 66 | 
 67 |     def limit_spec(self, spec):
 68 |         spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id
 69 |         return spec
 70 | 
 71 |     def prepare_for_db(self, data):
 72 |         data["legislative_session_id"] = self.get_session_id(
 73 |             data.pop("legislative_session")
 74 |         )
 75 | 
 76 |         if data["from_organization"]:
 77 |             data["from_organization_id"] = self.org_importer.resolve_json_id(
 78 |                 data.pop("from_organization")
 79 |             )
 80 | 
 81 |         for action in data["actions"]:
 82 |             action["organization_id"] = self.org_importer.resolve_json_id(
 83 |                 action["organization_id"]
 84 |             )
 85 |             for entity in action["related_entities"]:
 86 |                 if "organization_id" in entity:
 87 |                     entity["organization_id"] = self.org_importer.resolve_json_id(
 88 |                         entity["organization_id"]
 89 |                     )
 90 |                 elif "person_id" in entity:
 91 |                     entity["person_id"] = self.person_importer.resolve_json_id(
 92 |                         entity["person_id"]
 93 |                     )
 94 | 
 95 |         for sponsor in data["sponsorships"]:
 96 |             if "person_id" in sponsor:
 97 |                 sponsor["person_id"] = self.person_importer.resolve_json_id(
 98 |                     sponsor["person_id"], allow_no_match=True
 99 |                 )
100 | 
101 |             if "organization_id" in sponsor:
102 |                 sponsor["organization_id"] = self.org_importer.resolve_json_id(
103 |                     sponsor["organization_id"], allow_no_match=True
104 |                 )
105 | 
106 |         return data
107 | 
108 |     def postimport(self):
109 |         # go through all RelatedBill objs that are attached to a bill in this
110 |         # jurisdiction and are currently unresolved
111 |         for rb in RelatedBill.objects.filter(
112 |             bill__legislative_session__jurisdiction_id=self.jurisdiction_id,
113 |             related_bill=None,
114 |         ):
115 |             candidates = list(
116 |                 Bill.objects.filter(
117 |                     legislative_session__identifier=rb.legislative_session,
118 |                     legislative_session__jurisdiction_id=self.jurisdiction_id,
119 |                     identifier=rb.identifier,
120 |                 )
121 |             )
122 |             if len(candidates) == 1:
123 |                 rb.related_bill = candidates[0]
124 |                 rb.save()
125 |             elif len(candidates) > 1:  # pragma: no cover
126 |                 # if we ever see this, we need to add additional fields on the relation
127 |                 raise PupaInternalError(
128 |                     "multiple related_bill candidates found for {}".format(rb)
129 |                 )
130 | 


--------------------------------------------------------------------------------
/pupa/importers/events.py:
--------------------------------------------------------------------------------
  1 | from .base import BaseImporter
  2 | from pupa.utils import get_pseudo_id, _make_pseudo_id
  3 | from opencivicdata.legislative.models import (
  4 |     Event,
  5 |     EventLocation,
  6 |     EventSource,
  7 |     EventDocument,
  8 |     EventDocumentLink,
  9 |     EventLink,
 10 |     EventParticipant,
 11 |     EventMedia,
 12 |     EventMediaLink,
 13 |     EventAgendaItem,
 14 |     EventRelatedEntity,
 15 |     EventAgendaMedia,
 16 |     EventAgendaMediaLink,
 17 | )
 18 | 
 19 | 
 20 | class EventImporter(BaseImporter):
 21 |     _type = "event"
 22 |     model_class = Event
 23 |     related_models = {
 24 |         "sources": (EventSource, "event_id", {}),
 25 |         "documents": (
 26 |             EventDocument,
 27 |             "event_id",
 28 |             {"links": (EventDocumentLink, "document_id", {})},
 29 |         ),
 30 |         "links": (EventLink, "event_id", {}),
 31 |         "participants": (EventParticipant, "event_id", {}),
 32 |         "media": (
 33 |             EventMedia,
 34 |             "event_id",
 35 |             {
 36 |                 "links": (EventMediaLink, "media_id", {}),
 37 |             },
 38 |         ),
 39 |         "agenda": (
 40 |             EventAgendaItem,
 41 |             "event_id",
 42 |             {
 43 |                 "related_entities": (EventRelatedEntity, "agenda_item_id", {}),
 44 |                 "media": (
 45 |                     EventAgendaMedia,
 46 |                     "agenda_item_id",
 47 |                     {
 48 |                         "links": (EventAgendaMediaLink, "media_id", {}),
 49 |                     },
 50 |                 ),
 51 |             },
 52 |         ),
 53 |     }
 54 |     preserve_order = ("agenda",)
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         jurisdiction_id,
 59 |         org_importer,
 60 |         person_importer,
 61 |         bill_importer,
 62 |         vote_event_importer,
 63 |     ):
 64 |         super(EventImporter, self).__init__(jurisdiction_id)
 65 |         self.org_importer = org_importer
 66 |         self.person_importer = person_importer
 67 |         self.bill_importer = bill_importer
 68 |         self.vote_event_importer = vote_event_importer
 69 | 
 70 |     def get_object(self, event):
 71 |         if event.get("pupa_id"):
 72 |             e_id = self.lookup_obj_id(event["pupa_id"], Event)
 73 |             if e_id:
 74 |                 spec = {"id": e_id}
 75 |             else:
 76 |                 return None
 77 |         else:
 78 |             spec = {
 79 |                 "name": event["name"],
 80 |                 "description": event["description"],
 81 |                 "start_date": event["start_date"],
 82 |                 "end_date": event["end_date"],
 83 |                 "jurisdiction_id": self.jurisdiction_id,
 84 |             }
 85 |         return self.model_class.objects.get(**spec)
 86 | 
 87 |     def get_location(self, location_data):
 88 |         obj, created = EventLocation.objects.get_or_create(
 89 |             name=location_data["name"],
 90 |             url=location_data.get("url", ""),
 91 |             jurisdiction_id=self.jurisdiction_id,
 92 |         )
 93 |         # TODO: geocode here?
 94 |         return obj
 95 | 
 96 |     def prepare_for_db(self, data):
 97 |         data["jurisdiction_id"] = self.jurisdiction_id
 98 |         if data["location"]:
 99 |             data["location"] = self.get_location(data["location"])
100 | 
101 |         data["start_date"] = data["start_date"]
102 |         data["end_date"] = data.get("end_date", "")
103 | 
104 |         for participant in data["participants"]:
105 |             if "person_id" in participant:
106 |                 participant["person_id"] = self.person_importer.resolve_json_id(
107 |                     participant["person_id"], allow_no_match=True
108 |                 )
109 |             elif "organization_id" in participant:
110 |                 participant["organization_id"] = self.org_importer.resolve_json_id(
111 |                     participant["organization_id"], allow_no_match=True
112 |                 )
113 | 
114 |         for item in data["agenda"]:
115 |             for entity in item["related_entities"]:
116 |                 if "person_id" in entity:
117 |                     entity["person_id"] = self.person_importer.resolve_json_id(
118 |                         entity["person_id"], allow_no_match=True
119 |                     )
120 |                 elif "organization_id" in entity:
121 |                     entity["organization_id"] = self.org_importer.resolve_json_id(
122 |                         entity["organization_id"], allow_no_match=True
123 |                     )
124 |                 elif "bill_id" in entity:
125 |                     # unpack and repack bill psuedo id in case filters alter it
126 |                     bill = get_pseudo_id(entity["bill_id"])
127 |                     self.bill_importer.apply_transformers(bill)
128 |                     bill = _make_pseudo_id(**bill)
129 |                     entity["bill_id"] = self.bill_importer.resolve_json_id(
130 |                         bill, allow_no_match=True
131 |                     )
132 |                 elif "vote_event_id" in entity:
133 |                     entity["vote_event_id"] = self.vote_event_importer.resolve_json_id(
134 |                         entity["vote_event_id"], allow_no_match=True
135 |                     )
136 | 
137 |         return data
138 | 


--------------------------------------------------------------------------------
/pupa/importers/organizations.py:
--------------------------------------------------------------------------------
  1 | from django.db.models import Q
  2 | from opencivicdata.core.models import (
  3 |     Organization,
  4 |     OrganizationIdentifier,
  5 |     OrganizationName,
  6 |     OrganizationContactDetail,
  7 |     OrganizationLink,
  8 |     OrganizationSource,
  9 | )
 10 | from .base import BaseImporter
 11 | from ..utils import get_pseudo_id
 12 | from ..utils.topsort import Network
 13 | from ..exceptions import UnresolvedIdError, PupaInternalError, SameOrgNameError
 14 | 
 15 | 
 16 | class OrganizationImporter(BaseImporter):
 17 |     _type = "organization"
 18 |     model_class = Organization
 19 |     related_models = {
 20 |         "identifiers": (OrganizationIdentifier, "organization_id", {}),
 21 |         "other_names": (OrganizationName, "organization_id", {}),
 22 |         "contact_details": (OrganizationContactDetail, "organization_id", {}),
 23 |         "links": (OrganizationLink, "organization_id", {}),
 24 |         "sources": (OrganizationSource, "organization_id", {}),
 25 |     }
 26 | 
 27 |     def get_object(self, org):
 28 |         spec = {"classification": org["classification"], "parent_id": org["parent_id"]}
 29 | 
 30 |         # add jurisdiction_id unless this is a party
 31 |         jid = org.get("jurisdiction_id")
 32 |         if jid:
 33 |             spec["jurisdiction_id"] = jid
 34 | 
 35 |         all_names = [org["name"]] + [o["name"] for o in org["other_names"]]
 36 | 
 37 |         query = Q(**spec) & (Q(name__in=all_names) | Q(other_names__name__in=all_names))
 38 |         matches = list(self.model_class.objects.filter(query).distinct("id"))
 39 |         matches_length = len(matches)
 40 |         if matches_length == 1:
 41 |             return matches[0]
 42 |         elif matches_length == 0:
 43 |             raise self.model_class.DoesNotExist(
 44 |                 "No Organization: {} in {}".format(all_names, self.jurisdiction_id)
 45 |             )
 46 |         else:
 47 |             raise SameOrgNameError(org["name"])
 48 | 
 49 |     def prepare_for_db(self, data):
 50 |         data["parent_id"] = self.resolve_json_id(data["parent_id"])
 51 | 
 52 |         if data["classification"] != "party":
 53 |             data["jurisdiction_id"] = self.jurisdiction_id
 54 |         return data
 55 | 
 56 |     def limit_spec(self, spec):
 57 |         if spec.get("classification") != "party":
 58 |             spec["jurisdiction_id"] = self.jurisdiction_id
 59 | 
 60 |         name = spec.pop("name", None)
 61 |         if name:
 62 |             return Q(**spec) & (Q(name=name) | Q(other_names__name=name))
 63 |         return spec
 64 | 
 65 |     def _prepare_imports(self, dicts):
 66 |         """an override for prepare imports that sorts the imports
 67 |         by parent_id dependencies"""
 68 |         # all pseudo parent ids we've seen
 69 |         pseudo_ids = set()
 70 |         # pseudo matches
 71 |         pseudo_matches = {}
 72 | 
 73 |         # get prepared imports from parent
 74 |         prepared = dict(super(OrganizationImporter, self)._prepare_imports(dicts))
 75 | 
 76 |         # collect parent pseudo_ids
 77 |         for _, data in prepared.items():
 78 |             parent_id = data.get("parent_id", None) or ""
 79 |             if parent_id.startswith("~"):
 80 |                 pseudo_ids.add(parent_id)
 81 | 
 82 |         # turn pseudo_ids into a tuple of dictionaries
 83 |         pseudo_ids = [(ppid, get_pseudo_id(ppid)) for ppid in pseudo_ids]
 84 | 
 85 |         # loop over all data again, finding the pseudo ids true json id
 86 |         for json_id, data in prepared.items():
 87 |             # check if this matches one of our ppids
 88 |             for ppid, spec in pseudo_ids:
 89 |                 match = True
 90 |                 for k, v in spec.items():
 91 |                     if data[k] != v:
 92 |                         match = False
 93 |                         break
 94 |                 if match:
 95 |                     if ppid in pseudo_matches:
 96 |                         raise UnresolvedIdError(
 97 |                             "multiple matches for pseudo id: " + ppid
 98 |                         )
 99 |                     pseudo_matches[ppid] = json_id
100 | 
101 |         # toposort the nodes so parents are imported first
102 |         network = Network()
103 |         in_network = set()
104 |         import_order = []
105 | 
106 |         for json_id, data in prepared.items():
107 |             parent_id = data.get("parent_id", None)
108 | 
109 |             # resolve pseudo_ids to their json id before building the network
110 |             if parent_id in pseudo_matches:
111 |                 parent_id = pseudo_matches[parent_id]
112 | 
113 |             network.add_node(json_id)
114 |             if parent_id:
115 |                 # Right. There's an import dep. We need to add the edge from
116 |                 # the parent to the current node, so that we import the parent
117 |                 # before the current node.
118 |                 network.add_edge(parent_id, json_id)
119 | 
120 |         # resolve the sorted import order
121 |         for jid in network.sort():
122 |             import_order.append((jid, prepared[jid]))
123 |             in_network.add(jid)
124 | 
125 |         # ensure all data made it into network (paranoid check, should never fail)
126 |         if in_network != set(prepared.keys()):  # pragma: no cover
127 |             raise PupaInternalError("import is missing nodes in network set")
128 | 
129 |         return import_order
130 | 


--------------------------------------------------------------------------------
/pupa/migrations/0001_initial.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | from django.db import models, migrations
  5 | 
  6 | 
  7 | class Migration(migrations.Migration):
  8 | 
  9 |     dependencies = [
 10 |         ("core", "0001_initial"),
 11 |         ("legislative", "0001_initial"),
 12 |     ]
 13 | 
 14 |     operations = [
 15 |         migrations.CreateModel(
 16 |             name="ImportObjects",
 17 |             fields=[
 18 |                 (
 19 |                     "id",
 20 |                     models.AutoField(
 21 |                         primary_key=True,
 22 |                         auto_created=True,
 23 |                         serialize=False,
 24 |                         verbose_name="ID",
 25 |                     ),
 26 |                 ),
 27 |                 (
 28 |                     "object_type",
 29 |                     models.CharField(
 30 |                         max_length=20,
 31 |                         choices=[
 32 |                             ("jurisdiction", "Jurisdiction"),
 33 |                             ("person", "Person"),
 34 |                             ("organization", "Organization"),
 35 |                             ("post", "Post"),
 36 |                             ("membership", "Membership"),
 37 |                             ("bill", "Bill"),
 38 |                             ("vote_event", "VoteEvent"),
 39 |                             ("event", "Event"),
 40 |                         ],
 41 |                     ),
 42 |                 ),
 43 |                 ("insert_count", models.PositiveIntegerField()),
 44 |                 ("update_count", models.PositiveIntegerField()),
 45 |                 ("noop_count", models.PositiveIntegerField()),
 46 |                 ("start_time", models.DateTimeField()),
 47 |                 ("end_time", models.DateTimeField()),
 48 |             ],
 49 |             options={},
 50 |             bases=(models.Model,),
 51 |         ),
 52 |         migrations.CreateModel(
 53 |             name="RunPlan",
 54 |             fields=[
 55 |                 (
 56 |                     "id",
 57 |                     models.AutoField(
 58 |                         primary_key=True,
 59 |                         auto_created=True,
 60 |                         serialize=False,
 61 |                         verbose_name="ID",
 62 |                     ),
 63 |                 ),
 64 |                 ("success", models.BooleanField(default=True)),
 65 |                 (
 66 |                     "jurisdiction",
 67 |                     models.ForeignKey(to="core.Jurisdiction", on_delete=models.CASCADE),
 68 |                 ),
 69 |             ],
 70 |             options={},
 71 |             bases=(models.Model,),
 72 |         ),
 73 |         migrations.AddField(
 74 |             model_name="importobjects",
 75 |             name="report",
 76 |             field=models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE),
 77 |             preserve_default=True,
 78 |         ),
 79 |         migrations.CreateModel(
 80 |             name="ScrapeObjects",
 81 |             fields=[
 82 |                 (
 83 |                     "id",
 84 |                     models.AutoField(
 85 |                         primary_key=True,
 86 |                         auto_created=True,
 87 |                         serialize=False,
 88 |                         verbose_name="ID",
 89 |                     ),
 90 |                 ),
 91 |                 (
 92 |                     "object_type",
 93 |                     models.CharField(
 94 |                         max_length=20,
 95 |                         choices=[
 96 |                             ("jurisdiction", "Jurisdiction"),
 97 |                             ("person", "Person"),
 98 |                             ("organization", "Organization"),
 99 |                             ("post", "Post"),
100 |                             ("membership", "Membership"),
101 |                             ("bill", "Bill"),
102 |                             ("vote_event", "VoteEvent"),
103 |                             ("event", "Event"),
104 |                         ],
105 |                     ),
106 |                 ),
107 |                 ("count", models.PositiveIntegerField()),
108 |             ],
109 |             options={},
110 |             bases=(models.Model,),
111 |         ),
112 |         migrations.CreateModel(
113 |             name="ScrapeReport",
114 |             fields=[
115 |                 (
116 |                     "id",
117 |                     models.AutoField(
118 |                         primary_key=True,
119 |                         auto_created=True,
120 |                         serialize=False,
121 |                         verbose_name="ID",
122 |                     ),
123 |                 ),
124 |                 ("scraper", models.CharField(max_length=300)),
125 |                 ("args", models.CharField(max_length=300)),
126 |                 ("start_time", models.DateTimeField()),
127 |                 ("end_time", models.DateTimeField()),
128 |                 (
129 |                     "plan",
130 |                     models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE),
131 |                 ),
132 |             ],
133 |             options={},
134 |             bases=(models.Model,),
135 |         ),
136 |         migrations.AddField(
137 |             model_name="scrapeobjects",
138 |             name="report",
139 |             field=models.ForeignKey(to="pupa.ScrapeReport", on_delete=models.CASCADE),
140 |             preserve_default=True,
141 |         ),
142 |     ]
143 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/bill.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Schema for bill objects.
  3 | """
  4 | 
  5 | from .common import sources, extras, fuzzy_date_blank, fuzzy_datetime
  6 | from opencivicdata import common
  7 | 
  8 | versions_or_documents = {
  9 |     "items": {
 10 |         "properties": {
 11 |             "note": {"type": "string", "minLength": 1},
 12 |             "date": fuzzy_date_blank,
 13 |             "links": {
 14 |                 "items": {
 15 |                     "properties": {
 16 |                         "media_type": {"type": "string"},
 17 |                         "url": {"type": "string", "format": "uri"},
 18 |                     },
 19 |                     "type": "object",
 20 |                 },
 21 |                 "type": "array",
 22 |             },
 23 |         },
 24 |         "type": "object",
 25 |     },
 26 |     "type": "array",
 27 | }
 28 | 
 29 | schema = {
 30 |     "type": "object",
 31 |     "properties": {
 32 |         "legislative_session": {"type": "string", "minLength": 1},
 33 |         "identifier": {"type": "string", "minLength": 1},
 34 |         "title": {"type": "string", "minLength": 1},
 35 |         "from_organization": {"type": ["string", "null"]},
 36 |         "classification": {
 37 |             "items": {"type": "string", "enum": common.BILL_CLASSIFICATIONS},
 38 |             "type": "array",
 39 |         },
 40 |         "subject": {"items": {"type": "string", "minLength": 1}, "type": "array"},
 41 |         "abstracts": {
 42 |             "items": {
 43 |                 "properties": {
 44 |                     "abstract": {"type": "string", "minLength": 1},
 45 |                     "note": {"type": "string"},
 46 |                     "date": {"type": "string"},
 47 |                 },
 48 |                 "type": "object",
 49 |             },
 50 |             "type": "array",
 51 |         },
 52 |         "other_titles": {
 53 |             "items": {
 54 |                 "properties": {
 55 |                     "title": {"type": "string", "minLength": 1},
 56 |                     "note": {"type": "string"},
 57 |                 },
 58 |                 "type": "object",
 59 |             },
 60 |             "type": "array",
 61 |         },
 62 |         "other_identifiers": {
 63 |             "items": {
 64 |                 "properties": {
 65 |                     "identifier": {"type": "string", "minLength": 1},
 66 |                     "note": {"type": "string"},
 67 |                     "scheme": {"type": "string"},
 68 |                 },
 69 |                 "type": "object",
 70 |             },
 71 |             "type": "array",
 72 |         },
 73 |         "actions": {
 74 |             "items": {
 75 |                 "properties": {
 76 |                     "organization": {"type": ["string", "null"]},
 77 |                     "date": fuzzy_datetime,
 78 |                     "description": {"type": "string", "minLength": 1},
 79 |                     "classification": {
 80 |                         "items": {
 81 |                             "type": "string",
 82 |                             "enum": common.BILL_ACTION_CLASSIFICATIONS,
 83 |                         },
 84 |                         "type": "array",
 85 |                     },
 86 |                     "related_entities": {
 87 |                         "items": {
 88 |                             "properties": {
 89 |                                 "name": {"type": "string", "minLength": 1},
 90 |                                 "entity_type": {
 91 |                                     "enum": ["organization", "person", ""],
 92 |                                     "type": "string",
 93 |                                 },
 94 |                                 "person_id": {"type": ["string", "null"]},
 95 |                                 "organization_id": {"type": ["string", "null"]},
 96 |                             },
 97 |                             "type": "object",
 98 |                         },
 99 |                         "type": "array",
100 |                     },
101 |                 },
102 |                 "type": "object",
103 |             },
104 |             "type": "array",
105 |         },
106 |         "sponsorships": {
107 |             "items": {
108 |                 "properties": {
109 |                     "primary": {"type": "boolean"},
110 |                     "classification": {"type": "string", "minLength": 1},
111 |                     "name": {"type": "string", "minLength": 1},
112 |                     "entity_type": {
113 |                         "enum": ["organization", "person", ""],
114 |                         "type": "string",
115 |                     },
116 |                     "person_id": {"type": ["string", "null"]},
117 |                     "organization_id": {"type": ["string", "null"]},
118 |                 },
119 |                 "type": "object",
120 |             },
121 |             "type": "array",
122 |         },
123 |         "related_bills": {
124 |             "items": {
125 |                 "properties": {
126 |                     "identifier": {"type": "string", "minLength": 1},
127 |                     "legislative_session": {"type": "string", "minLength": 1},
128 |                     "relation_type": {
129 |                         "enum": common.BILL_RELATION_TYPES,
130 |                         "type": "string",
131 |                     },
132 |                 },
133 |                 "type": "object",
134 |             },
135 |             "type": "array",
136 |         },
137 |         "versions": versions_or_documents,
138 |         "documents": versions_or_documents,
139 |         "sources": sources,
140 |         "extras": extras,
141 |     },
142 | }
143 | 


--------------------------------------------------------------------------------
/pupa/ext/ansistrm.py:
--------------------------------------------------------------------------------
  1 | # flake8: NOQA
  2 | #
  3 | # Copyright (C) 2010-2012 Vinay Sajip. All rights reserved.
  4 | # Licensed under the new BSD license.
  5 | #
  6 | import ctypes
  7 | import logging
  8 | import os
  9 | 
 10 | 
 11 | class ColorizingStreamHandler(logging.StreamHandler):
 12 |     # color names to indices
 13 |     color_map = {
 14 |         "black": 0,
 15 |         "red": 1,
 16 |         "green": 2,
 17 |         "yellow": 3,
 18 |         "blue": 4,
 19 |         "magenta": 5,
 20 |         "cyan": 6,
 21 |         "white": 7,
 22 |     }
 23 | 
 24 |     # levels to (background, foreground, bold/intense)
 25 |     if os.name == "nt":
 26 |         level_map = {
 27 |             logging.DEBUG: (None, "blue", True),
 28 |             logging.INFO: (None, "white", False),
 29 |             logging.WARNING: (None, "yellow", True),
 30 |             logging.ERROR: (None, "red", True),
 31 |             logging.CRITICAL: ("red", "white", True),
 32 |         }
 33 |     else:
 34 |         level_map = {
 35 |             logging.DEBUG: (None, "blue", False),
 36 |             logging.INFO: (None, "white", False),
 37 |             logging.WARNING: (None, "yellow", False),
 38 |             logging.ERROR: (None, "red", False),
 39 |             logging.CRITICAL: ("red", "white", True),
 40 |         }
 41 |     csi = "\x1b["
 42 |     reset = "\x1b[0m"
 43 | 
 44 |     @property
 45 |     def is_tty(self):
 46 |         # bluff for Jenkins
 47 |         if os.environ.get("JENKINS_URL"):
 48 |             return True
 49 |         isatty = getattr(self.stream, "isatty", None)
 50 |         return isatty and isatty()
 51 | 
 52 |     def emit(self, record):
 53 |         try:
 54 |             message = self.format(record)
 55 |             stream = self.stream
 56 |             if not self.is_tty:
 57 |                 stream.write(message)
 58 |             else:
 59 |                 self.output_colorized(message)
 60 |             stream.write(getattr(self, "terminator", "\n"))
 61 |             self.flush()
 62 |         except (KeyboardInterrupt, SystemExit):
 63 |             raise
 64 |         except:
 65 |             self.handleError(record)
 66 | 
 67 |     if os.name != "nt":
 68 | 
 69 |         def output_colorized(self, message):
 70 |             self.stream.write(message)
 71 | 
 72 |     else:
 73 |         import re
 74 | 
 75 |         ansi_esc = re.compile(r"\x1b\[((?:\d+)(?:;(?:\d+))*)m")
 76 | 
 77 |         nt_color_map = {
 78 |             0: 0x00,  # black
 79 |             1: 0x04,  # red
 80 |             2: 0x02,  # green
 81 |             3: 0x06,  # yellow
 82 |             4: 0x01,  # blue
 83 |             5: 0x05,  # magenta
 84 |             6: 0x03,  # cyan
 85 |             7: 0x07,  # white
 86 |         }
 87 | 
 88 |         def output_colorized(self, message):
 89 |             parts = self.ansi_esc.split(message)
 90 |             write = self.stream.write
 91 |             h = None
 92 |             fd = getattr(self.stream, "fileno", None)
 93 |             if fd is not None:
 94 |                 fd = fd()
 95 |                 if fd in (1, 2):  # stdout or stderr
 96 |                     h = ctypes.windll.kernel32.GetStdHandle(-10 - fd)
 97 |             while parts:
 98 |                 text = parts.pop(0)
 99 |                 if text:
100 |                     write(text)
101 |                 if parts:
102 |                     params = parts.pop(0)
103 |                     if h is not None:
104 |                         params = [int(p) for p in params.split(";")]
105 |                         color = 0
106 |                         for p in params:
107 |                             if 40 <= p <= 47:
108 |                                 color |= self.nt_color_map[p - 40] << 4
109 |                             elif 30 <= p <= 37:
110 |                                 color |= self.nt_color_map[p - 30]
111 |                             elif p == 1:
112 |                                 color |= 0x08  # foreground intensity on
113 |                             elif p == 0:  # reset to default color
114 |                                 color = 0x07
115 |                             else:
116 |                                 pass  # error condition ignored
117 |                         ctypes.windll.kernel32.SetConsoleTextAttribute(h, color)
118 | 
119 |     def colorize(self, message, record):
120 |         if record.levelno in self.level_map:
121 |             bg, fg, bold = self.level_map[record.levelno]
122 |             params = []
123 |             if bg in self.color_map:
124 |                 params.append(str(self.color_map[bg] + 40))
125 |             if fg in self.color_map:
126 |                 params.append(str(self.color_map[fg] + 30))
127 |             if bold:
128 |                 params.append("1")
129 |             if params:
130 |                 message = "".join(
131 |                     (self.csi, ";".join(params), "m", message, self.reset)
132 |                 )
133 |         return message
134 | 
135 |     def format(self, record):
136 |         message = logging.StreamHandler.format(self, record)
137 |         if self.is_tty:
138 |             # Don't colorize any traceback
139 |             parts = message.split("\n", 1)
140 |             parts[0] = self.colorize(parts[0], record)
141 |             message = "\n".join(parts)
142 |         return message
143 | 
144 | 
145 | def main():
146 |     root = logging.getLogger()
147 |     root.setLevel(logging.DEBUG)
148 |     root.addHandler(ColorizingStreamHandler())
149 |     logging.debug("DEBUG")
150 |     logging.info("INFO")
151 |     logging.warning("WARNING")
152 |     logging.error("ERROR")
153 |     logging.critical("CRITICAL")
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/pupa/utils/topsort.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from itertools import chain
  3 | 
  4 | 
  5 | class CyclicGraphError(ValueError):
  6 |     """
  7 |     This exception is raised if the graph is Cyclic (or rather, when the
  8 |     sorting algorithm *knows* that the graph is Cyclic by hitting a snag
  9 |     in the top-sort)
 10 |     """
 11 | 
 12 |     pass
 13 | 
 14 | 
 15 | class Network(object):
 16 |     """
 17 |     This object (the `Network` object) handles keeping track of all the
 18 |     graph's nodes, and links between the nodes.
 19 | 
 20 |     The `Network' object is mostly used to topologically sort the nodes,
 21 |     to handle dependency resolution.
 22 |     """
 23 | 
 24 |     def __init__(self):
 25 |         self.nodes = set()
 26 |         self.edges = defaultdict(set)
 27 | 
 28 |     def add_node(self, node):
 29 |         """Add a node to the graph (with no edges)"""
 30 |         self.nodes.add(node)
 31 | 
 32 |     def add_edge(self, fro, to):
 33 |         """
 34 |         When doing topological sorting, the semantics of the edge mean that
 35 |         the depedency runs from the parent to the child - which is to say that
 36 |         the parent is required to be sorted *before* the child.
 37 | 
 38 |                   [ FROM ] ------> [ TO ]
 39 |         Committee on Finance -> Subcommittee of the Finance Committee on Budget
 40 |                              -> Subcommittee of the Finance Committee on Roads
 41 |         """
 42 |         self.add_node(fro)
 43 |         self.add_node(to)
 44 |         self.edges[fro].add(to)
 45 | 
 46 |     def leaf_nodes(self):
 47 |         """
 48 |         Return an interable of nodes with no edges pointing at them. This is
 49 |         helpful to find all nodes without dependencies.
 50 |         """
 51 |         # Now contains all nodes that contain dependencies.
 52 |         deps = {item for sublist in self.edges.values() for item in sublist}
 53 |         # contains all nodes *without* any dependencies (leaf nodes)
 54 |         return self.nodes - deps
 55 | 
 56 |     def prune_node(self, node, remove_backrefs=False):
 57 |         """
 58 |         remove node `node` from the network (including any edges that may
 59 |         have been pointing at `node`).
 60 |         """
 61 |         if not remove_backrefs:
 62 |             for fro, connections in self.edges.items():
 63 |                 if node in self.edges[fro]:
 64 |                     raise ValueError(
 65 |                         """Attempting to remove a node with
 66 |                                      backrefs. You may consider setting
 67 |                                      `remove_backrefs` to true."""
 68 |                     )
 69 | 
 70 |         # OK. Otherwise, let's do our removal.
 71 | 
 72 |         self.nodes.remove(node)
 73 |         if node in self.edges:
 74 |             # Remove add edges from this node if we're pruning it.
 75 |             self.edges.pop(node)
 76 | 
 77 |         for fro, connections in self.edges.items():
 78 |             # Remove any links to this node (if they exist)
 79 |             if node in self.edges[fro]:
 80 |                 # If we should remove backrefs:
 81 |                 self.edges[fro].remove(node)
 82 | 
 83 |     def sort(self):
 84 |         """
 85 |         Return an iterable of nodes, toplogically sorted to correctly import
 86 |         dependencies before leaf nodes.
 87 |         """
 88 |         while self.nodes:
 89 |             iterated = False
 90 |             for node in self.leaf_nodes():
 91 |                 iterated = True
 92 |                 self.prune_node(node)
 93 |                 yield node
 94 |             if not iterated:
 95 |                 raise CyclicGraphError("Sorting has found a cyclic graph.")
 96 | 
 97 |     def dot(self):
 98 |         """
 99 |         Return a buffer that represents something dot(1) can render.
100 |         """
101 |         buff = "digraph graphname {"
102 |         for fro in self.edges:
103 |             for to in self.edges[fro]:
104 |                 buff += "%s -> %s;" % (fro, to)
105 |         buff += "}"
106 |         return buff
107 | 
108 |     def cycles(self):
109 |         """
110 |         Fairly expensive cycle detection algorithm. This method
111 |         will return the shortest unique cycles that were detected.
112 | 
113 |         Debug usage may look something like:
114 | 
115 |         print("The following cycles were found:")
116 |         for cycle in network.cycles():
117 |             print("    ", " -> ".join(cycle))
118 |         """
119 | 
120 |         def walk_node(node, seen):
121 |             """
122 |             Walk each top-level node we know about, and recurse
123 |             along the graph.
124 |             """
125 |             if node in seen:
126 |                 yield (node,)
127 |                 return
128 |             seen.add(node)
129 |             for edge in self.edges[node]:
130 |                 for cycle in walk_node(edge, set(seen)):
131 |                     yield (node,) + cycle
132 | 
133 |         # First, let's get a iterable of all known cycles.
134 |         cycles = chain.from_iterable((walk_node(node, set()) for node in self.nodes))
135 | 
136 |         shortest = set()
137 |         # Now, let's go through and sift through the cycles, finding
138 |         # the shortest unique cycle known, ignoring cycles which contain
139 |         # already known cycles.
140 |         for cycle in sorted(cycles, key=len):
141 |             for el in shortest:
142 |                 if set(el).issubset(set(cycle)):
143 |                     break
144 |             else:
145 |                 shortest.add(cycle)
146 |         # And return that unique list.
147 |         return shortest
148 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_model_basics.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pupa.scrape.schemas.person import schema
  3 | from pupa.scrape.base import (
  4 |     BaseModel,
  5 |     SourceMixin,
  6 |     ContactDetailMixin,
  7 |     LinkMixin,
  8 |     AssociatedLinkMixin,
  9 |     OtherNameMixin,
 10 |     IdentifierMixin,
 11 | )
 12 | 
 13 | 
 14 | class GenericModel(
 15 |     BaseModel,
 16 |     SourceMixin,
 17 |     ContactDetailMixin,
 18 |     LinkMixin,
 19 |     AssociatedLinkMixin,
 20 |     OtherNameMixin,
 21 |     IdentifierMixin,
 22 | ):
 23 |     """a generic model used for testing the base and mixins"""
 24 | 
 25 |     _type = "generic"
 26 |     _schema = schema
 27 | 
 28 |     def __init__(self):
 29 |         super(GenericModel, self).__init__()
 30 |         self._associated = []
 31 | 
 32 | 
 33 | def test_init_id():
 34 |     m = GenericModel()
 35 |     assert len(m._id) == 36
 36 | 
 37 | 
 38 | def test_as_dict():
 39 |     m = GenericModel()
 40 |     assert m.as_dict()["_id"] == m._id
 41 | 
 42 | 
 43 | def test_setattr():
 44 |     m = GenericModel()
 45 | 
 46 |     with pytest.raises(ValueError):
 47 |         m.some_random_key = 3
 48 | 
 49 |     # and no error raised since this is a valid key
 50 |     m._id = "new id"
 51 | 
 52 | 
 53 | def test_add_source():
 54 |     m = GenericModel()
 55 |     m.add_source("http://example.com/1")
 56 |     m.add_source("http://example.com/2", note="xyz")
 57 |     assert m.sources == [
 58 |         {"url": "http://example.com/1", "note": ""},
 59 |         {"url": "http://example.com/2", "note": "xyz"},
 60 |     ]
 61 | 
 62 | 
 63 | def test_add_contact_detail():
 64 |     m = GenericModel()
 65 |     m.add_contact_detail(type="fax", value="111-222-3333", note="office")
 66 |     assert m.contact_details == [
 67 |         {"type": "fax", "value": "111-222-3333", "note": "office"}
 68 |     ]
 69 | 
 70 | 
 71 | def test_add_link():
 72 |     m = GenericModel()
 73 |     m.add_link("http://example.com/1")
 74 |     m.add_link("http://example.com/2", note="xyz")
 75 |     assert m.links == [
 76 |         {"url": "http://example.com/1", "note": ""},
 77 |         {"url": "http://example.com/2", "note": "xyz"},
 78 |     ]
 79 | 
 80 | 
 81 | def test_add_associated_link_match():
 82 |     m = GenericModel()
 83 |     m._add_associated_link(
 84 |         "_associated",
 85 |         "something",
 86 |         "http://example.com/1.txt",
 87 |         text="",
 88 |         media_type="text/plain",
 89 |         on_duplicate="error",
 90 |     )
 91 |     m._add_associated_link(
 92 |         "_associated",
 93 |         "something",
 94 |         "http://example.com/1.pdf",
 95 |         text="",
 96 |         media_type="application/pdf",
 97 |         on_duplicate="error",
 98 |     )
 99 |     # one 'document' added, multiple links for it
100 |     assert len(m._associated) == 1
101 |     assert len(m._associated[0]["links"]) == 2
102 | 
103 | 
104 | def test_add_associated_link_on_duplicate_bad():
105 |     m = GenericModel()
106 | 
107 |     with pytest.raises(ValueError):
108 |         m._add_associated_link(
109 |             "_associated",
110 |             "something",
111 |             "http://example.com",
112 |             text="",
113 |             media_type="text/html",
114 |             on_duplicate="idk",
115 |         )
116 | 
117 | 
118 | def test_add_associated_link_on_duplicate_error():
119 |     m = GenericModel()
120 |     m._add_associated_link(
121 |         "_associated",
122 |         "something",
123 |         "http://example.com",
124 |         text="",
125 |         media_type="text/html",
126 |         on_duplicate="error",
127 |     )
128 | 
129 |     with pytest.raises(ValueError):
130 |         m._add_associated_link(
131 |             "_associated",
132 |             "something else",
133 |             "http://example.com",
134 |             text="",
135 |             media_type="text/html",
136 |             on_duplicate="error",
137 |         )
138 | 
139 | 
140 | def test_add_associated_link_on_duplicate_ignore():
141 |     m = GenericModel()
142 |     m._add_associated_link(
143 |         "_associated",
144 |         "something",
145 |         "http://example.com",
146 |         text="",
147 |         media_type="text/html",
148 |         on_duplicate="ignore",
149 |     )
150 |     m._add_associated_link(
151 |         "_associated",
152 |         "something else",
153 |         "http://example.com",
154 |         text="",
155 |         media_type="text/html",
156 |         on_duplicate="ignore",
157 |     )
158 |     # one 'document' added, single link for it, keeps first name
159 |     assert len(m._associated) == 1
160 |     assert len(m._associated[0]["links"]) == 1
161 |     assert m._associated[0]["note"] == "something"
162 | 
163 | 
164 | def test_add_name():
165 |     m = GenericModel()
166 | 
167 |     m.add_name("Thiston", note="What my friends call me")
168 | 
169 |     assert m.other_names == [{"name": "Thiston", "note": "What my friends call me"}]
170 | 
171 |     m.add_name(
172 |         "Johnseph Q. Publico",
173 |         note="Birth name",
174 |         start_date="1920-01",
175 |         end_date="1949-12-31",
176 |     )
177 | 
178 |     assert m.other_names == [
179 |         {"name": "Thiston", "note": "What my friends call me"},
180 |         {
181 |             "name": "Johnseph Q. Publico",
182 |             "note": "Birth name",
183 |             "start_date": "1920-01",
184 |             "end_date": "1949-12-31",
185 |         },
186 |     ]
187 | 
188 | 
189 | def test_add_identifier():
190 |     g = GenericModel()
191 | 
192 |     with pytest.raises(TypeError):
193 |         g.add_identifier("id10t", foo="bar")
194 | 
195 |     g.add_identifier("id10t")
196 |     g.add_identifier("l0l", scheme="kruft")
197 | 
198 |     assert g.identifiers[-1]["scheme"] == "kruft"
199 |     assert g.identifiers[0]["identifier"] == "id10t"
200 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_event_scrape.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import datetime
  3 | from pupa.scrape import Event
  4 | 
  5 | 
  6 | def event_obj():
  7 |     e = Event(
  8 |         name="get-together",
  9 |         start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z",
 10 |         location_name="Joe's Place",
 11 |     )
 12 |     e.add_source(url="http://example.com/foobar")
 13 |     return e
 14 | 
 15 | 
 16 | def test_basic_event():
 17 |     e = event_obj()
 18 |     e.validate()
 19 | 
 20 | 
 21 | def test_no_location():
 22 |     e = Event(
 23 |         name="get-together",
 24 |         start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z",
 25 |     )
 26 |     e.add_source(url="http://example.com/foobar")
 27 |     e.validate()
 28 | 
 29 | 
 30 | def test_event_str():
 31 |     e = event_obj()
 32 |     assert e.name in str(e)
 33 | 
 34 | 
 35 | def test_bad_event():
 36 |     e = event_obj()
 37 |     e.start_date = 6
 38 | 
 39 |     with pytest.raises(ValueError):
 40 |         e.validate()
 41 | 
 42 | 
 43 | def test_basic_agenda():
 44 |     e = event_obj()
 45 |     agenda = e.add_agenda_item("foo bar")
 46 |     assert agenda["description"] == "foo bar"
 47 |     assert e.agenda[0] == agenda
 48 |     e.validate()
 49 | 
 50 | 
 51 | def test_agenda_add_person():
 52 |     e = event_obj()
 53 |     agenda = e.add_agenda_item("foo bar")
 54 |     assert agenda["related_entities"] == []
 55 | 
 56 |     agenda.add_person(person="John Q. Hacker", note="chair")
 57 |     assert len(e.agenda[0]["related_entities"]) == 1
 58 |     e.validate()
 59 | 
 60 | 
 61 | def test_agenda_add_vote_event():
 62 |     e = event_obj()
 63 |     agenda = e.add_agenda_item("foo bar")
 64 |     assert agenda["related_entities"] == []
 65 | 
 66 |     agenda.add_vote_event(vote_event="Roll no. 12")
 67 |     assert len(e.agenda[0]["related_entities"]) == 1
 68 |     e.validate()
 69 | 
 70 | 
 71 | def test_agenda_add_subject():
 72 |     e = event_obj()
 73 |     agenda = e.add_agenda_item("foo bar")
 74 | 
 75 |     agenda.add_subject("test")
 76 |     assert e.agenda[0]["subjects"] == ["test"]
 77 |     agenda.add_subject("test2")
 78 |     assert e.agenda[0]["subjects"] == ["test", "test2"]
 79 | 
 80 |     e.validate()
 81 | 
 82 | 
 83 | def test_agenda_add_classification():
 84 |     e = event_obj()
 85 |     agenda = e.add_agenda_item("foo bar")
 86 | 
 87 |     agenda.add_classification("test")
 88 |     assert e.agenda[0]["classification"] == ["test"]
 89 |     agenda.add_classification("test2")
 90 |     assert e.agenda[0]["classification"] == ["test", "test2"]
 91 | 
 92 |     e.validate()
 93 | 
 94 | 
 95 | def test_agenda_add_extra():
 96 |     e = event_obj()
 97 |     a = e.add_agenda_item("foo bar")
 98 |     a["extras"] = dict(foo=1, bar=["baz"])
 99 | 
100 |     assert e.agenda[0]["extras"] == {"foo": 1, "bar": ["baz"]}
101 | 
102 | 
103 | def test_add_committee():
104 |     e = event_obj()
105 |     agenda = e.add_agenda_item("foo bar")
106 |     assert agenda["related_entities"] == []
107 | 
108 |     agenda.add_committee(committee="Hello, World", note="host")
109 |     e.validate()
110 | 
111 | 
112 | def test_add_bill():
113 |     e = event_obj()
114 |     agenda = e.add_agenda_item("foo bar")
115 |     assert agenda["related_entities"] == []
116 |     agenda.add_bill(bill="HB 101", note="consideration")
117 |     e.validate()
118 | 
119 | 
120 | def test_add_document():
121 |     e = event_obj()
122 |     assert e.documents == []
123 |     e.add_document(note="hello", url="http://example.com", media_type="text/html")
124 |     assert len(e.documents) == 1
125 |     o = e.documents[0]
126 |     assert o["note"] == "hello"
127 |     assert o["links"] == [
128 |         {"url": "http://example.com", "media_type": "text/html", "text": ""}
129 |     ]
130 |     e.validate()
131 | 
132 | 
133 | def test_participants():
134 |     e = event_obj()
135 |     e.add_participant("Committee of the Whole", type="committee", note="everyone")
136 |     assert len(e.participants) == 1
137 |     assert e.participants[0]["name"] == "Committee of the Whole"
138 |     assert e.participants[0]["entity_type"] == "committee"
139 |     assert e.participants[0]["note"] == "everyone"
140 | 
141 |     # and add_person, which is a shortcut
142 |     e.add_person("Bill Stevenson")
143 |     assert len(e.participants) == 2
144 |     assert e.participants[1]["name"] == "Bill Stevenson"
145 |     assert e.participants[1]["entity_type"] == "person"
146 |     assert e.participants[1]["note"] == "participant"
147 | 
148 | 
149 | def test_set_location():
150 |     e = event_obj()
151 |     e.set_location(
152 |         "North Pole",
153 |         note="it is cold here",
154 |         url="https://www.northpole.com",
155 |         coordinates={"latitude": "90.0000", "longitude": "0.0000"},
156 |     )
157 | 
158 |     assert e.location.get("name") == "North Pole"
159 |     assert e.location.get("note") == "it is cold here"
160 |     assert e.location.get("url") == "https://www.northpole.com"
161 |     assert e.location.get("coordinates").get("latitude") == "90.0000"
162 |     assert e.location.get("coordinates").get("longitude") == "0.0000"
163 | 
164 |     e.validate()
165 | 
166 | 
167 | def test_add_media():
168 |     e = event_obj()
169 |     name = "Hello, World"
170 |     a = e.add_agenda_item(description="foo")
171 |     a.add_media_link(note=name, url="http://pault.ag", media_type="text/html")
172 |     a.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain")
173 |     e.validate()
174 |     assert len(e.agenda[0]["media"]) == 1
175 |     assert len(e.agenda[0]["media"][0]["links"]) == 2
176 | 
177 |     e.add_media_link(note=name, url="http://pault.ag", media_type="text/html")
178 |     e.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain")
179 |     e.validate()
180 |     assert len(e.media) == 1
181 |     assert len(e.media[0]["links"]) == 2
182 | 


--------------------------------------------------------------------------------
/pupa/scrape/bill.py:
--------------------------------------------------------------------------------
  1 | from ..utils import _make_pseudo_id
  2 | from .popolo import pseudo_organization
  3 | from .base import BaseModel, SourceMixin, AssociatedLinkMixin, cleanup_list
  4 | from .schemas.bill import schema
  5 | 
  6 | 
  7 | class Action(dict):
  8 |     def add_related_entity(self, name, entity_type, entity_id=None):
  9 |         ent = {
 10 |             "name": name,
 11 |             "entity_type": entity_type,
 12 |             entity_type + "_id": entity_id,
 13 |         }
 14 |         self["related_entities"].append(ent)
 15 |         return ent
 16 | 
 17 | 
 18 | class Bill(SourceMixin, AssociatedLinkMixin, BaseModel):
 19 |     """
 20 |     An Open Civic Data bill.
 21 |     """
 22 | 
 23 |     _type = "bill"
 24 |     _schema = schema
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         identifier,
 29 |         legislative_session,
 30 |         title,
 31 |         *,
 32 |         chamber=None,
 33 |         from_organization=None,
 34 |         classification=None
 35 |     ):
 36 |         super(Bill, self).__init__()
 37 | 
 38 |         self.identifier = identifier
 39 |         self.legislative_session = legislative_session
 40 |         self.title = title
 41 |         self.classification = cleanup_list(classification, ["bill"])
 42 |         self.from_organization = pseudo_organization(
 43 |             from_organization, chamber, "legislature"
 44 |         )
 45 | 
 46 |         self.actions = []
 47 |         self.other_identifiers = []
 48 |         self.other_titles = []
 49 |         self.documents = []
 50 |         self.related_bills = []
 51 |         self.sponsorships = []
 52 |         self.subject = []
 53 |         self.abstracts = []
 54 |         self.versions = []
 55 | 
 56 |     def add_action(
 57 |         self,
 58 |         description,
 59 |         date,
 60 |         *,
 61 |         organization=None,
 62 |         chamber=None,
 63 |         classification=None,
 64 |         related_entities=None,
 65 |         extras=None
 66 |     ):
 67 |         action = Action(
 68 |             description=description,
 69 |             date=date,
 70 |             organization_id=pseudo_organization(organization, chamber, "legislature"),
 71 |             classification=cleanup_list(classification, []),
 72 |             related_entities=[],
 73 |             extras=extras or {},
 74 |         )
 75 |         self.actions.append(action)
 76 |         return action
 77 | 
 78 |     def add_related_bill(self, identifier, legislative_session, relation_type):
 79 |         # will we need jurisdiction, organization?
 80 |         self.related_bills.append(
 81 |             {
 82 |                 "identifier": identifier,
 83 |                 "legislative_session": legislative_session,
 84 |                 "relation_type": relation_type,
 85 |             }
 86 |         )
 87 | 
 88 |     def add_sponsorship(
 89 |         self,
 90 |         name,
 91 |         classification,
 92 |         entity_type,
 93 |         primary,
 94 |         *,
 95 |         chamber=None,
 96 |         entity_id=None
 97 |     ):
 98 |         sp = {
 99 |             "name": name,
100 |             "classification": classification,
101 |             "entity_type": entity_type,
102 |             "primary": primary,
103 |             # set these so that all JSON objects have the same keys,
104 |             # prevents import errors
105 |             "person_id": None,
106 |             "organization_id": None,
107 |         }
108 |         # overwrite the id that exists
109 |         if entity_type:
110 |             if not entity_id:
111 |                 entity_id = _make_pseudo_id(name=name)
112 |             sp[entity_type + "_id"] = entity_id
113 |         self.sponsorships.append(sp)
114 | 
115 |     def add_sponsorship_by_identifier(
116 |         self,
117 |         name,
118 |         classification,
119 |         entity_type,
120 |         primary,
121 |         *,
122 |         scheme,
123 |         identifier,
124 |         chamber=None
125 |     ):
126 |         return self.add_sponsorship(
127 |             name,
128 |             classification,
129 |             entity_type,
130 |             primary,
131 |             chamber=chamber,
132 |             entity_id=_make_pseudo_id(
133 |                 identifiers__scheme=scheme, identifiers__identifier=identifier
134 |             ),
135 |         )
136 | 
137 |     def add_subject(self, subject):
138 |         self.subject.append(subject)
139 | 
140 |     def add_abstract(self, abstract, note, date=""):
141 |         self.abstracts.append({"note": note, "abstract": abstract, "date": date})
142 | 
143 |     def add_title(self, title, note=""):
144 |         self.other_titles.append({"note": note, "title": title})
145 | 
146 |     def add_identifier(self, identifier, note="", scheme=""):
147 |         self.other_identifiers.append(
148 |             {"note": note, "identifier": identifier, "scheme": scheme}
149 |         )
150 | 
151 |     def add_document_link(
152 |         self, note, url, *, date="", media_type="", text="", on_duplicate="error"
153 |     ):
154 |         return self._add_associated_link(
155 |             collection="documents",
156 |             note=note,
157 |             url=url,
158 |             date=date,
159 |             text=text,
160 |             media_type=media_type,
161 |             on_duplicate=on_duplicate,
162 |         )
163 | 
164 |     def add_version_link(
165 |         self, note, url, *, date="", media_type="", text="", on_duplicate="error"
166 |     ):
167 |         return self._add_associated_link(
168 |             collection="versions",
169 |             note=note,
170 |             url=url,
171 |             date=date,
172 |             text=text,
173 |             media_type=media_type,
174 |             on_duplicate=on_duplicate,
175 |         )
176 | 
177 |     def __str__(self):
178 |         return self.identifier + " in " + self.legislative_session
179 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_vote_event_scrape.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pupa.scrape import VoteEvent, Bill, Organization, OrderVoteEvent
  3 | from pupa.utils import get_pseudo_id
  4 | 
  5 | 
  6 | def toy_vote_event():
  7 |     ve = VoteEvent(
  8 |         legislative_session="2009",
  9 |         motion_text="passage of the bill",
 10 |         start_date="2009-01-07",
 11 |         result="pass",
 12 |         classification="bill-passage",
 13 |     )
 14 |     ve.add_source("http://uri.example.com/", note="foo")
 15 |     return ve
 16 | 
 17 | 
 18 | def test_simple_vote_event():
 19 |     ve = toy_vote_event()
 20 |     ve.set_count("yes", 2)
 21 |     ve.yes("James")
 22 |     ve.no("Paul")
 23 |     ve.vote("abstain", "Thom")
 24 | 
 25 |     assert len(ve.votes) == 3
 26 |     assert len(ve.counts) == 1
 27 |     assert get_pseudo_id(ve.organization) == {"classification": "legislature"}
 28 |     assert get_pseudo_id(ve.votes[0]["voter_id"]) == {"name": "James"}
 29 |     assert get_pseudo_id(ve.votes[1]["voter_id"]) == {"name": "Paul"}
 30 |     assert get_pseudo_id(ve.votes[2]["voter_id"]) == {"name": "Thom"}
 31 |     assert ve.bill is None
 32 | 
 33 |     ve.validate()
 34 |     assert "we get here"
 35 | 
 36 | 
 37 | def test_vote_event_org_obj():
 38 |     o = Organization("something", classification="committee")
 39 |     ve = VoteEvent(
 40 |         legislative_session="2009",
 41 |         motion_text="passage of the bill",
 42 |         start_date="2009-01-07",
 43 |         result="pass",
 44 |         classification="bill-passage",
 45 |         organization=o,
 46 |     )
 47 |     assert ve.organization == o._id
 48 | 
 49 | 
 50 | def test_vote_event_org_dict():
 51 |     odict = {"name": "Random Committee", "classification": "committee"}
 52 |     ve = VoteEvent(
 53 |         legislative_session="2009",
 54 |         motion_text="passage of the bill",
 55 |         start_date="2009-01-07",
 56 |         result="pass",
 57 |         classification="bill-passage",
 58 |         organization=odict,
 59 |     )
 60 |     assert get_pseudo_id(ve.organization) == odict
 61 | 
 62 | 
 63 | def test_vote_event_org_chamber():
 64 |     ve = VoteEvent(
 65 |         legislative_session="2009",
 66 |         motion_text="passage of the bill",
 67 |         start_date="2009-01-07",
 68 |         result="pass",
 69 |         classification="bill-passage",
 70 |         chamber="upper",
 71 |     )
 72 |     assert get_pseudo_id(ve.organization) == {"classification": "upper"}
 73 | 
 74 | 
 75 | def test_org_and_chamber_conflict():
 76 |     with pytest.raises(ValueError):
 77 |         VoteEvent(
 78 |             legislative_session="2009",
 79 |             motion_text="passage of the bill",
 80 |             start_date="2009-01-07",
 81 |             result="pass",
 82 |             classification="passage",
 83 |             organization="test",
 84 |             chamber="lower",
 85 |         )
 86 | 
 87 | 
 88 | def test_set_count():
 89 |     ve = toy_vote_event()
 90 |     ve.set_count("yes", 2)
 91 |     ve.set_count("no", 100)
 92 |     ve.set_count("yes", 0)
 93 |     assert ve.counts == [{"option": "yes", "value": 0}, {"option": "no", "value": 100}]
 94 | 
 95 | 
 96 | def test_set_bill_obj():
 97 |     ve = toy_vote_event()
 98 |     b = Bill("HB 1", legislative_session="2009", title="fake bill")
 99 |     ve.set_bill(b)
100 |     assert ve.bill == b._id
101 | 
102 | 
103 | def test_set_bill_obj_no_extra_args():
104 |     ve = toy_vote_event()
105 |     b = Bill("HB 1", legislative_session="2009", title="fake bill")
106 |     with pytest.raises(ValueError):
107 |         ve.set_bill(b, chamber="lower")
108 | 
109 | 
110 | def test_set_bill_pseudo_id():
111 |     ve = toy_vote_event()
112 |     ve.set_bill("HB 1", chamber="lower")
113 |     assert get_pseudo_id(ve.bill) == {
114 |         "identifier": "HB 1",
115 |         "from_organization__classification": "lower",
116 |         "legislative_session__identifier": "2009",
117 |     }
118 | 
119 | 
120 | def test_str():
121 |     ve = toy_vote_event()
122 |     s = str(ve)
123 |     assert ve.legislative_session in s
124 |     assert ve.motion_text in s
125 | 
126 | 
127 | def test_order_vote_event():
128 |     ve = toy_vote_event()
129 |     order_vote_event = OrderVoteEvent()
130 | 
131 |     # add order as seconds to date with no time
132 |     ve.start_date = "2019-01-01"
133 |     ve.end_date = None
134 |     order_vote_event("2019", "1", ve)
135 |     assert ve.start_date == "2019-01-01T00:00:01"
136 |     assert ve.end_date is None
137 | 
138 |     # add order as seconds to time with explicit midnight time and
139 |     # zone, preserving timezone
140 |     ve.start_date = "2019-01-01T00:00:00+05:00"
141 |     ve.end_date = ""
142 |     order_vote_event("2019", "1", ve)
143 |     assert ve.start_date == "2019-01-01T00:00:02+05:00"
144 |     assert ve.end_date == ""
145 | 
146 |     # a second bill should start with '00:00:01' again
147 |     ve.start_date = "2019-01-01"
148 |     ve.end_date = None
149 |     order_vote_event("2019", "2", ve)
150 |     assert ve.start_date == "2019-01-01T00:00:01"
151 |     assert ve.end_date is None
152 | 
153 |     # the same bill id in a different session should start with '00:00:01' again
154 |     ve.start_date = "2019-01-01"
155 |     ve.end_date = None
156 |     order_vote_event("2020", "1", ve)
157 |     assert ve.start_date == "2019-01-01T00:00:01"
158 |     assert ve.end_date is None
159 | 
160 |     # add order as seconds to time with explicit midnight time and no timezone
161 |     ve.start_date = ve.end_date = "2019-01-01T00:00:00"
162 |     order_vote_event("2019", "1", ve)
163 |     assert ve.start_date == "2019-01-01T00:00:03"
164 |     assert ve.end_date == "2019-01-01T00:00:03"
165 | 
166 |     # don't change a date with a non-midnight time
167 |     ve.start_date = "2019-01-01T00:00:55+05:00"
168 |     order_vote_event("2019", "1", ve)
169 |     assert ve.start_date == "2019-01-01T00:00:55+05:00"
170 | 


--------------------------------------------------------------------------------
/pupa/cli/commands/init.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from .base import BaseCommand
  4 | from pupa.exceptions import CommandError
  5 | from opencivicdata.common import JURISDICTION_CLASSIFICATIONS
  6 | from opencivicdata.divisions import Division
  7 | 
  8 | 
  9 | def prompt(ps, default=""):
 10 |     return input(ps).strip() or default
 11 | 
 12 | 
 13 | CLASS_DICT = {
 14 |     "events": "Event",
 15 |     "people": "Person",
 16 |     "bills": "Bill",
 17 |     "vote_events": "VoteEvent",
 18 | }
 19 | 
 20 | 
 21 | def write_jurisdiction_template(
 22 |     dirname, short_name, long_name, division_id, classification, url, scraper_types
 23 | ):
 24 |     camel_case = short_name.title().replace(" ", "")
 25 | 
 26 |     # write __init__
 27 |     lines = ["# encoding=utf-8", "from pupa.scrape import Jurisdiction, Organization"]
 28 |     for stype in scraper_types:
 29 |         lines.append(
 30 |             "from .{} import {}{}Scraper".format(stype, camel_case, CLASS_DICT[stype])
 31 |         )
 32 |     lines.append("")
 33 |     lines.append("")
 34 |     lines.append("class {}(Jurisdiction):".format(camel_case))
 35 |     lines.append('    division_id = "{}"'.format(division_id))
 36 |     lines.append('    classification = "{}"'.format(classification))
 37 |     lines.append('    name = "{}"'.format(long_name))
 38 |     lines.append('    url = "{}"'.format(url))
 39 |     lines.append("    scrapers = {")
 40 |     for stype in scraper_types:
 41 |         lines.append(
 42 |             '        "{}": {}{}Scraper,'.format(stype, camel_case, CLASS_DICT[stype])
 43 |         )
 44 |     lines.append("    }")
 45 |     lines.append("")
 46 |     lines.append("    def get_organizations(self):")
 47 |     lines.append("        #REQUIRED: define an organization using this format")
 48 |     lines.append("        #where org_name is something like Seattle City Council")
 49 |     lines.append("        #and classification is described here:")
 50 | 
 51 |     lines.append(
 52 |         '        org = Organization(name="org_name", classification="legislature")'
 53 |     )
 54 |     lines.append("")
 55 | 
 56 |     lines.append("        # OPTIONAL: add posts to your organizaion using this format,")
 57 |     lines.append(
 58 |         "        # where label is a human-readable description of the post "
 59 |         '(eg "Ward 8 councilmember")'
 60 |     )
 61 |     lines.append(
 62 |         "        # and role is the position type (eg councilmember, alderman, mayor...)"
 63 |     )
 64 |     lines.append("        # skip entirely if you're not writing a people scraper.")
 65 |     lines.append(
 66 |         '        org.add_post(label="position_description", role="position_type")'
 67 |     )
 68 |     lines.append("")
 69 |     lines.append("        #REQUIRED: yield the organization")
 70 |     lines.append("        yield org")
 71 |     lines.append("")
 72 | 
 73 |     with open(os.path.join(dirname, "__init__.py"), "w") as of:
 74 |         of.write("\n".join(lines))
 75 | 
 76 |     # write scraper files
 77 |     for stype in scraper_types:
 78 |         lines = ["from pupa.scrape import Scraper"]
 79 |         lines.append("from pupa.scrape import {}".format(CLASS_DICT[stype]))
 80 |         lines.append("")
 81 |         lines.append("")
 82 |         lines.append(
 83 |             "class {}{}Scraper(Scraper):".format(camel_case, CLASS_DICT[stype])
 84 |         )
 85 |         lines.append("")
 86 |         lines.append("    def scrape(self):")
 87 |         lines.append("        # needs to be implemented")
 88 |         lines.append("        pass")
 89 |         lines.append("")
 90 |         with open(os.path.join(dirname, stype + ".py"), "w") as of:
 91 |             of.write("\n".join(lines))
 92 | 
 93 | 
 94 | class Command(BaseCommand):
 95 |     name = "init"
 96 |     help = "start a new pupa scraper"
 97 | 
 98 |     def add_args(self):
 99 |         self.add_argument("module", type=str, help="name of the new scraper module")
100 | 
101 |     def handle(self, args, other):
102 |         if os.path.exists(args.module):
103 |             raise CommandError("Directory {} already exists".format(repr(args.module)))
104 | 
105 |         division = None
106 |         while not division:
107 |             division = prompt(
108 |                 "division id (see https://github.com/opencivicdata/"
109 |                 "ocd-division-ids/tree/master/identifiers): "
110 |             )
111 |             if not division:
112 |                 print("\nERROR: Division ID is required.\n")
113 | 
114 |         try:
115 |             Division.get(division)
116 |         except (ValueError, IndexError):
117 |             raise CommandError("Division ID {} is invalid".format(repr(division)))
118 | 
119 |         name = prompt("jurisdiction name (e.g. City of Seattle): ")
120 |         classification = prompt(
121 |             "classification (can be: {}): ".format(
122 |                 ", ".join(JURISDICTION_CLASSIFICATIONS)
123 |             )
124 |         )
125 |         url = prompt("official url (e.g. http://www.seattle.gov/): ")
126 | 
127 |         os.makedirs(args.module)
128 | 
129 |         # Will default to True until they pick one, then defaults to False.
130 |         selected_scraper_types = []
131 |         for stype in CLASS_DICT.keys():
132 |             if selected_scraper_types:
133 |                 default = "N"
134 |                 hint = "[y/N]"
135 |             else:
136 |                 default = "Y"
137 |                 hint = "[Y/n]"
138 |             result = prompt(
139 |                 "create {} scraper? {}: ".format(stype, hint), default
140 |             ).upper()
141 |             if result == "Y":
142 |                 selected_scraper_types.append(stype)
143 | 
144 |         write_jurisdiction_template(
145 |             args.module,
146 |             args.module,
147 |             name,
148 |             division,
149 |             classification,
150 |             url,
151 |             selected_scraper_types,
152 |         )
153 | 


--------------------------------------------------------------------------------
/pupa/importers/vote_events.py:
--------------------------------------------------------------------------------
  1 | from opencivicdata.legislative.models import (
  2 |     VoteEvent,
  3 |     VoteCount,
  4 |     PersonVote,
  5 |     VoteSource,
  6 |     BillAction,
  7 | )
  8 | from pupa.utils import get_pseudo_id, _make_pseudo_id
  9 | from .base import BaseImporter
 10 | from ..exceptions import InvalidVoteEventError
 11 | 
 12 | 
 13 | class VoteEventImporter(BaseImporter):
 14 |     _type = "vote_event"
 15 |     model_class = VoteEvent
 16 |     related_models = {
 17 |         "counts": (VoteCount, "vote_event_id", {}),
 18 |         "votes": (PersonVote, "vote_event_id", {}),
 19 |         "sources": (VoteSource, "vote_event_id", {}),
 20 |     }
 21 | 
 22 |     def __init__(self, jurisdiction_id, person_importer, org_importer, bill_importer):
 23 | 
 24 |         super(VoteEventImporter, self).__init__(jurisdiction_id)
 25 |         self.person_importer = person_importer
 26 |         self.bill_importer = bill_importer
 27 |         self.org_importer = org_importer
 28 |         self.seen_bill_ids = set()
 29 |         self.seen_action_ids = set()
 30 |         self.vote_events_to_delete = set()
 31 | 
 32 |     def get_object(self, vote_event):
 33 |         spec = {"legislative_session_id": vote_event["legislative_session_id"]}
 34 | 
 35 |         if not vote_event["identifier"] and not vote_event["bill_id"]:
 36 |             raise InvalidVoteEventError(
 37 |                 'attempt to save a VoteEvent without an "identifier" or "bill_id"'
 38 |             )
 39 | 
 40 |         if vote_event["bill_id"]:
 41 |             if vote_event["bill_id"] not in self.seen_bill_ids:
 42 |                 self.seen_bill_ids.add(vote_event["bill_id"])
 43 |                 # keep a list of all the vote event ids that should be deleted
 44 |                 self.vote_events_to_delete.update(
 45 |                     self.model_class.objects.filter(
 46 |                         bill_id=vote_event["bill_id"]
 47 |                     ).values_list("id", flat=True)
 48 |                 )
 49 |             spec["bill_id"] = vote_event["bill_id"]
 50 | 
 51 |         if vote_event.get("pupa_id"):
 52 |             ve_id = self.lookup_obj_id(vote_event["pupa_id"], VoteEvent)
 53 |             if ve_id:
 54 |                 spec = {"id": ve_id}
 55 |             else:
 56 |                 return None
 57 |         elif vote_event["identifier"]:
 58 |             # if there's an identifier, just use it and the bill_id and the session
 59 |             spec["identifier"] = vote_event["identifier"]
 60 |         else:
 61 |             # otherwise use the motion, start_date, and org as well
 62 |             spec.update(
 63 |                 {
 64 |                     "motion_text": vote_event["motion_text"],
 65 |                     "start_date": vote_event["start_date"],
 66 |                     "organization_id": vote_event["organization_id"],
 67 |                 }
 68 |             )
 69 | 
 70 |         return self.model_class.objects.prefetch_related("votes__voter").get(**spec)
 71 | 
 72 |     def limit_spec(self, spec):
 73 |         spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id
 74 |         return spec
 75 | 
 76 |     def prepare_for_db(self, data):
 77 |         data["legislative_session_id"] = self.get_session_id(
 78 |             data.pop("legislative_session")
 79 |         )
 80 |         data["organization_id"] = self.org_importer.resolve_json_id(
 81 |             data.pop("organization")
 82 |         )
 83 | 
 84 |         bill = data.pop("bill")
 85 |         if bill and bill.startswith("~"):
 86 |             # unpack psuedo id and apply filter in case there are any that alter it
 87 |             bill = get_pseudo_id(bill)
 88 |             self.bill_importer.apply_transformers(bill)
 89 |             bill = _make_pseudo_id(**bill)
 90 | 
 91 |         data["bill_id"] = self.bill_importer.resolve_json_id(bill)
 92 |         bill_action = data.pop("bill_action")
 93 |         if bill_action:
 94 |             try:
 95 |                 action = BillAction.objects.get(
 96 |                     bill_id=data["bill_id"],
 97 |                     description=bill_action,
 98 |                     date=data["start_date"],
 99 |                     organization_id=data["organization_id"],
100 |                 )
101 |                 # seen_action_ids is for ones being added in this import
102 |                 # action.vote is already set if action was set on prior import
103 |                 if action.id in self.seen_action_ids or hasattr(action, "vote"):
104 |                     self.warning(
105 |                         "can not match two VoteEvents to %s: %s", action.id, bill_action
106 |                     )
107 |                 else:
108 |                     data["bill_action_id"] = action.id
109 |                     self.seen_action_ids.add(action.id)
110 |             except BillAction.DoesNotExist:
111 |                 self.warning(
112 |                     "could not match VoteEvent to %s %s %s",
113 |                     bill,
114 |                     bill_action,
115 |                     data["start_date"],
116 |                 )
117 |             except BillAction.MultipleObjectsReturned as e:
118 |                 self.warning(
119 |                     "could not match VoteEvent to %s %s %s: %s",
120 |                     bill,
121 |                     bill_action,
122 |                     data["start_date"],
123 |                     e,
124 |                 )
125 | 
126 |         for vote in data["votes"]:
127 |             vote["voter_id"] = self.person_importer.resolve_json_id(
128 |                 vote["voter_id"], allow_no_match=True
129 |             )
130 |         return data
131 | 
132 |     def postimport(self):
133 |         # be sure not to delete vote events that were
134 |         # imported (meaning updated) this time through
135 |         self.vote_events_to_delete.difference_update(self.json_to_db_id.values())
136 |         # everything remaining, goodbye
137 |         self.model_class.objects.filter(id__in=self.vote_events_to_delete).delete()
138 | 


--------------------------------------------------------------------------------
/pupa/scrape/event.py:
--------------------------------------------------------------------------------
  1 | from ..utils import _make_pseudo_id
  2 | from .base import BaseModel, SourceMixin, AssociatedLinkMixin, LinkMixin
  3 | from .schemas.event import schema
  4 | from pupa.exceptions import ScrapeValueError
  5 | 
  6 | 
  7 | class EventAgendaItem(dict, AssociatedLinkMixin):
  8 |     event = None
  9 | 
 10 |     def __init__(self, description, event):
 11 |         super(EventAgendaItem, self).__init__(
 12 |             {
 13 |                 "description": description,
 14 |                 "classification": [],
 15 |                 "related_entities": [],
 16 |                 "subjects": [],
 17 |                 "media": [],
 18 |                 "notes": [],
 19 |                 "order": str(len(event.agenda)),
 20 |                 "extras": {},
 21 |             }
 22 |         )
 23 |         self.event = event
 24 | 
 25 |     def add_subject(self, what):
 26 |         self["subjects"].append(what)
 27 | 
 28 |     def add_classification(self, what):
 29 |         self["classification"].append(what)
 30 | 
 31 |     def add_vote_event(self, vote_event, *, id=None, note="consideration"):
 32 |         self.add_entity(name=vote_event, entity_type="vote_event", id=id, note=note)
 33 | 
 34 |     def add_committee(self, committee, *, id=None, note="participant"):
 35 |         self.add_entity(name=committee, entity_type="organization", id=id, note=note)
 36 | 
 37 |     def add_bill(self, bill, *, id=None, note="consideration"):
 38 |         self.add_entity(name=bill, entity_type="bill", id=id, note=note)
 39 | 
 40 |     def add_person(self, person, *, id=None, note="participant"):
 41 |         self.add_entity(name=person, entity_type="person", id=id, note=note)
 42 | 
 43 |     def add_media_link(
 44 |         self, note, url, media_type, *, text="", type="media", on_duplicate="error"
 45 |     ):
 46 |         return self._add_associated_link(
 47 |             collection="media",
 48 |             note=note,
 49 |             url=url,
 50 |             text=text,
 51 |             media_type=media_type,
 52 |             on_duplicate=on_duplicate,
 53 |         )
 54 | 
 55 |     def add_entity(self, name, entity_type, *, id, note):
 56 |         ret = {"name": name, "entity_type": entity_type, "note": note}
 57 |         if id:
 58 |             ret["id"] = id
 59 |         elif entity_type:
 60 |             if entity_type in ("organization", "person"):
 61 |                 id = _make_pseudo_id(name=name)
 62 |             elif entity_type in ("bill", "vote_event"):
 63 |                 id = _make_pseudo_id(identifier=name)
 64 |             else:
 65 |                 raise ScrapeValueError(
 66 |                     "attempt to call add_entity with unsupported "
 67 |                     "entity type: {}".format(entity_type)
 68 |                 )
 69 |             ret[entity_type + "_id"] = id
 70 | 
 71 |         self["related_entities"].append(ret)
 72 | 
 73 | 
 74 | class Event(BaseModel, SourceMixin, AssociatedLinkMixin, LinkMixin):
 75 |     """
 76 |     Details for an event in .format
 77 |     """
 78 | 
 79 |     _type = "event"
 80 |     _schema = schema
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         name,
 85 |         start_date,
 86 |         *,
 87 |         location_name=None,
 88 |         all_day=False,
 89 |         description="",
 90 |         end_date="",
 91 |         status="confirmed",
 92 |         classification="event"
 93 |     ):
 94 |         super(Event, self).__init__()
 95 |         self.start_date = start_date
 96 |         self.all_day = all_day
 97 |         self.end_date = end_date
 98 |         self.name = name
 99 |         self.description = description
100 |         self.status = status
101 |         self.classification = classification
102 |         if location_name:
103 |             self.location = {"name": location_name, "note": "", "coordinates": None}
104 |         else:
105 |             self.location = None
106 |         self.documents = []
107 |         self.participants = []
108 |         self.media = []
109 |         self.agenda = []
110 | 
111 |     def __str__(self):
112 |         return "{} {}".format(self.start_date, self.name.strip())
113 | 
114 |     def set_location(self, name, *, note="", url="", coordinates=None):
115 |         self.location = {
116 |             "name": name,
117 |             "note": note,
118 |             "url": url,
119 |             "coordinates": coordinates,
120 |         }
121 | 
122 |     def add_participant(self, name, type, *, id=None, note="participant"):
123 |         p = {"name": name, "entity_type": type, "note": note}
124 |         if id:
125 |             p["id"] = id
126 |         elif type:
127 |             id = _make_pseudo_id(name=name)
128 |             p[type + "_id"] = id
129 | 
130 |         self.participants.append(p)
131 | 
132 |     def add_person(self, name, *, id=None, note="participant"):
133 |         return self.add_participant(name=name, type="person", id=id, note=note)
134 | 
135 |     def add_committee(self, name, *, id=None, note="participant"):
136 |         return self.add_participant(name=name, type="organization", id=id, note=note)
137 | 
138 |     def add_agenda_item(self, description):
139 |         obj = EventAgendaItem(description, self)
140 |         self.agenda.append(obj)
141 |         return obj
142 | 
143 |     def add_media_link(
144 |         self,
145 |         note,
146 |         url,
147 |         media_type,
148 |         *,
149 |         text="",
150 |         type="media",
151 |         on_duplicate="error",
152 |         date=""
153 |     ):
154 |         return self._add_associated_link(
155 |             collection="media",
156 |             note=note,
157 |             url=url,
158 |             text=text,
159 |             media_type=media_type,
160 |             on_duplicate=on_duplicate,
161 |             date=date,
162 |         )
163 | 
164 |     def add_document(
165 |         self, note, url, *, text="", media_type="", on_duplicate="error", date=""
166 |     ):
167 |         return self._add_associated_link(
168 |             collection="documents",
169 |             note=note,
170 |             url=url,
171 |             text=text,
172 |             media_type=media_type,
173 |             on_duplicate=on_duplicate,
174 |             date=date,
175 |         )
176 | 


--------------------------------------------------------------------------------
/pupa/tests/clean/test_clean.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import argparse
  3 | from datetime import datetime, timezone, timedelta
  4 | from freezegun import freeze_time
  5 | 
  6 | from opencivicdata.core.models import Person, Organization, Jurisdiction, Division, Post
  7 | 
  8 | from pupa.cli.commands.clean import Command as CleanCommand
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def subparsers():
 13 |     parser = argparse.ArgumentParser("pupa", description="pupa CLI")
 14 |     parser.add_argument("--debug", action="store_true", help="open debugger on error")
 15 |     parser.add_argument(
 16 |         "--loglevel",
 17 |         default="INFO",
 18 |         help=(
 19 |             "set log level. options are: "
 20 |             "DEBUG|INFO|WARNING|ERROR|CRITICAL "
 21 |             "(default is INFO)"
 22 |         ),
 23 |     )
 24 |     return parser.add_subparsers(dest="subcommand")
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def division():
 29 |     return Division.objects.create(id="ocd-division/country:us", name="USA")
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def jurisdiction(division):
 34 |     return Jurisdiction.objects.create(id="jid", division=division)
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def organization(jurisdiction):
 39 |     return Organization.objects.create(name="WWE", jurisdiction=jurisdiction)
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def post(organization):
 44 |     return Post.objects.create(organization=organization, label="Some post", role="Some post")
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def person():
 49 |     class PersonFactory:
 50 |         def build(self, **kwargs):
 51 |             person_info = {
 52 |                 "name": "George Washington",
 53 |                 "family_name": "Washington",
 54 |             }
 55 | 
 56 |             person_info.update(kwargs)
 57 | 
 58 |             return Person.objects.create(**person_info)
 59 | 
 60 |     return PersonFactory()
 61 | 
 62 | 
 63 | @pytest.mark.django_db
 64 | def test_get_stale_objects(subparsers, division, jurisdiction, organization, post, person):
 65 |     stale_person = person.build()
 66 |     membership = stale_person.memberships.create(organization=organization)
 67 | 
 68 |     protected_objects = {division, jurisdiction, post}
 69 |     expected_stale_objects = {stale_person, organization, membership}
 70 | 
 71 |     a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7)
 72 |     with freeze_time(a_week_from_now):
 73 |         fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson")
 74 |         fresh_person.memberships.create(organization=organization)
 75 | 
 76 |         stale_objects = set(CleanCommand(subparsers).get_stale_objects(7))
 77 |         assert stale_objects == expected_stale_objects
 78 | 
 79 |         # This is implied by the above check, but it's important, so we'll check
 80 |         # for it explicitly.
 81 |         assert protected_objects not in stale_objects
 82 | 
 83 | 
 84 | @pytest.mark.django_db
 85 | def test_remove_stale_objects(subparsers, organization, person):
 86 |     stale_person = person.build()
 87 |     membership = stale_person.memberships.create(organization=organization)
 88 | 
 89 |     expected_stale_objects = {stale_person, organization, membership}
 90 | 
 91 |     a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7)
 92 |     with freeze_time(a_week_from_now):
 93 |         fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson")
 94 |         fresh_person.memberships.create(organization=organization)
 95 | 
 96 |         CleanCommand(subparsers).remove_stale_objects(7)
 97 |         for obj in expected_stale_objects:
 98 |             was_deleted = not type(obj).objects.filter(id=obj.id).exists()
 99 |             assert was_deleted
100 | 
101 | 
102 | @pytest.mark.django_db
103 | def test_clean_command(subparsers, organization, person):
104 |     stale_person = person.build()
105 |     stale_membership = stale_person.memberships.create(organization=organization)
106 | 
107 |     a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7)
108 |     with freeze_time(a_week_from_now):
109 |         fresh_person = person.build(name="Thomas Jefferson", family_name="Jefferson")
110 |         not_stale_membership = fresh_person.memberships.create(
111 |             organization=organization
112 |         )
113 |         organization.save()  # Update org's last_seen field
114 | 
115 |         # Call clean command
116 |         CleanCommand(subparsers).handle(
117 |             argparse.Namespace(report=False, window=7, yes=True, max=10), []
118 |         )
119 | 
120 |         expected_stale_objects = {stale_person, stale_membership}
121 |         for obj in expected_stale_objects:
122 |             was_deleted = not type(obj).objects.filter(id=obj.id).exists()
123 |             assert was_deleted
124 | 
125 |         expected_not_stale_objects = {organization, fresh_person, not_stale_membership}
126 |         for obj in expected_not_stale_objects:
127 |             was_not_deleted = type(obj).objects.filter(id=obj.id).exists()
128 |             assert was_not_deleted
129 | 
130 | 
131 | @pytest.mark.django_db
132 | def test_clean_command_failsafe(subparsers, organization, person):
133 |     stale_people = [person.build() for i in range(20)]
134 |     for p in stale_people:
135 |         p.memberships.create(organization=organization)
136 | 
137 |     cmd = CleanCommand(subparsers)
138 | 
139 |     a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7)
140 |     with freeze_time(a_week_from_now):
141 |         with pytest.raises(SystemExit):
142 |             # Should trigger failsafe exist when deleting more than 10 objects
143 |             cmd.handle(
144 |                 argparse.Namespace(report=False, window=7, yes=False, max=10), []
145 |             )
146 | 
147 |         with pytest.raises(SystemExit):
148 |             # Should trigger failsafe exist when deleting more than 10 objects,
149 |             # even when yes is specified
150 |             cmd.handle(
151 |                 argparse.Namespace(report=False, window=7, yes=True, max=10), []
152 |             )
153 | 
154 |         # Should proceed without error, since max is increased (1 organization,
155 |         # 20 people, 20 memberships)
156 |         cmd.handle(
157 |             argparse.Namespace(report=False, window=7, max=41, yes=True), []
158 |         )
159 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # pupa changelog
  2 | 
  3 | ## 0.11.0 - April 3 2023
  4 | 
  5 | Improvements:
  6 | 
  7 | * Add `pupa clean` command to delete database objects that haven't been seen in recent scrapes
  8 | 
  9 | ## 0.10.2 - March 18 2021
 10 | 
 11 | Improvements:
 12 | 
 13 | * allow null event locations
 14 | * resolve memberships based on start date, falling back to end date if no start date is available
 15 | 
 16 | Fixes:
 17 | 
 18 | * raise exception for whitespace in urls
 19 | 
 20 | ## 0.10.1 - March 23 2020
 21 | 
 22 | Improvements:
 23 | 
 24 | * fire post-save hook on imports
 25 | * check family_name when resolving names
 26 | 
 27 | Fixes:
 28 | 
 29 | * if scrapers are omitted on command line, use all scrapers again
 30 | 
 31 | ## 0.10.0 - December 11 2019
 32 | 
 33 | Improvements:
 34 | 
 35 | * add data quality reports that update after each import (requires migration)
 36 | * add flags to disable person, bill, vote, event import
 37 | * let date be set on event media & document
 38 | 
 39 | Fixes:
 40 | 
 41 | * Fix warnings from obsolete usage of importlib & jsonschema
 42 | * remove parties from tests
 43 | * don't call check_session_list if running import only
 44 | * add support for Post.maximum_memberships
 45 | * add support for Person given & family names
 46 | * stop testing on Postgres 9.x
 47 | 
 48 | ## 0.9.1 - October 23 2018
 49 | 
 50 | Fixes:
 51 | 
 52 | * minor packaging fixes & dependency pinning tweaks
 53 | 
 54 | 
 55 | ## 0.9.0 - February 14 2018
 56 | 
 57 | Backwards-incompatible changes:
 58 | 
 59 | * fix_bill_id is no longer called on bill identifiers
 60 | 
 61 | Improvements:
 62 | 
 63 | * django 2.0 compatibility fixes (on_delete on models)
 64 | * require python-opencivicdata 2.1 fixes
 65 | * drop validictory for jsonschema
 66 | * add 'pupa party' command for atomic addition of parties, deprecate Jurisdiction.parties
 67 | * add IMPORT_TRANSFORMERS setting allowing alterations of data on import
 68 | 
 69 | Fixes:
 70 | 
 71 | * bugfix for OrganizationImporter other_names
 72 | * bugfix for VoteEvent bill resolution
 73 | * bugfix for VoteEvent bill action resolution (#307)
 74 | 
 75 | 
 76 | ## 0.8.0 - July 19 2017
 77 | 
 78 | Backwards-incompatible changes:
 79 | 
 80 | * role no longer defaults to 'member' and is now optional in Person constructor
 81 |   when used w/ primary_org. if primary_org alone is unambiguous scrapers
 82 |   can set primary org alone and role will be set automatically
 83 | * in accordance w/ OCDEP101, Event.start_time/end_time are now
 84 |   Event.start_date/end_date
 85 | 
 86 | Improvements:
 87 | 
 88 | * allow extras to be set on bill actions & event agenda items
 89 | * bill actions can now specify times
 90 | * add classification field to event agenda items
 91 | * resolving organizations checks other names like we do for people
 92 | 
 93 | ## 0.7.0 - June 5 2017
 94 | 
 95 | Backwards-incompatible changes:
 96 | 
 97 | * moves from split dependency of opencivicdata-divisions/opencivicdata-django
 98 |   to new unified opencivicdata which also splits into two Django apps
 99 |   (see python-opencivicdata release notes for more detail)
100 | 
101 | Improvements:
102 | 
103 | * allow Memberships to have unresolved `person_name` similar to how other
104 |   name resolutions work
105 | * allow linking of VoteEvent to BillAction by setting a matching chamber,
106 |   date, and bill\_action
107 | * add Scraper.latest\_session convienience method
108 | * optionally allow setting \_scraped\_name on legislative\_session, which will
109 |   be used in session\_list checking if present
110 | * add concept of Pupa identifiers, to aid in resolution
111 | 
112 | Fixes:
113 | 
114 | * pupa dbinit --reset now correctly drops dependent pupa tables and migrations
115 | * exit gracefully if the first scrape fails instead of complaining about RunPlan
116 |   DB constraint
117 | * complex psuedo-ids are now deterministic (by sorting dict keys)
118 | 
119 | 
120 | ## 0.6.0 - February 19 2017
121 | 
122 | Backwards-incompatible changes:
123 | 
124 | * Identify sessions by their identifiers instead of their names (update your `get_session_list()` methods)
125 | 
126 | Improvements:
127 | 
128 | * Check for the presence of a `get_session_list()` method instead of `check_sessions = True`
129 | * Resolve an event's participants and its agenda items' related entities #206, #207
130 | * Accept an organization name in `Person.add_membership` for the second parameter #233
131 | * Accept `datetime` dates wherever string dates are accepted #218
132 | * Improve error reporting #214, #230, #231
133 | * Compatible with Django 1.10
134 | 
135 | Fixes:
136 | 
137 | * Allow people to hold multiple posts in an organization #244, #247
138 | * Add a `primary_org_name` parameter to `Person.add_term`, to disambiguate organizations with the same classification #223
139 | * Update an object if the explicit order of its related objects has changed #242
140 | * Touch an object's `updated_at` whenever its related objects are updated #226
141 | * Correctly resolve a new person with the same name #232
142 | * Don't raise a resolution error due to multiple matches in cases where zero matches are acceptable
143 | 
144 | ## 0.5.2 - November 18 2015
145 | 
146 | * show run logs in the admin
147 | * start tracking failed runs
148 | 
149 | ## 0.5.1 - November 13 2015
150 | 
151 | * use other\_names for psuedo\_id resolution on people
152 | * fix for postgis:// on Heroku
153 | * remove dump command that required imago
154 | * require py-ocd-django 0.8.0 models
155 | 
156 | ## 0.5.0 - October 8 2015
157 | 
158 | * fix major bug causing deadlock on party import
159 | * fix major bug where legislative\_session changes would wipe the database
160 | * update from Django 1.7 to Django 1.9
161 |     * now uses Django's ArrayField, JSONField, etc. instead of external deps
162 |     * also now requires Postgres 9.4
163 | * changes to be consistent with Popolo in naming of legislative\_session and vote\_event
164 | * some speedups on import by changing how we use bulk\_create
165 | * experimental Kafka support
166 | * actually use other\_names for person import
167 | * allow delayed resolution of people
168 | * respect locked\_fields during import
169 | * renamed make\_psuedo\_id() to discourage use
170 | * lots of other bugfixes
171 | 
172 | ## 0.4.1 - August 13 2014
173 | 
174 | * bugfix release for packaging issue w/ 0.4.0
175 | 
176 | ## 0.4.0 - August 13 2014
177 | 
178 | * near-complete rewrite from MongoDB to Postgres dependency
179 | 
180 | ## 0.3.0 - March 27 2014
181 | 
182 | * Initial PyPI release, MongoDB version heavily based on billy
183 | 


--------------------------------------------------------------------------------
/pupa/tests/importers/test_topsort.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pupa.utils.topsort import Network, CyclicGraphError
  3 | 
  4 | 
  5 | def chash(cycles):
  6 |     """
  7 |     Hash a cycle, useful for comparing sets of cycles.
  8 | 
  9 |     This checks the sorted set of each of the nodes in the cycle. This
 10 |     is *not* a perfect check, but it's useful so that we can create a set
 11 |     of these hashes, and check that they all match.
 12 | 
 13 |     It's not perfect, since D -> A -> B will be the same as B -> A -> D,
 14 |     but since this is only used in the testing logic, we can ensure
 15 |     that we handle it correctly in the testcases.
 16 | 
 17 |     (Implicit warning: Don't use this anywhere important.)
 18 |     """
 19 |     return {"".join(sorted(set(x))) for x in cycles}
 20 | 
 21 | 
 22 | def test_sort_order_basic():
 23 |     network = Network()
 24 |     network.add_node("A")
 25 |     network.add_node("B")
 26 |     network.add_node("C")
 27 | 
 28 |     network.add_edge("A", "B")
 29 |     network.add_edge("B", "C")
 30 | 
 31 |     assert (list(network.sort())) == ["A", "B", "C"]
 32 | 
 33 | 
 34 | def test_sort_order_double():
 35 |     network = Network()
 36 |     network.add_node("A")
 37 |     network.add_node("B")
 38 |     network.add_node("C")
 39 | 
 40 |     network.add_edge("A", "B")
 41 |     network.add_edge("A", "C")
 42 |     network.add_edge("C", "B")
 43 | 
 44 |     # A  =>  B
 45 |     #       /
 46 |     # A => C
 47 | 
 48 |     assert (list(network.sort())) == ["A", "C", "B"]
 49 | 
 50 | 
 51 | def test_sort_order_staged():
 52 |     network = Network()
 53 | 
 54 |     network.add_node("A1")
 55 |     network.add_node("A2")
 56 |     network.add_node("A3")
 57 | 
 58 |     network.add_edge("A1", "A2")
 59 |     network.add_edge("A1", "A3")
 60 |     network.add_edge("A2", "A3")
 61 | 
 62 |     network.add_node("B1")
 63 |     network.add_node("B2")
 64 |     network.add_node("B3")
 65 | 
 66 |     network.add_edge("B1", "B2")
 67 |     network.add_edge("B1", "B3")
 68 |     network.add_edge("B2", "B3")
 69 | 
 70 |     network.add_edge("B1", "A1")
 71 | 
 72 |     network.add_node("C1")
 73 |     network.add_node("C2")
 74 |     network.add_node("C3")
 75 | 
 76 |     network.add_edge("C1", "C2")
 77 |     network.add_edge("C1", "C3")
 78 |     network.add_edge("C2", "C3")
 79 | 
 80 |     network.add_edge("C1", "A1")
 81 |     network.add_edge("C1", "B1")
 82 | 
 83 |     network.add_edge("C1", "B1")
 84 |     network.add_edge("B1", "A1")
 85 |     network.add_edge("A1", "C2")
 86 |     network.add_edge("A1", "C3")
 87 | 
 88 |     # with open("/home/tag/debug.dot", 'w') as fd:
 89 |     #     fd.write(network.dot())
 90 | 
 91 |     sorted_order = list(network.sort())
 92 | 
 93 |     assert sorted_order.pop(0) == "C1"
 94 |     assert sorted_order.pop(0) == "B1"
 95 |     assert sorted_order.pop(0) in ("A1", "B2")
 96 |     #                          ^^ This makes more sense after you dot debug it
 97 |     assert sorted_order.pop(0) in ("A1", "B2")
 98 | 
 99 | 
100 | def test_cyclic_graph_error_simple():
101 |     network = Network()
102 |     network.add_node("A")
103 |     network.add_node("B")
104 |     network.add_edge("A", "B")
105 |     network.add_edge("B", "A")
106 | 
107 |     with pytest.raises(CyclicGraphError):
108 |         list(network.sort())
109 | 
110 | 
111 | def test_cyclic_graph_error_indirect():
112 |     network = Network()
113 |     network.add_node("A")
114 |     network.add_node("B")
115 |     network.add_node("C")
116 | 
117 |     network.add_edge("A", "B")
118 |     network.add_edge("B", "C")
119 |     network.add_edge("C", "A")
120 | 
121 |     with pytest.raises(CyclicGraphError):
122 |         list(network.sort())
123 | 
124 | 
125 | def test_cyclic_graph_error_massive():
126 |     network = Network()
127 | 
128 |     entries = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "A"]
129 |     for i, e in enumerate(entries[:-1]):
130 |         network.add_node(e)
131 |         network.add_edge(e, entries[1 + i])
132 | 
133 |     with pytest.raises(CyclicGraphError):
134 |         list(network.sort())
135 | 
136 | 
137 | def test_link_before_nodes():
138 |     network = Network()
139 | 
140 |     network.add_edge("A", "B")
141 |     network.add_edge("B", "C")
142 |     network.add_edge("C", "D")
143 | 
144 |     network.add_node("A")
145 |     network.add_node("B")
146 |     network.add_node("C")
147 |     network.add_node("D")
148 | 
149 |     assert list(network.sort()) == ["A", "B", "C", "D"]
150 | 
151 | 
152 | def test_internal_node_removal():
153 |     network = Network()
154 | 
155 |     network.add_node("A")
156 |     network.add_node("B")
157 |     network.add_node("C")
158 |     network.add_node("D")
159 | 
160 |     network.add_edge("A", "B")
161 |     network.add_edge("B", "C")
162 |     network.add_edge("C", "D")
163 |     network.add_edge("A", "C")  # Useful for ensuring the ending list
164 |     # is deterministic.
165 | 
166 |     # Ensure that we can't remove an internal node without a ValueError
167 |     # by default.
168 |     with pytest.raises(ValueError):
169 |         network.prune_node("B")
170 | 
171 |     # OK. Now that we know that works, let's prune it harder.
172 |     network.prune_node("B", remove_backrefs=True)
173 | 
174 |     # And make sure "B" is gone.
175 |     assert list(network.sort()) == ["A", "C", "D"]
176 | 
177 | 
178 | def test_dot_debug():
179 |     network = Network()
180 | 
181 |     network.add_node("A")
182 |     network.add_node("B")
183 |     network.add_edge("A", "B")
184 | 
185 |     dot = network.dot()
186 |     assert dot == "digraph graphname {A -> B;}"
187 | 
188 | 
189 | def test_cycles_simple():
190 |     network = Network()
191 |     network.add_node("A")
192 |     network.add_node("B")
193 |     network.add_edge("A", "B")
194 |     network.add_edge("B", "A")
195 |     assert chash(network.cycles()) == chash([("A", "B", "A")])
196 | 
197 | 
198 | def test_cycles_complex():
199 |     network = Network()
200 |     network.add_node("A")
201 |     network.add_node("B")
202 |     network.add_node("C")
203 |     network.add_node("D")
204 | 
205 |     network.add_edge("A", "B")
206 |     network.add_edge("B", "C")
207 |     network.add_edge("C", "D")
208 |     network.add_edge("D", "A")
209 | 
210 |     network.add_edge("D", "C")
211 |     network.add_edge("C", "B")
212 |     network.add_edge("B", "D")
213 | 
214 |     # with open("/home/tag/debug.dot", 'w') as fd:
215 |     #     fd.write(network.dot())
216 | 
217 |     assert chash(network.cycles()) == chash(
218 |         [("B", "C", "B"), ("C", "D", "C"), ("A", "B", "D", "A")]
219 |     )
220 | 


--------------------------------------------------------------------------------
/pupa/scrape/schemas/event.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Schema for event objects.
  3 | """
  4 | 
  5 | from .common import (
  6 |     sources,
  7 |     extras,
  8 |     fuzzy_date_blank,
  9 |     fuzzy_datetime,
 10 |     fuzzy_datetime_blank,
 11 | )
 12 | 
 13 | media_schema = {
 14 |     "items": {
 15 |         "properties": {
 16 |             "name": {"type": "string", "minLength": 1},
 17 |             "type": {"type": "string", "minLength": 1},
 18 |             "date": fuzzy_date_blank,
 19 |             "offset": {"type": ["number", "null"]},
 20 |             "links": {
 21 |                 "items": {
 22 |                     "properties": {
 23 |                         "media_type": {"type": "string"},
 24 |                         "url": {"type": "string", "format": "uri"},
 25 |                     },
 26 |                     "type": "object",
 27 |                 },
 28 |                 "type": "array",
 29 |             },
 30 |         },
 31 |         "type": "object",
 32 |     },
 33 |     "type": "array",
 34 | }
 35 | 
 36 | schema = {
 37 |     "properties": {
 38 |         "name": {"type": "string", "minLength": 1},
 39 |         "all_day": {"type": "boolean"},
 40 |         "start_date": fuzzy_datetime,
 41 |         "end_date": fuzzy_datetime_blank,
 42 |         "status": {
 43 |             "type": "string",
 44 |             "enum": ["cancelled", "tentative", "confirmed", "passed"],
 45 |         },
 46 |         "classification": {"type": "string", "minLength": 1},  # TODO: enum
 47 |         "description": {"type": "string"},
 48 |         "location": {
 49 |             "type": ["object", "null"],
 50 |             "properties": {
 51 |                 "name": {"type": "string", "minLength": 1},
 52 |                 "note": {
 53 |                     "type": "string",
 54 |                 },
 55 |                 "url": {
 56 |                     "type": ["string", "null"],
 57 |                     "format": "uri",
 58 |                 },
 59 |                 "coordinates": {
 60 |                     "type": ["object", "null"],
 61 |                     "properties": {
 62 |                         "latitude": {
 63 |                             "type": "string",
 64 |                             "minLength": 1,
 65 |                         },
 66 |                         "longitude": {
 67 |                             "type": "string",
 68 |                             "minLength": 1,
 69 |                         },
 70 |                     },
 71 |                 },
 72 |             },
 73 |         },
 74 |         "media": media_schema,
 75 |         "documents": {
 76 |             "items": {
 77 |                 "properties": {
 78 |                     "note": {"type": "string", "minLength": 1},
 79 |                     "url": {"type": "string", "minLength": 1},
 80 |                     "media_type": {"type": "string", "minLength": 1},
 81 |                     "date": fuzzy_date_blank,
 82 |                 },
 83 |                 "type": "object",
 84 |             },
 85 |             "type": "array",
 86 |         },
 87 |         "links": {
 88 |             "items": {
 89 |                 "properties": {
 90 |                     "note": {
 91 |                         "type": "string",
 92 |                     },
 93 |                     "url": {"format": "uri", "type": "string"},
 94 |                 },
 95 |                 "type": "object",
 96 |             },
 97 |             "type": "array",
 98 |         },
 99 |         "participants": {
100 |             "items": {
101 |                 "properties": {
102 |                     "name": {
103 |                         "type": "string",
104 |                         "minLength": 1,
105 |                     },
106 |                     "type": {
107 |                         "enum": ["organization", "person"],
108 |                         "type": "string",
109 |                     },
110 |                     "note": {
111 |                         "type": "string",
112 |                         "minLength": 1,
113 |                     },
114 |                 },
115 |                 "type": "object",
116 |             },
117 |             "type": "array",
118 |         },
119 |         "agenda": {
120 |             "items": {
121 |                 "properties": {
122 |                     "description": {"type": "string", "minLength": 1},
123 |                     "classification": {
124 |                         "items": {"type": "string", "minLength": 1},
125 |                         "type": "array",
126 |                     },
127 |                     "order": {
128 |                         "type": ["string", "null"],
129 |                     },
130 |                     "subjects": {
131 |                         "items": {"type": "string", "minLength": 1},
132 |                         "type": "array",
133 |                     },
134 |                     "media": media_schema,
135 |                     "notes": {
136 |                         "items": {
137 |                             "type": "string",
138 |                             "minLength": 1,
139 |                         },
140 |                         "type": "array",
141 |                     },
142 |                     "related_entities": {
143 |                         "items": {
144 |                             "properties": {
145 |                                 "entity_type": {
146 |                                     "type": "string",
147 |                                     "minLength": 1,
148 |                                 },
149 |                                 "name": {
150 |                                     "type": "string",
151 |                                     "minLength": 1,
152 |                                 },
153 |                                 "note": {
154 |                                     "type": [
155 |                                         "string",
156 |                                         "null",
157 |                                     ],
158 |                                     "minLength": 1,
159 |                                 },
160 |                             },
161 |                             "type": "object",
162 |                         },
163 |                         "minItems": 0,
164 |                         "type": "array",
165 |                     },
166 |                 },
167 |                 "type": "object",
168 |             },
169 |             "minItems": 0,
170 |             "type": "array",
171 |         },
172 |         "sources": sources,
173 |         "extras": extras,
174 |         "pupa_id": {
175 |             "type": ["string", "null"],
176 |             "minLength": 1,
177 |         },
178 |     },
179 |     "type": "object",
180 | }
181 | 


--------------------------------------------------------------------------------
/pupa/scrape/vote_event.py:
--------------------------------------------------------------------------------
  1 | from ..utils import _make_pseudo_id
  2 | from .base import BaseModel, cleanup_list, SourceMixin
  3 | from .bill import Bill
  4 | from .popolo import pseudo_organization
  5 | from .schemas.vote_event import schema
  6 | from pupa.exceptions import ScrapeValueError
  7 | import re
  8 | 
  9 | 
 10 | class VoteEvent(BaseModel, SourceMixin):
 11 |     _type = "vote_event"
 12 |     _schema = schema
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         *,
 17 |         motion_text,
 18 |         start_date,
 19 |         classification,
 20 |         result,
 21 |         legislative_session=None,
 22 |         identifier="",
 23 |         bill=None,
 24 |         bill_chamber=None,
 25 |         bill_action=None,
 26 |         organization=None,
 27 |         chamber=None
 28 |     ):
 29 |         super(VoteEvent, self).__init__()
 30 | 
 31 |         self.legislative_session = legislative_session
 32 |         self.motion_text = motion_text
 33 |         self.motion_classification = cleanup_list(classification, [])
 34 |         self.start_date = start_date
 35 |         self.result = result
 36 |         self.identifier = identifier
 37 |         self.bill_action = bill_action
 38 | 
 39 |         self.set_bill(bill, chamber=bill_chamber)
 40 | 
 41 |         if isinstance(bill, Bill) and not self.legislative_session:
 42 |             self.legislative_session = bill.legislative_session
 43 | 
 44 |         if not self.legislative_session:
 45 |             raise ScrapeValueError("must set legislative_session or bill")
 46 | 
 47 |         self.organization = pseudo_organization(organization, chamber, "legislature")
 48 |         self.votes = []
 49 |         self.counts = []
 50 | 
 51 |     def __str__(self):
 52 |         return "{0} - {1} - {2}".format(
 53 |             self.legislative_session, self.start_date, self.motion_text
 54 |         )
 55 | 
 56 |     def set_bill(self, bill_or_identifier, *, chamber=None):
 57 |         if not bill_or_identifier:
 58 |             self.bill = None
 59 |         elif isinstance(bill_or_identifier, Bill):
 60 |             if chamber:
 61 |                 raise ScrapeValueError(
 62 |                     "set_bill takes no arguments when using a `Bill` object"
 63 |                 )
 64 |             self.bill = bill_or_identifier._id
 65 |         else:
 66 |             if chamber is None:
 67 |                 chamber = "legislature"
 68 |             kwargs = {
 69 |                 "identifier": bill_or_identifier,
 70 |                 "from_organization__classification": chamber,
 71 |                 "legislative_session__identifier": self.legislative_session,
 72 |             }
 73 |             self.bill = _make_pseudo_id(**kwargs)
 74 | 
 75 |     def vote(self, option, voter, *, note=""):
 76 |         self.votes.append(
 77 |             {
 78 |                 "option": option,
 79 |                 "voter_name": voter,
 80 |                 "voter_id": _make_pseudo_id(name=voter),
 81 |                 "note": note,
 82 |             }
 83 |         )
 84 | 
 85 |     def yes(self, name, *, id=None, note=""):
 86 |         return self.vote("yes", name, note=note)
 87 | 
 88 |     def no(self, name, *, id=None, note=""):
 89 |         return self.vote("no", name, note=note)
 90 | 
 91 |     def set_count(self, option, value):
 92 |         for co in self.counts:
 93 |             if co["option"] == option:
 94 |                 co["value"] = value
 95 |                 break
 96 |         else:
 97 |             self.counts.append({"option": option, "value": value})
 98 | 
 99 | 
100 | class OrderVoteEvent:
101 |     """A functor for applying order to voteEvents.
102 |     A single OrderVoteEvent instance should be used for all bills in a scrape.
103 |     The vote events of each bill must be processed in chronological order,
104 |         but the processing of bills may be interleaved (needed in e.g. NH).
105 |     Currently, it only fudges midnight dates (start_date and end_date)
106 |         by adding the event sequence number in seconds
107 |         to the start_date and end_date (if they are well-formed string dates)
108 |     In the future, when there is an 'order' field on voteEvents,
109 |         it should fill that as well.
110 |     This fails softly and silently;
111 |         if a valid string date is not found in start_date or end_date,
112 |         the date is not touched.
113 |     This assumes that times are reported as local time, not UTC.
114 |         A UTC time that is local midnight will not be touched.
115 |     Sometimes one chamber reports the time of a vote,
116 |         but the other chamber reports only the date.  This is handled.
117 |     See the unit tests for examples and more behavior.
118 |     """
119 | 
120 |     _midnight = r"\d\d\d\d-\d\d-\d\dT00:00:00.*"
121 |     _timeless = r"\d\d\d\d-\d\d-\d\d"
122 | 
123 |     class OrderBillVoteEvent:
124 |         """Order VoteEvents for a single bill"""
125 | 
126 |         def __init__(self):
127 |             self.order = 0  # voteEvent sequence number. 1st voteEvent is 1.
128 | 
129 |         def __call__(self, voteEvent):
130 | 
131 |             self.order += 1
132 |             voteEvent.start_date = self._adjust_date(voteEvent.start_date)
133 |             if hasattr(voteEvent, "end_date"):
134 |                 voteEvent.end_date = self._adjust_date(voteEvent.end_date)
135 | 
136 |         def _adjust_date(self, date):
137 | 
138 |             if not isinstance(date, str):
139 |                 return date
140 | 
141 |             if re.fullmatch(OrderVoteEvent._timeless, date):
142 |                 d2 = date + "T00:00:00"
143 |             elif re.fullmatch(OrderVoteEvent._midnight, date):
144 |                 d2 = date
145 |             else:
146 |                 return date
147 | 
148 |             assert self.order <= 60 * 60
149 |             mins = "{:02d}".format(self.order // 60)
150 |             secs = "{:02d}".format(self.order % 60)
151 | 
152 |             # yyyy-mm-ddThh:mm:dd+05:00
153 |             # 0123456789012345678
154 |             return d2[:14] + mins + ":" + secs + d2[19:]
155 | 
156 |     def __init__(self):
157 |         self.orderers = {}
158 | 
159 |     def __call__(self, session_id, bill_id, voteEvent):
160 |         """
161 |         Record order of voteEvent within bill.
162 | 
163 |         The "order" field is not yet implemented; this fudges voteEvent
164 |         start_date and end_date.
165 |         See OrderVoteEvent docstring for details.
166 | 
167 |         :param session_id: session id
168 |         :param bill_id: an identifier for the vote's bill
169 |             that is at least unique within the session.
170 |         :param voteEvent:
171 |         :return: None
172 |         """
173 |         bill_orderer = self.orderers.get((session_id, bill_id))
174 | 
175 |         if not bill_orderer:
176 |             bill_orderer = self.OrderBillVoteEvent()
177 |             self.orderers[(session_id, bill_id)] = bill_orderer
178 | 
179 |         bill_orderer(voteEvent)
180 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_people_org_scrape.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import pytest
  3 | from pupa.scrape import Person, Organization, Membership, Post
  4 | from pupa.utils import get_pseudo_id
  5 | from pupa.exceptions import ScrapeValueError
  6 | 
  7 | 
  8 | def test_basic_post():
  9 |     post = Post(label="1", role="Representative", organization_id="fake_org")
 10 |     assert "1" in str(post)
 11 |     post.validate()
 12 | 
 13 | 
 14 | def test_basic_invalid_post():
 15 |     post = Post(label=1, role="Representative", organization_id="fake_org")
 16 |     with pytest.raises(ValueError):
 17 |         post.validate()
 18 | 
 19 | 
 20 | def test_basic_membership():
 21 |     m = Membership(person_id="person", organization_id="org")
 22 |     assert "person" in str(m) and "org" in str(m)
 23 | 
 24 | 
 25 | def test_basic_invalid_membership():
 26 |     membership = Membership(person_id=33, organization_id="orga_id")
 27 |     with pytest.raises(ValueError):
 28 |         membership.validate()
 29 | 
 30 | 
 31 | def test_basic_invalid_person():
 32 |     bob = Person("Bob B. Johnson")
 33 |     bob.add_source(url="http://example.com")
 34 |     bob.validate()
 35 | 
 36 |     bob.name = None
 37 | 
 38 |     with pytest.raises(ScrapeValueError):
 39 |         bob.validate()
 40 | 
 41 | 
 42 | def test_basic_person():
 43 |     p = Person("Bob B. Bear")
 44 |     p.add_source("http://example.com")
 45 |     assert p.name in str(p)
 46 |     p.validate()
 47 | 
 48 | 
 49 | def test_person_add_membership_org():
 50 |     p = Person("Bob B. Bear")
 51 |     p.add_source("http://example.com")
 52 |     o = Organization("test org", classification="unknown")
 53 |     p.add_membership(
 54 |         o, role="member", start_date="2007", end_date=datetime.date(2015, 5, 8)
 55 |     )
 56 |     assert len(p._related) == 1
 57 |     p._related[0].validate()
 58 |     assert p._related[0].person_id == p._id
 59 |     assert p._related[0].organization_id == o._id
 60 |     assert p._related[0].start_date == "2007"
 61 |     assert p._related[0].end_date == datetime.date(2015, 5, 8)
 62 | 
 63 | 
 64 | def test_basic_organization():
 65 |     org = Organization("some org", classification="committee")
 66 |     org.add_source("http://example.com")
 67 |     assert org.name in str(org)
 68 |     org.validate()
 69 | 
 70 | 
 71 | def test_no_source_on_party_org():
 72 |     org = Organization("Hat", classification="party")
 73 |     # no source? no problem because classification = party
 74 |     org.validate()
 75 | 
 76 | 
 77 | def test_basic_invalid_organization():
 78 |     orga = Organization("name")
 79 | 
 80 |     # no source
 81 |     with pytest.raises(ScrapeValueError):
 82 |         orga.validate()
 83 | 
 84 | 
 85 | def test_org_add_post():
 86 |     """Test that we can hack posts in on the fly'"""
 87 |     orga = Organization("name", classification="committee")
 88 |     orga.add_source(url="http://example.com")
 89 |     orga.validate()
 90 | 
 91 |     orga.add_post("Human Readable Name", "Chef")
 92 | 
 93 |     assert orga._related[0].role == "Chef"
 94 |     assert orga._related[0].label == "Human Readable Name"
 95 | 
 96 | 
 97 | def test_legislator_related_district():
 98 |     leg = Person("John Adams", district="1", primary_org="legislature")
 99 |     leg.pre_save("jurisdiction-id")
100 | 
101 |     assert len(leg._related) == 1
102 |     assert leg._related[0].person_id == leg._id
103 |     assert get_pseudo_id(leg._related[0].organization_id) == {
104 |         "classification": "legislature"
105 |     }
106 |     assert get_pseudo_id(leg._related[0].post_id) == {
107 |         "organization__classification": "legislature",
108 |         "label": "1",
109 |     }
110 | 
111 | 
112 | def test_legislator_related_chamber_district():
113 |     leg = Person("John Adams", district="1", primary_org="upper")
114 |     leg.pre_save("jurisdiction-id")
115 | 
116 |     assert len(leg._related) == 1
117 |     assert leg._related[0].person_id == leg._id
118 |     assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "upper"}
119 |     assert get_pseudo_id(leg._related[0].post_id) == {
120 |         "organization__classification": "upper",
121 |         "label": "1",
122 |     }
123 | 
124 | 
125 | def test_legislator_related_chamber_district_role():
126 |     leg = Person("John Adams", district="1", primary_org="lower", role="Speaker")
127 |     leg.pre_save("jurisdiction-id")
128 | 
129 |     assert len(leg._related) == 1
130 |     assert leg._related[0].person_id == leg._id
131 |     assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "lower"}
132 |     assert get_pseudo_id(leg._related[0].post_id) == {
133 |         "organization__classification": "lower",
134 |         "label": "1",
135 |         "role": "Speaker",
136 |     }
137 |     assert leg._related[0].role == "Speaker"
138 | 
139 | 
140 | def test_legislator_related_party():
141 |     leg = Person("John Adams", party="Democratic-Republican")
142 |     leg.pre_save("jurisdiction-id")
143 | 
144 |     # a party membership
145 |     assert len(leg._related) == 1
146 |     assert leg._related[0].person_id == leg._id
147 |     assert get_pseudo_id(leg._related[0].organization_id) == {
148 |         "classification": "party",
149 |         "name": "Democratic-Republican",
150 |     }
151 |     assert leg._related[0].role == "member"
152 | 
153 | 
154 | def test_committee_add_member_person():
155 |     c = Organization("Defense", classification="committee")
156 |     p = Person("John Adams")
157 |     c.add_member(p, role="chairman")
158 |     assert c._related[0].person_id == p._id
159 |     assert c._related[0].organization_id == c._id
160 |     assert c._related[0].role == "chairman"
161 | 
162 | 
163 | def test_committee_add_member_name():
164 |     c = Organization("Defense", classification="committee")
165 |     c.add_member("John Adams")
166 |     assert get_pseudo_id(c._related[0].person_id) == {"name": "John Adams"}
167 |     assert c._related[0].organization_id == c._id
168 |     assert c._related[0].role == "member"
169 | 
170 | 
171 | def test_person_add_membership_name():
172 |     p = Person("Leonardo DiCaprio")
173 |     p.add_membership(
174 |         "Academy of Motion Picture Arts and Sciences", role="winner", start_date="2016"
175 |     )
176 |     p._related[0].validate()
177 |     assert get_pseudo_id(p._related[0].organization_id) == {
178 |         "name": "Academy of Motion Picture Arts and Sciences"
179 |     }
180 |     assert p._related[0].person_id == p._id
181 |     assert p._related[0].role == "winner"
182 |     assert p._related[0].start_date == "2016"
183 | 
184 | 
185 | def test_person_add_party():
186 |     p = Person("Groot")
187 |     p.add_party("Green")
188 |     p._related[0].validate()
189 |     assert get_pseudo_id(p._related[0].organization_id) == {
190 |         "name": "Green",
191 |         "classification": "party",
192 |     }
193 | 
194 | 
195 | def test_person_add_term():
196 |     p = Person("Eternal")
197 |     p.add_term("eternal", "council", start_date="0001", end_date="9999")
198 |     p._related[0].validate()
199 |     assert get_pseudo_id(p._related[0].organization_id) == {
200 |         "classification": "council",
201 |     }
202 |     assert p._related[0].start_date == "0001"
203 |     assert p._related[0].end_date == "9999"
204 | 


--------------------------------------------------------------------------------
/pupa/tests/scrape/test_bill_scrape.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from pupa.scrape import Bill
  3 | from pupa.utils.generic import get_pseudo_id
  4 | from pupa.exceptions import ScrapeValueError
  5 | 
  6 | 
  7 | def toy_bill():
  8 |     b = Bill(
  9 |         identifier="HB 2017",
 10 |         legislative_session="2012A",
 11 |         title="A bill for an act to raise the cookie budget by 200%",
 12 |         from_organization="Foo Senate",
 13 |         classification="bill",
 14 |     )
 15 |     b.add_source("http://uri.example.com/", note="foo")
 16 |     return b
 17 | 
 18 | 
 19 | def test_basic_valid_bill():
 20 |     b = toy_bill()
 21 |     b.validate()
 22 |     assert "we got here"
 23 | 
 24 | 
 25 | def test_bill_type_setting():
 26 |     # default
 27 |     b = Bill(identifier="some bill", legislative_session="session", title="the title")
 28 |     assert b.classification == ["bill"]
 29 | 
 30 |     # string -> list
 31 |     b = Bill(
 32 |         identifier="some bill",
 33 |         legislative_session="session",
 34 |         title="the title",
 35 |         classification="string",
 36 |     )
 37 |     assert b.classification == ["string"]
 38 | 
 39 |     # list unmodified
 40 |     b = Bill(
 41 |         identifier="some bill",
 42 |         legislative_session="session",
 43 |         title="the title",
 44 |         classification=["two", "items"],
 45 |     )
 46 |     assert b.classification == ["two", "items"]
 47 | 
 48 |     # tuple -> list
 49 |     b = Bill(
 50 |         identifier="some bill",
 51 |         legislative_session="session",
 52 |         title="the title",
 53 |         classification=("two", "items"),
 54 |     )
 55 |     assert b.classification == ["two", "items"]
 56 | 
 57 | 
 58 | def test_basic_invalid_bill():
 59 |     """Test that we can create an invalid bill, and validation will fail"""
 60 |     b = toy_bill()
 61 |     b.identifier = None
 62 |     with pytest.raises(ValueError):
 63 |         b.validate()
 64 | 
 65 | 
 66 | def test_from_organization():
 67 |     # none set
 68 |     assert get_pseudo_id(Bill("HB 1", "2014", "Some Bill").from_organization) == {
 69 |         "classification": "legislature"
 70 |     }
 71 | 
 72 |     # chamber set
 73 |     assert get_pseudo_id(
 74 |         Bill("SB 1", "2014", "Some Bill", chamber="upper").from_organization
 75 |     ) == {"classification": "upper"}
 76 |     # org direct set
 77 |     assert (
 78 |         Bill("HB 1", "2014", "Some Bill", from_organization="test").from_organization
 79 |         == "test"
 80 |     )
 81 | 
 82 |     # can't set both
 83 |     with pytest.raises(ValueError):
 84 |         Bill("HB 1", "2014", "Some Bill", from_organization="upper", chamber="upper")
 85 | 
 86 | 
 87 | def test_add_action():
 88 |     """Make sure actions work"""
 89 |     b = toy_bill()
 90 |     b.add_action("Some dude liked it.", "2013-04-29T20:00Z", chamber="lower")
 91 |     assert len(b.actions) == 1
 92 |     assert b.actions[0]["description"] == "Some dude liked it."
 93 |     assert get_pseudo_id(b.actions[0]["organization_id"]) == {"classification": "lower"}
 94 |     assert b.actions[0]["date"] == "2013-04-29T20:00Z"
 95 |     b.validate()
 96 | 
 97 | 
 98 | def test_action_extra():
 99 |     b = toy_bill()
100 |     b.add_action(
101 |         "an action with some extra information",
102 |         "2017-01-01",
103 |         extras=dict(sitting_chair="Adams"),
104 |     )
105 |     assert b.actions[0]["extras"] == {"sitting_chair": "Adams"}
106 | 
107 | 
108 | def test_add_related_bill():
109 |     """Make sure related bills work"""
110 |     b = toy_bill()
111 |     b.add_related_bill(
112 |         identifier="HB 2020", legislative_session="2011A", relation_type="companion"
113 |     )
114 |     assert len(b.related_bills) == 1
115 |     assert b.related_bills[0] == {
116 |         "identifier": "HB 2020",
117 |         "legislative_session": "2011A",
118 |         "relation_type": "companion",
119 |     }
120 |     b.validate()
121 | 
122 | 
123 | def test_add_sponsor():
124 |     b = toy_bill()
125 |     b.add_sponsorship(
126 |         name="Joe Bleu",
127 |         classification="Author",
128 |         entity_type="person",
129 |         primary=True,
130 |         chamber="upper",
131 |     )
132 |     assert len(b.sponsorships) == 1
133 |     assert b.sponsorships[0] == {
134 |         "person_id": '~{"name": "Joe Bleu"}',
135 |         "name": "Joe Bleu",
136 |         "classification": "Author",
137 |         "entity_type": "person",
138 |         "primary": True,
139 |         "organization_id": None,
140 |     }
141 |     b.validate()
142 | 
143 | 
144 | def test_subjects():
145 |     b = toy_bill()
146 |     b.add_subject("Foo")
147 |     b.add_subject("Bar")
148 |     assert b.subject == ["Foo", "Bar"]
149 |     b.validate()
150 | 
151 | 
152 | def test_abstract():
153 |     b = toy_bill()
154 |     b.add_abstract("this bill is stupid", "K-5", "1969-10-20")
155 |     b.add_abstract("this legislative document is ignorant", "6-12", "2010-10-10")
156 |     assert b.abstracts == [
157 |         {"note": "K-5", "abstract": "this bill is stupid", "date": "1969-10-20"},
158 |         {
159 |             "note": "6-12",
160 |             "abstract": "this legislative document is ignorant",
161 |             "date": "2010-10-10",
162 |         },
163 |     ]
164 | 
165 | 
166 | def test_add_documents():
167 |     b = toy_bill()
168 | 
169 |     # should only add one document since they all have same note
170 |     b.add_document_link(
171 |         note="Fiscal Impact",
172 |         date="2013-04",
173 |         url="http://hi.example.com/foo#bar",
174 |         media_type="text/html",
175 |     )
176 |     b.add_document_link(note="Fiscal Impact", date="2013-04", url="http://foobar.baz")
177 |     assert len(b.documents) == 1
178 | 
179 |     # should now be two documents
180 |     b.add_document_link(
181 |         note="Other Document", date="2013-04", url="http://foobar.baz/other"
182 |     )
183 |     assert len(b.documents) == 2
184 | 
185 |     # valid documents so far
186 |     b.validate()
187 | 
188 |     # an invalid document
189 |     b.add_document_link(
190 |         note="Fiscal Impact", date="2013-04", url=None, media_type="foo"
191 |     )
192 |     with pytest.raises(ScrapeValueError):
193 |         b.validate()
194 | 
195 | 
196 | def test_versions():
197 |     b = toy_bill()
198 | 
199 |     # only one document, multiple links
200 |     b.add_version_link(url="http://pault.ag/", note="Final Version", date="2013-04")
201 |     b.add_version_link(url="http://pault.ag/foo", note="Final Version", date="2013-04")
202 |     b.validate()
203 |     assert len(b.versions) == 1
204 |     assert len(b.versions[0]["links"]) == 2
205 | 
206 |     # duplicate!
207 |     with pytest.raises(ValueError):
208 |         b.add_version_link(
209 |             url="http://pault.ag/foo", note="Final Version", date="2013-04"
210 |         )
211 | 
212 |     # ignore duplicate - nothing should change
213 |     b.add_version_link(
214 |         url="http://pault.ag/foo",
215 |         note="Final Version",
216 |         date="2013-04",
217 |         on_duplicate="ignore",
218 |     )
219 |     assert len(b.versions) == 1
220 |     assert len(b.versions[0]["links"]) == 2
221 | 
222 |     # duplicate URL
223 |     with pytest.raises(ValueError):
224 |         b.add_version_link(
225 |             url="http://pault.ag/foo", note="Finals Versions", date="2013-04"
226 |         )
227 |     assert len(b.versions) == 1
228 |     assert len(b.versions[0]["links"]) == 2
229 | 
230 |     # a new doc, numbers go up
231 |     b.add_version_link(
232 |         url="http://pault.ag/foovbar", note="Finals Versions", date="2013-04"
233 |     )
234 |     assert len(b.versions) == 2
235 |     assert len(b.versions[1]["links"]) == 1
236 | 
237 |     # still validates
238 |     b.validate()
239 | 
240 | 
241 | def test_str():
242 |     b = toy_bill()
243 |     assert b.identifier in str(b)
244 | 
245 | 
246 | def test_no_whitespace_in_uri():
247 |     b = Bill(
248 |         identifier="HB 2017",
249 |         legislative_session="2012A",
250 |         title="A bill for an act to raise the cookie budget by 200%",
251 |         from_organization="Foo Senate",
252 |         classification="bill",
253 |     )
254 |     b.add_source("http://uri.example.com/fail here", note="foo")
255 |     with pytest.raises(ScrapeValueError):
256 |         b.validate()
257 | 


--------------------------------------------------------------------------------
/pupa/tests/importers/test_base_importer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import shutil
  4 | import tempfile
  5 | import mock
  6 | import pytest
  7 | from opencivicdata.core.models import Person, Organization, Jurisdiction, Division
  8 | from pupa.scrape import Person as ScrapePerson
  9 | from pupa.scrape import Organization as ScrapeOrganization
 10 | from pupa.importers.base import omnihash, BaseImporter
 11 | from pupa.importers import PersonImporter, OrganizationImporter
 12 | from pupa.exceptions import UnresolvedIdError, DataImportError
 13 | 
 14 | 
 15 | def create_jurisdiction():
 16 |     Division.objects.create(id="ocd-division/country:us", name="USA")
 17 |     Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us")
 18 | 
 19 | 
 20 | class FakeImporter(BaseImporter):
 21 |     _type = "test"
 22 | 
 23 | 
 24 | def test_omnihash_python_types():
 25 |     # string
 26 |     assert omnihash("test") == omnihash("test")
 27 |     # list
 28 |     assert omnihash(["this", "is", "a", "list"]) == omnihash(
 29 |         ["this", "is", "a", "list"]
 30 |     )
 31 |     # set
 32 |     assert omnihash({"and", "a", "set"}) == omnihash({"set", "set", "and", "a"})
 33 |     # dict w/ set and tuple as well
 34 |     assert omnihash({"a": {("fancy", "nested"): {"dict"}}}) == omnihash(
 35 |         {"a": {("fancy", "nested"): {"dict"}}}
 36 |     )
 37 | 
 38 | 
 39 | def test_import_directory():
 40 |     # write out some temp data to filesystem
 41 |     datadir = tempfile.mkdtemp()
 42 |     dicta = {"test": "A"}
 43 |     dictb = {"test": "B"}
 44 |     open(os.path.join(datadir, "test_a.json"), "w").write(json.dumps(dicta))
 45 |     open(os.path.join(datadir, "test_b.json"), "w").write(json.dumps(dictb))
 46 | 
 47 |     # simply ensure that import directory calls import_data with all dicts
 48 |     ti = FakeImporter("jurisdiction-id")
 49 |     with mock.patch.object(ti, attribute="import_data") as mockobj:
 50 |         ti.import_directory(datadir)
 51 | 
 52 |     # import_data should be called once
 53 |     assert mockobj.call_count == 1
 54 |     # kind of hacky, get the total list of args passed in
 55 |     arg_objs = list(mockobj.call_args[0][0])
 56 | 
 57 |     # 2 args only, make sure a and b are in there
 58 |     assert len(arg_objs) == 2
 59 |     assert dicta in arg_objs
 60 |     assert dictb in arg_objs
 61 | 
 62 |     # clean up datadir
 63 |     shutil.rmtree(datadir)
 64 | 
 65 | 
 66 | def test_apply_transformers():
 67 |     transformers = {
 68 |         "capitalize": lambda x: x.upper(),
 69 |         "cap_and_reverse": [lambda x: x.upper(), lambda y: y[::-1]],
 70 |         "never_used": lambda x: 1 / 0,
 71 |         "nested": {"replace": lambda x: "replaced"},
 72 |     }
 73 |     data = {
 74 |         "capitalize": "words",
 75 |         "cap_and_reverse": "simple",
 76 |         "nested": {"replace": None},
 77 |     }
 78 |     ti = FakeImporter("jid")
 79 |     ti.cached_transformers = transformers
 80 |     output = ti.apply_transformers(data)
 81 |     assert output["capitalize"] == "WORDS"
 82 |     assert output["cap_and_reverse"] == "ELPMIS"
 83 |     assert output["nested"]["replace"] == "replaced"
 84 | 
 85 | 
 86 | # doing these next few tests just on a Person because it is the same
 87 | # code that handles it but for completeness maybe it is better to do
 88 | # these on each type?
 89 | 
 90 | 
 91 | @pytest.mark.django_db
 92 | def test_last_seen_updates_on_scrape():
 93 |     create_jurisdiction()
 94 |     o = Organization.objects.create(name="WWE", jurisdiction_id="jid")
 95 | 
 96 |     p = Person.objects.create(name="George Washington", family_name="Washington")
 97 |     p.memberships.create(organization=o)
 98 | 
 99 |     expected_updated_at = p.updated_at
100 |     last_seen_before_scrape = p.last_seen
101 | 
102 |     # Simulate no-op scrape
103 |     scraped_p = ScrapePerson("George Washington").as_dict()
104 |     PersonImporter("jid").import_data([scraped_p])
105 | 
106 |     p.refresh_from_db()
107 | 
108 |     assert p.updated_at < p.last_seen, "Should refresh last_seen but not updated_at"
109 |     assert (
110 |         p.updated_at == expected_updated_at
111 |     ), "Should not refresh updated_at when there's no update"
112 | 
113 |     assert (
114 |         p.last_seen > last_seen_before_scrape
115 |     ), "Should refresh last_seen even when there's no update"
116 | 
117 | 
118 | @pytest.mark.django_db
119 | def test_deduplication_identical_object():
120 |     p1 = ScrapePerson("Dwayne").as_dict()
121 |     p2 = ScrapePerson("Dwayne").as_dict()
122 |     PersonImporter("jid").import_data([p1, p2])
123 | 
124 |     assert Person.objects.count() == 1
125 | 
126 | 
127 | @pytest.mark.django_db
128 | def test_exception_on_identical_objects_in_import_stream():
129 |     create_jurisdiction()
130 |     # these two objects aren't identical, but refer to the same thing
131 |     # at the moment we consider this an error (but there may be a better
132 |     # way to handle this?)
133 |     o1 = ScrapeOrganization("X-Men", classification="unknown").as_dict()
134 |     o2 = ScrapeOrganization(
135 |         "X-Men", founding_date="1970", classification="unknown"
136 |     ).as_dict()
137 | 
138 |     with pytest.raises(Exception):
139 |         OrganizationImporter("jid").import_data([o1, o2])
140 | 
141 | 
142 | @pytest.mark.django_db
143 | def test_resolve_json_id():
144 |     p1 = ScrapePerson("Dwayne").as_dict()
145 |     p2 = ScrapePerson("Dwayne").as_dict()
146 |     pi = PersonImporter("jid")
147 | 
148 |     # do import and get database id
149 |     p1_id = p1["_id"]
150 |     p2_id = p2["_id"]
151 |     pi.import_data([p1, p2])
152 |     db_id = Person.objects.get().id
153 | 
154 |     # simplest case
155 |     assert pi.resolve_json_id(p1_id) == db_id
156 |     # duplicate should resolve to same id
157 |     assert pi.resolve_json_id(p2_id) == db_id
158 |     # a null id should map to None
159 |     assert pi.resolve_json_id(None) is None
160 |     # no such id
161 |     with pytest.raises(UnresolvedIdError):
162 |         pi.resolve_json_id("this-is-invalid")
163 | 
164 | 
165 | @pytest.mark.django_db
166 | def test_invalid_fields():
167 |     p1 = ScrapePerson("Dwayne").as_dict()
168 |     p1["newfield"] = "shouldn't happen"
169 | 
170 |     with pytest.raises(DataImportError):
171 |         PersonImporter("jid").import_data([p1])
172 | 
173 | 
174 | @pytest.mark.django_db
175 | def test_invalid_fields_related_item():
176 |     p1 = ScrapePerson("Dwayne")
177 |     p1.add_link("http://example.com")
178 |     p1 = p1.as_dict()
179 |     p1["links"][0]["test"] = 3
180 | 
181 |     with pytest.raises(DataImportError):
182 |         PersonImporter("jid").import_data([p1])
183 | 
184 | 
185 | @pytest.mark.django_db
186 | def test_locked_field():
187 |     create_jurisdiction()
188 |     org = ScrapeOrganization("SHIELD").as_dict()
189 |     oi = OrganizationImporter("jid")
190 |     oi.import_data([org])
191 | 
192 |     # set date and lock field
193 |     o = Organization.objects.get()
194 |     o.dissolution_date = "2015"
195 |     o.locked_fields = ["dissolution_date"]
196 |     o.save()
197 | 
198 |     # reimport
199 |     org = ScrapeOrganization("SHIELD").as_dict()
200 |     oi = OrganizationImporter("jid")
201 |     oi.import_data([org])
202 | 
203 |     o = Organization.objects.get()
204 |     assert o.dissolution_date == "2015"
205 |     assert o.locked_fields == ["dissolution_date"]
206 | 
207 |     # do it a third time to check for the locked_fields reversion issue
208 |     org = ScrapeOrganization("SHIELD").as_dict()
209 |     oi = OrganizationImporter("jid")
210 |     oi.import_data([org])
211 | 
212 |     o = Organization.objects.get()
213 |     assert o.dissolution_date == "2015"
214 |     assert o.locked_fields == ["dissolution_date"]
215 | 
216 | 
217 | @pytest.mark.django_db
218 | def test_locked_field_subitem():
219 |     create_jurisdiction()
220 |     org = ScrapeOrganization("SHIELD")
221 |     org.add_name("S.H.I.E.L.D.")
222 |     oi = OrganizationImporter("jid")
223 |     oi.import_data([org.as_dict()])
224 | 
225 |     # lock the field
226 |     o = Organization.objects.get()
227 |     o.locked_fields = ["other_names"]
228 |     o.save()
229 | 
230 |     # reimport
231 |     org = ScrapeOrganization("SHIELD").as_dict()
232 |     oi = OrganizationImporter("jid")
233 |     oi.import_data([org])
234 | 
235 |     o = Organization.objects.get()
236 |     assert o.other_names.get().name == "S.H.I.E.L.D."
237 | 


--------------------------------------------------------------------------------