├── scraper ├── __init__.py ├── tests │ ├── __init__.py │ └── test_utils.py ├── pipelines.py ├── extensions.py ├── spiders │ ├── __init__.py │ └── utils.py ├── items.py ├── settings.py ├── monitors.py └── validators.py ├── web ├── home │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_templates.py │ ├── migrations │ │ └── __init__.py │ ├── static │ │ └── home │ │ │ ├── style.css │ │ │ ├── favicon.ico │ │ │ ├── imagem-apresentacao-dadosdefeira.png │ │ │ ├── bulma.js │ │ │ └── hero.css │ ├── apps.py │ ├── urls.py │ ├── views.py │ ├── templates │ │ ├── admin │ │ │ └── base_site.html │ │ └── snippets │ │ │ └── google-analytics.html │ └── context_processors.py ├── api │ ├── tests │ │ ├── __init__.py │ │ ├── test_health_check.py │ │ ├── constants.py │ │ ├── conftest.py │ │ └── test_serializers.py │ ├── filters.py │ ├── routes.py │ ├── constants.py │ ├── serializers.py │ └── views.py ├── datasets │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── fixtures │ │ │ ├── empty-response.json │ │ │ └── response-22042021.json │ │ ├── test_signals.py │ │ ├── management │ │ │ └── commands │ │ │ │ ├── test_search_vector.py │ │ │ │ └── test_citycouncil.py │ │ ├── test_parsers.py │ │ └── test_services.py │ ├── migrations │ │ ├── __init__.py │ │ ├── 0029_file_local_path.py │ │ ├── 0023_auto_20201124_0458.py │ │ ├── 0027_auto_20210501_0839.py │ │ ├── 0013_file_search_vector.py │ │ ├── 0018_file_external_code.py │ │ ├── 0015_drop_gazette_file_trigger.py │ │ ├── 0026_auto_20210410_0548.py │ │ ├── 0006_gazette_search_vector.py │ │ ├── 0004_auto_20200321_0817.py │ │ ├── 0001_initial.py │ │ ├── 0016_auto_20200522_0647.py │ │ ├── 0003_citycouncilattendancelist.py │ │ ├── 0028_auto_20210703_0457.py │ │ ├── 0005_auto_20200327_1348.py │ │ ├── 0010_auto_20200515_0959.py │ │ ├── 0007_citycouncilexpense.py │ │ ├── 0030_alter_historicalcitycouncilattendancelist_options_and_more.py │ │ ├── 0014_citycouncilbid.py │ │ ├── 0021_historicalcitycouncilattendancelist.py │ │ ├── 0009_auto_20200514_1350.py │ │ ├── 0002_auto_20200316_1905.py │ │ ├── 0024_auto_20210326_1704.py │ │ ├── 0019_auto_20200704_1132.py │ │ ├── 0012_auto_20200520_1050.py │ │ ├── 0025_auto_20210327_1144.py │ │ ├── 0017_citycouncilrevenue.py │ │ └── 0008_cityhallbid_cityhallbidevent.py │ ├── apps.py │ ├── management │ │ └── commands │ │ │ ├── _file.py │ │ │ ├── searchvector.py │ │ │ ├── _tcmba.py │ │ │ ├── citycouncil_sync.py │ │ │ ├── _cityhall.py │ │ │ ├── _citycouncil.py │ │ │ ├── crawl_tcmba.py │ │ │ ├── import.py │ │ │ ├── _gazette.py │ │ │ ├── load_tcmba_documents.py │ │ │ └── crawl.py │ ├── signals.py │ ├── baker_recipes.py │ ├── parsers.py │ ├── services.py │ └── adapters.py ├── __init__.py ├── asgi.py ├── celery.py ├── wsgi.py └── urls.py ├── runtime.txt ├── DOKKU_SCALE ├── CHECKS ├── pytest.ini ├── scrapy.cfg ├── .github ├── dependabot.yml └── workflows │ └── cicd.yml ├── Procfile ├── bin └── release.sh ├── .dockerignore ├── dev_requirements.txt ├── dependabot.yml ├── .gitignore ├── setup.cfg ├── Dockerfile ├── .pre-commit-config.yaml ├── manage.py ├── .env.example ├── requirements.txt ├── Makefile ├── LICENSE ├── docker-compose.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md └── README.md /scraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/home/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.8.6 2 | -------------------------------------------------------------------------------- /scraper/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/home/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/datasets/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/home/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DOKKU_SCALE: -------------------------------------------------------------------------------- 1 | web=1 2 | worker=1 3 | -------------------------------------------------------------------------------- /web/datasets/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CHECKS: -------------------------------------------------------------------------------- 1 | /api/?format=json "status":"available" 2 | -------------------------------------------------------------------------------- /web/home/static/home/style.css: -------------------------------------------------------------------------------- 1 | .footer-link { 2 | color: rgb(41, 92, 173); 3 | } 4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | DJANGO_SETTINGS_MODULE=web.settings 3 | DJANGO_CONFIGURATION=Test 4 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = scraper.settings 3 | 4 | [deploy] 5 | project = scraper 6 | -------------------------------------------------------------------------------- /web/home/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class HomeConfig(AppConfig): 5 | name = "web.home" 6 | -------------------------------------------------------------------------------- /web/__init__.py: -------------------------------------------------------------------------------- 1 | """Inicializa Django web app.""" 2 | from .celery import app as celery_app 3 | 4 | __all__ = ("celery_app",) 5 | -------------------------------------------------------------------------------- /web/home/static/home/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DadosAbertosDeFeira/maria-quiteria/HEAD/web/home/static/home/favicon.ico -------------------------------------------------------------------------------- /web/home/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path 2 | 3 | from . import views 4 | 5 | urlpatterns = [path("", views.index, name="index")] 6 | -------------------------------------------------------------------------------- /web/home/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | 4 | def index(request): 5 | return render(request, "home/index.html", {}) 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | open-pull-requests-limit: 10 8 | -------------------------------------------------------------------------------- /web/home/templates/admin/base_site.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/base_site.html' %} 2 | {% block extrahead %} 3 | {% include 'snippets/google-analytics.html' %} 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /web/datasets/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def mock_backup_file(mocker): 6 | return mocker.patch("web.datasets.tasks.backup_file.apply_async") 7 | -------------------------------------------------------------------------------- /web/home/context_processors.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | 3 | 4 | def google_analytics_key(request): 5 | return {"GOOGLE_ANALYTICS_KEY": settings.GOOGLE_ANALYTICS_KEY} 6 | -------------------------------------------------------------------------------- /web/home/static/home/imagem-apresentacao-dadosdefeira.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DadosAbertosDeFeira/maria-quiteria/HEAD/web/home/static/home/imagem-apresentacao-dadosdefeira.png -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | release: bin/release.sh 2 | web: gunicorn web.wsgi:application --preload --log-file - 3 | worker: celery -A web worker -l INFO --without-heartbeat --without-gossip --without-mingle 4 | -------------------------------------------------------------------------------- /web/asgi.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from django.core.asgi import get_asgi_application 4 | 5 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings") 6 | 7 | application = get_asgi_application() 8 | -------------------------------------------------------------------------------- /bin/release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eo pipefail 4 | 5 | PYTHON=$(which python3) 6 | 7 | echo "Running migrations" 8 | ${PYTHON} manage.py migrate --no-input 9 | 10 | echo "Done!" 11 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | __pycache__ 3 | .idea 4 | .scrapy/ 5 | .vscode 6 | .env 7 | .pytest_cache 8 | *.log 9 | 10 | # data 11 | *.json 12 | *.csv 13 | *.xls 14 | *.zip 15 | **/data/ 16 | *.sqlite* 17 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | django-debug-toolbar==4.2.0 3 | model-bakery==1.15.0 4 | pre-commit==3.3.3 5 | pytest==7.4.0 6 | pytest-django==4.5.2 7 | pytest-dotenv==0.5.2 8 | pytest-mock==3.11.1 9 | -------------------------------------------------------------------------------- /scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | from scraper.spiders.utils import get_git_commit 2 | 3 | 4 | class DefaultValuesPipeline(object): 5 | def process_item(self, item, spider): 6 | item.setdefault("git_commit", get_git_commit()) 7 | return item 8 | -------------------------------------------------------------------------------- /dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | ignore: 8 | - dependency-name: "*" 9 | update-types: ["version-update:semver-patch"] 10 | -------------------------------------------------------------------------------- /web/datasets/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class DatasetsConfig(AppConfig): 5 | name = "web.datasets" 6 | verbose_name = "Bases de dados" 7 | 8 | def ready(self): 9 | import web.datasets.signals # noqa 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .venv/ 3 | __pycache__ 4 | .idea 5 | .scrapy/ 6 | .vscode 7 | .env 8 | .pytest_cache 9 | *.log 10 | 11 | # data 12 | *.json 13 | !**/fixtures/*.json 14 | *.csv 15 | *.xls 16 | *.zip 17 | **/data/ 18 | *.sqlite* 19 | 20 | # django 21 | /static/ 22 | 23 | # scrapy 24 | files/ 25 | -------------------------------------------------------------------------------- /web/datasets/management/commands/_file.py: -------------------------------------------------------------------------------- 1 | from web.datasets.models import File 2 | 3 | 4 | def save_file(url, content_type, object_id, checksum=None): 5 | File.objects.get_or_create( 6 | url=url, 7 | content_type=content_type, 8 | object_id=object_id, 9 | checksum=checksum, 10 | ) 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | exclude = .git,*migrations* 4 | extend-ignore = E203 5 | 6 | [isort] 7 | multi_line_output = 3 8 | include_trailing_comma = True 9 | force_grid_wrap = 0 10 | use_parentheses = True 11 | line_length = 88 12 | 13 | [tool:pytest] 14 | DJANGO_SETTINGS_MODULE = web.settings 15 | DJANGO_CONFIGURATION = Test 16 | -------------------------------------------------------------------------------- /web/datasets/tests/fixtures/empty-response.json: -------------------------------------------------------------------------------- 1 | { 2 | "inclusoesContrato": [], 3 | "alteracoesContrato": [], 4 | "exclusoesContrato": [], 5 | "inclusoesLicitacao": [], 6 | "alteracoesLicitacao": [], 7 | "exclusoesLicitacao": [], 8 | "inclusoesReceita": [], 9 | "alteracoesReceita": [], 10 | "exclusoesReceita": [], 11 | "inclusoesDespesa": [], 12 | "alteracoesDespesa": [], 13 | "exclusoesDespesa": [] 14 | } 15 | -------------------------------------------------------------------------------- /web/celery.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import configurations 4 | from celery import Celery 5 | from django.apps import apps 6 | 7 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings") 8 | os.environ.setdefault("DJANGO_CONFIGURATION", "Dev") 9 | 10 | configurations.setup() 11 | 12 | app = Celery("web") 13 | app.config_from_object("django.conf:settings", namespace="CELERY") 14 | app.autodiscover_tasks(lambda: [n.name for n in apps.get_app_configs()]) 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | ENV PYTHONUNBUFFERED 1 4 | 5 | WORKDIR /code 6 | 7 | COPY requirements.txt . 8 | COPY dev_requirements.txt . 9 | 10 | RUN apt-get update && \ 11 | apt-get install -y netcat-openbsd gcc && \ 12 | apt-get clean && \ 13 | pip install -r dev_requirements.txt && \ 14 | apt purge -y gcc && \ 15 | apt autoremove -y && \ 16 | rm -rf /var/lib/apt/lists/* 17 | 18 | COPY . . 19 | 20 | RUN python manage.py collectstatic --no-input 21 | -------------------------------------------------------------------------------- /web/home/static/home/bulma.js: -------------------------------------------------------------------------------- 1 | // The following code is based off a toggle menu by @Bradcomp 2 | // source: https://gist.github.com/Bradcomp/a9ef2ef322a8e8017443b626208999c1 3 | (function() { 4 | var burger = document.querySelector('.burger'); 5 | var menu = document.querySelector('#'+burger.dataset.target); 6 | burger.addEventListener('click', function() { 7 | burger.classList.toggle('is-active'); 8 | menu.classList.toggle('is-active'); 9 | }); 10 | })(); 11 | -------------------------------------------------------------------------------- /web/wsgi.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from configurations.wsgi import get_wsgi_application 4 | from django.conf import settings 5 | 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings") 7 | os.environ.setdefault("DJANGO_CONFIGURATION", "Dev") 8 | 9 | application = get_wsgi_application() 10 | 11 | if settings.ENABLE_NEW_RELIC: 12 | import newrelic.agent 13 | 14 | newrelic.agent.initialize(settings.NEW_RELIC_CONFIG_FILE) 15 | application = newrelic.agent.WSGIApplicationWrapper(application) 16 | -------------------------------------------------------------------------------- /web/datasets/migrations/0029_file_local_path.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.2.7 on 2021-09-23 08:38 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0028_auto_20210703_0457"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AddField( 13 | model_name="file", 14 | name="local_path", 15 | field=models.CharField(blank=True, max_length=350, null=True), 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.5.0 4 | hooks: 5 | - id: check-added-large-files 6 | - id: debug-statements 7 | - id: end-of-file-fixer 8 | - id: requirements-txt-fixer 9 | - id: trailing-whitespace 10 | - repo: https://github.com/pycqa/flake8 11 | rev: 6.1.0 12 | hooks: 13 | - id: flake8 14 | - repo: https://github.com/pycqa/isort 15 | rev: 5.12.0 16 | hooks: 17 | - id: isort 18 | - repo: https://github.com/ambv/black 19 | rev: 23.7.0 20 | hooks: 21 | - id: black 22 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | from dotenv import find_dotenv, load_dotenv 7 | 8 | 9 | def main(): 10 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings") 11 | os.environ.setdefault("DJANGO_CONFIGURATION", "Dev") 12 | 13 | load_dotenv(find_dotenv()) 14 | 15 | from configurations.management import execute_from_command_line 16 | 17 | execute_from_command_line(sys.argv) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /scraper/extensions.py: -------------------------------------------------------------------------------- 1 | import sentry_sdk 2 | from scrapy.exceptions import NotConfigured 3 | 4 | 5 | class SentryLogging(object): 6 | """ 7 | Envia exceções e erros para o Sentry. 8 | 9 | Copiado de: https://stackoverflow.com/a/54964660/1344295 10 | """ 11 | 12 | @classmethod 13 | def from_crawler(cls, crawler): 14 | sentry_dsn = crawler.settings.get("SENTRY_DSN", None) 15 | if sentry_dsn is None: 16 | raise NotConfigured 17 | ext = cls() 18 | sentry_sdk.init(sentry_dsn) 19 | return ext 20 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | SENTRY_DSN= 2 | SPIDERMON_TELEGRAM_FAKE=True 3 | SPIDERMON_SENTRY_FAKE=True 4 | DJANGO_SETTINGS_MODULE=web.settings 5 | DJANGO_CONFIGURATION=Dev 6 | DJANGO_SECRET_KEY=dont-tell-anybody 7 | ACCESS_TOKEN_LIFETIME_IN_MINUTES=60 8 | REFRESH_TOKEN_LIFETIME_IN_MINUTES=60 9 | AWS_ACCESS_KEY_ID= 10 | AWS_SECRET_ACCESS_KEY= 11 | AWS_S3_BUCKET= 12 | AWS_S3_BUCKET_FOLDER= 13 | AWS_S3_REGION= 14 | # A variável abaixo aponta para o arquivo de configuração do NewRelic, se preciso colocar, tb, o path 15 | NEW_RELIC_CONFIG_FILE=newrelic.ini 16 | NEW_RELIC_LICENSE_KEY= 17 | NEW_RELIC_APP_NAME= 18 | -------------------------------------------------------------------------------- /web/datasets/migrations/0023_auto_20201124_0458.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.2 on 2020-11-24 07:58 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0022_historical_citycouncil"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterField( 13 | model_name="file", 14 | name="s3_url", 15 | field=models.URLField( 16 | blank=True, max_length=400, null=True, verbose_name="URL externa" 17 | ), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /web/datasets/migrations/0027_auto_20210501_0839.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.8 on 2021-05-01 11:39 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0026_auto_20210410_0548"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterModelOptions( 13 | name="syncinformation", 14 | options={ 15 | "ordering": ["-created_at"], 16 | "verbose_name": "Sincronização", 17 | "verbose_name_plural": "Sincronizações", 18 | }, 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from dateutil.parser import parse 3 | 4 | 5 | class BaseSpider(scrapy.Spider): 6 | start_from_date = None 7 | 8 | @property 9 | def start_date(self): 10 | picked_date = None 11 | if self.start_from_date: 12 | if isinstance(self.start_from_date, str): 13 | picked_date = parse(self.start_from_date, dayfirst=True) 14 | picked_date = picked_date.date() 15 | else: 16 | picked_date = self.start_from_date 17 | elif hasattr(self, "initial_date"): 18 | picked_date = self.initial_date 19 | 20 | return picked_date 21 | -------------------------------------------------------------------------------- /web/datasets/migrations/0013_file_search_vector.py: -------------------------------------------------------------------------------- 1 | from django.db import migrations 2 | 3 | 4 | class Migration(migrations.Migration): 5 | dependencies = [ 6 | ("datasets", "0012_auto_20200520_1050"), 7 | ] 8 | 9 | operations = [ 10 | migrations.RunSQL( 11 | sql=""" 12 | CREATE TRIGGER search_vector_file_update BEFORE INSERT OR UPDATE 13 | ON datasets_file FOR EACH ROW EXECUTE PROCEDURE 14 | tsvector_update_trigger(search_vector, 'pg_catalog.portuguese', content); 15 | """, 16 | reverse_sql="DROP TRIGGER IF EXISTS search_vector_file_update ON datasets_file;", 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /web/datasets/migrations/0018_file_external_code.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.6 on 2020-06-14 05:36 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0017_citycouncilrevenue"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AddField( 13 | model_name="file", 14 | name="external_code", 15 | field=models.CharField( 16 | blank=True, 17 | db_index=True, 18 | max_length=10, 19 | null=True, 20 | verbose_name="Código externo", 21 | ), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /web/datasets/migrations/0015_drop_gazette_file_trigger.py: -------------------------------------------------------------------------------- 1 | from django.db import migrations 2 | 3 | 4 | class Migration(migrations.Migration): 5 | dependencies = [ 6 | ("datasets", "0014_citycouncilbid"), 7 | ] 8 | 9 | operations = [ 10 | migrations.RunSQL( 11 | sql="DROP TRIGGER IF EXISTS search_vector_update ON datasets_gazette;", 12 | reverse_sql=""" 13 | CREATE TRIGGER search_vector_update BEFORE INSERT OR UPDATE 14 | ON datasets_gazette FOR EACH ROW EXECUTE PROCEDURE 15 | tsvector_update_trigger(search_vector, 'pg_catalog.portuguese', file_content); 16 | """, 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /web/datasets/signals.py: -------------------------------------------------------------------------------- 1 | from django.db.models.signals import post_save 2 | from django.dispatch import receiver 3 | 4 | from .models import File 5 | 6 | 7 | @receiver(post_save, sender=File) 8 | def backup_and_extract_content(sender, instance, **kwargs): 9 | """Faz backup e extrai conteúdo de um arquivo após sua criação.""" 10 | from .tasks import backup_file, content_from_file 11 | 12 | if instance.s3_url is None: 13 | backup_file.apply_async( 14 | (instance.pk,), 15 | link=content_from_file.si( 16 | instance.pk, 17 | ), 18 | ) 19 | elif instance.content is None: 20 | content_from_file.delay(instance.pk) 21 | -------------------------------------------------------------------------------- /web/home/templates/snippets/google-analytics.html: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.28.17 2 | celery==5.3.1 3 | dj-database-url==2.0.0 4 | django==4.1.10 5 | django-configurations==2.4.1 6 | django-extensions==3.2.3 7 | django-filter==23.2 8 | django-public-admin==0.0.5 9 | django-simple-history==3.3.0 10 | djangorestframework==3.14.0 11 | djangorestframework-simplejwt==5.3.0 12 | drf-yasg==1.21.7 13 | gunicorn==21.2.0 14 | https://github.com/DadosAbertosDeFeira/tcm-ba/releases/download/0.2.0/documentos_tcmba-0.2.0-py3-none-any.whl 15 | jinja2==3.1.2 16 | newrelic==8.8.1 17 | notifiers==1.2.1 18 | psycopg2-binary==2.9.7 19 | PyJWT==2.8.0 20 | python-dateutil==2.8.2 21 | python-dotenv==1.0.0 22 | schematics==2.1.1 23 | scrapy==2.10.1 24 | sentry-sdk==1.30.0 25 | spidermon==1.19.0 26 | tika==2.6.0 27 | whitenoise==6.5.0 28 | xlrd==2.0.1 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | bash: 2 | docker-compose run --rm web bash 3 | 4 | build: 5 | docker-compose build 6 | 7 | collectstatic: 8 | docker-compose run --rm web python manage.py collectstatic 9 | 10 | crawl: 11 | docker-compose run --rm web python manage.py crawl 12 | 13 | createsuperuser: 14 | docker-compose run --rm web python manage.py createsuperuser 15 | 16 | makemigrations: 17 | docker-compose run --rm web python manage.py makemigrations 18 | 19 | migrate: 20 | docker-compose run --rm web python manage.py migrate 21 | 22 | run: 23 | docker-compose up -d 24 | 25 | stop: 26 | docker-compose stop 27 | 28 | runspider: 29 | docker-compose run --rm web scrapy crawl $(SPIDER) -a start_from_date=$(START_DATE) 30 | 31 | shell: 32 | docker-compose run --rm web python manage.py shell_plus 33 | 34 | tests: 35 | docker-compose run --rm web pytest --dc Test 36 | -------------------------------------------------------------------------------- /web/home/tests/test_templates.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | class TestHome: 5 | def test_append_google_analytics_key(self, settings, client): 6 | settings.GOOGLE_ANALYTICS_KEY = "UA-000000000-1" 7 | response = client.get("/") 8 | assert "UA-000000000-1" in str(response.content) 9 | 10 | 11 | @pytest.mark.django_db 12 | class TestAdmin: 13 | def test_append_google_analytics_key(self, settings, admin_client): 14 | settings.GOOGLE_ANALYTICS_KEY = "UA-000000000-1" 15 | response = admin_client.get("/admin/") 16 | assert "UA-000000000-1" in str(response.content) 17 | 18 | 19 | @pytest.mark.django_db 20 | class TestPanel: 21 | def test_append_google_analytics_key(self, settings, client): 22 | settings.GOOGLE_ANALYTICS_KEY = "UA-000000000-1" 23 | response = client.get("/painel/") 24 | assert "UA-000000000-1" in str(response.content) 25 | -------------------------------------------------------------------------------- /web/datasets/tests/test_signals.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from model_bakery import baker 3 | 4 | 5 | @pytest.mark.django_db 6 | def test_backup_and_extract_content_when_file_is_saved(mock_backup_file): 7 | expected_link_task = "web.datasets.tasks.content_from_file" 8 | baker.make("datasets.File", s3_url=None, content=None) 9 | 10 | assert mock_backup_file.called is True 11 | assert mock_backup_file.call_count == 1 12 | assert expected_link_task in str(mock_backup_file.call_args_list[0][1]["link"]) 13 | 14 | 15 | @pytest.mark.django_db 16 | def test_extract_content_when_file_with_backup_is_saved(mocker, mock_backup_file): 17 | mock_content_from_file = mocker.patch("web.datasets.tasks.content_from_file.delay") 18 | baker.make("datasets.File", s3_url="https://www.pdf.com/test.pdf", content=None) 19 | 20 | assert mock_backup_file.called is False 21 | assert mock_content_from_file.called is True 22 | -------------------------------------------------------------------------------- /web/datasets/management/commands/searchvector.py: -------------------------------------------------------------------------------- 1 | from django.contrib.postgres.search import SearchVector 2 | from django.core.management.base import BaseCommand 3 | 4 | from web.datasets.models import File 5 | 6 | 7 | class Command(BaseCommand): 8 | help = """Remonta os indices de busca em caso de problemas 9 | com a geração de índice via trigger.""" 10 | 11 | def echo(self, text, style=None): 12 | self.stdout.write(style(text) if style else text) 13 | 14 | def handle(self, *args, **options): 15 | file_count = File.objects.count() 16 | self.echo( 17 | f"Criando um vetor de busca para os arquivos. " 18 | f"Total de itens: {file_count:,}", 19 | self.style.SUCCESS, 20 | ) 21 | self.echo("Aguarde...", self.style.SUCCESS) 22 | 23 | search_vector = SearchVector("content", config="portuguese") 24 | 25 | File.objects.update(search_vector=search_vector) 26 | 27 | self.echo("Pronto!", self.style.SUCCESS) 28 | -------------------------------------------------------------------------------- /web/api/tests/test_health_check.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from django.urls import reverse 3 | 4 | 5 | class TestHealthCheck: 6 | def test_return_success_when_accessing_health_check(self, api_client, url): 7 | response = api_client.get(url, format="json") 8 | assert response.status_code == 200 9 | assert list(response.json().keys()) == ["status", "time"] 10 | assert response.json().get("status") == "available" 11 | 12 | def test_return_forbidden_when_trying_to_anonymously_access_a_restricted_route( 13 | self, api_client 14 | ): 15 | url = reverse("gazettes-list") 16 | response = api_client.get(url) 17 | assert response.status_code == 403 18 | 19 | @pytest.mark.django_db 20 | def test_return_success_when_accessing_a_restricted_route_with_credentials( 21 | self, api_client_authenticated 22 | ): 23 | url = reverse("gazettes-list") 24 | response = api_client_authenticated.get(url) 25 | assert response.status_code == 200 26 | -------------------------------------------------------------------------------- /web/api/filters.py: -------------------------------------------------------------------------------- 1 | from django_filters import rest_framework as filters 2 | 3 | from web.datasets.models import CityHallBid, Gazette 4 | 5 | 6 | class GazetteFilter(filters.FilterSet): 7 | start_date = filters.DateFilter(field_name="date", lookup_expr="gte") 8 | end_date = filters.DateFilter(field_name="date", lookup_expr="lte") 9 | 10 | class Meta: 11 | model = Gazette 12 | fields = [ 13 | "power", 14 | "start_date", 15 | "end_date", 16 | "events__title", 17 | "events__secretariat", 18 | "events__summary", 19 | "year_and_edition", 20 | ] 21 | 22 | 23 | class CityHallBidFilter(filters.FilterSet): 24 | start_date = filters.DateFilter(field_name="session_at", lookup_expr="gte") 25 | end_date = filters.DateFilter(field_name="session_at", lookup_expr="lte") 26 | description = filters.CharFilter(field_name="description", lookup_expr="icontains") 27 | 28 | class Meta: 29 | model = CityHallBid 30 | fields = ["public_agency", "description", "modality", "start_date", "end_date"] 31 | -------------------------------------------------------------------------------- /web/datasets/migrations/0026_auto_20210410_0548.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.8 on 2021-04-10 08:48 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("contenttypes", "0002_remove_content_type_name"), 9 | ("datasets", "0025_auto_20210327_1144"), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name="file", 15 | name="original_filename", 16 | field=models.CharField( 17 | blank=True, 18 | db_index=True, 19 | max_length=200, 20 | null=True, 21 | verbose_name="Nome do arquivo", 22 | ), 23 | ), 24 | migrations.AlterField( 25 | model_name="file", 26 | name="url", 27 | field=models.URLField(db_index=True, verbose_name="URL do arquivo"), 28 | ), 29 | migrations.AlterUniqueTogether( 30 | name="file", 31 | unique_together={("url", "content_type", "object_id", "original_filename")}, 32 | ), 33 | ] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dados Abertos de Feira 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /web/datasets/tests/management/commands/test_search_vector.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from model_bakery import baker 5 | 6 | from web.datasets.management.commands.searchvector import Command 7 | 8 | 9 | @pytest.mark.django_db 10 | class TestCommandHandler: 11 | @pytest.mark.parametrize( 12 | "text,answer", 13 | [ 14 | ("O Prefeito Municipal de Feira...", "'feir':5 'municipal':3 'prefeit':2"), 15 | ( 16 | "Mussum Ipsum, cacilds vidis litro abertis.", 17 | "'abert':6 'cacilds':3 'ipsum':2 'litr':5 'mussum':1 'vid':4", 18 | ), 19 | ], 20 | ) 21 | def test_handler(self, text, answer, capsys): 22 | gazette = baker.make("datasets.File", content=text) 23 | assert not gazette.search_vector 24 | 25 | command = Command() 26 | command.handle() 27 | 28 | gazette.refresh_from_db() 29 | 30 | captured = capsys.readouterr() 31 | assert re.search(r"Criando um vetor .* Total de itens: 1", captured.out) 32 | assert "Pronto!" in captured.out 33 | 34 | assert gazette.search_vector == answer 35 | -------------------------------------------------------------------------------- /web/api/routes.py: -------------------------------------------------------------------------------- 1 | from django.urls import include, path 2 | from rest_framework import routers 3 | 4 | from web.api.views import ( 5 | CityCouncilAgendaView, 6 | CityCouncilAttendanceListView, 7 | CityCouncilMinuteView, 8 | CityHallBidView, 9 | FrontendEndpoint, 10 | GazetteView, 11 | HealthCheckView, 12 | ) 13 | 14 | router = routers.DefaultRouter() 15 | router.register("", HealthCheckView, basename="root") 16 | router.register("datasets/gazettes", GazetteView, basename="gazettes") 17 | 18 | 19 | urlpatterns = [ 20 | path("", include(router.urls)), 21 | path( 22 | "datasets/city-council/agenda/", 23 | CityCouncilAgendaView.as_view(), 24 | name="city-council-agenda", 25 | ), 26 | path( 27 | "datasets/city-council/attendance-list/", 28 | CityCouncilAttendanceListView.as_view(), 29 | name="city-council-attendance-list", 30 | ), 31 | path( 32 | "datasets/city-council/minute/", 33 | CityCouncilMinuteView.as_view(), 34 | name="city-council-minute", 35 | ), 36 | path("datasets/city-hall/bids/", CityHallBidView.as_view(), name="city-hall-bids"), 37 | path("datasets/endpoints", FrontendEndpoint.as_view(), name="frontend-endpoints"), 38 | ] 39 | -------------------------------------------------------------------------------- /web/datasets/management/commands/_tcmba.py: -------------------------------------------------------------------------------- 1 | from django.contrib.admin.options import get_content_type_for_model 2 | 3 | from web.datasets.models import File, TCMBADocument 4 | from web.datasets.parsers import from_str_to_date 5 | 6 | 7 | def save_document(item): 8 | public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam" 9 | document, created = TCMBADocument.objects.get_or_create( 10 | year=item["year"], 11 | month=item["month"], 12 | period=item["period"].lower(), 13 | category=item["category"], 14 | unit=item["unit"], 15 | inserted_at=from_str_to_date(item["inserted_at"]), 16 | inserted_by=item["inserted_by"], 17 | original_filename=item["original_filename"], 18 | crawled_from=public_view_url, 19 | defaults={ 20 | "crawled_at": item["crawled_at"], 21 | }, 22 | ) 23 | content_type = get_content_type_for_model(document) 24 | if created: 25 | _, file_created = File.objects.get_or_create( 26 | url=public_view_url, 27 | content_type=content_type, 28 | object_id=document.pk, 29 | local_path=f"{item['filepath']}{item['filename']}", 30 | original_filename=item["original_filename"], 31 | ) 32 | -------------------------------------------------------------------------------- /web/datasets/management/commands/citycouncil_sync.py: -------------------------------------------------------------------------------- 1 | from datetime import date, timedelta 2 | 3 | from celery import chain 4 | from dateutil.parser import parse 5 | from django.core.management.base import BaseCommand 6 | 7 | from web.datasets.tasks import ( 8 | distribute_city_council_objects_to_sync, 9 | get_city_council_updates, 10 | ) 11 | 12 | 13 | class Command(BaseCommand): 14 | help = "Dispara sincronização com o webservice da Câmara de Vereadores." 15 | 16 | def add_arguments(self, parser): 17 | parser.add_argument("--date", help="Data no formato aaaa-mm-dd") 18 | 19 | def handle(self, *args, **options): 20 | if options.get("date"): 21 | # converte para datetime para verificar se o formato está correto 22 | target_date = parse(options.get("date"), yearfirst=True).date() 23 | else: 24 | # ontem 25 | target_date = date.today() - timedelta(days=1) 26 | 27 | chain( 28 | get_city_council_updates.s(target_date.strftime("%Y-%m-%d")), 29 | distribute_city_council_objects_to_sync.s(), 30 | )() 31 | 32 | self.stdout.write( 33 | f"Syncronização com a Câmara iniciada (data alvo: {target_date})." 34 | ) 35 | -------------------------------------------------------------------------------- /web/datasets/migrations/0006_gazette_search_vector.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-04-03 22:53 2 | 3 | import django.contrib.postgres.indexes 4 | import django.contrib.postgres.search 5 | from django.db import migrations 6 | 7 | 8 | class Migration(migrations.Migration): 9 | dependencies = [ 10 | ("datasets", "0005_auto_20200327_1348"), 11 | ] 12 | 13 | operations = [ 14 | migrations.AddField( 15 | model_name="gazette", 16 | name="search_vector", 17 | field=django.contrib.postgres.search.SearchVectorField( 18 | editable=False, null=True 19 | ), 20 | ), 21 | migrations.AddIndex( 22 | model_name="gazette", 23 | index=django.contrib.postgres.indexes.GinIndex( 24 | fields=["search_vector"], name="datasets_ga_search__1d3d09_gin" 25 | ), 26 | ), 27 | migrations.RunSQL( 28 | sql=""" 29 | CREATE TRIGGER search_vector_update BEFORE INSERT OR UPDATE 30 | ON datasets_gazette FOR EACH ROW EXECUTE PROCEDURE 31 | tsvector_update_trigger(search_vector, 'pg_catalog.portuguese', file_content); 32 | """, 33 | reverse_sql="DROP TRIGGER IF EXISTS search_vector_update ON datasets_gazette;", 34 | ), 35 | ] 36 | -------------------------------------------------------------------------------- /web/api/constants.py: -------------------------------------------------------------------------------- 1 | GAZETTES_API = "api/datasets/gazettes" 2 | CITY_HALL_API = "api/datasets/city-hall" 3 | CITY_COUNCIL_API = "api/datasets/city-council" 4 | 5 | AVAILABLE_ENDPOINTS_BY_PUBLIC_AGENCY = { 6 | "city-council": { 7 | "public_agency": "Câmara Municipal", 8 | "endpoints": [ 9 | { 10 | "friendly_name": "Agenda dos vereadores", 11 | "endpoint": f"{CITY_COUNCIL_API}/agenda/", 12 | }, 13 | { 14 | "friendly_name": "Atas das sessões", 15 | "endpoint": f"{CITY_COUNCIL_API}/minute/", 16 | }, 17 | { 18 | "friendly_name": "Diário Oficial - Legislativo", 19 | "endpoint": f"{GAZETTES_API}/?power=legislative", 20 | }, 21 | { 22 | "friendly_name": "Lista de presença dos vereadores", 23 | "endpoint": f"{CITY_COUNCIL_API}/attendance-list/", 24 | }, 25 | ], 26 | }, 27 | "city-hall": { 28 | "public_agency": "Prefeitura", 29 | "endpoints": [ 30 | { 31 | "friendly_name": "Diário Oficial - Executivo", 32 | "endpoint": f"{GAZETTES_API}/?power=executive", 33 | }, 34 | { 35 | "friendly_name": "Licitações", 36 | "endpoint": f"{CITY_HALL_API}/bids/", 37 | }, 38 | ], 39 | }, 40 | } 41 | -------------------------------------------------------------------------------- /web/datasets/management/commands/_cityhall.py: -------------------------------------------------------------------------------- 1 | from django.contrib.admin.options import get_content_type_for_model 2 | 3 | from web.datasets.models import CityHallBid, CityHallBidEvent 4 | 5 | from ._file import save_file 6 | 7 | 8 | def save_bid(item): 9 | bid, created = CityHallBid.objects.update_or_create( 10 | session_at=item["session_at"], 11 | public_agency=item["public_agency"], 12 | codes=item["codes"], 13 | defaults={ 14 | "crawled_from": item["crawled_from"], 15 | "crawled_at": item["crawled_at"], 16 | "description": item["description"], 17 | "modality": item["modality"], 18 | }, 19 | ) 20 | 21 | if created and item.get("files"): 22 | content_type = get_content_type_for_model(bid) 23 | for file_ in item["files"]: 24 | save_file(file_, content_type, bid.pk) 25 | 26 | content_type = get_content_type_for_model(CityHallBidEvent) 27 | for event in item["history"]: 28 | event_obj, created = CityHallBidEvent.objects.get_or_create( 29 | crawled_from=item["crawled_from"], 30 | bid=bid, 31 | published_at=event["published_at"], 32 | summary=event["event"], 33 | defaults={"crawled_at": item["crawled_at"]}, 34 | ) 35 | if created and event.get("url"): 36 | save_file(event.get("url"), content_type, event_obj.pk) 37 | return bid 38 | -------------------------------------------------------------------------------- /web/datasets/migrations/0004_auto_20200321_0817.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0 on 2020-03-21 11:17 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0003_citycouncilattendancelist"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterModelOptions( 13 | name="citycouncilagenda", 14 | options={ 15 | "verbose_name": "Câmara de Vereadores - Agenda", 16 | "verbose_name_plural": "Câmara de Vereadores - Agendas", 17 | }, 18 | ), 19 | migrations.AlterModelOptions( 20 | name="citycouncilattendancelist", 21 | options={ 22 | "verbose_name": "Câmara de Vereadores - Lista de Presença", 23 | "verbose_name_plural": "Câmara de Vereadores - Listas de Presença", 24 | }, 25 | ), 26 | migrations.AlterModelOptions( 27 | name="gazette", 28 | options={ 29 | "verbose_name": "Diário Oficial", 30 | "verbose_name_plural": "Diários Oficiais", 31 | }, 32 | ), 33 | migrations.AlterModelOptions( 34 | name="gazetteevent", 35 | options={ 36 | "verbose_name": "Diário Oficial - Evento", 37 | "verbose_name_plural": "Diário Oficial - Eventos", 38 | }, 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /web/api/tests/constants.py: -------------------------------------------------------------------------------- 1 | GAZZETES_API = "api/datasets/gazettes" 2 | CITY_HALL_API = "api/datasets/city-hall" 3 | CITY_COUNCIL_API = "api/datasets/city-council" 4 | 5 | AVAILABLE_ENDPOINTS_BY_PUBLIC_AGENCY = { 6 | "city-council": { 7 | "public_agency": "Câmara Municipal", 8 | "endpoints": [ 9 | { 10 | "friendly_name": "Agenda dos vereadores", 11 | "endpoint": f"{CITY_COUNCIL_API}/agenda/", 12 | }, 13 | { 14 | "friendly_name": "Atas das sessões", 15 | "endpoint": f"{CITY_COUNCIL_API}/minute/", 16 | }, 17 | { 18 | "friendly_name": "Diário Oficial - Legislativo", 19 | "endpoint": f"{GAZZETES_API}/?power=legislative", 20 | }, 21 | { 22 | "friendly_name": "Lista de presença dos vereadores", 23 | "endpoint": f"{CITY_COUNCIL_API}/attendance-list/", 24 | }, 25 | ], 26 | }, 27 | "city-hall": { 28 | "public_agency": "Prefeitura", 29 | "endpoints": [ 30 | { 31 | "friendly_name": "Diário Oficial - Executivo", 32 | "endpoint": f"{GAZZETES_API}/?power=executive", 33 | }, 34 | { 35 | "friendly_name": "Licitações", 36 | "endpoint": f"{CITY_HALL_API}/bids/", 37 | }, 38 | ], 39 | }, 40 | } 41 | -------------------------------------------------------------------------------- /web/api/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | import pytest 4 | from django.contrib.auth.models import User 5 | from model_bakery import baker 6 | from rest_framework.test import APIClient 7 | 8 | 9 | @pytest.fixture 10 | def api_client(): 11 | return APIClient() 12 | 13 | 14 | @pytest.fixture 15 | def user(): 16 | return User(username="marvin", password="paranoidandroid") 17 | 18 | 19 | @pytest.fixture 20 | def api_client_authenticated(api_client, user): 21 | api_client.force_authenticate(user) 22 | return api_client 23 | 24 | 25 | @pytest.fixture 26 | def url(): 27 | return "/api/" 28 | 29 | 30 | @pytest.fixture 31 | def one_gazette(): 32 | return baker.make_recipe("datasets.Gazette", date=date(2021, 4, 21)) 33 | 34 | 35 | @pytest.fixture 36 | def last_of_two_gazettes(): 37 | baker.make_recipe("datasets.Gazette", date=date(2021, 3, 5)) 38 | return baker.make_recipe("datasets.Gazette", date=date(2021, 4, 21)) 39 | 40 | 41 | @pytest.fixture 42 | def last_of_three_gazettes(): 43 | baker.make_recipe("datasets.Gazette", date=date(2021, 1, 1), power="executivo") 44 | baker.make_recipe( 45 | "datasets.GazetteEvent", 46 | summary="Life? Don't talk to me about life.", 47 | gazette__date=date(2021, 3, 5), 48 | gazette__power="legislativo", 49 | ) 50 | return baker.make_recipe( 51 | "datasets.Gazette", date=date(2021, 4, 21), power="executivo" 52 | ) 53 | -------------------------------------------------------------------------------- /web/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | from django.contrib import admin 3 | from django.urls import include, path, re_path 4 | from drf_yasg import openapi 5 | from drf_yasg.views import get_schema_view 6 | from rest_framework import permissions 7 | from rest_framework_simplejwt.views import ( 8 | TokenObtainPairView, 9 | TokenRefreshView, 10 | TokenVerifyView, 11 | ) 12 | 13 | from web.datasets.admin import public_admin 14 | 15 | schema_view = get_schema_view( 16 | openapi.Info( 17 | title="Maria Quitéria API", 18 | default_version="v1", 19 | contact=openapi.Contact(email="dadosabertosdefeira+api@gmail.com"), 20 | license=openapi.License(name="MIT"), 21 | ), 22 | public=True, 23 | permission_classes=(permissions.AllowAny,), 24 | ) 25 | 26 | urlpatterns = [ 27 | path("admin/", admin.site.urls), 28 | path("", include("web.home.urls")), 29 | path("painel/", public_admin.urls), 30 | path("api/", include("web.api.routes")), 31 | path("api/token/", TokenObtainPairView.as_view(), name="token_obtain_pair"), 32 | path("api/token/refresh/", TokenRefreshView.as_view(), name="token_refresh"), 33 | path("api/token/verify/", TokenVerifyView.as_view(), name="token_verify"), 34 | re_path( 35 | r"^api/docs/$", 36 | schema_view.with_ui("swagger"), 37 | name="schema-swagger-ui", 38 | ), 39 | ] 40 | 41 | 42 | if settings.DEBUG: 43 | import debug_toolbar 44 | 45 | urlpatterns = [path("__debug__/", include(debug_toolbar.urls))] + urlpatterns 46 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | db: 5 | image: library/postgres:11-alpine 6 | environment: 7 | POSTGRES_DB: mariaquiteria 8 | POSTGRES_USER: postgres 9 | POSTGRES_PASSWORD: postgres 10 | volumes: 11 | - ./db:/var/lib/postgresql 12 | 13 | rabbitmq: 14 | image: rabbitmq 15 | ports: 16 | - "5672:5672" 17 | - "15672:15672" 18 | healthcheck: 19 | test: [ "CMD", "nc", "-z", "localhost", "5672" ] 20 | interval: 5s 21 | timeout: 15s 22 | retries: 1 23 | 24 | tika: 25 | image: apache/tika 26 | ports: 27 | - "9998:9998" 28 | 29 | web: 30 | build: . 31 | command: ["python", "manage.py", "runserver", "0.0.0.0:8000"] 32 | volumes: 33 | - .:/code 34 | ports: 35 | - "8000:8000" 36 | environment: 37 | DATABASE_HOST: db 38 | env_file: .env 39 | depends_on: 40 | - db 41 | - worker 42 | 43 | worker: 44 | build: . 45 | command: ["celery", "-A", "web", "worker", "-l", "INFO", "--without-heartbeat", "--without-gossip", "--without-mingle"] 46 | environment: 47 | DATABASE_HOST: db 48 | TIKA_CLIENT_ONLY: 1 49 | TIKA_SERVER_ENDPOINT: http://tika:9998 50 | env_file: .env 51 | restart: on-failure 52 | depends_on: 53 | - tika 54 | - db 55 | - rabbitmq 56 | -------------------------------------------------------------------------------- /web/datasets/baker_recipes.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | 3 | from model_bakery.recipe import Recipe, foreign_key 4 | 5 | from web.datasets.models import ( 6 | CityCouncilAgenda, 7 | CityCouncilAttendanceList, 8 | CityCouncilBid, 9 | CityCouncilContract, 10 | CityCouncilExpense, 11 | CityCouncilMinute, 12 | CityCouncilRevenue, 13 | CityHallBid, 14 | File, 15 | Gazette, 16 | GazetteEvent, 17 | SyncInformation, 18 | ) 19 | 20 | CityCouncilAgenda = Recipe( 21 | CityCouncilAgenda, 22 | date=date(2020, 3, 18), 23 | details="PROJETOS DE LEI ORDINÁRIA EM 2ª DISCUSSÃO 017/20", 24 | event_type="sessao_ordinaria", 25 | title="ORDEM DO DIA - 18 DE MARÇO DE 2020", 26 | ) 27 | 28 | 29 | CityCouncilAttendanceList = Recipe( 30 | CityCouncilAttendanceList, 31 | date=date(2020, 2, 3), 32 | description="Abertura da 1ª etapa do 4º período da 18ª legislatura", 33 | council_member="Competente da Silva", 34 | status="presente", 35 | ) 36 | 37 | 38 | CityCouncilBid = Recipe(CityCouncilBid) 39 | 40 | 41 | CityCouncilContract = Recipe(CityCouncilContract) 42 | 43 | 44 | CityCouncilExpense = Recipe(CityCouncilExpense) 45 | 46 | 47 | CityCouncilMinute = Recipe(CityCouncilMinute) 48 | 49 | 50 | CityCouncilRevenue = Recipe(CityCouncilRevenue) 51 | 52 | 53 | CityHallBid = Recipe(CityHallBid) 54 | 55 | 56 | Gazette = Recipe( 57 | Gazette, 58 | ) 59 | 60 | 61 | GazetteEvent = Recipe(GazetteEvent, gazette=foreign_key(Gazette)) 62 | 63 | 64 | File = Recipe( 65 | File, 66 | ) 67 | 68 | 69 | SyncInformation = Recipe( 70 | SyncInformation, 71 | ) 72 | -------------------------------------------------------------------------------- /web/datasets/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0 on 2020-02-02 02:00 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | initial = True 8 | 9 | dependencies = [] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilAgenda", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("crawled_at", models.DateTimeField(auto_now_add=True)), 25 | ("updated_at", models.DateTimeField(auto_now=True)), 26 | ("crawled_from", models.URLField(blank=True, null=True)), 27 | ("notes", models.TextField(blank=True, null=True)), 28 | ("date", models.DateField()), 29 | ("details", models.TextField(blank=True, null=True)), 30 | ( 31 | "event_type", 32 | models.CharField( 33 | choices=[ 34 | ("ordem_do_dia", "Ordem do Dia"), 35 | ("sessao_solene", "Sessão Solene"), 36 | ("sessao_especial", "Sessão Especial"), 37 | ("audiencia_publica", "Audiência Pública"), 38 | ], 39 | max_length=20, 40 | ), 41 | ), 42 | ("title", models.CharField(blank=True, max_length=100, null=True)), 43 | ], 44 | ), 45 | ] 46 | -------------------------------------------------------------------------------- /web/datasets/migrations/0016_auto_20200522_0647.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-05-22 09:47 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("datasets", "0015_drop_gazette_file_trigger"), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveIndex( 14 | model_name="gazette", 15 | name="datasets_ga_search__1d3d09_gin", 16 | ), 17 | migrations.RemoveField( 18 | model_name="citycouncilminute", 19 | name="file_content", 20 | ), 21 | migrations.RemoveField( 22 | model_name="citycouncilminute", 23 | name="file_url", 24 | ), 25 | migrations.RemoveField( 26 | model_name="cityhallbid", 27 | name="file_content", 28 | ), 29 | migrations.RemoveField( 30 | model_name="cityhallbid", 31 | name="file_url", 32 | ), 33 | migrations.RemoveField( 34 | model_name="cityhallbidevent", 35 | name="file_content", 36 | ), 37 | migrations.RemoveField( 38 | model_name="cityhallbidevent", 39 | name="file_url", 40 | ), 41 | migrations.RemoveField( 42 | model_name="gazette", 43 | name="file_content", 44 | ), 45 | migrations.RemoveField( 46 | model_name="gazette", 47 | name="file_url", 48 | ), 49 | migrations.RemoveField( 50 | model_name="gazette", 51 | name="search_vector", 52 | ), 53 | migrations.AlterField( 54 | model_name="gazetteevent", 55 | name="gazette", 56 | field=models.ForeignKey( 57 | on_delete=django.db.models.deletion.CASCADE, 58 | related_name="events", 59 | to="datasets.Gazette", 60 | ), 61 | ), 62 | ] 63 | -------------------------------------------------------------------------------- /web/datasets/migrations/0003_citycouncilattendancelist.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0 on 2020-03-21 09:51 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0002_auto_20200316_1905"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilAttendanceList", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("created_at", models.DateTimeField(auto_now_add=True)), 25 | ("updated_at", models.DateTimeField(auto_now=True)), 26 | ("crawled_at", models.DateTimeField()), 27 | ("crawled_from", models.URLField()), 28 | ("notes", models.TextField(blank=True, null=True)), 29 | ("date", models.DateField()), 30 | ( 31 | "description", 32 | models.CharField(blank=True, max_length=200, null=True), 33 | ), 34 | ("council_member", models.CharField(max_length=200)), 35 | ( 36 | "status", 37 | models.CharField( 38 | choices=[ 39 | ("presente", "Presente"), 40 | ("falta_justificada", "Falta Justificada"), 41 | ("licenca_justificada", "Licença Justificada"), 42 | ("ausente", "Ausente"), 43 | ], 44 | max_length=20, 45 | ), 46 | ), 47 | ], 48 | options={ 49 | "abstract": False, 50 | }, 51 | ), 52 | ] 53 | -------------------------------------------------------------------------------- /web/home/static/home/hero.css: -------------------------------------------------------------------------------- 1 | html, body { 2 | background: #EFF3F4; 3 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; 4 | } 5 | 6 | .hero-body .container { 7 | max-width: 700px; 8 | } 9 | 10 | .hero-body .title { 11 | color: hsl(192, 17%, 99%) !important; 12 | } 13 | 14 | .hero-body .subtitle { 15 | color: hsl(192, 17%, 99%) !important; 16 | padding-top: 2rem; 17 | line-height: 1.5; 18 | } 19 | 20 | .background { 21 | background-image: url('./imagem-apresentacao-dadosdefeira.png'); 22 | background-size:100% 100%; 23 | background-repeat: no-repeat; 24 | background-position: center; 25 | background-origin: border-box; 26 | position: relative; 27 | z-index: 99999999; 28 | } 29 | 30 | .features { 31 | padding: 5rem 0; 32 | } 33 | 34 | .box.cta { 35 | border-radius: 0; 36 | border-left: none; 37 | border-right: none; 38 | } 39 | 40 | .card-image>.fa { 41 | font-size: 8rem; 42 | padding-top: 2rem; 43 | padding-bottom: 2rem; 44 | color: #ffcc00; 45 | } 46 | 47 | .card-content .content { 48 | font-size: 14px; 49 | margin: 1rem 1rem; 50 | } 51 | 52 | .card-content .content h4 { 53 | font-size: 16px; 54 | font-weight: 700; 55 | } 56 | 57 | .card { 58 | box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.18); 59 | margin-bottom: 2rem; 60 | } 61 | 62 | .intro { 63 | padding: 5rem 0; 64 | text-align: center; 65 | } 66 | 67 | .sandbox { 68 | padding: 5rem 0; 69 | } 70 | 71 | .tile.notification { 72 | display: flex; 73 | justify-content: center; 74 | flex-direction: column; 75 | } 76 | 77 | .is-shady { 78 | animation: flyintoright .4s backwards; 79 | background: #fff; 80 | box-shadow: rgba(0, 0, 0, .1) 0 1px 0; 81 | border-radius: 4px; 82 | display: inline-block; 83 | margin: 10px; 84 | position: relative; 85 | transition: all .2s ease-in-out; 86 | } 87 | 88 | .is-shady:hover { 89 | box-shadow: 0 10px 16px rgba(0, 0, 0, .13), 0 6px 6px rgba(0, 0, 0, .19); 90 | } 91 | 92 | .hide { 93 | display: none; 94 | } 95 | -------------------------------------------------------------------------------- /web/api/serializers.py: -------------------------------------------------------------------------------- 1 | from rest_framework import serializers 2 | 3 | from web.datasets.models import ( 4 | CityCouncilAgenda, 5 | CityCouncilAttendanceList, 6 | CityCouncilMinute, 7 | CityHallBid, 8 | CityHallBidEvent, 9 | File, 10 | Gazette, 11 | GazetteEvent, 12 | ) 13 | 14 | 15 | class CityCouncilAgendaSerializer(serializers.ModelSerializer): 16 | class Meta: 17 | model = CityCouncilAgenda 18 | fields = "__all__" 19 | 20 | 21 | class CityCouncilAttendanceListSerializer(serializers.ModelSerializer): 22 | class Meta: 23 | model = CityCouncilAttendanceList 24 | fields = "__all__" 25 | 26 | 27 | class FileSerializer(serializers.ModelSerializer): 28 | class Meta: 29 | model = File 30 | fields = ["url"] 31 | 32 | 33 | class GazetteEventSerializer(serializers.ModelSerializer): 34 | class Meta: 35 | model = GazetteEvent 36 | fields = ["title", "secretariat", "summary", "published_on"] 37 | 38 | 39 | class GazetteSerializer(serializers.ModelSerializer): 40 | events = GazetteEventSerializer(many=True) 41 | files = FileSerializer(many=True, required=False) 42 | 43 | class Meta: 44 | model = Gazette 45 | fields = [ 46 | "crawled_from", 47 | "date", 48 | "power", 49 | "year_and_edition", 50 | "events", 51 | "files", 52 | ] 53 | 54 | 55 | class CityCouncilMinuteSerializer(serializers.ModelSerializer): 56 | files = FileSerializer(many=True) 57 | 58 | class Meta: 59 | model = CityCouncilMinute 60 | fields = "__all__" 61 | 62 | 63 | class CityHallBidEventSerializer(serializers.ModelSerializer): 64 | class Meta: 65 | model = CityHallBidEvent 66 | fields = "__all__" 67 | 68 | 69 | class CityHallBidSerializer(serializers.ModelSerializer): 70 | events = CityHallBidEventSerializer(many=True, read_only=True) 71 | files = FileSerializer(many=True, read_only=True) 72 | 73 | class Meta: 74 | model = CityHallBid 75 | fields = "__all__" 76 | -------------------------------------------------------------------------------- /web/datasets/migrations/0028_auto_20210703_0457.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.2.4 on 2021-07-03 07:57 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0027_auto_20210501_0839"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterField( 13 | model_name="citycouncilagenda", 14 | name="event_type", 15 | field=models.CharField( 16 | blank=True, 17 | choices=[ 18 | ("sessao_ordinaria", "Sessão Ordinária"), 19 | ("ordem_do_dia", "Ordem do Dia"), 20 | ("sessao_solene", "Sessão Solene"), 21 | ("sessao_especial", "Sessão Especial"), 22 | ("audiencia_publica", "Audiência Pública"), 23 | ("sessao_extraordinaria", "Sessão Extraordinária"), 24 | ("termo_de_encerramento", "Termo de Encerramento"), 25 | ], 26 | db_index=True, 27 | max_length=30, 28 | null=True, 29 | verbose_name="Tipo do evento", 30 | ), 31 | ), 32 | migrations.AlterField( 33 | model_name="citycouncilminute", 34 | name="event_type", 35 | field=models.CharField( 36 | blank=True, 37 | choices=[ 38 | ("sessao_ordinaria", "Sessão Ordinária"), 39 | ("ordem_do_dia", "Ordem do Dia"), 40 | ("sessao_solene", "Sessão Solene"), 41 | ("sessao_especial", "Sessão Especial"), 42 | ("audiencia_publica", "Audiência Pública"), 43 | ("sessao_extraordinaria", "Sessão Extraordinária"), 44 | ("termo_de_encerramento", "Termo de Encerramento"), 45 | ], 46 | db_index=True, 47 | max_length=30, 48 | null=True, 49 | verbose_name="Tipo de evento", 50 | ), 51 | ), 52 | ] 53 | -------------------------------------------------------------------------------- /.github/workflows/cicd.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [ push ] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: Set up Python 11 | uses: actions/setup-python@v2 12 | with: 13 | python-version: 3.8 14 | - name: Install Dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install -r dev_requirements.txt 18 | - name: Lint 19 | run: | 20 | pre-commit run --all-files 21 | - name: Check migrations 22 | env: 23 | DJANGO_SETTINGS_MODULE: "web.settings" 24 | DATABASE_URL: "postgres://postgres:postgres@localhost:5432/mariaquiteria" 25 | run: python manage.py makemigrations --check 26 | - name: Run Tests 27 | env: 28 | DJANGO_SETTINGS_MODULE: "web.settings" 29 | DJANGO_CONFIGURATION: "Test" 30 | DATABASE_URL: "postgres://postgres:postgres@localhost:5432/mariaquiteria" 31 | run: | 32 | python manage.py collectstatic 33 | pytest 34 | services: 35 | postgres: 36 | image: library/postgres:11-alpine 37 | env: 38 | POSTGRES_PASSWORD: postgres 39 | POSTGRES_DB: mariaquiteria 40 | ports: 41 | - 5432:5432 42 | options: >- 43 | --health-cmd pg_isready 44 | --health-interval 10s 45 | --health-timeout 5s 46 | --health-retries 5 47 | rabbitmq: 48 | image: rabbitmq 49 | env: 50 | RABBITMQ_DEFAULT_USER: guest 51 | RABBITMQ_DEFAULT_PASS: guest 52 | ports: 53 | - 5672:5672 54 | deploy: 55 | runs-on: ubuntu-20.04 56 | if: github.ref == 'refs/heads/main' 57 | 58 | steps: 59 | - uses: actions/checkout@v2 60 | with: 61 | fetch-depth: 0 62 | - name: Push to dokku 63 | uses: dokku/github-action@v1.0.2 64 | with: 65 | branch: main 66 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 67 | git_remote_url: ${{ secrets.DOKKU_REMOTE_URL }} 68 | ssh_host_key: ${{ secrets.SSH_HOST_KEY }} 69 | needs: [build] 70 | -------------------------------------------------------------------------------- /scraper/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class BaseItem(scrapy.Item): 5 | crawled_at = scrapy.Field() 6 | crawled_from = scrapy.Field() 7 | git_commit = scrapy.Field() 8 | errors = scrapy.Field() 9 | 10 | 11 | class LegacyGazetteItem(BaseItem): 12 | title = scrapy.Field() 13 | published_on = scrapy.Field() 14 | date = scrapy.Field() 15 | details = scrapy.Field() 16 | files = scrapy.Field() 17 | 18 | 19 | class GazetteItem(BaseItem): 20 | date = scrapy.Field() 21 | power = scrapy.Field() 22 | year_and_edition = scrapy.Field() 23 | events = scrapy.Field() 24 | files = scrapy.Field() 25 | 26 | 27 | class CityCouncilAgendaItem(BaseItem): 28 | date = scrapy.Field() 29 | details = scrapy.Field() 30 | title = scrapy.Field() 31 | event_type = scrapy.Field() 32 | 33 | 34 | class CityCouncilAttendanceListItem(BaseItem): 35 | date = scrapy.Field() 36 | description = scrapy.Field() 37 | council_member = scrapy.Field() 38 | status = scrapy.Field() 39 | 40 | 41 | class CityCouncilMinuteItem(BaseItem): 42 | date = scrapy.Field() 43 | title = scrapy.Field() 44 | event_type = scrapy.Field() 45 | files = scrapy.Field() 46 | 47 | 48 | class CityHallContractItem(BaseItem): 49 | contract_id = scrapy.Field() 50 | starts_at = scrapy.Field() 51 | summary = scrapy.Field() 52 | contractor_document = scrapy.Field() # CNPJ or CPF 53 | contractor_name = scrapy.Field() 54 | value = scrapy.Field() 55 | ends_at = scrapy.Field() 56 | files = scrapy.Field() 57 | 58 | 59 | class CityHallBidItem(BaseItem): 60 | public_agency = scrapy.Field() 61 | month = scrapy.Field() 62 | year = scrapy.Field() 63 | description = scrapy.Field() 64 | history = scrapy.Field() 65 | codes = scrapy.Field() 66 | modality = scrapy.Field() 67 | session_at = scrapy.Field() 68 | files = scrapy.Field() 69 | 70 | 71 | class CityHallPaymentsItem(BaseItem): 72 | published_at = scrapy.Field() 73 | phase = scrapy.Field() 74 | company_or_person = scrapy.Field() 75 | value = scrapy.Field() 76 | number = scrapy.Field() 77 | document = scrapy.Field() 78 | date = scrapy.Field() 79 | process_number = scrapy.Field() 80 | summary = scrapy.Field() 81 | group = scrapy.Field() 82 | action = scrapy.Field() 83 | function = scrapy.Field() 84 | subfunction = scrapy.Field() 85 | type_of_process = scrapy.Field() 86 | resource = scrapy.Field() 87 | -------------------------------------------------------------------------------- /web/datasets/management/commands/_citycouncil.py: -------------------------------------------------------------------------------- 1 | from django.contrib.admin.options import get_content_type_for_model 2 | 3 | from web.datasets.management.commands._file import save_file 4 | from web.datasets.models import ( 5 | CityCouncilAgenda, 6 | CityCouncilAttendanceList, 7 | CityCouncilExpense, 8 | CityCouncilMinute, 9 | ) 10 | 11 | 12 | def save_agenda(item): 13 | agenda, _ = CityCouncilAgenda.objects.update_or_create( 14 | date=item["date"], 15 | title=item["title"], 16 | event_type=item["event_type"], 17 | crawled_from=item["crawled_from"], 18 | defaults={"crawled_at": item["crawled_at"], "details": item["details"]}, 19 | ) 20 | return agenda 21 | 22 | 23 | def save_attendance_list(item): 24 | attendance, _ = CityCouncilAttendanceList.objects.update_or_create( 25 | date=item["date"], 26 | council_member=item["council_member"], 27 | defaults={ 28 | "crawled_at": item["crawled_at"], 29 | "crawled_from": item["crawled_from"], 30 | "status": item.get("status"), 31 | }, 32 | ) 33 | return attendance 34 | 35 | 36 | def save_expense(item): 37 | attendance, _ = CityCouncilExpense.objects.get_or_create( 38 | published_at=item["published_at"], 39 | phase=item["phase"], 40 | company_or_person=item["company_or_person"], 41 | value=item["value"], 42 | number=item["number"], 43 | document=item["document"], 44 | date=item["date"], 45 | process_number=item["process_number"], 46 | summary=item["summary"], 47 | legal_status=item["legal_status"], 48 | function=item["function"], 49 | subfunction=item["subfunction"], 50 | type_of_process=item["type_of_process"], 51 | resource=item["resource"], 52 | subgroup=item["subgroup"], 53 | group=item["group"], 54 | defaults={ 55 | "crawled_at": item["crawled_at"], 56 | "crawled_from": item["crawled_from"], 57 | }, 58 | ) 59 | return attendance 60 | 61 | 62 | def save_minute(item): 63 | minute, created = CityCouncilMinute.objects.get_or_create( 64 | date=item["date"], 65 | crawled_from=item["crawled_from"], 66 | defaults={ 67 | "title": item["title"], 68 | "event_type": item["event_type"], 69 | "crawled_at": item["crawled_at"], 70 | }, 71 | ) 72 | if created and item.get("files"): 73 | content_type = get_content_type_for_model(minute) 74 | for file_ in item["files"]: 75 | save_file(file_, content_type, minute.pk) 76 | return minute 77 | -------------------------------------------------------------------------------- /web/datasets/migrations/0005_auto_20200327_1348.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0 on 2020-03-27 16:48 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0004_auto_20200321_0817"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilMinute", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("created_at", models.DateTimeField(auto_now_add=True)), 25 | ("updated_at", models.DateTimeField(auto_now=True)), 26 | ("crawled_at", models.DateTimeField()), 27 | ("crawled_from", models.URLField()), 28 | ("notes", models.TextField(blank=True, null=True)), 29 | ("date", models.DateField()), 30 | ("title", models.CharField(blank=True, max_length=300, null=True)), 31 | ( 32 | "event_type", 33 | models.CharField( 34 | choices=[ 35 | ("sessao_ordinaria", "Sessão Ordinária"), 36 | ("ordem_do_dia", "Ordem do Dia"), 37 | ("sessao_solene", "Sessão Solene"), 38 | ("sessao_especial", "Sessão Especial"), 39 | ("audiencia_publica", "Audiência Pública"), 40 | ], 41 | max_length=20, 42 | ), 43 | ), 44 | ("file_url", models.URLField(blank=True, null=True)), 45 | ("file_content", models.TextField(blank=True, null=True)), 46 | ], 47 | options={ 48 | "verbose_name": "Câmara de Vereadores - Atas", 49 | "verbose_name_plural": "Câmara de Vereadores - Atas", 50 | }, 51 | ), 52 | migrations.AlterField( 53 | model_name="citycouncilagenda", 54 | name="event_type", 55 | field=models.CharField( 56 | choices=[ 57 | ("sessao_ordinaria", "Sessão Ordinária"), 58 | ("ordem_do_dia", "Ordem do Dia"), 59 | ("sessao_solene", "Sessão Solene"), 60 | ("sessao_especial", "Sessão Especial"), 61 | ("audiencia_publica", "Audiência Pública"), 62 | ], 63 | max_length=20, 64 | ), 65 | ), 66 | ] 67 | -------------------------------------------------------------------------------- /scraper/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from tcmba.items import DocumentItem as TCMBADocumentItem 5 | 6 | from .items import ( 7 | CityCouncilAgendaItem, 8 | CityCouncilAttendanceListItem, 9 | CityCouncilMinuteItem, 10 | CityHallBidItem, 11 | CityHallContractItem, 12 | CityHallPaymentsItem, 13 | GazetteItem, 14 | LegacyGazetteItem, 15 | ) 16 | 17 | # general 18 | BOT_NAME = "maria-quiteria" 19 | SPIDER_MODULES = ["scraper.spiders"] 20 | NEWSPIDER_MODULE = "scraper.spiders" 21 | ROBOTSTXT_OBEY = True 22 | COOKIES_ENABLED = False 23 | EXTENSIONS = { 24 | "scraper.extensions.SentryLogging": -1, 25 | "spidermon.contrib.scrapy.extensions.Spidermon": 500, 26 | } 27 | SENTRY_DSN = os.getenv("SENTRY_DSN", "") 28 | 29 | # pipelines 30 | ITEM_PIPELINES = { 31 | "spidermon.contrib.scrapy.pipelines.ItemValidationPipeline": 200, 32 | "scraper.pipelines.DefaultValuesPipeline": 300, 33 | } 34 | 35 | # http cache 36 | HTTPCACHE_ENABLED = True 37 | HTTPCACHE_EXPIRATION_SECS = 86400 # 24 horas 38 | 39 | # testing 40 | SPIDERMON_ENABLED = True 41 | SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS = True 42 | SPIDERMON_VALIDATION_ERRORS_FIELD = "errors" 43 | SPIDERMON_VALIDATION_MODELS = { 44 | LegacyGazetteItem: "scraper.validators.LegacyGazetteItem", 45 | GazetteItem: "scraper.validators.GazetteItem", 46 | CityCouncilAgendaItem: "scraper.validators.CityCouncilAgendaItem", 47 | CityCouncilMinuteItem: "scraper.validators.CityCouncilMinuteItem", 48 | CityHallContractItem: "scraper.validators.CityHallContractItem", 49 | CityHallBidItem: "scraper.validators.CityHallBidItem", 50 | CityHallPaymentsItem: "scraper.validators.CityHallPaymentsItem", 51 | CityCouncilAttendanceListItem: "scraper.validators.CityCouncilAttendanceListItem", 52 | TCMBADocumentItem: "scraper.validators.TCMBADocumentItem", 53 | } 54 | 55 | # monitoring 56 | SPIDERMON_SPIDER_CLOSE_MONITORS = ("scraper.monitors.SpiderCloseMonitorSuite",) 57 | 58 | # bot 59 | SPIDERMON_TELEGRAM_SENDER_TOKEN = os.getenv("TELEGRAM_SENDER_TOKEN", "fake") 60 | SPIDERMON_TELEGRAM_RECIPIENTS = [os.getenv("TELEGRAM_CHANNEL", None)] 61 | SPIDERMON_TELEGRAM_FAKE = os.getenv("SPIDERMON_TELEGRAM_FAKE", False) 62 | SPIDERMON_DISCORD_WEBHOOK_URL = os.getenv("SPIDERMON_DISCORD_WEBHOOK_URL", "fake") 63 | 64 | # sentry 65 | SPIDERMON_SENTRY_DSN = SENTRY_DSN 66 | SPIDERMON_SENTRY_PROJECT_NAME = "MariaQuiteria - Scraper" 67 | SPIDERMON_SENTRY_ENVIRONMENT_TYPE = os.getenv( 68 | "SPIDERMON_SENTRY_ENVIRONMENT_TYPE", "Prod" 69 | ) 70 | SPIDERMON_SENTRY_FAKE = os.getenv("SPIDERMON_SENTRY_FAKE", False) 71 | 72 | # throttling 73 | AUTOTHROTTLE_ENABLED = True 74 | 75 | if os.getenv("ENABLE_AUTOTHROTTLE_DEBUG", False): 76 | AUTOTHROTTLE_DEBUG = True 77 | 78 | FILES_STORE = Path.cwd() / "files" 79 | FILES_STORE.mkdir(parents=True, exist_ok=True) 80 | -------------------------------------------------------------------------------- /scraper/monitors.py: -------------------------------------------------------------------------------- 1 | from spidermon import MonitorSuite 2 | from spidermon.contrib.actions.discord import SendDiscordMessage 3 | from spidermon.contrib.actions.telegram import SendTelegramMessage 4 | 5 | 6 | def find_exceptions(stats): 7 | exceptions = [] 8 | for key, value in stats.items(): 9 | if key.startswith("spider_exceptions"): 10 | exceptions.append(f"`{key}` ({value})") 11 | elif key.startswith("downloader/response_status_count/4"): 12 | exceptions.append(f"Página não encontrada ({value})") 13 | return exceptions 14 | 15 | 16 | class CustomSendTelegramMessage(SendTelegramMessage): 17 | def get_message(self): 18 | stats = self.data.stats 19 | n_scraped_items = stats.get("item_scraped_count", 0) 20 | 21 | exceptions = find_exceptions(stats) 22 | exceptions_message = "" 23 | if exceptions: 24 | exceptions_message = "\n".join(exceptions) 25 | 26 | number_of_failures = len(self.result.failures) 27 | number_of_exceptions = len(exceptions) 28 | failed = any( 29 | [ 30 | number_of_failures > 0, 31 | (n_scraped_items - number_of_exceptions) < 0, 32 | ] 33 | ) 34 | emoji = "💀" if failed else "🎉" 35 | 36 | message = "\n".join( 37 | [ 38 | f"{emoji} Spider `{self.data.spider.name}` {stats['finish_reason']}", 39 | f"- Duração em segundos: {round(stats['elapsed_time_seconds'], 1)}", 40 | f"- Itens raspados: {n_scraped_items}", 41 | f"- Erros: {number_of_failures}", 42 | f"- Exceções: {number_of_exceptions}\n{exceptions_message}", 43 | ] 44 | ) 45 | return message 46 | 47 | 48 | class CustomSendDiscordMessage(SendDiscordMessage): 49 | def get_message(self): 50 | stats = self.data.stats 51 | n_scraped_items = stats.get("item_scraped_count", 0) 52 | 53 | exceptions = find_exceptions(stats) 54 | exceptions_message = "" 55 | if exceptions: 56 | exceptions_message = "\n".join(exceptions) 57 | 58 | number_of_failures = len(self.result.failures) 59 | number_of_exceptions = len(exceptions) 60 | failed = any( 61 | [ 62 | number_of_failures > 0, 63 | (n_scraped_items - number_of_exceptions) < 0, 64 | ] 65 | ) 66 | emoji = "💀" if failed else "🎉" 67 | 68 | message = "\n".join( 69 | [ 70 | f"{emoji} Spider `{self.data.spider.name}` {stats['finish_reason']}", 71 | f"- Duração em segundos: {round(stats['elapsed_time_seconds'], 1)}", 72 | f"- Itens raspados: {n_scraped_items}", 73 | f"- Erros: {number_of_failures}", 74 | f"- Exceções: {number_of_exceptions}\n{exceptions_message}", 75 | ] 76 | ) 77 | return message 78 | 79 | 80 | class SpiderCloseMonitorSuite(MonitorSuite): 81 | monitors_finished_actions = [CustomSendTelegramMessage, CustomSendDiscordMessage] 82 | -------------------------------------------------------------------------------- /web/datasets/parsers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unicodedata 3 | from datetime import datetime 4 | 5 | from dateutil.parser import ParserError, parse 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def get_phase(value): 11 | mapping = { 12 | "emp": "empenho", 13 | "liq": "liquidacao", 14 | "pag": "pagamento", 15 | } 16 | return mapping.get(value.lower().strip(), None) 17 | 18 | 19 | def currency_to_float(value): 20 | """Converte de R$ 69.848,70 (str) para 69848.70 (float).""" 21 | try: 22 | # format 37500.36 or '37500.36 23 | return float(value.replace("'", "")) 24 | except ValueError: 25 | # format R$ 37.500,36 or 37.500,36 26 | cleaned_value = value.replace("R$", "").replace(".", "").replace(",", ".") 27 | try: 28 | return float(cleaned_value) 29 | except ValueError: 30 | pass 31 | return 32 | 33 | 34 | def to_boolean(value): 35 | return value.lower() in ["y", "s", 1] 36 | 37 | 38 | def from_str_to_datetime(date_str, supported_formats=None): 39 | if date_str is None: 40 | return 41 | try: 42 | converted_date = parse(date_str, dayfirst=True) 43 | except ParserError: 44 | pass 45 | else: 46 | reference_date = datetime(1833, 9, 18) 47 | if converted_date >= reference_date: 48 | return converted_date 49 | return 50 | 51 | 52 | def from_str_to_date(date_str, supported_formats=["%d/%m/%Y", "%d/%m/%y", "%Y-%m-%d"]): 53 | if date_str is None: 54 | return 55 | datetime_obj = from_str_to_datetime(date_str, supported_formats) 56 | if datetime_obj: 57 | return datetime_obj.date() 58 | 59 | 60 | def lower(value): 61 | if value: 62 | return value.lower() 63 | 64 | 65 | def lower_without_spaces(value): 66 | if value: 67 | return strip_accents(value.lower()).replace(" ", "_") 68 | 69 | 70 | def city_council_bid_modality_mapping(code): 71 | mapping = { 72 | "1": "pregao_eletronico", 73 | "2": "convite", 74 | "3": "concorrencia", 75 | "4": "tomada_de_precos", 76 | "5": "concurso", 77 | "6": "leilao", 78 | "7": "pregao_presencial", 79 | "8": "dispensada", 80 | "9": "inexigibilidade", 81 | } 82 | found = mapping.get(code) 83 | if found: 84 | return found 85 | else: 86 | logger.warning(f"Código da modalidade não encontrado: {code}") 87 | 88 | 89 | def city_council_revenue_type_mapping(code): 90 | mapping = { 91 | "ORC": "orcamentaria", 92 | "NORC": "nao_orcamentaria", 93 | "TRANSF": "transferencia", 94 | } 95 | found = mapping.get(code) 96 | if found: 97 | return found 98 | else: 99 | logger.warning(f"Código da tipo de receita não encontrado: {code}") 100 | 101 | 102 | def strip_accents(string): 103 | if string is None: 104 | return 105 | return "".join( 106 | char 107 | for char in unicodedata.normalize("NFD", string) 108 | if unicodedata.category(char) != "Mn" 109 | ) 110 | -------------------------------------------------------------------------------- /web/datasets/management/commands/crawl_tcmba.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from datetime import date 4 | 5 | from dateutil.relativedelta import relativedelta 6 | from django.core.management.base import BaseCommand 7 | from scrapy import signals 8 | from scrapy.crawler import CrawlerProcess 9 | from scrapy.signalmanager import dispatcher 10 | from scrapy.utils.project import get_project_settings 11 | from tcmba.items import DocumentItem 12 | from tcmba.spiders.consulta_publica import ConsultaPublicaSpider 13 | 14 | from web.datasets.management.commands._tcmba import save_document 15 | 16 | 17 | class Command(BaseCommand): 18 | """Raspa documentos de uma unidade no TCM-BA. 19 | 20 | Unidades: 21 | "Camara Municipal de FEIRA DE SANTANA" 22 | "Agência Reguladora de Feira de Santana - ARFES" 23 | "Fundação Hospitalar de Feira de Santana" 24 | "Superintendência Municipal de Proteção e Defesa do Consumidor" 25 | "Consórcio Público Interfederativo De Saúde Da Região de Feira de Santana" 26 | "Fundação Cultural Municipal Egberto Tavares Costa" 27 | "Superintendência Municipal de Trânsito - SMT" 28 | "Instituto de Previdência de Feira de Santana - IPFS" 29 | """ 30 | 31 | help = "Executa raspador de documentos públicos do TCM-BA e salva no banco." 32 | 33 | def add_arguments(self, parser): 34 | parser.add_argument("--period") 35 | parser.add_argument("--period-type", default="mensal") 36 | parser.add_argument( 37 | "--unit", default="Prefeitura Municipal de FEIRA DE SANTANA" 38 | ) 39 | parser.add_argument("--scrapy-args") 40 | 41 | def echo(self, text, style=None): 42 | self.stdout.write(style(text) if style else text) 43 | 44 | def warn(self, text): 45 | return self.echo(text, self.style.WARNING) 46 | 47 | def success(self, text): 48 | return self.echo(text, self.style.SUCCESS) 49 | 50 | def save(self, signal, sender, item, response, spider): 51 | if isinstance(item, DocumentItem): 52 | save_document(item) 53 | 54 | def handle(self, *args, **options): 55 | if not options.get("period"): 56 | target_date = date.today() + relativedelta(months=-2) 57 | target_date = target_date.strftime("%m/%Y") 58 | else: 59 | target_date = options.get("period") 60 | 61 | dispatcher.connect(self.save, signal=signals.item_passed) 62 | os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings" 63 | settings = get_project_settings() 64 | settings["COOKIES_ENABLED"] = True 65 | 66 | if options.get("scrapy_args"): 67 | scrapy_args = json.loads(options.get("scrapy_args")) 68 | settings.update(scrapy_args) 69 | 70 | process = CrawlerProcess(settings=settings) 71 | 72 | args = { 73 | "unidade": options.get("unit"), 74 | "competencia": target_date, 75 | "cidade": "feira de santana", 76 | "periodicidade": options.get("period_type"), 77 | } 78 | self.warn(str(args)) 79 | process.crawl(ConsultaPublicaSpider, **args) 80 | self.warn("Iniciando a coleta dos documentos do TCM-BA...") 81 | process.start() 82 | self.success("Pronto!") 83 | -------------------------------------------------------------------------------- /web/datasets/migrations/0010_auto_20200515_0959.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.12 on 2020-05-15 12:59 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0009_auto_20200514_1350"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilContract", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("created_at", models.DateTimeField(auto_now_add=True)), 25 | ("updated_at", models.DateTimeField(auto_now=True)), 26 | ("crawled_at", models.DateTimeField()), 27 | ("crawled_from", models.URLField()), 28 | ("notes", models.TextField(blank=True, null=True)), 29 | ( 30 | "external_code", 31 | models.PositiveIntegerField(verbose_name="Código externo"), 32 | ), 33 | ( 34 | "description", 35 | models.TextField(blank=True, null=True, verbose_name="Descrição"), 36 | ), 37 | ( 38 | "details", 39 | models.TextField( 40 | blank=True, null=True, verbose_name="Objeto do contrato" 41 | ), 42 | ), 43 | ( 44 | "company_or_person_document", 45 | models.CharField( 46 | blank=True, max_length=50, null=True, verbose_name="CNPJ ou CPF" 47 | ), 48 | ), 49 | ( 50 | "company_or_person", 51 | models.TextField( 52 | blank=True, null=True, verbose_name="Empresa ou pessoa" 53 | ), 54 | ), 55 | ( 56 | "value", 57 | models.DecimalField( 58 | decimal_places=2, max_digits=10, verbose_name="Valor" 59 | ), 60 | ), 61 | ("start_date", models.DateField(verbose_name="Data de início")), 62 | ("end_date", models.DateField(verbose_name="Data final")), 63 | ("excluded", models.BooleanField(default=False)), 64 | ], 65 | options={ 66 | "verbose_name": "Câmara de Vereadores - Contrato", 67 | "verbose_name_plural": "Câmara de Vereadores - Contratos", 68 | "get_latest_by": "start_date", 69 | }, 70 | ), 71 | migrations.AddField( 72 | model_name="citycouncilexpense", 73 | name="external_file_code", 74 | field=models.CharField(blank=True, max_length=50, null=True), 75 | ), 76 | migrations.AddField( 77 | model_name="citycouncilexpense", 78 | name="external_file_line", 79 | field=models.CharField(blank=True, max_length=50, null=True), 80 | ), 81 | ] 82 | -------------------------------------------------------------------------------- /web/datasets/migrations/0007_citycouncilexpense.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-04-10 18:47 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0006_gazette_search_vector"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilExpense", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("created_at", models.DateTimeField(auto_now_add=True)), 25 | ("updated_at", models.DateTimeField(auto_now=True)), 26 | ("crawled_at", models.DateTimeField()), 27 | ("crawled_from", models.URLField()), 28 | ("notes", models.TextField(blank=True, null=True)), 29 | ("published_at", models.DateField()), 30 | ( 31 | "phase", 32 | models.CharField( 33 | choices=[ 34 | ("empenho", "Empenho"), 35 | ("liquidacao", "Liquidação"), 36 | ("pagamento", "Pagamento"), 37 | ], 38 | max_length=20, 39 | ), 40 | ), 41 | ("company_or_person", models.TextField(blank=True, null=True)), 42 | ( 43 | "value", 44 | models.DecimalField( 45 | decimal_places=2, max_digits=10, verbose_name="Valor" 46 | ), 47 | ), 48 | ("number", models.CharField(blank=True, max_length=50, null=True)), 49 | ("document", models.CharField(blank=True, max_length=50, null=True)), 50 | ("date", models.DateField()), 51 | ( 52 | "process_number", 53 | models.CharField(blank=True, max_length=50, null=True), 54 | ), 55 | ("summary", models.TextField(blank=True, null=True)), 56 | ( 57 | "legal_status", 58 | models.CharField(blank=True, max_length=200, null=True), 59 | ), 60 | ("function", models.CharField(blank=True, max_length=50, null=True)), 61 | ("subfunction", models.CharField(blank=True, max_length=50, null=True)), 62 | ( 63 | "type_of_process", 64 | models.CharField(blank=True, max_length=50, null=True), 65 | ), 66 | ("resource", models.CharField(blank=True, max_length=200, null=True)), 67 | ("subgroup", models.CharField(blank=True, max_length=100, null=True)), 68 | ("group", models.CharField(blank=True, max_length=100, null=True)), 69 | ], 70 | options={ 71 | "verbose_name": "Câmara de Vereadores - Despesa", 72 | "verbose_name_plural": "Câmara de Vereadores - Despesas", 73 | }, 74 | ), 75 | ] 76 | -------------------------------------------------------------------------------- /web/datasets/tests/test_parsers.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import pytest 4 | 5 | from web.datasets.parsers import ( 6 | city_council_bid_modality_mapping, 7 | currency_to_float, 8 | from_str_to_date, 9 | from_str_to_datetime, 10 | lower_without_spaces, 11 | ) 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "original_value,expected_value", 16 | [ 17 | ("R$ 69.848,70", 69848.70), 18 | ("69.848,70", 69848.70), 19 | ("R$ -69.848,70", -69848.70), 20 | ("1,70", 1.70), 21 | ("00,00", 0), 22 | ("Random", None), 23 | ("37500.36", 37500.36), 24 | ("37500", 37500.00), 25 | ("'37500.36", 37500.36), 26 | ("R$ 37.500,36", 37500.36), 27 | ], 28 | ) 29 | def test_currency_to_float(original_value, expected_value): 30 | assert currency_to_float(original_value) == expected_value 31 | 32 | 33 | @pytest.mark.parametrize( 34 | "datetime_str,expected_obj", 35 | [ 36 | ("26/02/2020 19:28", datetime(2020, 2, 26, 19, 28)), 37 | ("26/2/2014 09:00", datetime(2014, 2, 26, 9, 0)), 38 | ("26/02/2020 19:28:00", datetime(2020, 2, 26, 19, 28, 0)), 39 | ("26/02/2020", datetime(2020, 2, 26)), 40 | ("26.02.20", datetime(2020, 2, 26)), 41 | ("05/02/23", datetime(2023, 2, 5)), 42 | (None, None), 43 | ("", None), 44 | ], 45 | ) 46 | def test_possible_datetime(datetime_str, expected_obj): 47 | assert from_str_to_datetime(datetime_str) == expected_obj 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "date_str,expected_obj", 52 | [ 53 | ("26/02/2020 19:28", date(2020, 2, 26)), 54 | ("26/2/2014 09:00", date(2014, 2, 26)), 55 | ("26/02/2020 19:28:00", date(2020, 2, 26)), 56 | ("26/02/2020", date(2020, 2, 26)), 57 | ("26/02/20", date(2020, 2, 26)), 58 | ("26.02.20", date(2020, 2, 26)), 59 | (None, None), 60 | ("", None), 61 | ], 62 | ) 63 | def test_possible_date(date_str, expected_obj): 64 | assert from_str_to_date(date_str) == expected_obj 65 | 66 | 67 | @pytest.mark.parametrize( 68 | "datetime_str,expected_obj", 69 | [ 70 | ("18/05/2020", datetime(2020, 5, 18)), 71 | ("18/09/1833", datetime(1833, 9, 18)), 72 | ("17/09/1833", None), 73 | ("01/01/0001", None), 74 | ], 75 | ) 76 | def test_dates_older_than_city_creation(datetime_str, expected_obj): 77 | assert from_str_to_datetime(datetime_str) == expected_obj 78 | 79 | 80 | @pytest.mark.parametrize( 81 | "value,expected_modality", 82 | [ 83 | ("1", "pregao_eletronico"), 84 | ("2", "convite"), 85 | ("3", "concorrencia"), 86 | ("4", "tomada_de_precos"), 87 | ("5", "concurso"), 88 | ("6", "leilao"), 89 | ("7", "pregao_presencial"), 90 | ("8", "dispensada"), 91 | ("9", "inexigibilidade"), 92 | ], 93 | ) 94 | def test_modality_mapping_from_city_council_db(value, expected_modality): 95 | assert city_council_bid_modality_mapping(value) == expected_modality 96 | 97 | 98 | @pytest.mark.parametrize( 99 | "value,expected", 100 | [ 101 | ("Tomada de Preço", "tomada_de_preco"), 102 | ("concorrencia", "concorrencia"), 103 | ("", None), 104 | (None, None), 105 | ], 106 | ) 107 | def test_lower_without_spaces(value, expected): 108 | assert lower_without_spaces(value) == expected 109 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Código de Conduta para Colaboradores 2 | 3 | ## Nossa promessa 4 | 5 | Com o interesse de fomentar uma comunidade aberta e acolhedora, 6 | nós, como colaboradores e administradores deste projeto, comprometemo-nos 7 | a fazer a participação deste projeto uma experiência livre de assédio 8 | para todos, independentemente da aparência pessoal, deficiência, 9 | etnia, gênero, idade, identidade ou expressão de gênero, identidade 10 | ou orientação sexual, nacionalidade, nível de experiência, porte físico, 11 | raça ou religião. 12 | 13 | ## Nossos padrões 14 | 15 | Exemplos de comportamentos que contribuem a criar um ambiente positivo incluem: 16 | 17 | * Usar linguagem acolhedora e inclusiva 18 | * Respeitar pontos de vista e experiências diferentes 19 | * Aceitar crítica construtiva com graça 20 | * Focar no que é melhor para a comunidade 21 | * Mostrar empatia com outros membros da comunidade 22 | 23 | Exemplos de comportamentos inaceitáveis por parte dos participantes incluem: 24 | 25 | * Uso de linguagem ou imagens sexuais e atenção ou avanço sexual indesejada 26 | * Comentários insultuosos e/ou depreciativos e ataques pessoais ou políticos (*Trolling*) 27 | * Assédio público ou privado 28 | * Publicar informação pessoal de outros sem permissão explícita, como, por exemplo, um endereço eletrônico ou residencial 29 | * Qualquer outra forma de conduta que pode ser razoavelmente considerada inapropriada num ambiente profissional 30 | 31 | ## Nossas responsibilidades 32 | 33 | Os administradores do projeto são responsáveis por esclarecer os padrões de 34 | comportamento e deverão tomar ação corretiva apropriada e justa em resposta 35 | a qualquer instância de comportamento inaceitável. 36 | 37 | Os administradores do projeto têm o direito e a responsabilidade de 38 | remover, editar ou rejeitar comentários, commits, código, edições 39 | na wiki, erros ou outras formas de contribuição que não estejam de 40 | acordo com este Código de Conduta, bem como banir temporariamente ou 41 | permanentemente qualquer colaborador por qualquer outro comportamento 42 | que se considere impróprio, perigoso, ofensivo ou problemático. 43 | 44 | ## Escopo 45 | 46 | Este Código de Conduta aplica-se dentro dos espaços do projeto ou 47 | qualquer espaço público onde alguém represente o mesmo ou a sua 48 | comunidade. Exemplos de representação do projeto ou comunidade incluem 49 | usar um endereço de email oficial do projeto, postar por uma conta de 50 | mídia social oficial, ou agir como um representante designado num evento 51 | online ou offline. A representação de um projeto pode ser ainda definida e 52 | esclarecida pelos administradores do projeto. 53 | 54 | ## Aplicação 55 | 56 | Comportamento abusivo, de assédio ou de outros tipos pode ser 57 | comunicado contatando a equipe do projeto no dadosabertosdefeira@gmail.com. 58 | Todas as queixas serão revistas e investigadas e 59 | resultarão numa resposta necessária e apropriada à situação. 60 | A equipe é obrigada a manter a confidencialidade em relação 61 | ao elemento que reportou o incidente. Demais detalhes de 62 | políticas de aplicação podem ser postadas separadamente. 63 | 64 | Administradores do projeto que não sigam ou não mantenham o Código 65 | de Conduta em boa fé podem enfrentar repercussões temporárias ou permanentes 66 | determinadas por outros membros da liderança do projeto. 67 | 68 | ## Atribuição 69 | 70 | Este Código de Conduta é adaptado do [Contributor Covenant](https://www.contributor-covenant.org), 71 | versão 1.4, disponível em https://www.contributor-covenant.org/pt-br/version/1/4/code-of-conduct.html 72 | -------------------------------------------------------------------------------- /web/datasets/management/commands/import.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from datetime import datetime 4 | 5 | from django.conf import settings 6 | from django.core.management.base import BaseCommand 7 | 8 | from web.datasets.adapters import ( 9 | to_citycouncil_bid, 10 | to_citycouncil_bid_file, 11 | to_citycouncil_contract, 12 | to_citycouncil_contract_file, 13 | to_citycouncil_expense, 14 | to_citycouncil_revenue, 15 | ) 16 | from web.datasets.models import ( 17 | CityCouncilBid, 18 | CityCouncilContract, 19 | CityCouncilExpense, 20 | CityCouncilRevenue, 21 | File, 22 | ) 23 | 24 | mapping = { 25 | "citycouncil_expenses": { 26 | "model": CityCouncilExpense, 27 | "adapter": to_citycouncil_expense, 28 | }, 29 | "citycouncil_contracts": { 30 | "model": CityCouncilContract, 31 | "adapter": to_citycouncil_contract, 32 | }, 33 | "citycouncil_bids": {"model": CityCouncilBid, "adapter": to_citycouncil_bid}, 34 | "citycouncil_revenues": { 35 | "model": CityCouncilRevenue, 36 | "adapter": to_citycouncil_revenue, 37 | }, 38 | "citycouncil_contract_files": { 39 | "model": File, 40 | "adapter": to_citycouncil_contract_file, 41 | }, 42 | "citycouncil_bid_files": {"model": File, "adapter": to_citycouncil_bid_file}, 43 | } 44 | 45 | 46 | class Command(BaseCommand): 47 | help = "Importa dados de um arquivo CSV." 48 | 49 | def add_arguments(self, parser): 50 | parser.add_argument("source") 51 | parser.add_argument("file") 52 | parser.add_argument("--drop-all", action="store_true") 53 | 54 | def echo(self, text, style=None): 55 | self.stdout.write(style(text) if style else text) 56 | 57 | def warn(self, text): 58 | return self.echo(text, self.style.WARNING) 59 | 60 | def success(self, text): 61 | return self.echo(text, self.style.SUCCESS) 62 | 63 | def handle(self, *args, **options): 64 | self.echo(options.get("source")) 65 | self.echo(options.get("file")) 66 | 67 | source_map = mapping.get(options.get("source")) 68 | adapter = source_map["adapter"] 69 | model = source_map["model"] 70 | 71 | if options.get("drop_all"): 72 | if os.getenv("DJANGO_CONFIGURATION") == "Prod" and not options.get( 73 | "source" 74 | ).endswith("_files"): 75 | self.warn( 76 | "VOCÊ ESTÁ EM AMBIENTE DE PRODUÇÃO E TODOS OS DADOS SERÃO APAGADOS." 77 | ) 78 | confirmation = input("Tem certeza? s/n ") 79 | if confirmation.lower() in ["s", "y"]: 80 | model.objects.all().delete() 81 | 82 | saved = 0 83 | errors = 0 84 | with open(options.get("file"), newline="") as csv_file: 85 | reader = csv.DictReader(csv_file) 86 | 87 | for row in reader: 88 | item = adapter(row) 89 | if not options.get("source").endswith("_files"): 90 | item["crawled_at"] = datetime.now() 91 | item["crawled_from"] = settings.CITY_COUNCIL_WEBSERVICE 92 | try: 93 | model.objects.create(**item) 94 | saved += 1 95 | except Exception as e: 96 | errors += 1 97 | self.warn(f"{e}\n{str(row)}") 98 | 99 | self.success(f"Concluído!\nSalvos: {saved} Erros: {errors}") 100 | -------------------------------------------------------------------------------- /web/datasets/migrations/0030_alter_historicalcitycouncilattendancelist_options_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 4.1.5 on 2023-01-03 07:52 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0029_file_local_path"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AlterModelOptions( 13 | name="historicalcitycouncilattendancelist", 14 | options={ 15 | "get_latest_by": ("history_date", "history_id"), 16 | "ordering": ("-history_date", "-history_id"), 17 | "verbose_name": "historical Câmara de Vereadores - Lista de Presença", 18 | "verbose_name_plural": "historical Câmara de Vereadores - Listas de Presença", 19 | }, 20 | ), 21 | migrations.AlterModelOptions( 22 | name="historicalcitycouncilbid", 23 | options={ 24 | "get_latest_by": ("history_date", "history_id"), 25 | "ordering": ("-history_date", "-history_id"), 26 | "verbose_name": "historical Câmara de Vereadores - Licitação", 27 | "verbose_name_plural": "historical Câmara de Vereadores - Licitações", 28 | }, 29 | ), 30 | migrations.AlterModelOptions( 31 | name="historicalcitycouncilcontract", 32 | options={ 33 | "get_latest_by": ("history_date", "history_id"), 34 | "ordering": ("-history_date", "-history_id"), 35 | "verbose_name": "historical Câmara de Vereadores - Contrato", 36 | "verbose_name_plural": "historical Câmara de Vereadores - Contratos", 37 | }, 38 | ), 39 | migrations.AlterModelOptions( 40 | name="historicalcitycouncilexpense", 41 | options={ 42 | "get_latest_by": ("history_date", "history_id"), 43 | "ordering": ("-history_date", "-history_id"), 44 | "verbose_name": "historical Câmara de Vereadores - Despesa", 45 | "verbose_name_plural": "historical Câmara de Vereadores - Despesas", 46 | }, 47 | ), 48 | migrations.AlterModelOptions( 49 | name="historicalcitycouncilrevenue", 50 | options={ 51 | "get_latest_by": ("history_date", "history_id"), 52 | "ordering": ("-history_date", "-history_id"), 53 | "verbose_name": "historical Câmara de Vereadores - Receita", 54 | "verbose_name_plural": "historical Câmara de Vereadores - Receitas", 55 | }, 56 | ), 57 | migrations.AlterField( 58 | model_name="historicalcitycouncilattendancelist", 59 | name="history_date", 60 | field=models.DateTimeField(db_index=True), 61 | ), 62 | migrations.AlterField( 63 | model_name="historicalcitycouncilbid", 64 | name="history_date", 65 | field=models.DateTimeField(db_index=True), 66 | ), 67 | migrations.AlterField( 68 | model_name="historicalcitycouncilcontract", 69 | name="history_date", 70 | field=models.DateTimeField(db_index=True), 71 | ), 72 | migrations.AlterField( 73 | model_name="historicalcitycouncilexpense", 74 | name="history_date", 75 | field=models.DateTimeField(db_index=True), 76 | ), 77 | migrations.AlterField( 78 | model_name="historicalcitycouncilrevenue", 79 | name="history_date", 80 | field=models.DateTimeField(db_index=True), 81 | ), 82 | ] 83 | -------------------------------------------------------------------------------- /web/datasets/management/commands/_gazette.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import date 3 | 4 | from django.contrib.admin.options import get_content_type_for_model 5 | 6 | from web.datasets.models import Gazette, GazetteEvent 7 | 8 | from ._file import save_file 9 | 10 | 11 | def save_gazette(item): 12 | """Salva diários oficiais do executivo a partir de 2015.""" 13 | gazette, created = Gazette.objects.update_or_create( 14 | date=item["date"], 15 | power=item["power"], 16 | year_and_edition=item["year_and_edition"], 17 | defaults={ 18 | "crawled_at": item["crawled_at"], 19 | "crawled_from": item["crawled_from"], 20 | }, 21 | ) 22 | 23 | if created and item.get("files"): 24 | content_type = get_content_type_for_model(gazette) 25 | for file_ in item["files"]: 26 | save_file(file_, content_type, gazette.pk) 27 | 28 | for event in item["events"]: 29 | GazetteEvent.objects.get_or_create( 30 | gazette=gazette, 31 | title=event["title"], 32 | secretariat=event["secretariat"], 33 | crawled_from=item["crawled_from"], 34 | summary=event["summary"], 35 | defaults={"crawled_at": item["crawled_at"]}, 36 | ) 37 | return gazette 38 | 39 | 40 | def save_legacy_gazette(item): 41 | """Salva diários oficiais do executivo de antes de 2015. 42 | 43 | Os diários oficiais eram publicados em um site diferente do atual e 44 | também em jornais. Além disso, tinham um formato diferente, sendo um 45 | arquivo para cada evento (decreto, leis etc). 46 | Alguns não possuem data (especialmente os do ano de 2010). Por isso a 47 | tentativa de extrair a data do título. 48 | """ 49 | 50 | notes = "" 51 | if item["date"] is None: 52 | extracted_date = _extract_date(item["title"]) 53 | if extracted_date: 54 | item["date"] = extracted_date 55 | notes = "Data extraída do título." 56 | 57 | gazette, created = Gazette.objects.get_or_create( 58 | date=item["date"], 59 | power="executivo", 60 | crawled_from=item["crawled_from"], 61 | is_legacy=True, 62 | defaults={"crawled_at": item["crawled_at"], "notes": notes}, 63 | ) 64 | 65 | if created and item.get("files"): 66 | content_type = get_content_type_for_model(gazette) 67 | for file_ in item["files"]: 68 | save_file(file_, content_type, gazette.pk) 69 | 70 | GazetteEvent.objects.create( 71 | gazette=gazette, 72 | title=item["title"], 73 | crawled_from=item["crawled_from"], 74 | summary=item["details"], 75 | published_on=item["published_on"], 76 | crawled_at=item["crawled_at"], 77 | ) 78 | return gazette 79 | 80 | 81 | def _extract_date(str_date): 82 | if str_date is None: 83 | return 84 | pattern = r"(\d+) DE (\w+) DE (\d{4})" 85 | result = re.search(pattern, str_date, re.IGNORECASE) 86 | if result: 87 | months = { 88 | "janeiro": 1, 89 | "fevereiro": 2, 90 | "março": 3, 91 | "marco": 3, 92 | "abril": 4, 93 | "maio": 5, 94 | "junho": 6, 95 | "julho": 7, 96 | "agosto": 8, 97 | "setembro": 9, 98 | "outubro": 10, 99 | "novembro": 11, 100 | "dezembro": 12, 101 | } 102 | day = int(result.group(1)) 103 | month = result.group(2).lower() 104 | year = int(result.group(3)) 105 | return date(year, months[month], day) 106 | return result 107 | -------------------------------------------------------------------------------- /web/datasets/migrations/0014_citycouncilbid.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-05-22 09:14 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0013_file_search_vector"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilBid", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ( 25 | "created_at", 26 | models.DateTimeField(auto_now_add=True, verbose_name="Criado em"), 27 | ), 28 | ( 29 | "updated_at", 30 | models.DateTimeField(auto_now=True, verbose_name="Atualizado em"), 31 | ), 32 | ("crawled_at", models.DateTimeField(verbose_name="Coletado em")), 33 | ("crawled_from", models.URLField(verbose_name="Fonte")), 34 | ( 35 | "notes", 36 | models.TextField(blank=True, null=True, verbose_name="Anotações"), 37 | ), 38 | ( 39 | "external_code", 40 | models.CharField(max_length=10, verbose_name="Código externo"), 41 | ), 42 | ( 43 | "modality", 44 | models.CharField( 45 | blank=True, 46 | choices=[ 47 | ("tomada_de_precos", "Tomada de Preço"), 48 | ("pregao_presencial", "Pregão Presencial"), 49 | ("pregao_eletronico", "Pregão Eletrônico"), 50 | ("leilao", "Leilão"), 51 | ("inexigibilidade", "Inexigibilidade"), 52 | ("dispensada", "Dispensada"), 53 | ("convite", "Convite"), 54 | ("concurso", "Concurso"), 55 | ("concorrencia", "Concorrência"), 56 | ("chamada_publica", "Chamada Pública"), 57 | ], 58 | max_length=60, 59 | null=True, 60 | verbose_name="Modalidade", 61 | ), 62 | ), 63 | ( 64 | "code", 65 | models.CharField(max_length=15, verbose_name="Código da licitação"), 66 | ), 67 | ( 68 | "code_type", 69 | models.CharField( 70 | max_length=15, verbose_name="Código do tipo da licitação" 71 | ), 72 | ), 73 | ("description", models.TextField(verbose_name="Descrição (objeto)")), 74 | ( 75 | "session_at", 76 | models.DateTimeField( 77 | null=True, verbose_name="Sessão Data / Horário" 78 | ), 79 | ), 80 | ( 81 | "excluded", 82 | models.BooleanField(default=False, verbose_name="Excluído?"), 83 | ), 84 | ], 85 | options={ 86 | "verbose_name": "Câmara de Vereadores - Licitação", 87 | "verbose_name_plural": "Câmara de Vereadores - Licitações", 88 | "get_latest_by": "session_at", 89 | }, 90 | ), 91 | ] 92 | -------------------------------------------------------------------------------- /web/datasets/management/commands/load_tcmba_documents.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timezone 3 | 4 | from django.conf import settings 5 | from django.contrib.admin.options import get_content_type_for_model 6 | from django.core.management.base import BaseCommand 7 | 8 | from web.datasets.models import File, TCMBADocument 9 | from web.datasets.parsers import from_str_to_date 10 | from web.datasets.services import get_s3_client 11 | 12 | client = get_s3_client(settings) 13 | 14 | 15 | def build_path(s3_filepath, unit, category, filename): 16 | parts = s3_filepath.split("/") 17 | parts.pop() # remove json da lista 18 | parts.extend([unit, category, filename]) 19 | return "/".join(parts) 20 | 21 | 22 | class Command(BaseCommand): 23 | help = "Importa documentos do TCM-BA em um bucket S3." 24 | 25 | def add_arguments(self, parser): 26 | parser.add_argument("s3_path") 27 | parser.add_argument("--drop-all", action="store_true") 28 | 29 | def echo(self, text, style=None): 30 | self.stdout.write(style(text) if style else text) 31 | 32 | def warn(self, text): 33 | return self.echo(text, self.style.WARNING) 34 | 35 | def success(self, text): 36 | return self.echo(text, self.style.SUCCESS) 37 | 38 | def handle(self, *args, **options): 39 | self.echo(f"Caminho no S3: {options.get('s3_path')}") 40 | 41 | file_items = client.download_file(options.get("s3_path")) 42 | json_items = json.loads(open(file_items).read()) 43 | 44 | public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam" 45 | 46 | if options.get("drop_all"): 47 | confirmation = input("Apagar todos os arquivos do TCM-BA? s/n ") 48 | if confirmation.lower() in ["s", "y"]: 49 | TCMBADocument.objects.all().delete() 50 | 51 | failed = 0 52 | for item in json_items: 53 | path = build_path( 54 | options.get("s3_path"), item["unit"], item["category"], item["filename"] 55 | ) 56 | s3_url = f"https://dadosabertosdefeira.s3.eu-central-1.amazonaws.com/{path}" 57 | s3_file_path = f"s3://dadosabertosdefeira/{path}" 58 | 59 | document, created = TCMBADocument.objects.get_or_create( 60 | year=item["year"], 61 | month=item["month"], 62 | period=item["period"].lower(), 63 | category=item["category"], 64 | unit=item["unit"], 65 | inserted_at=from_str_to_date(item["inserted_at"]), 66 | inserted_by=item["inserted_by"], 67 | original_filename=item["original_filename"], 68 | crawled_from=public_view_url, 69 | defaults={ 70 | "crawled_at": datetime.fromisoformat(item["crawled_at"]).replace( 71 | tzinfo=timezone.utc 72 | ), 73 | }, 74 | ) 75 | content_type = get_content_type_for_model(document) 76 | if created: 77 | _, file_created = File.objects.get_or_create( 78 | url=public_view_url, 79 | content_type=content_type, 80 | object_id=document.pk, 81 | s3_url=s3_url, 82 | s3_file_path=s3_file_path, 83 | original_filename=item["original_filename"], 84 | ) 85 | if not file_created: 86 | self.warn(f"Arquivo já existe: {document.pk} - {item}") 87 | else: 88 | self.warn(f"Documento já existe: {document.pk} - {item}") 89 | failed += 1 90 | self.warn(f"Warnings: {failed}") 91 | -------------------------------------------------------------------------------- /scraper/validators.py: -------------------------------------------------------------------------------- 1 | from schematics.models import Model 2 | from schematics.types import ( 3 | BaseType, 4 | DateTimeType, 5 | DateType, 6 | DictType, 7 | IntType, 8 | ListType, 9 | StringType, 10 | URLType, 11 | ) 12 | 13 | 14 | class BaseModel(Model): 15 | crawled_at = DateTimeType(required=True) 16 | crawled_from = URLType(required=True) 17 | git_commit = StringType(required=False) 18 | 19 | 20 | class LegacyGazetteItem(BaseModel): 21 | title = StringType(required=True) 22 | published_on = StringType(required=False) 23 | # important info but not available in years like 2010 24 | date = DateType(required=False) 25 | details = StringType(required=True) 26 | files = ListType(StringType) 27 | 28 | 29 | class GazetteItem(BaseModel): 30 | date = DateType() 31 | power = StringType(required=True) 32 | year_and_edition = StringType(required=True) 33 | events = ListType(DictType(StringType), required=True) 34 | files = ListType(StringType) 35 | 36 | 37 | class CityCouncilAgendaItem(BaseModel): 38 | date = DateType() 39 | details = StringType() 40 | title = StringType(required=True) 41 | event_type = StringType(required=True) 42 | 43 | 44 | class CityCouncilAttendanceListItem(BaseModel): 45 | date = DateType() 46 | description = StringType() 47 | council_member = StringType(required=True) 48 | status = StringType(required=True) 49 | 50 | 51 | class CityCouncilMinuteItem(BaseModel): 52 | date = DateType() 53 | title = StringType(required=True) 54 | event_type = StringType(required=True) 55 | files = ListType(StringType) 56 | 57 | 58 | class CityHallContractItem(BaseModel): 59 | contract_id = StringType(required=True) 60 | starts_at = DateType(formats=("%d/%m/%Y", "%d/%m/%y")) 61 | summary = StringType() 62 | contractor_document = StringType() 63 | contractor_name = StringType() 64 | value = StringType() 65 | ends_at = DateType(formats=("%d/%m/%Y", "%d/%m/%y")) 66 | files = ListType(StringType) 67 | 68 | 69 | class CityHallBidHistoryType(BaseType): 70 | event = StringType() 71 | published_at = DateTimeType() 72 | url = URLType() 73 | 74 | 75 | class CityHallBidItem(BaseModel): 76 | public_agency = StringType() 77 | month = IntType(min_value=1, max_value=12) 78 | year = IntType(min_value=1873) # quando Feira virou cidade :) 79 | description = StringType() 80 | history = ListType(DictType(CityHallBidHistoryType)) 81 | codes = StringType() 82 | modality = StringType() 83 | session_at = DateTimeType() 84 | files = ListType(StringType) 85 | 86 | 87 | class CityHallPaymentsItem(BaseModel): 88 | published_at = DateType(formats=("%d/%m/%Y", "%d/%m/%y")) 89 | phase = StringType() 90 | company_or_person = StringType(required=True) 91 | value = StringType(required=True) 92 | number = StringType() 93 | document = StringType(required=True) 94 | date = DateType(formats=("%d/%m/%Y", "%d/%m/%y")) 95 | process_number = StringType() 96 | summary = StringType() 97 | group = StringType() 98 | action = StringType() 99 | function = StringType() 100 | subfunction = StringType() 101 | type_of_process = StringType() 102 | resource = StringType() 103 | 104 | 105 | class TCMBADocumentItem(Model): 106 | crawled_at = DateTimeType(required=True) 107 | category = StringType() 108 | filename = StringType(required=True) 109 | original_filename = StringType(required=True) 110 | filepath = StringType(required=True) 111 | inserted_by = StringType() 112 | inserted_at = DateType(formats=("%d/%m/%Y", "%d/%m/%y")) 113 | unit = StringType(required=True) 114 | month = StringType() 115 | year = StringType() 116 | period = StringType() 117 | -------------------------------------------------------------------------------- /web/datasets/services.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import boto3 5 | import requests 6 | 7 | 8 | class S3Client: 9 | __slots__ = ("client", "bucket", "bucket_folder", "bucket_region") 10 | 11 | def __init__(self, client, bucket, bucket_folder, bucket_region): 12 | self.client = client 13 | self.bucket = bucket 14 | self.bucket_folder = bucket_folder 15 | self.bucket_region = bucket_region 16 | 17 | def _upload_to_s3(self, temp_file_path, bucket_file_path): 18 | with open(temp_file_path, "rb") as body_file: 19 | self.client.put_object( 20 | Bucket=self.bucket, 21 | Key=bucket_file_path, 22 | Body=body_file, 23 | ) 24 | 25 | def upload_file(self, location_or_url, relative_file_path, prefix=""): 26 | location = Path(location_or_url) 27 | if not location.exists(): 28 | # se não é um arquivo local, assumimos que é uma url 29 | file_name, temp_file_path = self.create_temp_file( 30 | location_or_url, relative_file_path, prefix 31 | ) 32 | else: 33 | file_name, temp_file_path = location.name, str(location) 34 | 35 | bucket_file_path = f"{self.bucket_folder}/files/{relative_file_path}" 36 | bucket_file_path = f"{bucket_file_path}{file_name}" 37 | url = ( 38 | f"https://{self.bucket}.s3.{self.bucket_region}.amazonaws.com/" 39 | f"{bucket_file_path}" 40 | ) 41 | self._upload_to_s3(temp_file_path, bucket_file_path) 42 | self.delete_temp_file(temp_file_path) 43 | 44 | return url, bucket_file_path 45 | 46 | @staticmethod 47 | def create_temp_file(url, relative_file_path="", prefix=""): 48 | temporary_directory = f"{Path.cwd()}/data/tmp/{relative_file_path}" 49 | Path(temporary_directory).mkdir(parents=True, exist_ok=True) 50 | 51 | response = requests.get(url) 52 | start_index = url.rfind("/") + 1 53 | temp_file_name = f"{url[start_index:]}" 54 | if prefix: 55 | temp_file_name = f"{prefix}-{temp_file_name}" 56 | temp_file_path = f"{temporary_directory}{temp_file_name}" 57 | with open(temp_file_path, "wb") as tmp_file: 58 | tmp_file.write(response.content) 59 | return temp_file_name, temp_file_path 60 | 61 | def download_file(self, s3_file_path): 62 | temporary_directory = f"{Path.cwd()}/data/tmp/" 63 | Path(temporary_directory).mkdir(parents=True, exist_ok=True) 64 | 65 | start_index = s3_file_path.rfind("/") + 1 66 | file_name = s3_file_path[start_index:] 67 | 68 | local_path = f"{temporary_directory}{file_name}" 69 | with open(local_path, "wb") as file_: 70 | self.client.download_fileobj(self.bucket, s3_file_path, file_) 71 | 72 | return local_path 73 | 74 | @staticmethod 75 | def delete_temp_file(temp_file_path): 76 | Path(temp_file_path).unlink() 77 | 78 | 79 | class FakeS3Client(S3Client): 80 | def _upload_to_s3(self, temp_file_path, bucket_file_path): 81 | pass 82 | 83 | def download_file(self, s3_file_path): 84 | return f"{Path.cwd()}/data/tmp/{s3_file_path}" 85 | 86 | 87 | def get_s3_client(settings): 88 | if os.getenv("DJANGO_CONFIGURATION") != "Prod": 89 | from unittest.mock import Mock 90 | 91 | client = Mock() 92 | return FakeS3Client(client, "teste", "maria-quiteria-local", "brasil") 93 | 94 | client = boto3.client( 95 | service_name="s3", 96 | region_name=settings.AWS_S3_REGION, 97 | aws_access_key_id=settings.AWS_ACCESS_KEY_ID, 98 | aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, 99 | ) 100 | return S3Client( 101 | client, 102 | settings.AWS_S3_BUCKET, 103 | settings.AWS_S3_BUCKET_FOLDER, 104 | settings.AWS_S3_REGION, 105 | ) 106 | -------------------------------------------------------------------------------- /web/datasets/management/commands/crawl.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from django.core.management.base import BaseCommand 5 | from scrapy import signals 6 | from scrapy.crawler import CrawlerProcess 7 | from scrapy.signalmanager import dispatcher 8 | from scrapy.utils.project import get_project_settings 9 | 10 | from scraper.items import ( 11 | CityCouncilAttendanceListItem, 12 | CityCouncilMinuteItem, 13 | CityHallBidItem, 14 | GazetteItem, 15 | LegacyGazetteItem, 16 | ) 17 | from scraper.spiders.citycouncil import AttendanceListSpider, MinuteSpider 18 | from scraper.spiders.cityhall import BidsSpider 19 | from scraper.spiders.gazette import ( 20 | ExecutiveAndLegislativeGazetteSpider, 21 | LegacyGazetteSpider, 22 | ) 23 | from web.datasets.models import ( 24 | CityCouncilAttendanceList, 25 | CityCouncilMinute, 26 | CityHallBid, 27 | File, 28 | Gazette, 29 | GazetteEvent, 30 | ) 31 | 32 | from ._citycouncil import save_attendance_list, save_minute 33 | from ._cityhall import save_bid 34 | from ._gazette import save_gazette, save_legacy_gazette 35 | 36 | 37 | class Command(BaseCommand): 38 | help = "Executa todos os coletores e salva os itens recentes no banco." 39 | 40 | def add_arguments(self, parser): 41 | drop_all_help = "Limpa o banco antes de iniciar a coleta." 42 | parser.add_argument("--drop-all", action="store_true", help=drop_all_help) 43 | parser.add_argument("--scrapy-args") 44 | 45 | def echo(self, text, style=None): 46 | self.stdout.write(style(text) if style else text) 47 | 48 | def warn(self, text): 49 | return self.echo(text, self.style.WARNING) 50 | 51 | def success(self, text): 52 | return self.echo(text, self.style.SUCCESS) 53 | 54 | def save(self, signal, sender, item, response, spider): 55 | if isinstance(item, CityCouncilAttendanceListItem): 56 | save_attendance_list(item) 57 | if isinstance(item, CityCouncilMinuteItem): 58 | save_minute(item) 59 | if isinstance(item, CityHallBidItem): 60 | save_bid(item) 61 | if isinstance(item, LegacyGazetteItem): 62 | save_legacy_gazette(item) 63 | if isinstance(item, GazetteItem): 64 | save_gazette(item) 65 | 66 | def handle(self, *args, **options): 67 | if options.get("drop_all"): 68 | self.warn("Apagando registros...") 69 | CityCouncilAttendanceList.objects.all().delete() 70 | CityCouncilMinute.objects.all().delete() 71 | CityHallBid.objects.all().delete() 72 | Gazette.objects.all().delete() 73 | GazetteEvent.objects.all().delete() 74 | File.objects.all().delete() 75 | 76 | dispatcher.connect(self.save, signal=signals.item_passed) 77 | os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings" 78 | settings = get_project_settings() 79 | 80 | if options.get("scrapy_args"): 81 | scrapy_args = json.loads(options.get("scrapy_args")) 82 | settings.update(scrapy_args) 83 | 84 | process = CrawlerProcess(settings=settings) 85 | process.crawl( 86 | AttendanceListSpider, 87 | start_from_date=CityCouncilAttendanceList.last_collected_item_date(), 88 | ) 89 | process.crawl( 90 | MinuteSpider, start_from_date=CityCouncilMinute.last_collected_item_date() 91 | ) 92 | process.crawl( 93 | BidsSpider, start_from_date=CityHallBid.last_collected_item_date() 94 | ) 95 | 96 | last_collected_gazette = Gazette.last_collected_item_date() 97 | if last_collected_gazette is None: 98 | process.crawl(LegacyGazetteSpider) 99 | process.crawl( 100 | ExecutiveAndLegislativeGazetteSpider, 101 | start_from_date=last_collected_gazette, 102 | ) 103 | 104 | self.warn("Iniciando a coleta...") 105 | process.start() 106 | self.success("Pronto!") 107 | -------------------------------------------------------------------------------- /web/datasets/migrations/0021_historicalcitycouncilattendancelist.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.8 on 2020-08-30 17:12 2 | 3 | import django.db.models.deletion 4 | import simple_history.models 5 | from django.conf import settings 6 | from django.db import migrations, models 7 | 8 | 9 | class Migration(migrations.Migration): 10 | dependencies = [ 11 | migrations.swappable_dependency(settings.AUTH_USER_MODEL), 12 | ("datasets", "0020_auto_20200718_2347"), 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name="HistoricalCityCouncilAttendanceList", 18 | fields=[ 19 | ( 20 | "id", 21 | models.IntegerField( 22 | auto_created=True, blank=True, db_index=True, verbose_name="ID" 23 | ), 24 | ), 25 | ( 26 | "created_at", 27 | models.DateTimeField( 28 | blank=True, editable=False, verbose_name="Criado em" 29 | ), 30 | ), 31 | ( 32 | "updated_at", 33 | models.DateTimeField( 34 | blank=True, editable=False, verbose_name="Atualizado em" 35 | ), 36 | ), 37 | ("crawled_at", models.DateTimeField(verbose_name="Coletado em")), 38 | ("crawled_from", models.URLField(verbose_name="Fonte")), 39 | ( 40 | "notes", 41 | models.TextField(blank=True, null=True, verbose_name="Anotações"), 42 | ), 43 | ("date", models.DateField(verbose_name="Data")), 44 | ( 45 | "description", 46 | models.CharField( 47 | blank=True, max_length=200, null=True, verbose_name="Descrição" 48 | ), 49 | ), 50 | ( 51 | "council_member", 52 | models.CharField( 53 | db_index=True, max_length=200, verbose_name="Vereador" 54 | ), 55 | ), 56 | ( 57 | "status", 58 | models.CharField( 59 | choices=[ 60 | ("presente", "Presente"), 61 | ("falta_justificada", "Falta Justificada"), 62 | ("licenca_justificada", "Licença Justificada"), 63 | ("ausente", "Ausente"), 64 | ], 65 | db_index=True, 66 | max_length=20, 67 | verbose_name="Situação", 68 | ), 69 | ), 70 | ("history_id", models.AutoField(primary_key=True, serialize=False)), 71 | ("history_date", models.DateTimeField()), 72 | ("history_change_reason", models.CharField(max_length=100, null=True)), 73 | ( 74 | "history_type", 75 | models.CharField( 76 | choices=[("+", "Created"), ("~", "Changed"), ("-", "Deleted")], 77 | max_length=1, 78 | ), 79 | ), 80 | ( 81 | "history_user", 82 | models.ForeignKey( 83 | null=True, 84 | on_delete=django.db.models.deletion.SET_NULL, 85 | related_name="+", 86 | to=settings.AUTH_USER_MODEL, 87 | ), 88 | ), 89 | ], 90 | options={ 91 | "verbose_name": "historical Câmara de Vereadores - Lista de Presença", 92 | "ordering": ("-history_date", "-history_id"), 93 | "get_latest_by": "history_date", 94 | }, 95 | bases=(simple_history.models.HistoricalChanges, models.Model), 96 | ), 97 | ] 98 | -------------------------------------------------------------------------------- /scraper/spiders/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | import unicodedata 5 | from datetime import datetime, timezone 6 | from urllib.parse import parse_qs, urlparse 7 | 8 | from web.datasets.parsers import from_str_to_date 9 | 10 | DOMAIN_FORMAT = re.compile( 11 | r"(?:^(\w{1,255}):(.{1,255})@|^)" 12 | r"(?:(?:(?=\S{0,253}(?:$|:))" 13 | r"((?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+" 14 | r"(?:[a-z0-9]{1,63})))" 15 | r"|localhost)" 16 | r"(:\d{1,5})?", 17 | re.IGNORECASE, 18 | ) 19 | 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def replace_query_param(url, field, value): 25 | return re.sub(r"{}=\d+".format(field), r"{}={}".format(field, str(value)), url) 26 | 27 | 28 | def identify_contract_id(text): 29 | CONTRACT_NUMBER_PATTERN = re.compile(r"\d+[-|\/]\d{4}?[-|\/]\d+C|\d+-\d{4}|\d+") 30 | result = re.findall(CONTRACT_NUMBER_PATTERN, text) 31 | if result: 32 | return result[0] 33 | 34 | 35 | def extract_param(url, param): 36 | parsed = urlparse(url) 37 | try: 38 | value = parse_qs(parsed.query)[param] 39 | return value[0] 40 | except KeyError: 41 | return 42 | 43 | 44 | def months_and_years(start_date, end_date): 45 | pairs = [] 46 | if start_date.year == end_date.year: 47 | if start_date.month == end_date.month: 48 | return [(start_date.month, start_date.year)] 49 | for year in range(start_date.year, end_date.year + 1): 50 | for month in range(1, 13): 51 | if start_date.year == end_date.year: 52 | if start_date.month < month <= end_date.month: 53 | pairs.append((month, year)) 54 | elif year == start_date.year: 55 | if month > start_date.month: 56 | pairs.append((month, year)) 57 | elif year == end_date.year: 58 | if month <= end_date.month: 59 | pairs.append((month, year)) 60 | elif year not in (start_date.year, end_date.year): 61 | pairs.append((month, year)) 62 | return pairs 63 | 64 | 65 | def extract_date(str_with_date): 66 | DATE_PATTERN = re.compile(r"\d+\/\d+\/\d+") 67 | result = re.search(DATE_PATTERN, str_with_date) 68 | if result: 69 | return from_str_to_date(result.group(0)) 70 | return 71 | 72 | 73 | def is_url(url): 74 | if not url: 75 | return False 76 | 77 | url = url.strip() 78 | 79 | if len(url) > 2048: 80 | logger.warning( 81 | f"URL ultrapassa limite de 2048 caracteres (tamanho = {len(url)})" 82 | ) 83 | return False 84 | 85 | result = urlparse(url) 86 | scheme = result.scheme 87 | domain = result.netloc 88 | 89 | if not scheme: 90 | logger.warning("Nenhum URL scheme especificado") 91 | return is_url(f"http://{url}") 92 | 93 | if not domain: 94 | logger.warning("Nenhum domínio especificado") 95 | return False 96 | 97 | if not re.fullmatch(DOMAIN_FORMAT, domain): 98 | logger.warning(f"Domínio inválido ({domain})") 99 | return False 100 | 101 | return True 102 | 103 | 104 | def strip_accents(string): 105 | if string is None: 106 | return 107 | return "".join( 108 | char 109 | for char in unicodedata.normalize("NFD", string) 110 | if unicodedata.category(char) != "Mn" 111 | ) 112 | 113 | 114 | def datetime_utcnow_aware() -> datetime: 115 | """Data e hora UTC com informação de timezone.""" 116 | return datetime.utcnow().replace(tzinfo=timezone.utc) 117 | 118 | 119 | def get_git_commit() -> str: 120 | """Retorna o hash ID do atual commit.""" 121 | git_rev = os.environ.get("GIT_REV") 122 | if git_rev in [None, "None"]: 123 | return "" 124 | return git_rev 125 | 126 | 127 | def get_status(status): 128 | """Retorna label dos status. 129 | Consultado em 01/01/2022.""" 130 | if not status: 131 | return "" 132 | status = strip_accents(status.strip()) 133 | return status.lower().replace(" ", "_") 134 | -------------------------------------------------------------------------------- /web/datasets/tests/fixtures/response-22042021.json: -------------------------------------------------------------------------------- 1 | { 2 | "inclusoesContrato": [], 3 | "alteracoesContrato": [], 4 | "exclusoesContrato": [], 5 | "inclusoesLicitacao": [ 6 | { 7 | "codLic": "229", 8 | "codTipoLic": "7", 9 | "numLic": "001\/2021", 10 | "numTipoLic": "001\/2021", 11 | "objetoLic": "Contratacao de empresa especializada na prestacao de servicos tecnicos na area de Solucoes Integradas em Tecnologia, para fornecimento de licenca de uso de Software de Gerenciamento de Processo Legislativo (sistema de protocolo legislativo WEB e DESK, sistema de tramitacao legislativa WEB e DESK, sistema de bancos de leis WEB, sistema inibidor de multiplicidade de materias, sistema de transparencia legislativa, sistema de painel eletronico, sistema em plataforma mobile de votacao em plenario e sistema em plataforma mobile de consulta de projetos e seus tramites), incluindo instalacao, configuracao, treinamento e parametrizacao, atendendo as caracteristicas da Camara Municipal de Feira de Santana ? Bahia, conforme especificacoes, quantitativos e condicoes estabelecidas no Edital e seus Anexos", 12 | "dtLic": "2021-05-05 09:00:00", 13 | "arquivos": [ 14 | { 15 | "codArqLic": "1588", 16 | "codLic": "229", 17 | "dsArqLic": "Aviso PP 001-2021 - Publicacao.doc", 18 | "caminhoArqLic": "https:\/\/www.transparencia.feiradesantana.ba.leg.br\/adm\/upload\/licitacao\/Aviso PP 001-2021 - Publicacao.doc" 19 | }, 20 | { 21 | "codArqLic": "1590", 22 | "codLic": "229", 23 | "dsArqLic": "Edital Lic 001-2021 - Sist Gerenc Legislativo - PP 001-2021.doc", 24 | "caminhoArqLic": "https:\/\/www.transparencia.feiradesantana.ba.leg.br\/adm\/upload\/licitacao\/Edital Lic 001-2021 - Sist Gerenc Legislativo - PP 001-2021.doc" 25 | } 26 | ] 27 | } 28 | ], 29 | "alteracoesLicitacao": [ 30 | { 31 | "codLic": "229", 32 | "codTipoLic": "7", 33 | "numLic": "001\/2021", 34 | "numTipoLic": "001\/2021", 35 | "objetoLic": "Contratacao de empresa especializada na prestacao de servicos tecnicos na area de Solucoes Integradas em Tecnologia, para fornecimento de licenca de uso de Software de Gerenciamento de Processo Legislativo (sistema de protocolo legislativo WEB e DESK, sistema de tramitacao legislativa WEB e DESK, sistema de bancos de leis WEB, sistema inibidor de multiplicidade de materias, sistema de transparencia legislativa, sistema de painel eletronico, sistema em plataforma mobile de votacao em plenario e sistema em plataforma mobile de consulta de projetos e seus tramites), incluindo instalacao, configuracao, treinamento e parametrizacao, atendendo as caracteristicas da Camara Municipal de Feira de Santana ? Bahia, conforme especificacoes, quantitativos e condicoes estabelecidas no Edital e seus Anexos", 36 | "dtLic": "2021-05-05 09:00:00", 37 | "arquivos": [ 38 | { 39 | "codArqLic": "1588", 40 | "codLic": "229", 41 | "dsArqLic": "Aviso PP 001-2021 - Publicacao.doc", 42 | "caminhoArqLic": "https:\/\/www.transparencia.feiradesantana.ba.leg.br\/adm\/upload\/licitacao\/Aviso PP 001-2021 - Publicacao.doc" 43 | }, 44 | { 45 | "codArqLic": "1590", 46 | "codLic": "229", 47 | "dsArqLic": "Edital Lic 001-2021 - Sist Gerenc Legislativo - PP 001-2021.doc", 48 | "caminhoArqLic": "https:\/\/www.transparencia.feiradesantana.ba.leg.br\/adm\/upload\/licitacao\/Edital Lic 001-2021 - Sist Gerenc Legislativo - PP 001-2021.doc" 49 | } 50 | ] 51 | } 52 | ], 53 | "exclusoesLicitacao": [], 54 | "inclusoesReceita": [], 55 | "alteracoesReceita": [], 56 | "exclusoesReceita": [], 57 | "inclusoesDespesa": [], 58 | "alteracoesDespesa": [], 59 | "exclusoesDespesa": [] 60 | } 61 | -------------------------------------------------------------------------------- /web/datasets/tests/test_services.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from django.conf import settings 5 | 6 | from web.datasets.services import get_s3_client 7 | 8 | client = get_s3_client(settings) 9 | 10 | 11 | class TestS3Client: 12 | def test_upload_file(self): 13 | relative_path = "TestModel/2020/10/23/" 14 | s3_url, bucket_file_path = client.upload_file( 15 | "https://www.google.com/robots.txt", relative_path 16 | ) 17 | 18 | expected_file_path = f"maria-quiteria-local/files/{relative_path}robots.txt" 19 | expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{bucket_file_path}" 20 | real_path = f"{os.getcwd()}/data/tmp/{expected_file_path}" 21 | 22 | assert s3_url == expected_s3_url 23 | assert bucket_file_path == expected_file_path 24 | assert Path(real_path).exists() is False 25 | 26 | def test_create_temp_file(self): 27 | url = ( 28 | "http://www.feiradesantana.ba.gov.br/licitacoes/" 29 | "respostas/4924SUSPENS%C3%83O.pdf" 30 | ) 31 | temp_file_name, temp_file_path = client.create_temp_file(url) 32 | 33 | assert temp_file_name == "4924SUSPENS%C3%83O.pdf" 34 | assert Path(temp_file_path).is_file() is True 35 | 36 | client.delete_temp_file(temp_file_path) 37 | assert Path(temp_file_path).is_file() is False 38 | 39 | def test_create_temp_file_with_prefix(self): 40 | url = ( 41 | "http://www.feiradesantana.ba.gov.br/licitacoes/" 42 | "respostas/4924SUSPENS%C3%83O.pdf" 43 | ) 44 | prefix = "eu-sou-um-checksum" 45 | expected_file_name = f"{prefix}-4924SUSPENS%C3%83O.pdf" 46 | temp_file_name, temp_file_path = client.create_temp_file(url, prefix=prefix) 47 | 48 | assert temp_file_name == expected_file_name 49 | assert Path(temp_file_path).is_file() is True 50 | 51 | client.delete_temp_file(temp_file_path) 52 | assert Path(temp_file_path).is_file() is False 53 | 54 | def test_create_temp_file_with_relative_file_path(self): 55 | url = ( 56 | "http://www.feiradesantana.ba.gov.br/licitacoes/" 57 | "respostas/4924SUSPENS%C3%83O.pdf" 58 | ) 59 | relative_file_path = "extra/" 60 | temp_file_name, temp_file_path = client.create_temp_file( 61 | url, relative_file_path=relative_file_path 62 | ) 63 | 64 | assert temp_file_name == "4924SUSPENS%C3%83O.pdf" 65 | assert Path(temp_file_path).is_file() is True 66 | 67 | client.delete_temp_file(temp_file_path) 68 | 69 | assert Path(temp_file_path).is_file() is False 70 | 71 | def test_download_file(self): 72 | relative_path = "TestModel/2020/10/23/" 73 | s3_url, relative_file_path = client.upload_file( 74 | "https://www.google.com/robots.txt", relative_path 75 | ) 76 | 77 | expected_file_path = f"maria-quiteria-local/files/{relative_path}robots.txt" 78 | expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{expected_file_path}" 79 | real_path = f"{os.getcwd()}/data/tmp/{expected_file_path}" 80 | 81 | assert s3_url == expected_s3_url 82 | assert relative_file_path == expected_file_path 83 | assert Path(real_path).exists() is False 84 | 85 | absolute_file_path = client.download_file(relative_file_path) 86 | 87 | assert absolute_file_path == real_path 88 | 89 | def test_upload_file_from_local_path(self): 90 | local_path = Path("conteudo.txt") 91 | local_path.write_text("Testando") 92 | relative_path = "TestModel/2021/06/23/" 93 | s3_url, bucket_file_path = client.upload_file(str(local_path), relative_path) 94 | 95 | expected_file_path = f"maria-quiteria-local/files/{relative_path}conteudo.txt" 96 | expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{bucket_file_path}" 97 | 98 | assert s3_url == expected_s3_url 99 | assert bucket_file_path == expected_file_path 100 | assert Path(local_path).exists() is False 101 | -------------------------------------------------------------------------------- /web/api/views.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from django_filters.rest_framework import DjangoFilterBackend 4 | from rest_framework.filters import SearchFilter 5 | from rest_framework.generics import ListAPIView 6 | from rest_framework.permissions import AllowAny 7 | from rest_framework.renderers import JSONRenderer 8 | from rest_framework.response import Response 9 | from rest_framework.views import APIView 10 | from rest_framework.viewsets import ReadOnlyModelViewSet, ViewSet 11 | 12 | from web.api.constants import AVAILABLE_ENDPOINTS_BY_PUBLIC_AGENCY 13 | from web.api.filters import CityHallBidFilter, GazetteFilter 14 | from web.api.serializers import ( 15 | CityCouncilAgendaSerializer, 16 | CityCouncilAttendanceListSerializer, 17 | CityCouncilMinuteSerializer, 18 | CityHallBidSerializer, 19 | GazetteSerializer, 20 | ) 21 | from web.datasets.models import ( 22 | CityCouncilAgenda, 23 | CityCouncilAttendanceList, 24 | CityCouncilMinute, 25 | CityHallBid, 26 | Gazette, 27 | ) 28 | 29 | 30 | class HealthCheckView(ViewSet): 31 | permission_classes = [AllowAny] 32 | 33 | def list(self, request): 34 | return Response({"status": "available", "time": datetime.now()}) 35 | 36 | 37 | class CityCouncilAgendaView(ListAPIView): 38 | queryset = CityCouncilAgenda.objects.all() 39 | serializer_class = CityCouncilAgendaSerializer 40 | 41 | def get_queryset(self): 42 | query = self.request.query_params.get("query", None) 43 | start_date = self.request.query_params.get("start_date", None) 44 | end_date = self.request.query_params.get("end_date", None) 45 | kwargs = {} 46 | 47 | if query: 48 | kwargs["details__icontains"] = query 49 | if start_date: 50 | kwargs["date__gte"] = start_date 51 | if end_date: 52 | kwargs["date__lte"] = end_date 53 | 54 | return self.queryset.filter(**kwargs) 55 | 56 | 57 | class CityCouncilAttendanceListView(ListAPIView): 58 | queryset = CityCouncilAttendanceList.objects.all() 59 | serializer_class = CityCouncilAttendanceListSerializer 60 | 61 | def get_queryset(self): 62 | query = self.request.query_params.get("query", None) 63 | status = self.request.query_params.get("status", None) 64 | start_date = self.request.query_params.get("start_date", None) 65 | end_date = self.request.query_params.get("end_date", None) 66 | 67 | kwargs = {} 68 | 69 | if query: 70 | kwargs["council_member__icontains"] = query 71 | if status: 72 | kwargs["status"] = status 73 | if start_date: 74 | kwargs["date__gte"] = start_date 75 | if end_date: 76 | kwargs["date__lte"] = end_date 77 | 78 | return self.queryset.filter(**kwargs) 79 | 80 | 81 | class CityCouncilMinuteView(ListAPIView): 82 | queryset = CityCouncilMinute.objects.all() 83 | serializer_class = CityCouncilMinuteSerializer 84 | 85 | def get_queryset(self): 86 | query = self.request.query_params.get("query", None) 87 | start_date = self.request.query_params.get("start_date", None) 88 | end_date = self.request.query_params.get("end_date", None) 89 | kwargs = {} 90 | 91 | if query: 92 | kwargs["title__icontains"] = query 93 | if start_date: 94 | kwargs["date__gte"] = start_date 95 | if end_date: 96 | kwargs["date__lte"] = end_date 97 | 98 | return self.queryset.filter(**kwargs) 99 | 100 | 101 | class GazetteView(ReadOnlyModelViewSet): 102 | queryset = Gazette.objects.all() 103 | serializer_class = GazetteSerializer 104 | filterset_class = GazetteFilter 105 | filter_backends = [SearchFilter, DjangoFilterBackend] 106 | 107 | 108 | class CityHallBidView(ListAPIView): 109 | queryset = CityHallBid.objects.prefetch_related("events").prefetch_related("files") 110 | serializer_class = CityHallBidSerializer 111 | filterset_class = CityHallBidFilter 112 | filter_backends = [SearchFilter, DjangoFilterBackend] 113 | 114 | 115 | class FrontendEndpoint(APIView): 116 | renderer_classes = [JSONRenderer] 117 | 118 | def get(self, request, format=None): 119 | return Response(AVAILABLE_ENDPOINTS_BY_PUBLIC_AGENCY) 120 | -------------------------------------------------------------------------------- /web/datasets/migrations/0009_auto_20200514_1350.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 2.2.12 on 2020-05-14 16:50 2 | 3 | import django.db.models.expressions 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("datasets", "0008_cityhallbid_cityhallbidevent"), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterModelOptions( 14 | name="citycouncilagenda", 15 | options={ 16 | "get_latest_by": "date", 17 | "verbose_name": "Câmara de Vereadores - Agenda", 18 | "verbose_name_plural": "Câmara de Vereadores - Agendas", 19 | }, 20 | ), 21 | migrations.AlterModelOptions( 22 | name="citycouncilattendancelist", 23 | options={ 24 | "get_latest_by": "date", 25 | "verbose_name": "Câmara de Vereadores - Lista de Presença", 26 | "verbose_name_plural": "Câmara de Vereadores - Listas de Presença", 27 | }, 28 | ), 29 | migrations.AlterModelOptions( 30 | name="citycouncilexpense", 31 | options={ 32 | "get_latest_by": "date", 33 | "verbose_name": "Câmara de Vereadores - Despesa", 34 | "verbose_name_plural": "Câmara de Vereadores - Despesas", 35 | }, 36 | ), 37 | migrations.AlterModelOptions( 38 | name="citycouncilminute", 39 | options={ 40 | "get_latest_by": "date", 41 | "verbose_name": "Câmara de Vereadores - Atas", 42 | "verbose_name_plural": "Câmara de Vereadores - Atas", 43 | }, 44 | ), 45 | migrations.AlterModelOptions( 46 | name="cityhallbid", 47 | options={ 48 | "get_latest_by": "session_at", 49 | "verbose_name": "Prefeitura - Licitação", 50 | "verbose_name_plural": "Prefeitura - Licitações", 51 | }, 52 | ), 53 | migrations.AlterModelOptions( 54 | name="gazette", 55 | options={ 56 | "get_latest_by": "date", 57 | "ordering": [ 58 | django.db.models.expressions.OrderBy( 59 | django.db.models.expressions.F("date"), 60 | descending=True, 61 | nulls_last=True, 62 | ) 63 | ], 64 | "verbose_name": "Diário Oficial", 65 | "verbose_name_plural": "Diários Oficiais", 66 | }, 67 | ), 68 | migrations.RemoveField( 69 | model_name="citycouncilexpense", 70 | name="type_of_process", 71 | ), 72 | migrations.AddField( 73 | model_name="citycouncilexpense", 74 | name="budget_unit", 75 | field=models.PositiveIntegerField(default=101), 76 | ), 77 | migrations.AddField( 78 | model_name="citycouncilexpense", 79 | name="excluded", 80 | field=models.BooleanField(default=False), 81 | ), 82 | migrations.AddField( 83 | model_name="citycouncilexpense", 84 | name="modality", 85 | field=models.CharField( 86 | blank=True, 87 | choices=[ 88 | ("convenio", "Convênio"), 89 | ("tomada_de_precos", "Tomada de Preço"), 90 | ("pregao", "Pregão"), 91 | ("inexigibilidade", "Inexigibilidade"), 92 | ("convite", "Convite"), 93 | ("concorrencia", "Concorrência"), 94 | ("dispensa", "Dispensa"), 95 | ("isento", "Isento"), 96 | ], 97 | max_length=50, 98 | null=True, 99 | ), 100 | ), 101 | migrations.AddField( 102 | model_name="citycouncilexpense", 103 | name="phase_code", 104 | field=models.CharField(blank=True, max_length=20, null=True), 105 | ), 106 | migrations.AlterField( 107 | model_name="citycouncilexpense", 108 | name="published_at", 109 | field=models.DateField(blank=True, null=True), 110 | ), 111 | ] 112 | -------------------------------------------------------------------------------- /web/datasets/migrations/0002_auto_20200316_1905.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0 on 2020-03-16 22:05 2 | 3 | import django.db.models.deletion 4 | import django.utils.timezone 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | dependencies = [ 10 | ("datasets", "0001_initial"), 11 | ] 12 | 13 | operations = [ 14 | migrations.CreateModel( 15 | name="Gazette", 16 | fields=[ 17 | ( 18 | "id", 19 | models.AutoField( 20 | auto_created=True, 21 | primary_key=True, 22 | serialize=False, 23 | verbose_name="ID", 24 | ), 25 | ), 26 | ("created_at", models.DateTimeField(auto_now_add=True)), 27 | ("updated_at", models.DateTimeField(auto_now=True)), 28 | ("crawled_at", models.DateTimeField()), 29 | ("crawled_from", models.URLField()), 30 | ("notes", models.TextField(blank=True, null=True)), 31 | ("date", models.DateField(null=True)), 32 | ( 33 | "power", 34 | models.CharField( 35 | choices=[ 36 | ("executivo", "Poder Executivo"), 37 | ("legislativo", "Poder Legislativo"), 38 | ], 39 | max_length=25, 40 | ), 41 | ), 42 | ("year_and_edition", models.CharField(max_length=100)), 43 | ("is_legacy", models.BooleanField(default=False)), 44 | ("file_url", models.URLField(blank=True, null=True)), 45 | ("file_content", models.TextField(blank=True, null=True)), 46 | ], 47 | options={ 48 | "abstract": False, 49 | }, 50 | ), 51 | migrations.AddField( 52 | model_name="citycouncilagenda", 53 | name="created_at", 54 | field=models.DateTimeField( 55 | auto_now_add=True, default=django.utils.timezone.now 56 | ), 57 | preserve_default=False, 58 | ), 59 | migrations.AlterField( 60 | model_name="citycouncilagenda", 61 | name="crawled_at", 62 | field=models.DateTimeField(), 63 | ), 64 | migrations.AlterField( 65 | model_name="citycouncilagenda", 66 | name="crawled_from", 67 | field=models.URLField(default=django.utils.timezone.now), 68 | preserve_default=False, 69 | ), 70 | migrations.CreateModel( 71 | name="GazetteEvent", 72 | fields=[ 73 | ( 74 | "id", 75 | models.AutoField( 76 | auto_created=True, 77 | primary_key=True, 78 | serialize=False, 79 | verbose_name="ID", 80 | ), 81 | ), 82 | ("created_at", models.DateTimeField(auto_now_add=True)), 83 | ("updated_at", models.DateTimeField(auto_now=True)), 84 | ("crawled_at", models.DateTimeField()), 85 | ("crawled_from", models.URLField()), 86 | ("notes", models.TextField(blank=True, null=True)), 87 | ("title", models.CharField(blank=True, max_length=300, null=True)), 88 | ( 89 | "secretariat", 90 | models.CharField(blank=True, max_length=100, null=True), 91 | ), 92 | ("summary", models.TextField(blank=True, null=True)), 93 | ( 94 | "published_on", 95 | models.CharField(blank=True, max_length=100, null=True), 96 | ), 97 | ( 98 | "gazette", 99 | models.ForeignKey( 100 | on_delete=django.db.models.deletion.CASCADE, 101 | to="datasets.Gazette", 102 | ), 103 | ), 104 | ], 105 | options={ 106 | "abstract": False, 107 | }, 108 | ), 109 | ] 110 | -------------------------------------------------------------------------------- /web/datasets/migrations/0024_auto_20210326_1704.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.7 on 2021-03-26 20:04 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0023_auto_20201124_0458"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="SyncInformation", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ( 25 | "created_at", 26 | models.DateTimeField(auto_now_add=True, verbose_name="Criado em"), 27 | ), 28 | ( 29 | "updated_at", 30 | models.DateTimeField(auto_now=True, verbose_name="Atualizado em"), 31 | ), 32 | ("date", models.DateField(verbose_name="Data alvo")), 33 | ( 34 | "source", 35 | models.CharField( 36 | choices=[ 37 | ("camara", "Câmara Municipal"), 38 | ("prefeitura", "Prefeitura"), 39 | ], 40 | db_index=True, 41 | max_length=20, 42 | verbose_name="Fonte", 43 | ), 44 | ), 45 | ( 46 | "succeed", 47 | models.BooleanField( 48 | null=True, verbose_name="Concluída com sucesso?" 49 | ), 50 | ), 51 | ("response", models.JSONField(null=True, verbose_name="Resposta")), 52 | ], 53 | ), 54 | migrations.AlterField( 55 | model_name="citycouncilcontract", 56 | name="external_code", 57 | field=models.PositiveIntegerField( 58 | db_index=True, unique=True, verbose_name="Código externo" 59 | ), 60 | ), 61 | migrations.AlterField( 62 | model_name="citycouncilcontract", 63 | name="value", 64 | field=models.DecimalField( 65 | decimal_places=2, max_digits=20, verbose_name="Valor" 66 | ), 67 | ), 68 | migrations.AlterField( 69 | model_name="citycouncilexpense", 70 | name="value", 71 | field=models.DecimalField( 72 | decimal_places=2, max_digits=20, verbose_name="Valor" 73 | ), 74 | ), 75 | migrations.AlterField( 76 | model_name="citycouncilrevenue", 77 | name="external_code", 78 | field=models.PositiveIntegerField( 79 | db_index=True, unique=True, verbose_name="Código externo" 80 | ), 81 | ), 82 | migrations.AlterField( 83 | model_name="citycouncilrevenue", 84 | name="value", 85 | field=models.DecimalField( 86 | decimal_places=2, max_digits=20, verbose_name="Valor" 87 | ), 88 | ), 89 | migrations.AlterField( 90 | model_name="historicalcitycouncilcontract", 91 | name="value", 92 | field=models.DecimalField( 93 | decimal_places=2, max_digits=20, verbose_name="Valor" 94 | ), 95 | ), 96 | migrations.AlterField( 97 | model_name="historicalcitycouncilexpense", 98 | name="value", 99 | field=models.DecimalField( 100 | decimal_places=2, max_digits=20, verbose_name="Valor" 101 | ), 102 | ), 103 | migrations.AlterField( 104 | model_name="historicalcitycouncilrevenue", 105 | name="external_code", 106 | field=models.PositiveIntegerField( 107 | db_index=True, verbose_name="Código externo" 108 | ), 109 | ), 110 | migrations.AlterField( 111 | model_name="historicalcitycouncilrevenue", 112 | name="value", 113 | field=models.DecimalField( 114 | decimal_places=2, max_digits=20, verbose_name="Valor" 115 | ), 116 | ), 117 | ] 118 | -------------------------------------------------------------------------------- /web/datasets/migrations/0019_auto_20200704_1132.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-07-04 14:32 2 | 3 | import django.db.models.expressions 4 | from django.db import migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("datasets", "0018_file_external_code"), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterModelOptions( 14 | name="citycouncilagenda", 15 | options={ 16 | "get_latest_by": "date", 17 | "ordering": ["-date"], 18 | "verbose_name": "Câmara de Vereadores - Agenda", 19 | "verbose_name_plural": "Câmara de Vereadores - Agendas", 20 | }, 21 | ), 22 | migrations.AlterModelOptions( 23 | name="citycouncilattendancelist", 24 | options={ 25 | "get_latest_by": "date", 26 | "ordering": ["-date"], 27 | "verbose_name": "Câmara de Vereadores - Lista de Presença", 28 | "verbose_name_plural": "Câmara de Vereadores - Listas de Presença", 29 | }, 30 | ), 31 | migrations.AlterModelOptions( 32 | name="citycouncilbid", 33 | options={ 34 | "get_latest_by": "session_at", 35 | "ordering": [ 36 | django.db.models.expressions.OrderBy( 37 | django.db.models.expressions.F("session_at"), 38 | descending=True, 39 | nulls_last=True, 40 | ) 41 | ], 42 | "verbose_name": "Câmara de Vereadores - Licitação", 43 | "verbose_name_plural": "Câmara de Vereadores - Licitações", 44 | }, 45 | ), 46 | migrations.AlterModelOptions( 47 | name="citycouncilcontract", 48 | options={ 49 | "get_latest_by": "start_date", 50 | "ordering": ["-start_date"], 51 | "verbose_name": "Câmara de Vereadores - Contrato", 52 | "verbose_name_plural": "Câmara de Vereadores - Contratos", 53 | }, 54 | ), 55 | migrations.AlterModelOptions( 56 | name="citycouncilexpense", 57 | options={ 58 | "get_latest_by": "date", 59 | "ordering": ["-date"], 60 | "verbose_name": "Câmara de Vereadores - Despesa", 61 | "verbose_name_plural": "Câmara de Vereadores - Despesas", 62 | }, 63 | ), 64 | migrations.AlterModelOptions( 65 | name="citycouncilminute", 66 | options={ 67 | "get_latest_by": "date", 68 | "ordering": ["-date"], 69 | "verbose_name": "Câmara de Vereadores - Atas", 70 | "verbose_name_plural": "Câmara de Vereadores - Atas", 71 | }, 72 | ), 73 | migrations.AlterModelOptions( 74 | name="citycouncilrevenue", 75 | options={ 76 | "get_latest_by": "published_at", 77 | "ordering": [ 78 | django.db.models.expressions.OrderBy( 79 | django.db.models.expressions.F("published_at"), 80 | descending=True, 81 | nulls_last=True, 82 | ) 83 | ], 84 | "verbose_name": "Câmara de Vereadores - Receita", 85 | "verbose_name_plural": "Câmara de Vereadores - Receitas", 86 | }, 87 | ), 88 | migrations.AlterModelOptions( 89 | name="cityhallbid", 90 | options={ 91 | "get_latest_by": "session_at", 92 | "ordering": [ 93 | django.db.models.expressions.OrderBy( 94 | django.db.models.expressions.F("session_at"), 95 | descending=True, 96 | nulls_last=True, 97 | ) 98 | ], 99 | "verbose_name": "Prefeitura - Licitação", 100 | "verbose_name_plural": "Prefeitura - Licitações", 101 | }, 102 | ), 103 | migrations.AlterModelOptions( 104 | name="file", 105 | options={ 106 | "ordering": ["-created_at"], 107 | "verbose_name": "Arquivo", 108 | "verbose_name_plural": "Arquivos", 109 | }, 110 | ), 111 | ] 112 | -------------------------------------------------------------------------------- /web/datasets/migrations/0012_auto_20200520_1050.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-05-20 13:50 2 | 3 | import django.contrib.postgres.indexes 4 | import django.contrib.postgres.search 5 | import django.db.models.deletion 6 | from django.db import migrations, models 7 | 8 | 9 | class Migration(migrations.Migration): 10 | dependencies = [ 11 | ("contenttypes", "0002_remove_content_type_name"), 12 | ("datasets", "0011_auto_20200515_1115"), 13 | ] 14 | 15 | operations = [ 16 | migrations.AlterField( 17 | model_name="citycouncilagenda", 18 | name="event_type", 19 | field=models.CharField( 20 | blank=True, 21 | choices=[ 22 | ("sessao_ordinaria", "Sessão Ordinária"), 23 | ("ordem_do_dia", "Ordem do Dia"), 24 | ("sessao_solene", "Sessão Solene"), 25 | ("sessao_especial", "Sessão Especial"), 26 | ("audiencia_publica", "Audiência Pública"), 27 | ], 28 | max_length=20, 29 | null=True, 30 | verbose_name="Tipo do evento", 31 | ), 32 | ), 33 | migrations.AlterField( 34 | model_name="citycouncilminute", 35 | name="event_type", 36 | field=models.CharField( 37 | blank=True, 38 | choices=[ 39 | ("sessao_ordinaria", "Sessão Ordinária"), 40 | ("ordem_do_dia", "Ordem do Dia"), 41 | ("sessao_solene", "Sessão Solene"), 42 | ("sessao_especial", "Sessão Especial"), 43 | ("audiencia_publica", "Audiência Pública"), 44 | ], 45 | max_length=20, 46 | null=True, 47 | verbose_name="Tipo de evento", 48 | ), 49 | ), 50 | migrations.CreateModel( 51 | name="File", 52 | fields=[ 53 | ( 54 | "id", 55 | models.AutoField( 56 | auto_created=True, 57 | primary_key=True, 58 | serialize=False, 59 | verbose_name="ID", 60 | ), 61 | ), 62 | ( 63 | "created_at", 64 | models.DateTimeField(auto_now_add=True, verbose_name="Criado em"), 65 | ), 66 | ( 67 | "updated_at", 68 | models.DateTimeField(auto_now=True, verbose_name="Atualizado em"), 69 | ), 70 | ("url", models.URLField(verbose_name="Arquivo")), 71 | ( 72 | "content", 73 | models.TextField(blank=True, null=True, verbose_name="Conteúdo"), 74 | ), 75 | ("object_id", models.PositiveIntegerField()), 76 | ("checksum", models.CharField(blank=True, max_length=128, null=True)), 77 | ( 78 | "s3_url", 79 | models.URLField(blank=True, null=True, verbose_name="URL externa"), 80 | ), 81 | ( 82 | "s3_file_path", 83 | models.CharField(blank=True, max_length=300, null=True), 84 | ), 85 | ( 86 | "search_vector", 87 | django.contrib.postgres.search.SearchVectorField( 88 | editable=False, null=True 89 | ), 90 | ), 91 | ( 92 | "content_type", 93 | models.ForeignKey( 94 | on_delete=django.db.models.deletion.CASCADE, 95 | to="contenttypes.ContentType", 96 | ), 97 | ), 98 | ], 99 | options={ 100 | "verbose_name": "Arquivo", 101 | "verbose_name_plural": "Arquivos", 102 | }, 103 | ), 104 | migrations.AddIndex( 105 | model_name="file", 106 | index=django.contrib.postgres.indexes.GinIndex( 107 | fields=["search_vector"], name="datasets_fi_search__52321c_gin" 108 | ), 109 | ), 110 | migrations.AlterUniqueTogether( 111 | name="file", 112 | unique_together={("url", "content_type", "object_id")}, 113 | ), 114 | ] 115 | -------------------------------------------------------------------------------- /web/datasets/migrations/0025_auto_20210327_1144.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.7 on 2021-03-27 14:44 2 | 3 | import django.db.models.expressions 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("datasets", "0024_auto_20210326_1704"), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name="TCMBADocument", 15 | fields=[ 16 | ( 17 | "id", 18 | models.AutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ( 26 | "created_at", 27 | models.DateTimeField(auto_now_add=True, verbose_name="Criado em"), 28 | ), 29 | ( 30 | "updated_at", 31 | models.DateTimeField(auto_now=True, verbose_name="Atualizado em"), 32 | ), 33 | ("crawled_at", models.DateTimeField(verbose_name="Coletado em")), 34 | ("crawled_from", models.URLField(verbose_name="Fonte")), 35 | ( 36 | "notes", 37 | models.TextField(blank=True, null=True, verbose_name="Anotações"), 38 | ), 39 | ( 40 | "year", 41 | models.PositiveIntegerField(db_index=True, verbose_name="Ano"), 42 | ), 43 | ( 44 | "month", 45 | models.PositiveIntegerField( 46 | db_index=True, null=True, verbose_name="Mês" 47 | ), 48 | ), 49 | ( 50 | "period", 51 | models.CharField( 52 | choices=[("mensal", "Mensal"), ("anual", "Anual")], 53 | db_index=True, 54 | max_length=10, 55 | verbose_name="Periodicidade", 56 | ), 57 | ), 58 | ( 59 | "category", 60 | models.CharField( 61 | db_index=True, max_length=200, verbose_name="Categoria" 62 | ), 63 | ), 64 | ( 65 | "unit", 66 | models.CharField( 67 | db_index=True, max_length=100, verbose_name="Unidade" 68 | ), 69 | ), 70 | ( 71 | "inserted_at", 72 | models.DateField(null=True, verbose_name="Inserido em"), 73 | ), 74 | ( 75 | "inserted_by", 76 | models.CharField( 77 | blank=True, 78 | max_length=50, 79 | null=True, 80 | verbose_name="Inserido por", 81 | ), 82 | ), 83 | ( 84 | "original_filename", 85 | models.CharField(max_length=200, verbose_name="Nome do arquivo"), 86 | ), 87 | ], 88 | options={ 89 | "verbose_name": "TCM-BA - Documento", 90 | "verbose_name_plural": "TCM-BA - Documentos", 91 | "ordering": [ 92 | django.db.models.expressions.OrderBy( 93 | django.db.models.expressions.F("year"), descending=True 94 | ), 95 | django.db.models.expressions.OrderBy( 96 | django.db.models.expressions.F("month"), descending=True 97 | ), 98 | ], 99 | "get_latest_by": "inserted_at", 100 | }, 101 | ), 102 | migrations.AlterField( 103 | model_name="file", 104 | name="s3_file_path", 105 | field=models.CharField( 106 | blank=True, max_length=400, null=True, verbose_name="Caminho interno" 107 | ), 108 | ), 109 | migrations.AlterField( 110 | model_name="file", 111 | name="s3_url", 112 | field=models.URLField( 113 | blank=True, max_length=600, null=True, verbose_name="URL externa" 114 | ), 115 | ), 116 | ] 117 | -------------------------------------------------------------------------------- /web/datasets/migrations/0017_citycouncilrevenue.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.6 on 2020-06-04 12:40 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("datasets", "0016_auto_20200522_0647"), 9 | ] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="CityCouncilRevenue", 14 | fields=[ 15 | ( 16 | "id", 17 | models.AutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ( 25 | "created_at", 26 | models.DateTimeField(auto_now_add=True, verbose_name="Criado em"), 27 | ), 28 | ( 29 | "updated_at", 30 | models.DateTimeField(auto_now=True, verbose_name="Atualizado em"), 31 | ), 32 | ("crawled_at", models.DateTimeField(verbose_name="Coletado em")), 33 | ("crawled_from", models.URLField(verbose_name="Fonte")), 34 | ( 35 | "notes", 36 | models.TextField(blank=True, null=True, verbose_name="Anotações"), 37 | ), 38 | ( 39 | "external_code", 40 | models.CharField( 41 | db_index=True, max_length=10, verbose_name="Código externo" 42 | ), 43 | ), 44 | ( 45 | "budget_unit", 46 | models.PositiveIntegerField( 47 | default=101, verbose_name="Unidade gestora" 48 | ), 49 | ), 50 | ( 51 | "published_at", 52 | models.DateField( 53 | db_index=True, null=True, verbose_name="Publicado em" 54 | ), 55 | ), 56 | ( 57 | "registered_at", 58 | models.DateField( 59 | db_index=True, null=True, verbose_name="Registrado em" 60 | ), 61 | ), 62 | ( 63 | "revenue_type", 64 | models.CharField( 65 | choices=[ 66 | ("orcamentaria", "Orçamentária"), 67 | ("nao_orcamentaria", "Não-orçamentária"), 68 | ("transferencia", "Transferência"), 69 | ], 70 | db_index=True, 71 | max_length=20, 72 | verbose_name="Tipo da receita", 73 | ), 74 | ), 75 | ( 76 | "modality", 77 | models.CharField( 78 | blank=True, max_length=60, null=True, verbose_name="Modalidade" 79 | ), 80 | ), 81 | ("description", models.TextField(verbose_name="Descrição")), 82 | ( 83 | "value", 84 | models.DecimalField( 85 | decimal_places=2, max_digits=10, verbose_name="Valor" 86 | ), 87 | ), 88 | ( 89 | "resource", 90 | models.CharField( 91 | blank=True, 92 | default="prefeitura", 93 | max_length=200, 94 | null=True, 95 | verbose_name="Fonte", 96 | ), 97 | ), 98 | ( 99 | "legal_status", 100 | models.CharField( 101 | blank=True, 102 | db_index=True, 103 | max_length=200, 104 | null=True, 105 | verbose_name="Natureza", 106 | ), 107 | ), 108 | ( 109 | "destination", 110 | models.CharField( 111 | blank=True, max_length=200, null=True, verbose_name="Destinação" 112 | ), 113 | ), 114 | ( 115 | "excluded", 116 | models.BooleanField(default=False, verbose_name="Excluído?"), 117 | ), 118 | ], 119 | options={ 120 | "verbose_name": "Câmara de Vereadores - Receita", 121 | "verbose_name_plural": "Câmara de Vereadores - Receitas", 122 | "get_latest_by": "published_at", 123 | }, 124 | ), 125 | ] 126 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Guia de contribuição 2 | 3 | Ficamos muito felizes que você está lendo este guia de contribuição, sempre precisamos 4 | de pessoas voluntárias que acreditem na ideia e queiram contribuir com o projeto. 5 | 6 | Se você ainda não fez isso, junte-se a nós no [nosso grupo aberto do Telegram](https://t.me/dadosabertosdefeira) 7 | ou no nosso [Discord](https://discord.gg/bPZ9TSjdUn) e participe das discussões. 8 | Não hesite em nos procurar para tirar todas as suas dúvidas e conhecer mais sobre o projeto. 9 | 10 | [![Convite Discord](https://invidget.switchblade.xyz/bPZ9TSjdUn?language=pt)](https://discord.gg/bPZ9TSjdUn) 11 | 12 | ## Antes de começar 13 | 14 | Aqui estão alguns recursos importantes que você deve estar ciente antes de começar: 15 | 16 | - [Manual de dados abertos para desenvolvedores](https://www.w3c.br/pub/Materiais/PublicacoesW3C/manual_dados_abertos_desenvolvedores_web.pdf) 17 | te explicará um pouco sobre o que são e os principais conceitos por trás dos dados abertos. 18 | 19 | - Nossos [projetos](https://github.com/DadosAbertosDeFeira/maria-quiteria/projects), 20 | são um conjunto de funcionalidades e melhorias que queremos desenvolver nesse repositório. 21 | Caso não tenha nada que seja a sua praia, você pode dar uma olhada nos 22 | [projetos gerais](https://github.com/orgs/DadosAbertosDeFeira/projects) do projeto. 23 | 24 | - No [nosso Trello](https://trello.com/b/E8v20MFs/dados-abertos-de-feira) você pode 25 | acompanhar o que a comunidade em geral vem trabalhando. Lá você encontrá coisas desde 26 | fotografia até pedidos de acesso à informação. 27 | 28 | Os detalhes de como instalar e executar este projeto podem ser encontrados no 29 | [`README.md`](https://github.com/DadosAbertosDeFeira/maria-quiteria/blob/main/README.md). 30 | 31 | ## Reportando bugs 32 | 33 | Você encontrou um bug? 34 | 35 | * Sugestões de melhoria são rastreadas através de [_issues_](https://guides.github.com/features/issues/) 36 | e [_pull requests_](https://guides.github.com/activities/hello-world/#pr) no GitHub. 37 | Verifique se nenhuma _issue_ ou _pull request_ foi criada por outra pessoa com o mesmo bug. 38 | * Se não, [crie uma _issue_](https://github.com/DadosAbertosDeFeira/maria-quiteria/issues/new) 39 | explicando o problema e adicionando novas informações detalhadas que ajudem 40 | a reproduzir o problema. 41 | 42 | ## Sugerindo melhorias 43 | 44 | Você é mais que bem-vinda(o) a sugerir melhorias a MQ. Pedimos apenas que tente incluir o 45 | máximo de detalhes possíveis e que verifique se nenhuma _issue_ ou _pull request_ já foi 46 | criado por outra pessoa com a mesma sugestão. 47 | 48 | Caso seja algo novo, você tem duas alternativas: 49 | 50 | - Criar uma nova _issue_ 51 | - Compartilhar a sua sugestão com outros participantes e mantenedores do projeto em nosso [Discord](https://discord.gg/BS4GNf) 52 | 53 | Em ambos, tente usar uma linguagem clara, e com o máximo de detalhes. Qual a motivação, 54 | qual problema resolveria e possíveis desafios, por exemplo, são importantes para entender 55 | o que você precisa. Esse é um projeto de código aberto, mantido por voluntários. 56 | Frequentemente precisamos escolher bem o que vamos fazer com os recursos que temos. :) 57 | 58 | ## Criando _pull requests_ 59 | 60 | Você decidiu contribuir para o projeto! Yay! 61 | 62 | Faça um _fork_ do projeto e crie uma nova _branch_. 63 | Mais detalhes [aqui](https://help.github.com/pt/enterprise/2.17/user/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork). 64 | 65 | Aqui algumas dicas: 66 | 67 | * Caso decida trabalhar em alguma _issue_, comente na _issue_ escolhida. Dessa forma, 68 | outras pessoas saberão que tem alguém trabalhando nela. Caso tenha ficado perdido ou com 69 | dúvidas, peça ajuda. 70 | 71 | * Caso tenha visto algo pontual, como um _typo_ ou algo que pode ser corrigido e testado 72 | rapidamente e não envolva mudanças estruturais, você é bem vindo a abrir um novo PR também. 73 | 74 | * Antes de qualquer coisa, tente rodar o projeto localmente. 75 | 76 | * Instale o `pre-commit` localmente. Dessa forma, o código que você _commitar_ já estará 77 | formatado, com os _imports_ ordenados e mais arrumado. 78 | 79 | * Rode os testes localmente. Além de ser uma boa prática, previne idas e vindas nas 80 | revisões. 81 | 82 | * Adicione novos testes para novas funcionalidades ou bugs. 83 | 84 | * Use o tempo presente nas mensagens do _commit_. Exemplo: _"Adiciona funcionalidade"_ 85 | e não _"Adicionada a funcionalidade"_. 86 | 87 | * Atualize o [README.md](https://github.com/DadosAbertosDeFeira/maria-quiteria/blob/main/README.md) 88 | com os detalhes da mudança caso esta inclua uma nova base de dados ou um novo comando na CLI. 89 | 90 | * Embora o código esteja escrito em inglês, por convenção, as mensagens de _commit_, 91 | comentários, _pull requests_, _issues_, e demais comunicações do projeto deverão ser 92 | escritas em português. 93 | 94 | * Marque a opção "Permitir edição pelos mantenedores". Assim poderemos fazer modificações de emergência 95 | mantendo o _pull request_ aberto por você. 96 | 97 | Sentiu falta de algo nesse guia? Conta pra gente! 98 | 99 | Bem vinda(o) ao Dados Abertos de Feira! 100 | -------------------------------------------------------------------------------- /web/datasets/migrations/0008_cityhallbid_cityhallbidevent.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.0.5 on 2020-04-19 15:51 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("datasets", "0007_citycouncilexpense"), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name="CityHallBid", 15 | fields=[ 16 | ( 17 | "id", 18 | models.AutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ("created_at", models.DateTimeField(auto_now_add=True)), 26 | ("updated_at", models.DateTimeField(auto_now=True)), 27 | ("crawled_at", models.DateTimeField()), 28 | ("crawled_from", models.URLField()), 29 | ("notes", models.TextField(blank=True, null=True)), 30 | ( 31 | "session_at", 32 | models.DateTimeField( 33 | null=True, verbose_name="Sessão Data / Horário" 34 | ), 35 | ), 36 | ( 37 | "public_agency", 38 | models.CharField(max_length=200, verbose_name="Órgão"), 39 | ), 40 | ( 41 | "description", 42 | models.TextField(blank=True, null=True, verbose_name="Descrição"), 43 | ), 44 | ( 45 | "modality", 46 | models.CharField( 47 | blank=True, 48 | choices=[ 49 | ("tomada_de_precos", "Tomada de Preço"), 50 | ("pregao_presencial", "Pregão Presencial"), 51 | ("pregao_eletronico", "Pregão Eletrônico"), 52 | ("leilao", "Leilão"), 53 | ("inexigibilidade", "Inexigibilidade"), 54 | ("dispensada", "Dispensada"), 55 | ("convite", "Convite"), 56 | ("concurso", "Concurso"), 57 | ("concorrencia", "Concorrência"), 58 | ("chamada_publica", "Chamada Pública"), 59 | ], 60 | max_length=60, 61 | null=True, 62 | verbose_name="Modalidade", 63 | ), 64 | ), 65 | ("codes", models.CharField(max_length=300, verbose_name="Códigos")), 66 | ( 67 | "file_url", 68 | models.URLField(blank=True, null=True, verbose_name="Arquivo"), 69 | ), 70 | ( 71 | "file_content", 72 | models.TextField(blank=True, null=True, verbose_name="Conteúdo"), 73 | ), 74 | ], 75 | options={ 76 | "verbose_name": "Prefeitura - Licitação", 77 | "verbose_name_plural": "Prefeitura - Licitações", 78 | }, 79 | ), 80 | migrations.CreateModel( 81 | name="CityHallBidEvent", 82 | fields=[ 83 | ( 84 | "id", 85 | models.AutoField( 86 | auto_created=True, 87 | primary_key=True, 88 | serialize=False, 89 | verbose_name="ID", 90 | ), 91 | ), 92 | ("created_at", models.DateTimeField(auto_now_add=True)), 93 | ("updated_at", models.DateTimeField(auto_now=True)), 94 | ("crawled_at", models.DateTimeField()), 95 | ("crawled_from", models.URLField()), 96 | ("notes", models.TextField(blank=True, null=True)), 97 | ( 98 | "published_at", 99 | models.DateTimeField(null=True, verbose_name="Publicado em"), 100 | ), 101 | ( 102 | "summary", 103 | models.TextField(blank=True, null=True, verbose_name="Descrição"), 104 | ), 105 | ( 106 | "file_url", 107 | models.URLField(blank=True, null=True, verbose_name="Arquivo"), 108 | ), 109 | ( 110 | "file_content", 111 | models.TextField(blank=True, null=True, verbose_name="Conteúdo"), 112 | ), 113 | ( 114 | "bid", 115 | models.ForeignKey( 116 | on_delete=django.db.models.deletion.CASCADE, 117 | related_name="events", 118 | to="datasets.CityHallBid", 119 | ), 120 | ), 121 | ], 122 | options={ 123 | "verbose_name": "Prefeitura - Licitação - Histórico", 124 | "verbose_name_plural": "Prefeitura - Licitações - Históricos", 125 | }, 126 | ), 127 | ] 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maria Quitéria 2 | 3 | [![CI](https://github.com/DadosAbertosDeFeira/maria-quiteria/actions/workflows/cicd.yml/badge.svg)](https://github.com/DadosAbertosDeFeira/maria-quiteria/actions/workflows/cicd.yml) 4 | 5 | Tem a missão de libertar dados do município de [Feira de Santana](https://pt.wikipedia.org/wiki/Feira_de_Santana). 6 | Responsável pela raspagem e o armazenamento. 7 | 8 | Não sabe quem foi [Maria Quitéria](https://pt.wikipedia.org/wiki/Maria_Quit%C3%A9ria)? 9 | 10 | ## Dados 11 | 12 | Você pode visualizar e fazer buscas nossos dados [aqui](https://mq.dadosabertosdefeira.com.br/painel/). 13 | 14 | | Base de dados | Fonte | Descrição | Coleta | Banco de dados | Download | 15 | | ------------- | ------------- | ------------- |:-------------:|:-----:|:-----:| 16 | | Agenda (`citycouncil.py`) | Câmara Municipal | Agenda (ordem do dia, homenagens, sessões ordinárias etc) da Câmara Municipal. | :heavy_check_mark: | :heavy_check_mark: | [Kaggle](https://www.kaggle.com/dadosabertosdefeira/agenda-da-cmara-de-vereadores) | 17 | | Atas das sessões (`citycouncil.py`) | Câmara Municipal | Atas das sessões da Câmara Municipal. | :heavy_check_mark: | :heavy_check_mark: | 🔜 | 18 | | Lista de Presença (`citycouncil.py`) | Câmara Municipal | Assiduidade dos vereadores da Câmara Municipal. | :heavy_check_mark: | :heavy_check_mark: | [Kaggle](https://www.kaggle.com/dadosabertosdefeira/assiduidade-dos-vereadores) | 19 | | Despesas (`citycouncil.py`) | Câmara Municipal | Gastos realizados pela Câmara Municipal. | :heavy_check_mark: | :heavy_check_mark: | 🔜 | 20 | | Contratos (`cityhall.py`) | Prefeitura | Contratos realizados pela prefeitura entre 2016 e 2017. | 🔜 | 🔜 | 🔜 | 21 | | Diário Oficial (`gazette.py`) | Prefeitura/Câmara de Vereadores | Diário oficial do executivo e legislativo. | :heavy_check_mark: | :heavy_check_mark: | [Kaggle](https://www.kaggle.com/dadosabertosdefeira/dirios-oficiais-do-executivo-e-do-legislativo) | 22 | | Licitações (`cityhall.py`) | Prefeitura | Licitações realizadas pela prefeitura desde 2015. | :heavy_check_mark: | :heavy_check_mark: | [Kaggle](https://www.kaggle.com/dadosabertosdefeira/licitaes-da-prefeitura-de-feira-de-santana) | 23 | | Pagamentos (`cityhall.py`) | Prefeitura | Pagamentos realizados pela prefeitura desde 2010. | 🔜 | 🔜 | 🔜 | 24 | 25 | ## Contribuindo para o projeto 26 | 27 | Contribuições são muito bem-vindas. Veja como contribuir no nosso [Guia de Contribuição](CONTRIBUTING.md). 28 | 29 | Toda a comunicação e demais interações do Dados Abertos de Feira estão sujeitas 30 | ao nosso [Código de Conduta](CODE_OF_CONDUCT.md). 31 | 32 | ### Configurando seu ambiente 33 | 34 | Você precisará do [Docker](https://docs.docker.com/install/) 35 | e do [Docker-Compose](https://docs.docker.com/compose/install/) para rodar o projeto. 36 | 37 | #### Carregue as variáveis de ambiente 38 | 39 | Um exemplo das configurações pode ser encontrado no arquivo `.env.example`, 40 | que deve ser copiado para um arquivo `.env` na raiz do projeto. 41 | 42 | Caso queira utilizar um banco de dados diferente basta configurar a variável 43 | de ambiente `DATABASE_URL` em seu `.env`. 44 | 45 | #### Instale as dependências e prepare os serviços 46 | 47 | ```bash 48 | make build 49 | ``` 50 | 51 | O passo anterior vai criar um banco de dados postgres. 52 | Agora, basta aplicar as `migrations` executar o `collectstatic`: 53 | 54 | ``` 55 | make migrate 56 | make collectstatic 57 | ``` 58 | 59 | ### Executando os testes 60 | 61 | ``` 62 | make tests 63 | ``` 64 | 65 | ### Acessando o site 66 | 67 | Rode o servidor com: 68 | ``` 69 | make run 70 | ``` 71 | 72 | Com as configurações padrão o painel de controle estará acessível pela URL: 73 | [`localhost:8000`](http://localhost:8000). Veja as bases de dados disponíveis 74 | no nosso painel público [`localhost:8000/painel`](http://localhost:8000/painel). 75 | 76 | Para navegar no admin, primeiro crie um super administrador: 77 | ``` 78 | make createsuperuser 79 | ``` 80 | 81 | ### Coletando os dados 82 | 83 | Boa parte dos dados que temos vem da raspagem de dados feita por _spiders_. 84 | O comando abaixo vai executar todos os _spiders_ e salvar os itens raspados 85 | no banco de dados: 86 | 87 | ``` 88 | make crawl 89 | ``` 90 | 91 | Durante a coleta e adição ao banco, vamos também tentar extrair o conteúdo 92 | dos arquivos encontrados. 93 | 94 | ### Rodando os spiders individualmente 95 | 96 | No diretório `scraper` você poderá encontrar os _spiders_ responsáveis pela 97 | coleta dos dados. Para entender melhor como eles funcionam, dê uma olhada 98 | na documentação do [scrapy](https://docs.scrapy.org/). 99 | 100 | Para rodar um _spider_, execute: 101 | 102 | ``` 103 | SPIDER=citycouncil_agenda make runspider 104 | # ou 105 | SPIDER=citycouncil_agenda START_DATE=03/01/2020 make runspider 106 | ``` 107 | 108 | Para salvar os dados de um _spider_ em um arquivo: 109 | 110 | ``` 111 | docker-compose run --rm web scrapy crawl citycouncil_agenda -o citycouncil_agenda.json 112 | ``` 113 | 114 | Você pode substituir `json` por outros formatos como `csv`. 115 | 116 | Caso queira passar alguma configuração extra para o Scrapy através 117 | do comando `crawl` você pode adicionar após o parâmetro `--scrapy-args`: 118 | 119 | ``` 120 | docker-compose run --rm web python manage.py crawl --scrapy-args '{"LOG_FILE": "test.log"}' 121 | ``` 122 | 123 | ### API 124 | 125 | Sobre acesso a API veja instruções em nossa [Wiki](https://github.com/DadosAbertosDeFeira/maria-quiteria/wiki/API). 126 | 127 | 128 | ### Infraestrutura 129 | 130 | Essa aplicação está sendo hospedada no PaaS [Dokku](https://dokku.com/docs/) e todo código IaC está [nesse repositório](https://github.com/DadosAbertosDeFeira/iac). 131 | -------------------------------------------------------------------------------- /web/datasets/adapters.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from django.contrib.admin.options import get_content_type_for_model 4 | 5 | from web.datasets.models import CityCouncilBid, CityCouncilContract 6 | from web.datasets.parsers import ( 7 | city_council_bid_modality_mapping, 8 | city_council_revenue_type_mapping, 9 | currency_to_float, 10 | from_str_to_date, 11 | from_str_to_datetime, 12 | get_phase, 13 | lower, 14 | lower_without_spaces, 15 | to_boolean, 16 | ) 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | CITYCOUNCIL_BID_FIELDS_MAPPING = { 21 | "CODLIC": "external_code", 22 | "CODTIPOLIC": "modality", 23 | "NUMLIC": "code", 24 | "NUMTIPOLIC": "code_type", 25 | "OBJETOLIC": "description", 26 | "DTLIC": "session_at", 27 | "EXCLUIDO": "excluded", 28 | "ARQUIVOS": None, 29 | } 30 | 31 | 32 | CITYCOUNCIL_BID_FUNCTIONS = { 33 | "excluded": to_boolean, 34 | "session_at": from_str_to_datetime, 35 | "modality": city_council_bid_modality_mapping, 36 | } 37 | 38 | 39 | CITYCOUNCIL_CONTRACT_FIELDS_MAPPING = { 40 | "CODCON": "external_code", 41 | "DSCON": "description", 42 | "OBJETOCON": "details", 43 | "CPFCNPJCON": "company_or_person_document", 44 | "NMCON": "company_or_person", 45 | "VALORCON": "value", 46 | "DTCON": "start_date", 47 | "DTCONFIM": "end_date", 48 | "EXCLUIDO": "excluded", 49 | "ARQUIVOS": None, 50 | } 51 | 52 | 53 | CITYCOUNCIL_CONTRACT_FUNCTIONS = { 54 | "value": currency_to_float, 55 | "excluded": to_boolean, 56 | "start_date": from_str_to_date, 57 | "end_date": from_str_to_date, 58 | } 59 | 60 | 61 | CITYCOUNCIL_REVENUE_FIELDS_MAPPING = { 62 | "CODLINHA": "external_code", 63 | "CODUNIDGESTORA": "budget_unit", 64 | "DTPUBLICACAO": "published_at", 65 | "DTREGISTRO": "registered_at", 66 | "TIPOREC": "revenue_type", 67 | "MODALIDADE": "modality", 68 | "DSRECEITA": "description", 69 | "VALOR": "value", 70 | "FONTE": "resource", 71 | "DSNATUREZA": "legal_status", # TODO natureza do TCM-BA 72 | "DESTINACAO": "destination", 73 | "EXCLUIDO": "excluded", 74 | } 75 | 76 | 77 | CITYCOUNCIL_REVENUE_FUNCTIONS = { 78 | "excluded": to_boolean, 79 | "published_at": from_str_to_date, 80 | "registered_at": from_str_to_date, 81 | "value": currency_to_float, 82 | "modality": lower, 83 | "revenue_type": city_council_revenue_type_mapping, 84 | "resource": lower, 85 | "legal_status": lower, 86 | "destination": lower, 87 | } 88 | 89 | 90 | CITYCOUNCIL_EXPENSE_FIELDS_MAPPING = { 91 | "CODARQUIVO": "external_file_code", 92 | "CODLINHA": "external_file_line", 93 | "CODUNIDORCAM": "budget_unit", 94 | "DTPUBLICACAO": "published_at", 95 | "DTREGISTRO": "date", 96 | "CODETAPA": "phase", 97 | "NUMPROCADM": "number", 98 | "NUMPROCLIC": "process_number", 99 | "DSDESPESA": "summary", 100 | "NMCREDOR": "company_or_person", 101 | "NUCPFCNPJ": "document", 102 | "VALOR": "value", 103 | "DSFUNCAO": "function", 104 | "DSSUBFUNCAO": "subfunction", 105 | "DSNATUREZA": "legal_status", # TODO natureza do TCM-BA 106 | "DSFONTEREC": "resource", 107 | "NUMETAPA": "phase_code", 108 | "MODALIDADE": "modality", 109 | "EXCLUIDO": "excluded", 110 | } 111 | 112 | 113 | CITYCOUNCIL_EXPENSE_FUNCTIONS = { 114 | "value": currency_to_float, 115 | "excluded": to_boolean, 116 | "published_at": from_str_to_date, 117 | "date": from_str_to_date, 118 | "phase": get_phase, 119 | "modality": lower_without_spaces, 120 | } 121 | 122 | 123 | def map_to_fields(item, fields_mapping, functions): 124 | new_item = {} 125 | for key, value in item.items(): 126 | field = fields_mapping[key.upper()] 127 | if field: 128 | value = value.strip() 129 | new_item[field] = functions.get(field, lambda x: x)(value) 130 | return new_item 131 | 132 | 133 | def to_citycouncil_expense(item): 134 | return map_to_fields( 135 | item, CITYCOUNCIL_EXPENSE_FIELDS_MAPPING, CITYCOUNCIL_EXPENSE_FUNCTIONS 136 | ) 137 | 138 | 139 | def to_citycouncil_contract(item): 140 | return map_to_fields( 141 | item, CITYCOUNCIL_CONTRACT_FIELDS_MAPPING, CITYCOUNCIL_CONTRACT_FUNCTIONS 142 | ) 143 | 144 | 145 | def to_citycouncil_bid(item): 146 | return map_to_fields( 147 | item, CITYCOUNCIL_BID_FIELDS_MAPPING, CITYCOUNCIL_BID_FUNCTIONS 148 | ) 149 | 150 | 151 | def to_citycouncil_revenue(item): 152 | return map_to_fields( 153 | item, CITYCOUNCIL_REVENUE_FIELDS_MAPPING, CITYCOUNCIL_REVENUE_FUNCTIONS 154 | ) 155 | 156 | 157 | def to_citycouncil_contract_file(item): 158 | try: 159 | contract = CityCouncilContract.objects.get(external_code=item["CODCON"]) 160 | except CityCouncilContract.DoesNotExist: 161 | logger.error(f"Contrato não encontrado: {item}") 162 | return 163 | 164 | content_type = get_content_type_for_model(contract) 165 | return { 166 | "url": item["CAMINHO"], 167 | "content_type": content_type, 168 | "object_id": contract.pk, 169 | "external_code": item["CODARQCON"], 170 | } 171 | 172 | 173 | def to_citycouncil_bid_file(item): 174 | try: 175 | bid = CityCouncilBid.objects.get(external_code=item["CODLIC"]) 176 | except CityCouncilBid.DoesNotExist: 177 | logger.error(f"Licitação não encontrada: {item}") 178 | return 179 | 180 | content_type = get_content_type_for_model(bid) 181 | return { 182 | "url": item["CAMINHOARQLIC"], 183 | "content_type": content_type, 184 | "object_id": bid.pk, 185 | "external_code": item["CODARQLIC"], 186 | } 187 | -------------------------------------------------------------------------------- /web/datasets/tests/management/commands/test_citycouncil.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import pytest 4 | from django.utils.timezone import make_aware 5 | 6 | from web.datasets.management.commands._citycouncil import ( 7 | save_agenda, 8 | save_attendance_list, 9 | save_minute, 10 | ) 11 | 12 | 13 | @pytest.mark.django_db 14 | class TestSaveAgenda: 15 | def test_save_gazette(self): 16 | item = { 17 | "crawled_at": make_aware(datetime(2020, 3, 21, 7, 15, 17, 908831)), 18 | "crawled_from": "https://www.feiradesantana.ba.leg.br/agenda", 19 | "date": date(2019, 8, 29), 20 | "details": "- Especial , dia 29 (quinta-feira), às 09 horas," 21 | " para apresentar a sociedade\r\n" 22 | "civil e aos órgãos competentes e afins, os resultados dos " 23 | "trabalhos\r\n" 24 | "desenvolvidos pela Fundação Municipal de Tecnologia da " 25 | "informação,\r\n" 26 | "Telecomunicações e Cultura Egberto Tavares Costa- FUNTITEC, " 27 | "atendendo ao\r\n" 28 | "Requerimento nº 142/2019.", 29 | "event_type": "sessao_especial", 30 | "title": "SESSÃO ESPECIAL 29 DE AGOSTO", 31 | } 32 | 33 | agenda = save_agenda(item) 34 | assert agenda.date == item["date"] 35 | assert agenda.details == item["details"] 36 | assert agenda.event_type == item["event_type"] 37 | assert agenda.title == item["title"] 38 | assert agenda.crawled_at == item["crawled_at"] 39 | assert agenda.crawled_from == item["crawled_from"] 40 | 41 | def test_handle_with_changed_agenda(self): 42 | item = { 43 | "crawled_at": make_aware(datetime(2020, 3, 21, 7, 15, 17, 908831)), 44 | "crawled_from": "https://www.feiradesantana.ba.leg.br/agenda", 45 | "date": date(2019, 8, 29), 46 | "details": "- Especial , dia 29 (quinta-feira), às 09 horas," 47 | " para apresentar a sociedade\r\n" 48 | "civil e aos órgãos competentes e afins, os resultados dos " 49 | "trabalhos\r\n" 50 | "desenvolvidos pela Fundação Municipal de Tecnologia da " 51 | "informação,\r\n" 52 | "Telecomunicações e Cultura Egberto Tavares Costa- FUNTITEC, " 53 | "atendendo ao\r\n" 54 | "Requerimento nº 142/2019.", 55 | "event_type": "sessao_especial", 56 | "title": "SESSÃO ESPECIAL 29 DE AGOSTO", 57 | } 58 | 59 | agenda = save_agenda(item) 60 | item["details"] = "Festa na cidade bla bla bla" 61 | item["crawled_at"] = make_aware(datetime(2020, 3, 22, 7, 15, 17, 908831)) 62 | 63 | updated_agenda = save_agenda(item) 64 | 65 | assert agenda.pk == updated_agenda.pk 66 | assert agenda.details != updated_agenda.details 67 | assert agenda.crawled_at != updated_agenda.crawled_at 68 | 69 | 70 | @pytest.mark.django_db 71 | class TestSaveAttendanceList: 72 | def test_save_attendance_list(self): 73 | item = { 74 | "date": date(2020, 2, 3), 75 | "council_member": "Roberto Luis da Silva Tourinho", 76 | "status": "presente", 77 | "crawled_at": make_aware(datetime(2020, 3, 21, 7, 15, 17, 276019)), 78 | "crawled_from": "https://www.feiradesantana.ba.leg.br/lista/7/03-02-2020", 79 | } 80 | 81 | attendance = save_attendance_list(item) 82 | assert attendance.date == item["date"] 83 | assert attendance.council_member == item["council_member"] 84 | assert attendance.status == item["status"] 85 | assert attendance.crawled_at == item["crawled_at"] 86 | assert attendance.crawled_from == item["crawled_from"] 87 | 88 | def test_handle_with_changed_attendance_list(self): 89 | item = { 90 | "date": date(2020, 2, 3), 91 | "description": "Abertura da 1ª etapa do 4º período da 18ª legislatura", 92 | "council_member": "Roberto Luis da Silva Tourinho", 93 | "status": "ausente", 94 | "crawled_at": make_aware(datetime(2020, 3, 21, 7, 15, 17, 276019)), 95 | "crawled_from": "https://www.feiradesantana.ba.leg.br/lista/7/03-02-2020", 96 | } 97 | 98 | attendance = save_attendance_list(item) 99 | item["status"] = "falta_justificada" 100 | item["crawled_at"] = make_aware(datetime(2020, 3, 22, 7, 15, 17, 908831)) 101 | 102 | updated_attendance = save_attendance_list(item) 103 | 104 | assert attendance.pk == updated_attendance.pk 105 | assert attendance.council_member == updated_attendance.council_member 106 | assert attendance.description == updated_attendance.description 107 | assert attendance.crawled_from == updated_attendance.crawled_from 108 | assert attendance.status != updated_attendance.status 109 | assert attendance.crawled_at != updated_attendance.crawled_at 110 | 111 | 112 | @pytest.mark.django_db 113 | class TestSaveMinute: 114 | def test_save_minute(self, mock_backup_file): 115 | item = { 116 | "crawled_at": make_aware(datetime(2020, 4, 30, 18, 18, 56, 173788)), 117 | "crawled_from": "https://www.feiradesantana.ba.leg.br/atas?" 118 | "mes=9&ano=2018&Acessar=OK", 119 | "date": date(2018, 9, 11), 120 | "event_type": None, 121 | "files": [ 122 | { 123 | "url": "https://www.feiradesantana.ba.leg.br/5eaabb5e91088.pd", 124 | "checksum": "checksum", 125 | "content": None, 126 | } 127 | ], 128 | "title": "Ata da 4ª Reunião para Instalação da Comissão Especial", 129 | } 130 | 131 | minute = save_minute(item) 132 | assert minute.date == item["date"] 133 | assert minute.title == item["title"] 134 | assert minute.event_type == item["event_type"] 135 | assert minute.crawled_from == item["crawled_from"] 136 | -------------------------------------------------------------------------------- /scraper/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import pytest 4 | 5 | from ..spiders.utils import ( 6 | extract_date, 7 | extract_param, 8 | get_git_commit, 9 | identify_contract_id, 10 | is_url, 11 | months_and_years, 12 | replace_query_param, 13 | strip_accents, 14 | ) 15 | 16 | 17 | @pytest.mark.parametrize( 18 | "old_url,field,value,new_url", 19 | [ 20 | ( 21 | "http://www.diariooficial.feiradesantana.ba.gov.br/" 22 | "abrir.asp?edi=590&p=1", 23 | "p", 24 | 999, 25 | "http://www.diariooficial.feiradesantana.ba.gov.br/" 26 | "abrir.asp?edi=590&p=999", 27 | ), 28 | ( 29 | "http://www.diariooficial.feiradesantana.ba.gov.br/" 30 | "detalhes.asp?acao=&p=1116&menu=&idsec=1&tipo=&publicacao" 31 | "=1&st=&rad=&txtlei=''&dtlei=''&dtlei1=''" 32 | "&edicao=&hom=&ini=&fim=&meshom=#links>", 33 | "publicacao", 34 | "88", 35 | "http://www.diariooficial.feiradesantana.ba.gov.br/" 36 | "detalhes.asp?acao=&p=1116&menu=&idsec=1&tipo=" 37 | "&publicacao=88&st=&rad=&txtlei=''&dtlei=''&dtlei1=''" 38 | "&edicao=&hom=&ini=&fim=&meshom=#links>", 39 | ), 40 | ( 41 | "detalhes.asp?acao=&p=991&menu=&idsec=1&tipo=&publicacao=1&st=&rad=" 42 | "&txtlei=''&dtlei=''&dtlei1=''&edicao=&hom=&ini=&fim=&meshom=#links", 43 | "p", 44 | "", 45 | "detalhes.asp?acao=&p=&menu=&idsec=1&tipo=&publicacao=1&st=&rad=" 46 | "&txtlei=''&dtlei=''&dtlei1=''&edicao=&hom=&ini=&fim=&meshom=#links", 47 | ), 48 | ], 49 | ) 50 | def test_replace_query_parameter_from_a_url(old_url, field, value, new_url): 51 | assert replace_query_param(old_url, field, value) == new_url 52 | 53 | 54 | @pytest.mark.parametrize( 55 | "text, expected_contract_id", 56 | [ 57 | (" CONTRATO N�� 295-2017-10C ", "295-2017-10C"), 58 | ("CONTRATO N° 11-2017-10C", "11-2017-10C"), 59 | ("4/2016/09C", "4/2016/09C"), 60 | ("860/2015/05C", "860/2015/05C"), 61 | ("3-2017-1926C", "3-2017-1926C"), 62 | ("CONTRATO N�� 23820161111 ", "23820161111"), 63 | ("CONTRATO N° 05820171111 ", "05820171111"), 64 | ("CONTRATO N° 010521004-2017", "010521004-2017"), 65 | ], 66 | ) 67 | def test_identify_contract_ids(text, expected_contract_id): 68 | assert identify_contract_id(text) == expected_contract_id 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "url, param, value", 73 | [ 74 | ( 75 | "http://www.feiradesantana.ba.gov.br/seadm/servicos.asp?" 76 | "id=2&s=a&link=seadm/licitacoes_pm.asp&cat=PMFS&dt=01-2019#links", 77 | "dt", 78 | "01-2019", 79 | ), 80 | ("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "dt", "01-2019"), 81 | ("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "invalid", None), 82 | ], 83 | ) 84 | def test_extract_param(url, param, value): 85 | assert extract_param(url, param) == value 86 | 87 | 88 | @pytest.mark.parametrize( 89 | "start_date,end_date,expected_month_and_year", 90 | [ 91 | (datetime(2020, 1, 10), datetime(2020, 3, 1), [(2, 2020), (3, 2020)]), 92 | ( 93 | datetime(2019, 10, 1), 94 | datetime(2020, 3, 1), 95 | [(11, 2019), (12, 2019), (1, 2020), (2, 2020), (3, 2020)], 96 | ), 97 | (datetime(2020, 2, 10), datetime(2020, 3, 1), [(3, 2020)]), 98 | (datetime(2020, 6, 1), datetime(2020, 3, 1), []), 99 | ( 100 | datetime(2008, 10, 11), 101 | datetime(2012, 3, 29), 102 | [(11, 2008), (12, 2008)] 103 | + [(m, y) for y in range(2009, 2012) for m in range(1, 13)] 104 | + [(1, 2012), (2, 2012), (3, 2012)], 105 | ), 106 | (datetime(2020, 4, 14), datetime(2020, 4, 23), [(4, 2020)]), 107 | ], 108 | ) 109 | def test_months_and_years(start_date, end_date, expected_month_and_year): 110 | assert months_and_years(start_date, end_date) == expected_month_and_year 111 | 112 | 113 | @pytest.mark.parametrize( 114 | "str_with_date,expected_obj", 115 | [ 116 | ("26/02/2020", date(2020, 2, 26)), 117 | ("26/02/2020 19:28", date(2020, 2, 26)), 118 | ("26/02/20", date(2020, 2, 26)), 119 | ("26.02.20", None), 120 | ("Random", None), 121 | ], 122 | ) 123 | def test_extract_date(str_with_date, expected_obj): 124 | assert extract_date(str_with_date) == expected_obj 125 | 126 | 127 | @pytest.mark.parametrize( 128 | "original_value,expected_value", 129 | [ 130 | ("tomada", "tomada"), 131 | ("pregão presencial", "pregao presencial"), 132 | ("pregão eletrônico", "pregao eletronico"), 133 | ("concorrência", "concorrencia"), 134 | ("çãôéà", "caoea"), 135 | (None, None), 136 | ], 137 | ) 138 | def test_strip_accents(original_value, expected_value): 139 | assert strip_accents(original_value) == expected_value 140 | 141 | 142 | @pytest.mark.parametrize( 143 | "original_value,expected_value", 144 | [ 145 | ("google.com", True), 146 | ("www.google", True), 147 | ("feiraeh.top", True), 148 | ("http://feiradesantana.com.br", True), 149 | ("https://feiradesantana.com.br", True), 150 | ("https://feiradesantana.com.br", True), 151 | ("http://www.feiradesantana.com.br", True), 152 | ("https://www.feiradesantana.com.br", True), 153 | ("https://monitor.dadosabertosdefeira.com.br", True), 154 | ("http://www.feiradesantana.ba.gov.br/Word - Port20130001.pdf", True), 155 | ("tel:42384248", False), 156 | ("bobagem", False), 157 | ("#", False), 158 | (None, False), 159 | ], 160 | ) 161 | def test_is_url(original_value, expected_value): 162 | assert is_url(original_value) is expected_value 163 | 164 | 165 | def test_get_git_commit(monkeypatch): 166 | expected_git_commit = "43fb0339d3758204cef63d3bc3ffadfda9b8dd3b" 167 | monkeypatch.setenv("GIT_REV", expected_git_commit) 168 | 169 | git_commit = get_git_commit() 170 | 171 | assert len(git_commit) == 40 172 | assert git_commit == expected_git_commit 173 | 174 | 175 | def test_get_git_commit_when_git_rev_is_none(monkeypatch): 176 | monkeypatch.setenv("GIT_REV", None) 177 | 178 | assert get_git_commit() == "" 179 | -------------------------------------------------------------------------------- /web/api/tests/test_serializers.py: -------------------------------------------------------------------------------- 1 | from datetime import date, datetime 2 | 3 | import pytest 4 | from dateutil.parser import parse 5 | from model_bakery import baker 6 | 7 | from web.api.serializers import ( 8 | CityCouncilAgendaSerializer, 9 | CityCouncilAttendanceListSerializer, 10 | CityCouncilMinuteSerializer, 11 | CityHallBidEventSerializer, 12 | CityHallBidSerializer, 13 | FileSerializer, 14 | ) 15 | 16 | pytestmark = pytest.mark.django_db 17 | 18 | 19 | class TestCityCouncilAgendaSerializer: 20 | def test_city_council_agenda_serializer(self): 21 | data = { 22 | "date": "2020-03-18", 23 | "details": "PROJETOS DE LEI ORDINÁRIA EM 2ª DISCUSSÃO 017/20", 24 | "event_type": "sessao_ordinaria", 25 | "title": "ORDEM DO DIA - 18 DE MARÇO DE 2020", 26 | "crawled_at": "2020-01-01T04:16:13-04:00", 27 | "crawled_from": "http://www.pudim.com.br/", 28 | } 29 | serializer = CityCouncilAgendaSerializer(data=data) 30 | 31 | assert serializer.is_valid() 32 | assert ( 33 | serializer.validated_data["date"] 34 | == parse(data["date"], dayfirst=True).date() 35 | ) 36 | assert serializer.validated_data["details"] == data["details"] 37 | assert serializer.validated_data["event_type"] == data["event_type"] 38 | assert serializer.validated_data["title"] == data["title"] 39 | assert serializer.validated_data["crawled_at"] == datetime.fromisoformat( 40 | data["crawled_at"] 41 | ) 42 | assert serializer.validated_data["crawled_from"] == data["crawled_from"] 43 | 44 | 45 | class TestCityCouncilAttendanceList: 46 | def test_city_council_attendance_list(self): 47 | data = { 48 | "date": date(2020, 12, 14), 49 | "description": None, 50 | "council_member": "Zé Curuca", 51 | "status": "ausente", 52 | "crawled_at": "2020-01-01T04:16:13-03:00", 53 | "crawled_from": ( 54 | "https://www.feiradesantana.ba.leg.br/" 55 | "lista-presenca-vereadores/107/14-12-2020" 56 | ), 57 | "notes": "-", 58 | } 59 | 60 | serializer = CityCouncilAttendanceListSerializer(data=data) 61 | assert serializer.is_valid() 62 | assert serializer.validated_data["date"] == data["date"] 63 | assert serializer.validated_data["description"] == data["description"] 64 | assert serializer.validated_data["council_member"] == data["council_member"] 65 | assert serializer.validated_data["status"] == data["status"] 66 | assert serializer.validated_data["crawled_at"] == datetime.fromisoformat( 67 | data["crawled_at"] 68 | ) 69 | assert serializer.validated_data["crawled_from"] == data["crawled_from"] 70 | assert serializer.validated_data["notes"] == data["notes"] 71 | 72 | 73 | class TestCityCouncilMinuteSerializer: 74 | def test_city_council_minute_serializer(self): 75 | data = { 76 | "date": "2020-03-18", 77 | "event_type": "sessao_ordinaria", 78 | "title": "ORDEM DO DIA - 18 DE MARÇO DE 2020", 79 | "crawled_at": "2020-01-01T04:16:13-04:00", 80 | "crawled_from": "http://www.pudim.com.br/", 81 | "files": [ 82 | { 83 | "url": "https://www.feiradesantana.ba.leg.br/5eaabb5e91088.pd", 84 | "checksum": "checksum", 85 | "content": None, 86 | }, 87 | ], 88 | } 89 | serializer = CityCouncilMinuteSerializer(data=data) 90 | 91 | assert serializer.is_valid() 92 | assert ( 93 | serializer.validated_data["date"] 94 | == parse(data["date"], dayfirst=True).date() 95 | ) 96 | assert serializer.validated_data["event_type"] == data["event_type"] 97 | assert serializer.validated_data["title"] == data["title"] 98 | 99 | 100 | class TestCityHallBidEventSerializer: 101 | def test_city_hall_bid_event_serializer(self): 102 | bid = baker.make_recipe("datasets.CityHallBid") 103 | 104 | data = { 105 | "published_at": "2020-07-21T11:49:00-03:00", 106 | "summary": "Julgamento do recurso administrativo", 107 | "bid": bid.pk, 108 | "crawled_at": datetime.now(), 109 | "crawled_from": "https://www.example.com", 110 | } 111 | 112 | serializer = CityHallBidEventSerializer(data=data) 113 | assert serializer.is_valid() 114 | 115 | assert serializer.validated_data["published_at"] == datetime.fromisoformat( 116 | data["published_at"] 117 | ) 118 | assert serializer.validated_data["summary"] == data["summary"] 119 | assert serializer.validated_data["bid"] == bid 120 | 121 | 122 | class TestFileSerializer: 123 | def test_file_serializer(self): 124 | data = {"url": "https://www.example.com/file.pdf"} 125 | 126 | serializer = FileSerializer(data=data) 127 | assert serializer.is_valid() 128 | assert serializer.validated_data["url"] == data["url"] 129 | 130 | 131 | class TestCityHallBidSerializer: 132 | def test_city_hall_bid_serializer(self): 133 | data = { 134 | "session_at": "2021-01-06T08:30:00-03:00", 135 | "public_agency": "PMFS", 136 | "description": "Contratação de empresa de engenharia", 137 | "modality": "convite", 138 | "codes": "LICITAÇÃO Nº 150-2020 TOMADA DE PREÇO Nº 038-2020", 139 | "crawled_at": "2020-01-01T04:16:13-04:00", 140 | "crawled_from": "http://www.pudim.com.br/", 141 | "events": [ 142 | { 143 | "id": 243, 144 | "created_at": "2021-01-01T20:00:32.209476-03:00", 145 | "updated_at": "2021-01-01T20:00:32.209508-03:00", 146 | "crawled_at": "2021-01-01T20:00:32.185236-03:00", 147 | "crawled_from": "http://www.dadosdafeira.br/teste", 148 | "notes": "", 149 | "published_at": "2020-07-21T11:49:00-03:00", 150 | "summary": "Julgamento do recurso administrativo", 151 | "bid": 315, 152 | }, 153 | ], 154 | "files": [{"url": "http://www.dadosdafeira.br/licitacoes/testes.pdf"}], 155 | } 156 | 157 | serializer = CityHallBidSerializer(data=data) 158 | assert serializer.is_valid() 159 | 160 | assert serializer.validated_data["session_at"] == datetime.fromisoformat( 161 | data["session_at"] 162 | ) 163 | assert serializer.validated_data["public_agency"] == data["public_agency"] 164 | assert serializer.validated_data["description"] == data["description"] 165 | assert serializer.validated_data["modality"] == data["modality"] 166 | assert serializer.validated_data["codes"] == data["codes"] 167 | assert serializer.validated_data["crawled_at"] == datetime.fromisoformat( 168 | data["crawled_at"] 169 | ) 170 | assert serializer.validated_data["crawled_from"] == data["crawled_from"] 171 | --------------------------------------------------------------------------------