├── storage
    ├── raw
    │   └── .gitkeep
    ├── processed
    │   └── .gitkeep
    └── structured
    │   └── .gitkeep
├── src
    └── datapub
    │   ├── entities
    │       ├── __init__.py
    │       ├── al_ac
    │       │   ├── __init__.py
    │       │   └── extractor.py
    │       ├── al_ce
    │       │   ├── __init__.py
    │       │   └── extractor.py
    │       ├── al_go
    │       │   ├── __init__.py
    │       │   └── extractor.py
    │       ├── al_ms
    │       │   ├── __init__.py
    │       │   └── extractor.py
    │       └── al_pa
    │       │   ├── __init__.py
    │       │   └── extractor.py
    │   ├── shared
    │       ├── __init__.py
    │       ├── utils
    │       │   ├── __init__.py
    │       │   └── extractor_base.py
    │       └── contracts
    │       │   ├── __init__.py
    │       │   └── extractor_contract.py
    │   ├── __init__.py
    │   ├── skeleton.py
    │   └── cli.py
├── docs
    ├── _static
    │   └── .gitignore
    ├── contributing.rst
    ├── readme.rst
    ├── authors.rst
    ├── changelog.rst
    ├── license.rst
    ├── requirements.txt
    ├── Makefile
    ├── index.rst
    └── conf.py
├── AUTHORS.rst
├── CHANGELOG.rst
├── local.txt
├── tests
    ├── conftest.py
    └── test_skeleton.py
├── pyproject.toml
├── docker
    └── postgres
    │   └── Dockerfile
├── .readthedocs.yml
├── .coveragerc
├── .github
    ├── workflows
    │   └── sync-issues.yml
    ├── FUNDING.yml
    └── scripts
    │   └── sync_issues.py
├── setup.py
├── database.sql
├── docker-compose.yml
├── LICENSE.txt
├── .gitignore
├── README.rst
├── tox.ini
├── requirements.txt
├── README.md
├── setup.cfg
├── CONTRIBUTING.rst
└── sources.json


/storage/raw/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/storage/processed/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/storage/structured/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/shared/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ac/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ce/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_go/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_pa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/shared/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/datapub/shared/contracts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | # Empty directory
2 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. _readme:
2 | .. include:: ../README.rst
3 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. _authors:
2 | .. include:: ../AUTHORS.rst
3 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 | .. include:: ../CHANGELOG.rst
3 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributors
3 | ============
4 | 
5 | * a21ns1g4ts <atila.danvi@outlook.com>
6 | 


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. _license:
2 | 
3 | =======
4 | License
5 | =======
6 | 
7 | .. include:: ../LICENSE.txt
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | Version 0.1
 6 | ===========
 7 | 
 8 | - Feature A added
 9 | - FIX: nasty bug #1729 fixed
10 | - add your changes here!
11 | 


--------------------------------------------------------------------------------
/local.txt:
--------------------------------------------------------------------------------
1 | python -m venv venv
2 | source venv/bin/activate
3 | pip install selenium webdriver-manager
4 | pip install hcaptcha-solver
5 | pip install pyperclip
6 | pip install pyautogui
7 | pip install pdfplumber
8 | pip install dateparser


--------------------------------------------------------------------------------
/src/datapub/shared/contracts/extractor_contract.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | 
3 | class ExtractorContract(ABC):
4 |     @abstractmethod
5 |     def download(self, *args, **kwargs):
6 |         """Realiza o download dos dados brutos."""
7 |         pass
8 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Requirements file for ReadTheDocs, check .readthedocs.yml.
2 | # To build the module reference correctly, make sure every external package
3 | # under `install_requires` in `setup.cfg` is also listed here!
4 | sphinx>=3.2.1
5 | # sphinx_rtd_theme
6 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Dummy conftest.py for datapub.
 3 | 
 4 |     If you don't know what this is for, just leave it empty.
 5 |     Read more about conftest.py under:
 6 |     - https://docs.pytest.org/en/stable/fixture.html
 7 |     - https://docs.pytest.org/en/stable/writing_plugins.html
 8 | """
 9 | 
10 | # import pytest
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # AVOID CHANGING REQUIRES: IT WILL BE UPDATED BY PYSCAFFOLD!
 3 | requires = ["setuptools>=46.1.0", "setuptools_scm[toml]>=5"]
 4 | build-backend = "setuptools.build_meta"
 5 | 
 6 | [tool.setuptools_scm]
 7 | # For smarter version schemes and other configuration options,
 8 | # check out https://github.com/pypa/setuptools_scm
 9 | version_scheme = "no-guess-dev"
10 | 


--------------------------------------------------------------------------------
/docker/postgres/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM postgres:15
 2 | 
 3 | # Instala a extensão pgvector
 4 | RUN apt-get update && \
 5 |     apt-get install -y git build-essential postgresql-server-dev-15 && \
 6 |     git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git /tmp/pgvector && \
 7 |     cd /tmp/pgvector && \
 8 |     make && \
 9 |     make install && \
10 |     rm -rf /tmp/pgvector && \
11 |     apt-get remove -y git build-essential postgresql-server-dev-15 && \
12 |     apt-get autoremove -y && \
13 |     rm -rf /var/lib/apt/lists/*
14 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Build documentation in the docs/ directory with Sphinx
 8 | sphinx:
 9 |   configuration: docs/conf.py
10 | 
11 | # Build documentation with MkDocs
12 | #mkdocs:
13 | #  configuration: mkdocs.yml
14 | 
15 | # Optionally build your docs in additional formats such as PDF
16 | formats:
17 |   - pdf
18 | 
19 | build:
20 |   os: ubuntu-22.04
21 |   tools:
22 |     python: "3.11"
23 | 
24 | python:
25 |   install:
26 |     - requirements: docs/requirements.txt
27 |     - {path: ., method: pip}
28 | 


--------------------------------------------------------------------------------
/src/datapub/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | if sys.version_info[:2] >= (3, 8):
 4 |     # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
 5 |     from importlib.metadata import PackageNotFoundError, version  # pragma: no cover
 6 | else:
 7 |     from importlib_metadata import PackageNotFoundError, version  # pragma: no cover
 8 | 
 9 | try:
10 |     # Change here if project is renamed and does not equal the package name
11 |     dist_name = __name__
12 |     __version__ = version(dist_name)
13 | except PackageNotFoundError:  # pragma: no cover
14 |     __version__ = "unknown"
15 | finally:
16 |     del version, PackageNotFoundError
17 | 


--------------------------------------------------------------------------------
/tests/test_skeleton.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from datapub.skeleton import fib, main
 4 | 
 5 | __author__ = "a21ns1g4ts"
 6 | __copyright__ = "a21ns1g4ts"
 7 | __license__ = "MIT"
 8 | 
 9 | 
10 | def test_fib():
11 |     """API Tests"""
12 |     assert fib(1) == 1
13 |     assert fib(2) == 1
14 |     assert fib(7) == 13
15 |     with pytest.raises(AssertionError):
16 |         fib(-10)
17 | 
18 | 
19 | def test_main(capsys):
20 |     """CLI Tests"""
21 |     # capsys is a pytest fixture that allows asserts against stdout/stderr
22 |     # https://docs.pytest.org/en/stable/capture.html
23 |     main(["7"])
24 |     captured = capsys.readouterr()
25 |     assert "The 7-th Fibonacci number is 13" in captured.out
26 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = True
 4 | source = datapub
 5 | # omit = bad_file.py
 6 | 
 7 | [paths]
 8 | source =
 9 |     src/
10 |     */site-packages/
11 | 
12 | [report]
13 | # Regexes for lines to exclude from consideration
14 | exclude_lines =
15 |     # Have to re-enable the standard pragma
16 |     pragma: no cover
17 | 
18 |     # Don't complain about missing debug-only code:
19 |     def __repr__
20 |     if self\.debug
21 | 
22 |     # Don't complain if tests don't hit defensive assertion code:
23 |     raise AssertionError
24 |     raise NotImplementedError
25 | 
26 |     # Don't complain if non-runnable code isn't run:
27 |     if 0:
28 |     if __name__ == .__main__.:
29 | 


--------------------------------------------------------------------------------
/.github/workflows/sync-issues.yml:
--------------------------------------------------------------------------------
 1 | name: Sync Issues
 2 | on:
 3 |   workflow_dispatch: 
 4 | 
 5 | permissions:
 6 |   issues: write
 7 | 
 8 | jobs:
 9 |   sync_issues:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout repo
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Config Python
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.x'
20 | 
21 |       - name: Install dependencies
22 |         run: pip install requests
23 | 
24 |       - name: Sync issues
25 |         env:
26 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 |           GITHUB_REPOSITORY: ${{ github.repository }}
28 |         run: |
29 |           python .github/scripts/sync_issues.py
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Setup file for datapub.
 3 |     Use setup.cfg to configure your project.
 4 | 
 5 |     This file was generated with PyScaffold 4.6.
 6 |     PyScaffold helps you to put up the scaffold of your new Python project.
 7 |     Learn more under: https://pyscaffold.org/
 8 | """
 9 | 
10 | from setuptools import setup
11 | 
12 | if __name__ == "__main__":
13 |     try:
14 |         setup(use_scm_version={"version_scheme": "no-guess-dev"})
15 |     except:  # noqa
16 |         print(
17 |             "\n\nAn error occurred while building the project, "
18 |             "please ensure you have the most updated version of setuptools, "
19 |             "setuptools_scm and wheel with:\n"
20 |             "   pip install -U setuptools setuptools_scm wheel\n\n"
21 |         )
22 |         raise
23 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | github: [a21ns1g4ts]
 3 | #patreon: # Replace with a single Patreon username
 4 | #open_collective: # Replace with a single Open Collective username
 5 | #ko_fi: # Replace with a single Ko-fi username
 6 | #tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 7 | #community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 8 | #liberapay: # Replace with a single Liberapay username
 9 | #issuehunt: # Replace with a single IssueHunt username
10 | #lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
11 | #polar: # Replace with a single Polar username
12 | #buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
13 | #thanks_dev: # Replace with a single thanks.dev username
14 | #custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
15 | 


--------------------------------------------------------------------------------
/database.sql:
--------------------------------------------------------------------------------
 1 | -- Estrutura do banco de dados
 2 | CREATE TABLE documentos (
 3 |     id SERIAL PRIMARY KEY,
 4 |     orgao VARCHAR(255) NOT NULL,
 5 |     data_publicacao DATE NOT NULL,
 6 |     caminho_arquivo VARCHAR(512) NOT NULL,
 7 |     hash_arquivo VARCHAR(64) UNIQUE NOT NULL,
 8 |     metadata JSONB
 9 | );
10 | 
11 | CREATE TABLE entidades (
12 |     id SERIAL PRIMARY KEY,
13 |     documento_id INTEGER REFERENCES documentos(id),
14 |     tipo_entidade VARCHAR(50) NOT NULL,  -- 'PESSOA', 'ORGAO', 'LOCAL', etc.
15 |     valor TEXT NOT NULL,
16 |     contexto TEXT,
17 |     inicio_pos INTEGER,
18 |     fim_pos INTEGER
19 | );
20 | 
21 | CREATE TABLE gastos (
22 |     id SERIAL PRIMARY KEY,
23 |     documento_id INTEGER REFERENCES documentos(id),
24 |     orgao TEXT NOT NULL,
25 |     valor NUMERIC(15,2) NOT NULL,
26 |     descricao TEXT,
27 |     data DATE,
28 |     categoria TEXT
29 | );
30 | 
31 | -- Extensão para armazenar embeddings
32 | CREATE EXTENSION vector;
33 | CREATE TABLE document_embeddings (
34 |     document_id INTEGER PRIMARY KEY REFERENCES documentos(id),
35 |     embedding vector(768)  -- Dimensão do embedding
36 | );


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   postgres:
 3 |     image: postgres:15
 4 |     build: ./docker/postgres  
 5 |     environment:
 6 |       POSTGRES_PASSWORD: diarios
 7 |       POSTGRES_DB: diarios
 8 |     volumes:
 9 |       - pg_data:/var/lib/postgresql/data
10 |       - ./database.sql:/docker-entrypoint-initdb.d/database.sql 
11 |     ports:
12 |       - "5432:5432"
13 |   
14 |   elasticsearch:
15 |     image: docker.elastic.co/elasticsearch/elasticsearch:8.6.2
16 |     environment:
17 |       - discovery.type=single-node
18 |       - ES_JAVA_OPTS=-Xms1g -Xmx1g
19 |     volumes:
20 |       - es_data:/usr/share/elasticsearch/data
21 |     ports:
22 |       - "9200:9200"
23 |   
24 |   api:
25 |     build: ./src/api
26 |     ports:
27 |       - "8000:8000"
28 |     depends_on:
29 |       - postgres
30 |       - elasticsearch
31 |     environment:
32 |       DB_URL: postgresql://postgres:diarios@postgres/diarios
33 |       ES_URL: http://elasticsearch:9200
34 |   
35 |   frontend:
36 |     build: ./src/frontend
37 |     ports:
38 |       - "8501:8501"
39 |     depends_on:
40 |       - api
41 | 
42 | volumes:
43 |   pg_data:
44 |   es_data:
45 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2025 a21ns1g4ts
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary and binary files
 2 | *~
 3 | *.py[cod]
 4 | *.so
 5 | *.cfg
 6 | !.isort.cfg
 7 | !setup.cfg
 8 | *.orig
 9 | *.log
10 | *.pot
11 | __pycache__/*
12 | .cache/*
13 | .*.swp
14 | */.ipynb_checkpoints/*
15 | .DS_Store
16 | 
17 | # Project files
18 | .ropeproject
19 | .project
20 | .pydevproject
21 | .settings
22 | .idea
23 | .vscode
24 | tags
25 | 
26 | # Package files
27 | *.egg
28 | *.eggs/
29 | .installed.cfg
30 | *.egg-info
31 | 
32 | # Unittest and coverage
33 | htmlcov/*
34 | .coverage
35 | .coverage.*
36 | .tox
37 | junit*.xml
38 | coverage.xml
39 | .pytest_cache/
40 | 
41 | # Build and docs folder/files
42 | build/*
43 | dist/*
44 | sdist/*
45 | docs/api/*
46 | docs/_rst/*
47 | docs/_build/*
48 | cover/*
49 | MANIFEST
50 | 
51 | # Per-project virtualenvs
52 | .venv*/
53 | .conda*/
54 | .python-version
55 | 
56 | # Byte-compiled / optimized / DLL files
57 | __pycache__/
58 | **/__pycache__/
59 | *.py[cod]
60 | *.pyo
61 | storage/raw/*/
62 | storage/processed/*/
63 | storage/structured/*/
64 | 
65 | # Distribution / packaging
66 | build/
67 | dist/
68 | *.egg-info/
69 | .eggs/
70 | 
71 | # Virtual environments
72 | .env/
73 | .venv/
74 | venv/
75 | ENV/
76 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | AUTODOCDIR    = api
11 | 
12 | # User-friendly check for sphinx-build
13 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1)
14 | $(error "The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from https://sphinx-doc.org/")
15 | endif
16 | 
17 | .PHONY: help clean Makefile
18 | 
19 | # Put it first so that "make" without argument is like "make help".
20 | help:
21 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 
23 | clean:
24 | 	rm -rf $(BUILDDIR)/* $(AUTODOCDIR)
25 | 
26 | # Catch-all target: route all unknown targets to Sphinx using the new
27 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
28 | %: Makefile
29 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
30 | 


--------------------------------------------------------------------------------
/src/datapub/shared/utils/extractor_base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import hashlib
 4 | from pathlib import Path
 5 | from datetime import datetime
 6 | from datapub.shared.contracts.extractor_contract import ExtractorContract
 7 | 
 8 | 
 9 | class ExtractorBase(ExtractorContract):
10 |     def __init__(self, entity: str, base_dir: str, headless=True):
11 |         self.entity = entity
12 |         self.headless = headless
13 |         self.base_dir = Path(base_dir)
14 |         self.downloads_dir = self.base_dir / "downloads"
15 |         self.metadata_dir = self.base_dir / "metadata"
16 | 
17 |         self.downloads_dir.mkdir(parents=True, exist_ok=True)
18 |         self.metadata_dir.mkdir(parents=True, exist_ok=True)
19 | 
20 |     def _format_date(self, date: datetime, fmt: str = "%Y-%m-%d") -> str:
21 |         return date.strftime(fmt)
22 | 
23 |     def _save_metadata(self, date, filename, url, path, hash):
24 |         metadata = {
25 |             "entity": self.entity,
26 |             "data_publicacao": date.isoformat(),
27 |             "url_origem": url,
28 |             "caminho_local": str(path),
29 |             "data_download": datetime.now().isoformat(),
30 |             "tamanho_bytes": os.path.getsize(path),
31 |             "hash_md5": hash,
32 |             "status": "sucesso"
33 |         }
34 | 
35 |         nome_metadata = f"metadata_{filename}.json"
36 |         with open(self.metadata_dir / nome_metadata, "w", encoding="utf-8") as f:
37 |             json.dump(metadata, f, ensure_ascii=False, indent=2)
38 | 
39 |     def _generate_file_hash(self, content: bytes) -> str:
40 |         return hashlib.md5(content).hexdigest()
41 | 
42 |     def download(self, *args, **kwargs):
43 |         raise NotImplementedError("Você deve implementar o método `download`.")
44 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. These are examples of badges you might want to add to your README:
 2 |    please update the URLs accordingly
 3 | 
 4 |     .. image:: https://api.cirrus-ci.com/github/<USER>/datapub.svg?branch=main
 5 |         :alt: Built Status
 6 |         :target: https://cirrus-ci.com/github/<USER>/datapub
 7 |     .. image:: https://readthedocs.org/projects/datapub/badge/?version=latest
 8 |         :alt: ReadTheDocs
 9 |         :target: https://datapub.readthedocs.io/en/stable/
10 |     .. image:: https://img.shields.io/coveralls/github/<USER>/datapub/main.svg
11 |         :alt: Coveralls
12 |         :target: https://coveralls.io/r/<USER>/datapub
13 |     .. image:: https://img.shields.io/pypi/v/datapub.svg
14 |         :alt: PyPI-Server
15 |         :target: https://pypi.org/project/datapub/
16 |     .. image:: https://img.shields.io/conda/vn/conda-forge/datapub.svg
17 |         :alt: Conda-Forge
18 |         :target: https://anaconda.org/conda-forge/datapub
19 |     .. image:: https://pepy.tech/badge/datapub/month
20 |         :alt: Monthly Downloads
21 |         :target: https://pepy.tech/project/datapub
22 |     .. image:: https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Twitter
23 |         :alt: Twitter
24 |         :target: https://twitter.com/datapub
25 | 
26 | .. image:: https://img.shields.io/badge/-PyScaffold-005CA0?logo=pyscaffold
27 |     :alt: Project generated with PyScaffold
28 |     :target: https://pyscaffold.org/
29 | 
30 | |
31 | 
32 | =======
33 | datapub
34 | =======
35 | 
36 | 
37 |     Add a short description here!
38 | 
39 | 
40 | A longer description of your project goes here...
41 | 
42 | 
43 | .. _pyscaffold-notes:
44 | 
45 | Note
46 | ====
47 | 
48 | This project has been set up using PyScaffold 4.6. For details and usage
49 | information on PyScaffold see https://pyscaffold.org/.
50 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | datapub
 3 | =======
 4 | 
 5 | This is the documentation of **datapub**.
 6 | 
 7 | .. note::
 8 | 
 9 |     This is the main page of your project's `Sphinx`_ documentation.
10 |     It is formatted in `reStructuredText`_. Add additional pages
11 |     by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
12 |     Use then `references`_ in order to link them from this page, e.g.
13 |     :ref:`authors` and :ref:`changes`.
14 | 
15 |     It is also possible to refer to the documentation of other Python packages
16 |     with the `Python domain syntax`_. By default you can reference the
17 |     documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
18 |     `Pandas`_, `Scikit-Learn`_. You can add more by extending the
19 |     ``intersphinx_mapping`` in your Sphinx's ``conf.py``.
20 | 
21 |     The pretty useful extension `autodoc`_ is activated by default and lets
22 |     you include documentation from docstrings. Docstrings can be written in
23 |     `Google style`_ (recommended!), `NumPy style`_ and `classical style`_.
24 | 
25 | 
26 | Contents
27 | ========
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 | 
32 |    Overview <readme>
33 |    Contributions & Help <contributing>
34 |    License <license>
35 |    Authors <authors>
36 |    Changelog <changelog>
37 |    Module Reference <api/modules>
38 | 
39 | 
40 | Indices and tables
41 | ==================
42 | 
43 | * :ref:`genindex`
44 | * :ref:`modindex`
45 | * :ref:`search`
46 | 
47 | .. _toctree: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html
48 | .. _reStructuredText: https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
49 | .. _references: https://www.sphinx-doc.org/en/stable/markup/inline.html
50 | .. _Python domain syntax: https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html#the-python-domain
51 | .. _Sphinx: https://www.sphinx-doc.org/
52 | .. _Python: https://docs.python.org/
53 | .. _Numpy: https://numpy.org/doc/stable
54 | .. _SciPy: https://docs.scipy.org/doc/scipy/reference/
55 | .. _matplotlib: https://matplotlib.org/contents.html#
56 | .. _Pandas: https://pandas.pydata.org/pandas-docs/stable
57 | .. _Scikit-Learn: https://scikit-learn.org/stable
58 | .. _autodoc: https://www.sphinx-doc.org/en/master/ext/autodoc.html
59 | .. _Google style: https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
60 | .. _NumPy style: https://numpydoc.readthedocs.io/en/latest/format.html
61 | .. _classical style: https://www.sphinx-doc.org/en/master/domains.html#info-field-lists
62 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox configuration file
 2 | # Read more under https://tox.wiki/
 3 | # THIS SCRIPT IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
 4 | 
 5 | [tox]
 6 | minversion = 3.24
 7 | envlist = default
 8 | isolated_build = True
 9 | 
10 | 
11 | [testenv]
12 | description = Invoke pytest to run automated tests
13 | setenv =
14 |     TOXINIDIR = {toxinidir}
15 | passenv =
16 |     HOME
17 |     SETUPTOOLS_*
18 | extras =
19 |     testing
20 | commands =
21 |     pytest {posargs}
22 | 
23 | 
24 | # # To run `tox -e lint` you need to make sure you have a
25 | # # `.pre-commit-config.yaml` file. See https://pre-commit.com
26 | # [testenv:lint]
27 | # description = Perform static analysis and style checks
28 | # skip_install = True
29 | # deps = pre-commit
30 | # passenv =
31 | #     HOMEPATH
32 | #     PROGRAMDATA
33 | #     SETUPTOOLS_*
34 | # commands =
35 | #     pre-commit run --all-files {posargs:--show-diff-on-failure}
36 | 
37 | 
38 | [testenv:{build,clean}]
39 | description =
40 |     build: Build the package in isolation according to PEP517, see https://github.com/pypa/build
41 |     clean: Remove old distribution files and temporary build artifacts (./build and ./dist)
42 | # https://setuptools.pypa.io/en/stable/build_meta.html#how-to-use-it
43 | skip_install = True
44 | changedir = {toxinidir}
45 | deps =
46 |     build: build[virtualenv]
47 | passenv =
48 |     SETUPTOOLS_*
49 | commands =
50 |     clean: python -c 'import shutil; [shutil.rmtree(p, True) for p in ("build", "dist", "docs/_build")]'
51 |     clean: python -c 'import pathlib, shutil; [shutil.rmtree(p, True) for p in pathlib.Path("src").glob("*.egg-info")]'
52 |     build: python -m build {posargs}
53 | # By default, both `sdist` and `wheel` are built. If your sdist is too big or you don't want
54 | # to make it available, consider running: `tox -e build -- --wheel`
55 | 
56 | 
57 | [testenv:{docs,doctests,linkcheck}]
58 | description =
59 |     docs: Invoke sphinx-build to build the docs
60 |     doctests: Invoke sphinx-build to run doctests
61 |     linkcheck: Check for broken links in the documentation
62 | passenv =
63 |     SETUPTOOLS_*
64 | setenv =
65 |     DOCSDIR = {toxinidir}/docs
66 |     BUILDDIR = {toxinidir}/docs/_build
67 |     docs: BUILD = html
68 |     doctests: BUILD = doctest
69 |     linkcheck: BUILD = linkcheck
70 | deps =
71 |     -r {toxinidir}/docs/requirements.txt
72 |     # ^  requirements.txt shared with Read The Docs
73 | commands =
74 |     sphinx-build --color -b {env:BUILD} -d "{env:BUILDDIR}/doctrees" "{env:DOCSDIR}" "{env:BUILDDIR}/{env:BUILD}" {posargs}
75 | 
76 | 
77 | [testenv:publish]
78 | description =
79 |     Publish the package you have been developing to a package index server.
80 |     By default, it uses testpypi. If you really want to publish your package
81 |     to be publicly accessible in PyPI, use the `-- --repository pypi` option.
82 | skip_install = True
83 | changedir = {toxinidir}
84 | passenv =
85 |     # See: https://twine.readthedocs.io/en/latest/
86 |     TWINE_USERNAME
87 |     TWINE_PASSWORD
88 |     TWINE_REPOSITORY
89 |     TWINE_REPOSITORY_URL
90 | deps = twine
91 | commands =
92 |     python -m twine check dist/*
93 |     python -m twine upload {posargs:--repository {env:TWINE_REPOSITORY:testpypi}} dist/*
94 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | altair==5.5.0
  2 | annotated-types==0.7.0
  3 | anyio==4.9.0
  4 | attrs==25.3.0
  5 | blinker==1.9.0
  6 | blis==1.3.0
  7 | cachetools==5.5.2
  8 | catalogue==2.0.10
  9 | certifi==2025.4.26
 10 | cffi==1.17.1
 11 | charset-normalizer==3.4.2
 12 | click==8.2.1
 13 | cloudpathlib==0.21.1
 14 | confection==0.1.5
 15 | contourpy==1.3.2
 16 | cryptography==45.0.3
 17 | cycler==0.12.1
 18 | cymem==2.0.11
 19 | dateparser==1.2.1
 20 | elastic-transport==8.17.1
 21 | elasticsearch==9.0.1
 22 | fastapi==0.115.12
 23 | filelock==3.18.0
 24 | fonttools==4.58.1
 25 | fsspec==2025.5.1
 26 | gitdb==4.0.12
 27 | GitPython==3.1.44
 28 | h11==0.16.0
 29 | hcaptcha-solver==0.2.14
 30 | hf-xet==1.1.2
 31 | huggingface-hub==0.32.3
 32 | idna==3.10
 33 | Jinja2==3.1.6
 34 | jsonschema==4.24.0
 35 | jsonschema-specifications==2025.4.1
 36 | kiwisolver==1.4.8
 37 | langcodes==3.5.0
 38 | language_data==1.3.0
 39 | marisa-trie==1.2.1
 40 | markdown-it-py==3.0.0
 41 | MarkupSafe==3.0.2
 42 | matplotlib==3.10.3
 43 | mdurl==0.1.2
 44 | MouseInfo==0.1.3
 45 | mpmath==1.3.0
 46 | murmurhash==1.0.13
 47 | narwhals==1.41.0
 48 | networkx==3.5
 49 | numpy==2.2.6
 50 | nvidia-cublas-cu12==12.6.4.1
 51 | nvidia-cuda-cupti-cu12==12.6.80
 52 | nvidia-cuda-nvrtc-cu12==12.6.77
 53 | nvidia-cuda-runtime-cu12==12.6.77
 54 | nvidia-cudnn-cu12==9.5.1.17
 55 | nvidia-cufft-cu12==11.3.0.4
 56 | nvidia-cufile-cu12==1.11.1.6
 57 | nvidia-curand-cu12==10.3.7.77
 58 | nvidia-cusolver-cu12==11.7.1.2
 59 | nvidia-cusparse-cu12==12.5.4.2
 60 | nvidia-cusparselt-cu12==0.6.3
 61 | nvidia-nccl-cu12==2.26.2
 62 | nvidia-nvjitlink-cu12==12.6.85
 63 | nvidia-nvtx-cu12==12.6.77
 64 | outcome==1.3.0.post0
 65 | packaging==24.2
 66 | pandas==2.2.3
 67 | pdfminer.six==20250327
 68 | pdfplumber==0.11.6
 69 | pgvector==0.4.1
 70 | pillow==11.2.1
 71 | preshed==3.0.10
 72 | protobuf==6.31.1
 73 | psycopg2-binary==2.9.10
 74 | pyarrow==20.0.0
 75 | PyAutoGUI==0.9.54
 76 | pycparser==2.22
 77 | pydantic==2.11.5
 78 | pydantic_core==2.33.2
 79 | pydeck==0.9.1
 80 | PyGetWindow==0.0.9
 81 | Pygments==2.19.1
 82 | PyMsgBox==1.0.9
 83 | pyparsing==3.2.3
 84 | PyPDF2==3.0.1
 85 | pypdfium2==4.30.1
 86 | pyperclip==1.9.0
 87 | PyRect==0.2.0
 88 | PyScreeze==1.0.1
 89 | PySocks==1.7.1
 90 | python-dateutil==2.9.0.post0
 91 | python-dotenv==1.1.0
 92 | python-multipart==0.0.20
 93 | python3-xlib==0.15
 94 | pytweening==1.2.0
 95 | pytz==2025.2
 96 | PyYAML==6.0.2
 97 | referencing==0.36.2
 98 | regex==2024.11.6
 99 | requests==2.32.3
100 | rich==14.0.0
101 | rpds-py==0.25.1
102 | safetensors==0.5.3
103 | selenium==4.33.0
104 | setuptools==80.9.0
105 | shellingham==1.5.4
106 | six==1.17.0
107 | smart-open==7.1.0
108 | smmap==5.0.2
109 | sniffio==1.3.1
110 | sortedcontainers==2.4.0
111 | spacy==3.8.7
112 | spacy-legacy==3.0.12
113 | spacy-loggers==1.0.5
114 | srsly==2.5.1
115 | starlette==0.46.2
116 | streamlit==1.45.1
117 | sympy==1.14.0
118 | tenacity==9.1.2
119 | thinc==8.3.6
120 | tokenizers==0.21.1
121 | toml==0.10.2
122 | torch==2.7.0
123 | tornado==6.5.1
124 | tqdm==4.67.1
125 | transformers==4.52.4
126 | trio==0.30.0
127 | trio-websocket==0.12.2
128 | triton==3.3.0
129 | typer==0.16.0
130 | typing-inspection==0.4.1
131 | typing_extensions==4.13.2
132 | tzdata==2025.2
133 | tzlocal==5.3.1
134 | Unidecode==1.4.0
135 | urllib3==2.4.0
136 | uvicorn==0.34.2
137 | wasabi==1.1.3
138 | watchdog==6.0.0
139 | weasel==0.4.1
140 | webdriver-manager==4.0.2
141 | websocket-client==1.8.0
142 | wrapt==1.17.2
143 | wsproto==1.2.0
144 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ce/extractor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import hashlib
 4 | import requests
 5 | from pathlib import Path
 6 | from datetime import datetime, timedelta
 7 | 
 8 | 
 9 | class Extractor:
10 |     def __init__(self, base_dir="storage/raw/alce"):
11 |         self.base_api = "https://doalece.al.ce.gov.br/api/publico/ultimas-edicoes"
12 |         self.base_url = "https://doalece.al.ce.gov.br"
13 |         self.base_dir = Path(base_dir)
14 |         self.downloads_dir = self.base_dir / "downloads"
15 |         self.metadata_dir = self.base_dir / "metadata"
16 | 
17 |         self.downloads_dir.mkdir(parents=True, exist_ok=True)
18 |         self.metadata_dir.mkdir(parents=True, exist_ok=True)
19 | 
20 |     def _format_date(self, date: datetime):
21 |         return date.strftime("%Y-%m-%d")
22 | 
23 |     def _build_api_url(self, start_date: datetime, end_date: datetime):
24 |         date_range = {
25 |             "data_de": self._format_date(start_date),
26 |             "data_ate": self._format_date(end_date),
27 |         }
28 |         return f"{self.base_api}?buscarData={json.dumps(date_range)}"
29 | 
30 |     def download(self, start_date: datetime, end_date: datetime):
31 |         print(f"📡 Buscando edições de {start_date} até {end_date}")
32 |         api_url = self._build_api_url(start_date, end_date)
33 |         response = requests.get(api_url)
34 | 
35 |         if response.status_code != 200:
36 |             print("❌ Erro ao acessar a API:", response.status_code)
37 |             return
38 | 
39 |         data = response.json()
40 |         edicoes = data.get("dados", [])
41 | 
42 |         if not edicoes:
43 |             print("⚠️ Nenhuma edição encontrada nesse intervalo.")
44 |             return
45 | 
46 |         for edicao in edicoes:
47 |             try:
48 |                 self._baixar_edicao(edicao)
49 |             except Exception as e:
50 |                 print(f"❌ Erro ao baixar edição {edicao.get('id')}: {e}")
51 | 
52 |     def _baixar_edicao(self, edicao: dict):
53 |         data_pub = edicao["data_publicacao"][:10]
54 |         nome_arquivo = f"diario-alce-{data_pub}.pdf"
55 |         caminho = edicao["caminho_documento_pdf"]
56 |         url_pdf = self.base_url + caminho
57 | 
58 |         print(f"📄 Baixando edição de {data_pub}: {nome_arquivo}")
59 | 
60 |         response = requests.get(url_pdf, timeout=15)
61 |         if response.status_code == 200 and b"%PDF" in response.content[:10]:
62 |             local_path = self.downloads_dir / nome_arquivo
63 |             with open(local_path, "wb") as f:
64 |                 f.write(response.content)
65 | 
66 |             file_hash = hashlib.md5(response.content).hexdigest()
67 |             self._salvar_metadata(edicao, url_pdf, local_path, file_hash)
68 |             print(f"✅ Salvo: {nome_arquivo} | Hash: {file_hash[:8]}")
69 |         else:
70 |             print(f"⚠️ Arquivo inválido ou não encontrado: {url_pdf}")
71 | 
72 |     def _salvar_metadata(self, edicao: dict, url_pdf: str, local_path: Path, file_hash: str):
73 |         metadata = {
74 |             "orgao": "ALCE",
75 |             "data_publicacao": edicao["data_publicacao"],
76 |             "url_origem": url_pdf,
77 |             "caminho_local": str(local_path),
78 |             "data_download": datetime.now().isoformat(),
79 |             "tamanho_bytes": os.path.getsize(local_path),
80 |             "hash_md5": file_hash,
81 |             "status": "sucesso"
82 |         }
83 | 
84 |         nome_metadata = f"metadata_{edicao['data_publicacao'][:10]}.json"
85 |         with open(self.metadata_dir / nome_metadata, "w", encoding="utf-8") as f:
86 |             json.dump(metadata, f, ensure_ascii=False, indent=2)
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     extractor = Extractor()
91 |     try:
92 |         inicio = datetime(2025, 5, 22)
93 |         fim = datetime(2025, 6, 1)
94 |         extractor.download(inicio, fim)
95 |     except Exception as e:
96 |         print("Erro durante execução:", e)
97 | 


--------------------------------------------------------------------------------
/.github/scripts/sync_issues.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import requests
  4 | import time
  5 | 
  6 | GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
  7 | REPO = os.getenv("GITHUB_REPOSITORY")
  8 | 
  9 | HEADERS = {
 10 |     "Authorization": f"token {GITHUB_TOKEN}",
 11 |     "Accept": "application/vnd.github.v3+json"
 12 | }
 13 | 
 14 | def load_json():
 15 |     with open('./sources.json', 'r', encoding='utf-8') as f:
 16 |         return json.load(f)
 17 | 
 18 | def load_all_open_issue_titles():
 19 |     titles = set()
 20 |     page = 1
 21 |     while True:
 22 |         url = f"https://api.github.com/repos/{REPO}/issues?state=open&per_page=100&page={page}"
 23 |         response = requests.get(url, headers=HEADERS)
 24 |         if response.status_code != 200:
 25 |             print(f"Erro ao buscar issues na página {page}: {response.text}")
 26 |             break
 27 |         issues = response.json()
 28 |         if not issues:
 29 |             break
 30 |         for issue in issues:
 31 |             titles.add(issue["title"].strip().lower())
 32 |         page += 1
 33 |     return titles
 34 | 
 35 | def create_issue(title, body, existing_titles):
 36 |     if title.strip().lower() in existing_titles:
 37 |         print(f"Issue already exists: {title}")
 38 |         return False
 39 | 
 40 |     url = f"https://api.github.com/repos/{REPO}/issues"
 41 |     data = {
 42 |         "title": title,
 43 |         "body": body
 44 |     }
 45 |     response = requests.post(url, headers=HEADERS, json=data)
 46 |     if response.status_code == 201:
 47 |         print(f"Issue created: {title}")
 48 |         # Adiciona o título criado para evitar criação duplicada na mesma execução
 49 |         existing_titles.add(title.strip().lower())
 50 |         return True
 51 |     else:
 52 |         print(f"Error creating issue {title}: {response.text}")
 53 |         return False
 54 | 
 55 | def main():
 56 |     data = load_json()
 57 |     existing_titles = load_all_open_issue_titles()
 58 |     created_count = 0
 59 |     max_create = 20
 60 | 
 61 |     for item in data:
 62 |         if created_count >= max_create:
 63 |             print(f"Limite de {max_create} issues criadas atingido nesta execução.")
 64 |             break
 65 | 
 66 |         title = f"Implementar pipeline para {item['nome']} ({item['sigla']})"
 67 |         body = f"""
 68 | # 🚀 Implementar pipeline para a fonte {item['nome']} ({item['sigla']})
 69 | 
 70 | ---
 71 | 
 72 | ## 📋 Detalhes da Fonte
 73 | 
 74 | - **Nome:** {item['nome']}
 75 | - **Sigla:** {item['sigla']}
 76 | - **URL:** {item['url']}
 77 | 
 78 | ---
 79 | 
 80 | ## 📊 Status Atual do Pipeline
 81 | 
 82 | - [ ] **Extracted:** {item['extracted']}  
 83 | - [ ] **Processed:** {item['processed']}  
 84 | - [ ] **Structured:** {item['structured']}  
 85 | 
 86 | ---
 87 | 
 88 | ## 🎯 Objetivo
 89 | 
 90 | Implementar a arquitetura do projeto DataPub para esta fonte, garantindo que os dados sejam extraídos, processados e estruturados corretamente para análise.
 91 | 
 92 | ---
 93 | 
 94 | ## 🔧 Próximas Etapas
 95 | 
 96 | - [ ] Validar conexão e acesso à fonte
 97 | - [ ] Implementar extração dos dados (ETL)
 98 | - [ ] Processar dados para limpeza e transformação
 99 | - [ ] Estruturar dados para consumo downstream
100 | - [ ] Testar pipeline end-to-end
101 | 
102 | ---
103 | 
104 | ## 📚 Referências
105 | 
106 | Leia mais no README do projeto:  
107 | https://github.com/{REPO}#readme
108 | 
109 | ---
110 | 
111 | ## 🛠️ Notas para o Bot de Automação
112 | 
113 | - Fonte: `{item['sigla']}`  
114 | - Status Extracted: `{item['extracted']}`  
115 | - Status Processed: `{item['processed']}`  
116 | - Status Structured: `{item['structured']}`  
117 | - URL Fonte: `{item['url']}`
118 | 
119 | ---
120 | 
121 | _💡 Este template foi criado para facilitar a colaboração e automação no projeto DataPub._
122 | """
123 |         created = create_issue(title, body, existing_titles)
124 |         if created:
125 |             created_count += 1
126 |             time.sleep(1.5)  # para respeitar limite da API
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 📂 **DataPub – Sistema de Análise de Documentos Públicos**
  2 | 
  3 | ## 📌 Visão Geral
  4 | 
  5 | **DataPub** é uma plataforma para **coleta, processamento, estruturação e análise de documentos públicos brasileiros**, incluindo **Diários Oficiais, contratos, portarias, atos administrativos e demais publicações governamentais**.
  6 | 
  7 | Nosso objetivo é **tornar mais acessíveis e analisáveis informações que estão dispersas em portais públicos**, promovendo **transparência, accountability e inteligência institucional**.
  8 | 
  9 | > 🧭 **Por que isso importa?**
 10 | > Documentos públicos revelam o funcionamento real do Estado. Ao reunir e estruturar essas fontes:
 11 | >
 12 | > - Permitimos o **monitoramento da saúde política e institucional do país**
 13 | > - Fortalecemos o **controle social e o jornalismo investigativo**
 14 | > - Geramos dados úteis para **pesquisadores, ONGs, órgãos de controle e a sociedade civil organizada**
 15 | 
 16 | ---
 17 | 
 18 | ## Bucket Público
 19 | 
 20 | Os dados deste projeto estão disponíveis em um bucket da AWS com acesso público. Isso permite que qualquer pessoa acesse os arquivos diretamente, sem necessidade de autenticação.
 21 | 
 22 | Você pode acessar os dados por meio do seguinte endpoint (via CloudFront):
 23 | 
 24 | 🔗 [https://d23ollh9dwoi10.cloudfront.net/](https://d23ollh9dwoi10.cloudfront.net/)
 25 | 
 26 | > **Nota:** Certifique-se de usar URLs completas e corretas ao referenciar arquivos específicos no bucket. Exemplo:
 27 | >
 28 | > ```
 29 | > https://d23ollh9dwoi10.cloudfront.net/pasta/arquivo.json
 30 | > ```
 31 | 
 32 | ---
 33 | 
 34 | ## 🗂️ Estrutura do Projeto
 35 | 
 36 | ```
 37 | /datapub
 38 | │
 39 | ├── src/                  
 40 | │   └── databub/
 41 | │       ├── __init__.py
 42 | │       ├── /entities
 43 | │       │   ├── /al_go
 44 | │       │   │   ├── extractor.py
 45 | │       │   │   ├── processing.py
 46 | │       │   │   ├── models.py
 47 | │       │   │   └── config.yaml
 48 | │       │   ├── /al_ms
 49 | │       │   │   └── ...
 50 | │       ├── /shared
 51 | │       │   ├── /utils  
 52 | │       │   ├── /processing
 53 | │       │   ├── /models
 54 | │       │   └── /config
 55 | │       ├── config.py
 56 | │       ├── cli.py
 57 | │       └── factory.py
 58 | │
 59 | ├── /storage
 60 | │   ├── /raw               # Documentos públicos originais (PDF, HTML, etc.)
 61 | │   ├── /processed         # Textos extraídos, limpos e enriquecidos
 62 | │   └── /structured        # Dados estruturados (JSON, CSV, banco de dados)
 63 | │
 64 | ├── tests/                 
 65 | │   ├── __init__.py
 66 | │   ├── test_diario_alpa.py
 67 | │   └── test_relatorios_gestao_alpa.py
 68 | │
 69 | ├── docs/
 70 | │
 71 | ├── .gitignore
 72 | ├── LICENSE
 73 | ├── pyproject.toml           # Configurações do projeto (PEP 518)
 74 | ├── setup.cfg                # Configurações do setuptools, lint, pytest, etc
 75 | ├── setup.py                 # Script de instalação
 76 | ├── requirements.txt         # Dependências
 77 | └── README.rst               # Documentação
 78 | 
 79 | ```
 80 | 
 81 | ---
 82 | 
 83 | ## ⚙️ Como Executar
 84 | 
 85 | 1. **Instale as dependências**:
 86 | 
 87 |    ```bash
 88 |    pip install -e . 
 89 |    ```
 90 | 
 91 | 2. **Execute o coletor de arquivos**:
 92 | 
 93 |    ```bash
 94 |    extractor al_pa --start 2021-01-1 --end 2025-06-1
 95 |    ```
 96 | 
 97 | 3. **Execute o pipeline de processamento**:
 98 | 
 99 |    // TODO
100 | 
101 | ---
102 | 
103 | ## 🔍 Casos de Uso
104 | 
105 | - Monitoramento de nomeações, exonerações e licitações
106 | - Extração de padrões temáticos de portarias e contratos
107 | - Análise de linguagem em atos administrativos
108 | - Detecção de eventos políticos importantes em diferentes esferas (municipal, estadual, federal)
109 | 
110 | ---
111 | 
112 | ## 🤝 Contribuições
113 | 
114 | Contribuições são muito bem-vindas!
115 | Abra uma **issue**, envie um **pull request** ou compartilhe fontes/documentos de interesse público que deseja ver monitorados aqui.
116 | 
117 | ---
118 | 
119 | ## 📄 Licença
120 | 
121 | Este projeto é de código aberto sob a [MIT License](LICENSE).
122 | 
123 | ---
124 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # This file is used to configure your project.
  2 | # Read more about the various options under:
  3 | # https://setuptools.pypa.io/en/latest/userguide/declarative_config.html
  4 | # https://setuptools.pypa.io/en/latest/references/keywords.html
  5 | 
  6 | [metadata]
  7 | name = datapub
  8 | description = Add a short description here!
  9 | author = a21ns1g4ts
 10 | author_email = atila.danvi@outlook.com
 11 | license = MIT
 12 | license_files = LICENSE.txt
 13 | long_description = file: README.rst
 14 | long_description_content_type = text/x-rst; charset=UTF-8
 15 | url = https://github.com/pyscaffold/pyscaffold/
 16 | # Add here related links, for example:
 17 | project_urls =
 18 |     Documentation = https://pyscaffold.org/
 19 | #    Source = https://github.com/pyscaffold/pyscaffold/
 20 | #    Changelog = https://pyscaffold.org/en/latest/changelog.html
 21 | #    Tracker = https://github.com/pyscaffold/pyscaffold/issues
 22 | #    Conda-Forge = https://anaconda.org/conda-forge/pyscaffold
 23 | #    Download = https://pypi.org/project/PyScaffold/#files
 24 | #    Twitter = https://twitter.com/PyScaffold
 25 | 
 26 | # Change if running only on Windows, Mac or Linux (comma-separated)
 27 | platforms = any
 28 | 
 29 | # Add here all kinds of additional classifiers as defined under
 30 | # https://pypi.org/classifiers/
 31 | classifiers =
 32 |     Development Status :: 4 - Beta
 33 |     Programming Language :: Python
 34 | 
 35 | 
 36 | [options]
 37 | zip_safe = False
 38 | packages = find:
 39 | include_package_data = True
 40 | package_dir =
 41 |     =src
 42 | 
 43 | # Require a min/specific Python version (comma-separated conditions)
 44 | # python_requires = >=3.8
 45 | 
 46 | # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0.
 47 | # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in
 48 | # new major versions. This works if the required packages follow Semantic Versioning.
 49 | # For more information, check out https://semver.org/.
 50 | install_requires =
 51 |     importlib-metadata; python_version<"3.8"
 52 | 
 53 | 
 54 | [options.packages.find]
 55 | where = src
 56 | exclude =
 57 |     tests
 58 | 
 59 | [options.extras_require]
 60 | # Add here additional requirements for extra features, to install with:
 61 | # `pip install datapub[PDF]` like:
 62 | # PDF = ReportLab; RXP
 63 | 
 64 | # Add here test requirements (semicolon/line-separated)
 65 | testing =
 66 |     setuptools
 67 |     pytest
 68 |     pytest-cov
 69 | 
 70 | [options.entry_points]
 71 | console_scripts =
 72 |     extractor = datapub.cli:main
 73 | # Add here console scripts like:
 74 | # console_scripts =
 75 | #     script_name = datapub.module:function
 76 | # For example:
 77 | # console_scripts =
 78 | #     fibonacci = datapub.skeleton:run
 79 | # And any other entry points, for example:
 80 | # pyscaffold.cli =
 81 | #     awesome = pyscaffoldext.awesome.extension:AwesomeExtension
 82 | 
 83 | [tool:pytest]
 84 | # Specify command line options as you would do when invoking pytest directly.
 85 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 86 | # in order to write a coverage file that can be read by Jenkins.
 87 | # CAUTION: --cov flags may prohibit setting breakpoints while debugging.
 88 | #          Comment those flags to avoid this pytest issue.
 89 | addopts =
 90 |     --cov datapub --cov-report term-missing
 91 |     --verbose
 92 | norecursedirs =
 93 |     dist
 94 |     build
 95 |     .tox
 96 | testpaths = tests
 97 | # Use pytest markers to select/deselect specific tests
 98 | # markers =
 99 | #     slow: mark tests as slow (deselect with '-m "not slow"')
100 | #     system: mark end-to-end system tests
101 | 
102 | [devpi:upload]
103 | # Options for the devpi: PyPI server and packaging tool
104 | # VCS export must be deactivated since we are using setuptools-scm
105 | no_vcs = 1
106 | formats = bdist_wheel
107 | 
108 | [flake8]
109 | # Some sane defaults for the code style checker flake8
110 | max_line_length = 88
111 | extend_ignore = E203, W503
112 | # ^  Black-compatible
113 | #    E203 and W503 have edge cases handled by black
114 | exclude =
115 |     .tox
116 |     build
117 |     dist
118 |     .eggs
119 |     docs/conf.py
120 | 
121 | [pyscaffold]
122 | # PyScaffold's parameters when the project was created.
123 | # This will be used when updating. Do not change!
124 | version = 4.6
125 | package = datapub
126 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_go/extractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import time
  5 | import hashlib
  6 | from pathlib import Path
  7 | from datetime import datetime, date
  8 | 
  9 | import requests
 10 | from selenium import webdriver
 11 | from selenium.webdriver.chrome.options import Options
 12 | from selenium.webdriver.common.by import By
 13 | 
 14 | from datapub.shared.utils.extractor_base import ExtractorBase
 15 | 
 16 | class ALGOExtractor(ExtractorBase):
 17 |     def __init__(self):
 18 |         super().__init__(entity="ALGO", base_dir="storage/raw/al_go")
 19 | 
 20 |         self.page_url_template = "https://transparencia.al.go.leg.br/gestao-parlamentar/diario?ano={}&mes={}"
 21 | 
 22 |         chrome_options = Options()
 23 |         if self.headless:
 24 |             chrome_options.add_argument("--headless=new") 
 25 |             chrome_options.add_argument("--disable-gpu")
 26 |             chrome_options.add_argument("--no-sandbox")
 27 |         
 28 |         self.driver = webdriver.Chrome(options=chrome_options)
 29 | 
 30 |     def close(self):
 31 |         self.driver.quit()
 32 | 
 33 |     def download(self, start_date=None, end_date=None):
 34 |         if end_date is None:
 35 |             end_date = date.today()
 36 |         if start_date is None:
 37 |             start_date = date(2007, 8, 1)
 38 | 
 39 |         current_date = start_date
 40 |         while current_date <= end_date:
 41 |             year = current_date.year
 42 |             month = current_date.month
 43 |             print(f"🔍 Processando {year}-{month:02d}")
 44 |             links = self._get_pdf_links_for_month(year, month)
 45 | 
 46 |             for date_str, url in links.items():
 47 |                 self._download_single_url(date_str, url)
 48 |                 time.sleep(1)
 49 | 
 50 |             year = current_date.year + (current_date.month // 12)
 51 |             month = current_date.month % 12 + 1
 52 |             current_date = date(year, month, 1)
 53 | 
 54 |     def _get_pdf_links_for_month(self, year, month):
 55 |         url = self.page_url_template.format(year, month)
 56 |         print(f"  - Carregando página: {url}")
 57 |         self.driver.get(url)
 58 | 
 59 |         time.sleep(3)
 60 | 
 61 |         links = {}
 62 | 
 63 |         elements = self.driver.find_elements(By.CSS_SELECTOR, "a.fc-day-grid-event")
 64 |         for el in elements:
 65 |             href = el.get_attribute("href")
 66 |             if href and href.endswith(".pdf"):
 67 |                 filename = href.split("/")[-1]
 68 |                 parts = filename.split("-")
 69 |                 if len(parts) >= 4:
 70 |                     date_str = parts[-1].replace(".pdf", "")
 71 |                     links[date_str] = href
 72 | 
 73 |         print(f"  - Encontrados {len(links)} links")
 74 |         return links
 75 | 
 76 |     def _download_single_url(self, date_str, url):
 77 |         match = re.search(r"diario-alego-(\d{4}-\d{2}-\d{2})\.pdf", url)
 78 |         date = match.group(1)
 79 | 
 80 |         filename = f"diario-alego-{date}.pdf"
 81 |         filepath = self.downloads_dir / filename
 82 | 
 83 |         if filepath.exists():
 84 |             print(f"⏭️ [{date}] Já existe, pulando.")
 85 |             return True
 86 | 
 87 |         try:
 88 |             response = requests.get(url, timeout=30)
 89 |             if response.status_code == 200 and b"%PDF" in response.content[:10]:
 90 |                 with open(filepath, "wb") as f:
 91 |                     f.write(response.content)
 92 | 
 93 |                 file_hash = hashlib.md5(response.content).hexdigest()
 94 |                 date = datetime.strptime(date, "%Y-%m-%d")
 95 |                 self._save_metadata(date, filename, url, filepath, file_hash)
 96 | 
 97 |                 print(f"✅ [{date_str}] Baixado com sucesso | Hash: {file_hash[:8]}")
 98 |                 return True
 99 |             else:
100 |                 print(f"⚠️ [{date_str}] Documento não encontrado ou inválido (HTTP {response.status_code})")
101 |                 return False
102 |         except Exception as e:
103 |             print(f"❌ [{date_str}] Erro ao baixar: {e}")
104 |             return False
105 | 
106 | if __name__ == "__main__":
107 |     extractor = ALGOExtractor()
108 | 
109 |     start_date = datetime.date(2007, 8, 1)
110 |     end_date = datetime.now().date()
111 | 
112 |     print(f"🚀 Iniciando download de diários oficiais da AL-GO de {start_date} a {end_date}")
113 | 
114 |     extractor.download(start_date, end_date)
115 | 
116 |     extractor.close()
117 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ac/extractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import hashlib
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from datetime import datetime, timedelta
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | class Extractor:
 11 |     def __init__(self, base_dir="storage/raw/alac"):
 12 |         self.session = requests.Session()
 13 |         # Em transição para: https://www.al.ac.leg.br/
 14 |         self.base_url = "https://aleac.tceac.tc.br/faces/paginas/publico/dec/visualizarDOE.xhtml"
 15 |         self.base_dir = Path(base_dir)
 16 |         self.downloads_dir = self.base_dir / "downloads"
 17 |         self.metadata_dir = self.base_dir / "metadata"
 18 | 
 19 |         self.downloads_dir.mkdir(parents=True, exist_ok=True)
 20 |         self.metadata_dir.mkdir(parents=True, exist_ok=True)
 21 | 
 22 |     def _format_date(self, date: datetime):
 23 |         return date.strftime("%d-%m-%Y")
 24 | 
 25 |     def download(self, start_date=None, end_date=None, delay=0.5):
 26 |         if end_date is None:
 27 |             end_date = datetime.date.today()
 28 |         if start_date is None:
 29 |             start_date = datetime.date(2007, 1, 1)
 30 | 
 31 |         current_date = start_date
 32 | 
 33 |         while current_date <= end_date:
 34 |             try:
 35 |                 self._download_single(current_date)
 36 |             except Exception as e:
 37 |                 print(f"❌ Erro em {current_date}: {e}")
 38 |             current_date += timedelta(days=1)
 39 | 
 40 |     def _download_single(self, target_date: datetime):
 41 |         print(f"📡 Consultando ALAC para {target_date}")
 42 | 
 43 |         params = {
 44 |             "faces-redirect": "true",
 45 |             "includeViewParams": "true",
 46 |             "dataDEC": self._format_date(target_date)
 47 |         }
 48 | 
 49 |         response = self.session.get(self.base_url, params=params)
 50 |         if response.status_code != 200:
 51 |             print("❌ Não foi possível carregar a página inicial:", response.status_code)
 52 |             return
 53 | 
 54 |         soup = BeautifulSoup(response.text, "html.parser")
 55 |         view_state = soup.find("input", {"name": "javax.faces.ViewState"})
 56 |         if not view_state:
 57 |             print("❌ ViewState não encontrado.")
 58 |             return
 59 | 
 60 |         view_state_value = view_state["value"]
 61 |         form_id = "visualizarDoe"
 62 | 
 63 |         post_data = {
 64 |             f"{form_id}": form_id,
 65 |             f"{form_id}:botaoDownloadLink": f"{form_id}:botaoDownloadLink",
 66 |             "javax.faces.ViewState": view_state_value,
 67 |         }
 68 | 
 69 |         headers = {
 70 |             "Content-Type": "application/x-www-form-urlencoded",
 71 |             "Referer": response.url,
 72 |         }
 73 | 
 74 |         print("⏬ Enviando requisição de download...")
 75 |         download_response = self.session.post(self.base_url, data=post_data, headers=headers)
 76 | 
 77 |         content_type = download_response.headers.get("Content-Type", "")
 78 |         if "application/pdf" not in content_type:
 79 |             print("⚠️ Nenhum PDF disponível em", target_date.strftime('%Y-%m-%d'))
 80 |             return
 81 | 
 82 |         nome_arquivo = f"diario-alac-{target_date.strftime('%Y-%m-%d')}.pdf"
 83 |         local_path = self.downloads_dir / nome_arquivo
 84 | 
 85 |         with open(local_path, "wb") as f:
 86 |             f.write(download_response.content)
 87 | 
 88 |         file_hash = hashlib.md5(download_response.content).hexdigest()
 89 |         self._salvar_metadata(target_date, local_path, file_hash)
 90 |         print(f"✅ PDF salvo: {nome_arquivo} | Hash: {file_hash[:8]}")
 91 | 
 92 |     def _salvar_metadata(self, target_date: datetime, local_path: Path, file_hash: str):
 93 |         metadata = {
 94 |             "orgao": "ALAC",
 95 |             "data_publicacao": target_date.strftime("%Y-%m-%d"),
 96 |             "url_origem": self.base_url,
 97 |             "caminho_local": str(local_path),
 98 |             "data_download": datetime.now().isoformat(),
 99 |             "tamanho_bytes": os.path.getsize(local_path),
100 |             "hash_md5": file_hash,
101 |             "status": "sucesso"
102 |         }
103 | 
104 |         nome_metadata = f"metadata_{target_date.strftime('%Y-%m-%d')}.json"
105 |         with open(self.metadata_dir / nome_metadata, "w", encoding="utf-8") as f:
106 |             json.dump(metadata, f, ensure_ascii=False, indent=2)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     extractor = Extractor()
111 |     try:
112 |         extractor.download(
113 |             start_date=datetime(2015, 1, 1),
114 |             end_date=datetime(2025, 6, 1)
115 |         )
116 |     except Exception as e:
117 |         print("Erro durante execução:", e)
118 | 


--------------------------------------------------------------------------------
/src/datapub/skeleton.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a skeleton file that can serve as a starting point for a Python
  3 | console script. To run this script uncomment the following lines in the
  4 | ``[options.entry_points]`` section in ``setup.cfg``::
  5 | 
  6 |     console_scripts =
  7 |          fibonacci = datapub.skeleton:run
  8 | 
  9 | Then run ``pip install .`` (or ``pip install -e .`` for editable mode)
 10 | which will install the command ``fibonacci`` inside your current environment.
 11 | 
 12 | Besides console scripts, the header (i.e. until ``_logger``...) of this file can
 13 | also be used as template for Python modules.
 14 | 
 15 | Note:
 16 |     This file can be renamed depending on your needs or safely removed if not needed.
 17 | 
 18 | References:
 19 |     - https://setuptools.pypa.io/en/latest/userguide/entry_point.html
 20 |     - https://pip.pypa.io/en/stable/reference/pip_install
 21 | """
 22 | 
 23 | import argparse
 24 | import logging
 25 | import sys
 26 | 
 27 | from datapub import __version__
 28 | 
 29 | __author__ = "a21ns1g4ts"
 30 | __copyright__ = "a21ns1g4ts"
 31 | __license__ = "MIT"
 32 | 
 33 | _logger = logging.getLogger(__name__)
 34 | 
 35 | 
 36 | # ---- Python API ----
 37 | # The functions defined in this section can be imported by users in their
 38 | # Python scripts/interactive interpreter, e.g. via
 39 | # `from datapub.skeleton import fib`,
 40 | # when using this Python module as a library.
 41 | 
 42 | 
 43 | def fib(n):
 44 |     """Fibonacci example function
 45 | 
 46 |     Args:
 47 |       n (int): integer
 48 | 
 49 |     Returns:
 50 |       int: n-th Fibonacci number
 51 |     """
 52 |     assert n > 0
 53 |     a, b = 1, 1
 54 |     for _i in range(n - 1):
 55 |         a, b = b, a + b
 56 |     return a
 57 | 
 58 | 
 59 | # ---- CLI ----
 60 | # The functions defined in this section are wrappers around the main Python
 61 | # API allowing them to be called directly from the terminal as a CLI
 62 | # executable/script.
 63 | 
 64 | 
 65 | def parse_args(args):
 66 |     """Parse command line parameters
 67 | 
 68 |     Args:
 69 |       args (List[str]): command line parameters as list of strings
 70 |           (for example  ``["--help"]``).
 71 | 
 72 |     Returns:
 73 |       :obj:`argparse.Namespace`: command line parameters namespace
 74 |     """
 75 |     parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration")
 76 |     parser.add_argument(
 77 |         "--version",
 78 |         action="version",
 79 |         version=f"datapub {__version__}",
 80 |     )
 81 |     parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT")
 82 |     parser.add_argument(
 83 |         "-v",
 84 |         "--verbose",
 85 |         dest="loglevel",
 86 |         help="set loglevel to INFO",
 87 |         action="store_const",
 88 |         const=logging.INFO,
 89 |     )
 90 |     parser.add_argument(
 91 |         "-vv",
 92 |         "--very-verbose",
 93 |         dest="loglevel",
 94 |         help="set loglevel to DEBUG",
 95 |         action="store_const",
 96 |         const=logging.DEBUG,
 97 |     )
 98 |     return parser.parse_args(args)
 99 | 
100 | 
101 | def setup_logging(loglevel):
102 |     """Setup basic logging
103 | 
104 |     Args:
105 |       loglevel (int): minimum loglevel for emitting messages
106 |     """
107 |     logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
108 |     logging.basicConfig(
109 |         level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S"
110 |     )
111 | 
112 | 
113 | def main(args):
114 |     """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion
115 | 
116 |     Instead of returning the value from :func:`fib`, it prints the result to the
117 |     ``stdout`` in a nicely formatted message.
118 | 
119 |     Args:
120 |       args (List[str]): command line parameters as list of strings
121 |           (for example  ``["--verbose", "42"]``).
122 |     """
123 |     args = parse_args(args)
124 |     setup_logging(args.loglevel)
125 |     _logger.debug("Starting crazy calculations...")
126 |     print(f"The {args.n}-th Fibonacci number is {fib(args.n)}")
127 |     _logger.info("Script ends here")
128 | 
129 | 
130 | def run():
131 |     """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv`
132 | 
133 |     This function can be used as entry point to create console scripts with setuptools.
134 |     """
135 |     main(sys.argv[1:])
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     # ^  This is a guard statement that will prevent the following code from
140 |     #    being executed in the case someone imports this file instead of
141 |     #    executing it as a script.
142 |     #    https://docs.python.org/3/library/__main__.html
143 | 
144 |     # After installing your project with pip, users can also run your Python
145 |     # modules as scripts via the ``-m`` flag, as defined in PEP 338::
146 |     #
147 |     #     python -m datapub.skeleton 42
148 |     #
149 |     run()
150 | 


--------------------------------------------------------------------------------
/src/datapub/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | CLI to run extractors for official government gazettes.
  4 | Adapted for PyScaffold project structure.
  5 | """
  6 | 
  7 | import argparse
  8 | import importlib
  9 | import sys
 10 | from datetime import datetime, date
 11 | from pathlib import Path
 12 | from datapub.shared.contracts.extractor_contract import ExtractorContract
 13 | from datapub.shared.utils.extractor_base import ExtractorBase
 14 | 
 15 | EXTRACTORS_PACKAGE = "datapub.entities"
 16 | 
 17 | def parse_date(date_str):
 18 |     """Parses a date string in YYYY-MM-DD format into a date object."""
 19 |     return datetime.strptime(date_str, "%Y-%m-%d").date()
 20 | 
 21 | def load_extractor(entity: str) -> ExtractorContract:
 22 |     """
 23 |     Dynamically loads and returns the Extractor class instance for a given entity.
 24 | 
 25 |     Args:
 26 |         entity (str): Name of the entity folder (e.g., 'al_go').
 27 | 
 28 |     Returns:
 29 |         ExtractorContract: An instance of the extractor class.
 30 | 
 31 |     Raises:
 32 |         ValueError: If the extractor module or class cannot be found.
 33 |         TypeError: If the loaded class does not inherit from ExtractorContract.
 34 |     """
 35 |     class_name = entity.upper().replace("_", "") + "Extractor"
 36 | 
 37 |     try:
 38 |         module = importlib.import_module(f"datapub.entities.{entity}.extractor")
 39 |     except ModuleNotFoundError as e:
 40 |         raise ValueError(f"Extractor module not found for entity '{entity}'") from e
 41 | 
 42 |     if not hasattr(module, class_name):
 43 |         raise AttributeError(f"The module '{entity}.extractor' must contain a class named '{class_name}'")
 44 | 
 45 |     extractor_cls = getattr(module, class_name)
 46 | 
 47 |     if not issubclass(extractor_cls, ExtractorContract):
 48 |         raise TypeError(f"Extractor class in '{entity}' must inherit from ExtractorContract")
 49 | 
 50 |     return extractor_cls()
 51 | 
 52 | def run_extractor(entity, args):
 53 |     """Initializes and runs the appropriate extractor with CLI arguments."""
 54 |     extractor = load_extractor(entity)
 55 | 
 56 |     params = {}
 57 | 
 58 |     if entity == "al_go":
 59 |         # ALE-GO extractor: uses start/end dates
 60 |         params["start_date"] = parse_date(args.start) if args.start else date(2007, 8, 1)
 61 |         params["end_date"] = parse_date(args.end) if args.end else date.today()
 62 |         print(f"🚀 Starting ALE-GO download from {params['start_date']} to {params['end_date']}")
 63 |     
 64 |     elif entity == "al_ms":
 65 |         # ALE-MS extractor: uses edition numbers
 66 |         params["start_num"] = int(args.start) if args.start else 1844
 67 |         params["end_num"] = int(args.end) if args.end else None
 68 |         print(f"🚀 Starting ALE-MS download from number {params['start_num']} to {'last available' if not params['end_num'] else params['end_num']}")
 69 | 
 70 |     elif entity == "al_pa":
 71 |         # ALE-PA extractor: uses start/end dates
 72 |         params["start_date"] = parse_date(args.start) if args.start else date(2021, 1, 1)
 73 |         params["end_date"] = parse_date(args.end) if args.end else date.today()
 74 |         print(f"🚀 Starting ALE-PA download from {params['start_date']} to {params['end_date']}")
 75 | 
 76 |     elif entity == "al_ce":
 77 |         # ALE-CE extractor: uses start/end dates
 78 |         params["start_date"] = parse_date(args.start) if args.start else date(2025, 5, 26)
 79 |         params["end_date"] = parse_date(args.end) if args.end else date.today()
 80 |         print(f"🚀 Starting ALE-CE download from {params['start_date']} to {params['end_date']}")
 81 | 
 82 |     elif entity == "al_ac":
 83 |         # ALE-AC extractor: uses start/end dates
 84 |         params["start_date"] = parse_date(args.start) if args.start else date(2015, 1, 1)
 85 |         params["end_date"] = parse_date(args.end) if args.end else date.today()
 86 |         print(f"🚀 Starting ALE-AC download from {params['start_date']} to {params['end_date']}")
 87 | 
 88 |     # Call the extractor's download method with collected parameters
 89 |     extractor.download(**params)
 90 | 
 91 | def main():
 92 |     """Entry point for CLI parsing and execution."""
 93 |     parser = argparse.ArgumentParser(description="Runner for official gazette extractors")
 94 |     subparsers = parser.add_subparsers(dest="entity", required=True)
 95 | 
 96 |     # Define subcommands and their arguments for each 'entity'
 97 |     parser_algo = subparsers.add_parser("al_go", help="ALE-GO gazettes")
 98 |     parser_algo.add_argument("--start")
 99 |     parser_algo.add_argument("--end")
100 | 
101 |     parser_alms = subparsers.add_parser("al_ms", help="ALE-MS gazettes")
102 |     parser_alms.add_argument("--start")
103 |     parser_alms.add_argument("--end")
104 | 
105 |     parser_alepa = subparsers.add_parser("al_pa", help="ALE-PA gazettes")
106 |     parser_alepa.add_argument("--start")
107 |     parser_alepa.add_argument("--end")
108 | 
109 |     parser_alece = subparsers.add_parser("al_ce", help="ALE-CE gazettes")
110 |     parser_alece.add_argument("--start")
111 |     parser_alece.add_argument("--end")
112 | 
113 |     parser_aleac = subparsers.add_parser("al_ac", help="ALE-AC gazettes")
114 |     parser_aleac.add_argument("--start")
115 |     parser_aleac.add_argument("--end")
116 | 
117 |     # Parse arguments
118 |     args = parser.parse_args()
119 | 
120 |     try:
121 |         run_extractor(args.entity, args)
122 |     except KeyboardInterrupt:
123 |         print("\n⏹️ Execution interrupted by user")
124 |         sys.exit(0)
125 |     except Exception as e:
126 |         print(f"❌ Error during execution: {e}", file=sys.stderr)
127 |         sys.exit(1)
128 | 
129 | # Main script execution guard
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_pa/extractor.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import json
  4 | import hashlib
  5 | import random
  6 | from pathlib import Path
  7 | from datetime import datetime, timedelta, date
  8 | import re
  9 | import dateparser
 10 | import pdfplumber
 11 | 
 12 | import requests
 13 | from selenium import webdriver
 14 | from selenium.webdriver.common.by import By
 15 | from selenium.webdriver.support.ui import WebDriverWait
 16 | from selenium.webdriver.support import expected_conditions as EC
 17 | from selenium.webdriver.chrome.options import Options
 18 | import pyperclip
 19 | from selenium.webdriver import ActionChains
 20 | from selenium.webdriver.common.keys import Keys
 21 | from datapub.shared.utils.extractor_base import ExtractorBase
 22 | 
 23 | class ALPAExtractor(ExtractorBase):
 24 |     def __init__(self, base_dir="storage/raw/alpa", headless=True):
 25 |         super().__init__(entity="ALPA", base_dir=base_dir)
 26 | 
 27 |         self.base_url = "https://www.alepa.pa.gov.br/Comunicacao/Diarios"   
 28 | 
 29 |         chrome_options = Options()
 30 |         
 31 |         if self.headless:
 32 |             print("🚀 Headless mode enabled")
 33 |             chrome_options.add_argument("--headless=new")  # headless modo
 34 |             chrome_options.add_argument("--disable-gpu")
 35 |             chrome_options.add_argument("--no-sandbox")
 36 |         else:
 37 |             print("🚀 Headless mode disabled")
 38 | 
 39 |         chrome_options.add_argument("--log-level=3")
 40 |         self.driver = webdriver.Chrome(options=chrome_options)
 41 |         self.wait = WebDriverWait(self.driver, 20)
 42 | 
 43 |     def download(self, start_date, end_date):
 44 |         self.download_range(start_date, end_date)
 45 |         print("✅ Download concluído")
 46 | 
 47 |     def download_range(self, start_date, end_date):
 48 |         current_date = start_date
 49 |         while current_date <= end_date:
 50 |             try:
 51 |                 self._download_single(current_date)
 52 |             except Exception as e:
 53 |                 print(f"⚠️ Erro ao processar {current_date.strftime('%d/%m/%Y')}: {e}")
 54 |             current_date += timedelta(days=1)
 55 |             time.sleep(random.uniform(0, 0.2))
 56 | 
 57 |     def _download_single(self, day: datetime):
 58 |         day_str = day.isoformat() 
 59 | 
 60 |         for f in self.downloads_dir.iterdir():
 61 |             name = f.name
 62 | 
 63 |             if name == f"diario-alpa-{day_str}.pdf":
 64 |                 print(f"⏭️ Já existe (exato): {name}")
 65 |                 return
 66 | 
 67 |             match = re.match(r"diario-alpa-(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.pdf", name)
 68 |             if match:
 69 |                 start_str, end_str = match.groups()
 70 |                 try:
 71 |                     start_date = datetime.fromisoformat(start_str).date()
 72 |                     end_date = datetime.fromisoformat(end_str).date()
 73 |                     if start_date <= day <= end_date:
 74 |                         print(f"⏭️ Já existe (intervalo): {name}")
 75 |                         return
 76 |                 except ValueError                   :
 77 |                     continue 
 78 | 
 79 |         print(f"📅 Buscando: {day.strftime('%d/%m/%Y')}")
 80 |         self.driver.get(self.base_url)
 81 | 
 82 |         sleep_time = random.uniform(0.1, 0.3)
 83 |         time.sleep(sleep_time)
 84 | 
 85 |         try:
 86 |             date_str = day.strftime("%d/%m/%Y")
 87 | 
 88 |             input_field = self.wait.until(EC.presence_of_element_located((By.ID, "dateEdit_I")))
 89 |             input_field.clear()
 90 | 
 91 |             calendar_button = self.driver.find_element(By.ID, "dateEdit_B-1")
 92 |             calendar_button.click()
 93 | 
 94 |             time.sleep(random.uniform(0.1, 0.3))
 95 | 
 96 |             input_field.send_keys(date_str)
 97 | 
 98 |             ActionChains(self.driver).send_keys(Keys.TAB).perform()
 99 | 
100 |             time.sleep(random.uniform(0.1, 0.3))
101 | 
102 |             button = self.driver.find_elements(By.XPATH, "//button[contains(text(), 'Visualizar o arquivo')]")
103 |             if not button:
104 |                 print(f"⚠️ Nenhum diário para {day.strftime('%d/%m/%Y')}")
105 |                 return
106 | 
107 |             button[0].click()
108 |             time.sleep(4)
109 | 
110 |             if len(self.driver.window_handles) < 2:
111 |                 print(f"❌ PDF não abriu em nova aba para {day.strftime('%d/%m/%Y')}")
112 |                 return
113 | 
114 |             self.driver.switch_to.window(self.driver.window_handles[-1])
115 |             pdf_url = self.driver.current_url
116 |             self.driver.close()
117 |             self.driver.switch_to.window(self.driver.window_handles[0])
118 | 
119 |             self._download_pdf(pdf_url, day)
120 | 
121 |         except Exception as e:
122 |             print(f"❌ Erro em {day.strftime('%d/%m/%Y')}: {e}")
123 | 
124 |     def _download_pdf(self, url, day):
125 |         try:
126 |             response = requests.get(url, timeout=15)
127 |             if response.status_code == 200 and b"%PDF" in response.content[:10]:
128 |                 temp_filename = f"diario-alpa-{day.isoformat()}.pdf"
129 |                 temp_path = self.downloads_dir / temp_filename
130 | 
131 |                 with open(temp_path, "wb") as f:
132 |                     f.write(response.content)
133 | 
134 |                 path = temp_path  # Caminho final, pode ser alterado se houver range
135 |                 text = self._extract_text_from_pdf(temp_path)
136 | 
137 |                 if text:
138 |                     date_range = self._extract_date_range(text)
139 |                     if date_range[0] and date_range[1]:
140 |                         print(f"📋 Encontrado intervalo de datas: {date_range}")
141 |                         # Renomeia com intervalo de datas
142 |                         final_filename = f"diario-alpa-{date_range[0]}_{date_range[1]}.pdf"
143 |                         final_path = self.downloads_dir / final_filename
144 |                         temp_path.rename(final_path)
145 |                         path = final_path  # Atualiza o caminho final
146 | 
147 |                 file_hash = hashlib.md5(response.content).hexdigest()
148 |                 self._save_metadata(url, path, day, file_hash)
149 |                 print(f"✅ Salvo: {path.name} | Hash: {file_hash[:8]}")
150 | 
151 |             else:
152 |                 print(f"⚠️ Conteúdo não é PDF válido: {url}")
153 |         except Exception as e:
154 |             print(f"❌ Falha ao baixar PDF: {e}")
155 | 
156 | 
157 |     def _extract_text_from_pdf(self, pdf_path):
158 |         full_text = ""
159 |         with pdfplumber.open(pdf_path) as pdf:
160 |             # for page in pdf.pages:
161 |             #     full_text += page.extract_text() + "\n"
162 |             full_text += pdf.pages[0].extract_text()
163 |         return full_text
164 | 
165 |     def _extract_date_range(self, text):
166 |         patterns = [
167 |             # 29 de Janeiro a 05 de Fevereiro de 2021
168 |             r'(\d{1,2})\s*de\s*([a-zç]+)\s*a\s*(\d{1,2})\s*de\s*([a-zç]+)\s*de\s*(\d{4})',
169 |             
170 |             # 22 a 29 de Janeiro de 2021
171 |             r'(\d{1,2})\s*a\s*(\d{1,2})\s*de\s*([a-zç]+)\s*de\s*(\d{4})',
172 |         ]
173 | 
174 |         results = []
175 | 
176 |         for pattern in patterns:
177 |             for match in re.finditer(pattern, text.lower()):
178 |                 groups = match.groups()
179 | 
180 |                 try:
181 |                     if len(groups) == 5:
182 |                         # Ex: 29 de Janeiro a 05 de Fevereiro de 2021
183 |                         start_date = dateparser.parse(f"{groups[0]} de {groups[1]} de {groups[4]}", languages=['pt'])
184 |                         end_date = dateparser.parse(f"{groups[2]} de {groups[3]} de {groups[4]}", languages=['pt'])
185 |                     elif len(groups) == 4:
186 |                         # Ex: 22 a 29 de Janeiro de 2021
187 |                         start_date = dateparser.parse(f"{groups[0]} de {groups[2]} de {groups[3]}", languages=['pt'])
188 |                         end_date = dateparser.parse(f"{groups[1]} de {groups[2]} de {groups[3]}", languages=['pt'])
189 |                     else:
190 |                         continue
191 | 
192 |                     if start_date and end_date:
193 |                         results.append((start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')))
194 |                 except:
195 |                     continue
196 | 
197 |         return results[0]
198 | 
199 |     def _save_metadata(self, url, path, date, file_hash):
200 |         metadata = {
201 |             "orgao": "ALEPA",
202 |             "data_publicacao": date.isoformat(),
203 |             "url_origem": url,
204 |             "caminho_local": str(path),
205 |             "data_download": datetime.now().isoformat(),
206 |             "tamanho_bytes": os.path.getsize(path),
207 |             "hash_md5": file_hash,
208 |             "status": "sucesso"
209 |         }
210 |         
211 |         with open(self.metadata_dir / f"metadata_{date.isoformat()}.json", "w", encoding="utf-8") as f:
212 |             json.dump(metadata, f, ensure_ascii=False, indent=2)
213 | 
214 |     def close(self):
215 |         self.driver.quit()
216 | 
217 | 
218 | if __name__ == "__main__":
219 |     extractor = Extractor()
220 |     try:
221 |         start = datetime(2021, 1, 1)
222 |         end = datetime.today()
223 |         extractor.download_range(start, end)
224 |     finally:
225 |         extractor.close()
226 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # This file is execfile()d with the current directory set to its containing dir.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | #
  7 | # All configuration values have a default; values that are commented out
  8 | # serve to show the default.
  9 | 
 10 | import os
 11 | import sys
 12 | import shutil
 13 | 
 14 | # -- Path setup --------------------------------------------------------------
 15 | 
 16 | __location__ = os.path.dirname(__file__)
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | sys.path.insert(0, os.path.join(__location__, "../src"))
 22 | 
 23 | # -- Run sphinx-apidoc -------------------------------------------------------
 24 | # This hack is necessary since RTD does not issue `sphinx-apidoc` before running
 25 | # `sphinx-build -b html . _build/html`. See Issue:
 26 | # https://github.com/readthedocs/readthedocs.org/issues/1139
 27 | # DON'T FORGET: Check the box "Install your project inside a virtualenv using
 28 | # setup.py install" in the RTD Advanced Settings.
 29 | # Additionally it helps us to avoid running apidoc manually
 30 | 
 31 | try:  # for Sphinx >= 1.7
 32 |     from sphinx.ext import apidoc
 33 | except ImportError:
 34 |     from sphinx import apidoc
 35 | 
 36 | output_dir = os.path.join(__location__, "api")
 37 | module_dir = os.path.join(__location__, "../src/datapub")
 38 | try:
 39 |     shutil.rmtree(output_dir)
 40 | except FileNotFoundError:
 41 |     pass
 42 | 
 43 | try:
 44 |     import sphinx
 45 | 
 46 |     cmd_line = f"sphinx-apidoc --implicit-namespaces -f -o {output_dir} {module_dir}"
 47 | 
 48 |     args = cmd_line.split(" ")
 49 |     if tuple(sphinx.__version__.split(".")) >= ("1", "7"):
 50 |         # This is a rudimentary parse_version to avoid external dependencies
 51 |         args = args[1:]
 52 | 
 53 |     apidoc.main(args)
 54 | except Exception as e:
 55 |     print("Running `sphinx-apidoc` failed!\n{}".format(e))
 56 | 
 57 | # -- General configuration ---------------------------------------------------
 58 | 
 59 | # If your documentation needs a minimal Sphinx version, state it here.
 60 | # needs_sphinx = '1.0'
 61 | 
 62 | # Add any Sphinx extension module names here, as strings. They can be extensions
 63 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 64 | extensions = [
 65 |     "sphinx.ext.autodoc",
 66 |     "sphinx.ext.intersphinx",
 67 |     "sphinx.ext.todo",
 68 |     "sphinx.ext.autosummary",
 69 |     "sphinx.ext.viewcode",
 70 |     "sphinx.ext.coverage",
 71 |     "sphinx.ext.doctest",
 72 |     "sphinx.ext.ifconfig",
 73 |     "sphinx.ext.mathjax",
 74 |     "sphinx.ext.napoleon",
 75 | ]
 76 | 
 77 | # Add any paths that contain templates here, relative to this directory.
 78 | templates_path = ["_templates"]
 79 | 
 80 | # The suffix of source filenames.
 81 | source_suffix = ".rst"
 82 | 
 83 | # The encoding of source files.
 84 | # source_encoding = 'utf-8-sig'
 85 | 
 86 | # The master toctree document.
 87 | master_doc = "index"
 88 | 
 89 | # General information about the project.
 90 | project = "datapub"
 91 | copyright = "2025, a21ns1g4ts"
 92 | 
 93 | # The version info for the project you're documenting, acts as replacement for
 94 | # |version| and |release|, also used in various other places throughout the
 95 | # built documents.
 96 | #
 97 | # version: The short X.Y version.
 98 | # release: The full version, including alpha/beta/rc tags.
 99 | # If you don’t need the separation provided between version and release,
100 | # just set them both to the same value.
101 | try:
102 |     from datapub import __version__ as version
103 | except ImportError:
104 |     version = ""
105 | 
106 | if not version or version.lower() == "unknown":
107 |     version = os.getenv("READTHEDOCS_VERSION", "unknown")  # automatically set by RTD
108 | 
109 | release = version
110 | 
111 | # The language for content autogenerated by Sphinx. Refer to documentation
112 | # for a list of supported languages.
113 | # language = None
114 | 
115 | # There are two options for replacing |today|: either, you set today to some
116 | # non-false value, then it is used:
117 | # today = ''
118 | # Else, today_fmt is used as the format for a strftime call.
119 | # today_fmt = '%B %d, %Y'
120 | 
121 | # List of patterns, relative to source directory, that match files and
122 | # directories to ignore when looking for source files.
123 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", ".venv"]
124 | 
125 | # The reST default role (used for this markup: `text`) to use for all documents.
126 | # default_role = None
127 | 
128 | # If true, '()' will be appended to :func: etc. cross-reference text.
129 | # add_function_parentheses = True
130 | 
131 | # If true, the current module name will be prepended to all description
132 | # unit titles (such as .. function::).
133 | # add_module_names = True
134 | 
135 | # If true, sectionauthor and moduleauthor directives will be shown in the
136 | # output. They are ignored by default.
137 | # show_authors = False
138 | 
139 | # The name of the Pygments (syntax highlighting) style to use.
140 | pygments_style = "sphinx"
141 | 
142 | # A list of ignored prefixes for module index sorting.
143 | # modindex_common_prefix = []
144 | 
145 | # If true, keep warnings as "system message" paragraphs in the built documents.
146 | # keep_warnings = False
147 | 
148 | # If this is True, todo emits a warning for each TODO entries. The default is False.
149 | todo_emit_warnings = True
150 | 
151 | 
152 | # -- Options for HTML output -------------------------------------------------
153 | 
154 | # The theme to use for HTML and HTML Help pages.  See the documentation for
155 | # a list of builtin themes.
156 | html_theme = "alabaster"
157 | 
158 | # Theme options are theme-specific and customize the look and feel of a theme
159 | # further.  For a list of options available for each theme, see the
160 | # documentation.
161 | html_theme_options = {
162 |     "sidebar_width": "300px",
163 |     "page_width": "1200px"
164 | }
165 | 
166 | # Add any paths that contain custom themes here, relative to this directory.
167 | # html_theme_path = []
168 | 
169 | # The name for this set of Sphinx documents.  If None, it defaults to
170 | # "<project> v<release> documentation".
171 | # html_title = None
172 | 
173 | # A shorter title for the navigation bar.  Default is the same as html_title.
174 | # html_short_title = None
175 | 
176 | # The name of an image file (relative to this directory) to place at the top
177 | # of the sidebar.
178 | # html_logo = ""
179 | 
180 | # The name of an image file (within the static path) to use as favicon of the
181 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
182 | # pixels large.
183 | # html_favicon = None
184 | 
185 | # Add any paths that contain custom static files (such as style sheets) here,
186 | # relative to this directory. They are copied after the builtin static files,
187 | # so a file named "default.css" will overwrite the builtin "default.css".
188 | html_static_path = ["_static"]
189 | 
190 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
191 | # using the given strftime format.
192 | # html_last_updated_fmt = '%b %d, %Y'
193 | 
194 | # If true, SmartyPants will be used to convert quotes and dashes to
195 | # typographically correct entities.
196 | # html_use_smartypants = True
197 | 
198 | # Custom sidebar templates, maps document names to template names.
199 | # html_sidebars = {}
200 | 
201 | # Additional templates that should be rendered to pages, maps page names to
202 | # template names.
203 | # html_additional_pages = {}
204 | 
205 | # If false, no module index is generated.
206 | # html_domain_indices = True
207 | 
208 | # If false, no index is generated.
209 | # html_use_index = True
210 | 
211 | # If true, the index is split into individual pages for each letter.
212 | # html_split_index = False
213 | 
214 | # If true, links to the reST sources are added to the pages.
215 | # html_show_sourcelink = True
216 | 
217 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
218 | # html_show_sphinx = True
219 | 
220 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
221 | # html_show_copyright = True
222 | 
223 | # If true, an OpenSearch description file will be output, and all pages will
224 | # contain a <link> tag referring to it.  The value of this option must be the
225 | # base URL from which the finished HTML is served.
226 | # html_use_opensearch = ''
227 | 
228 | # This is the file name suffix for HTML files (e.g. ".xhtml").
229 | # html_file_suffix = None
230 | 
231 | # Output file base name for HTML help builder.
232 | htmlhelp_basename = "datapub-doc"
233 | 
234 | 
235 | # -- Options for LaTeX output ------------------------------------------------
236 | 
237 | latex_elements = {
238 |     # The paper size ("letterpaper" or "a4paper").
239 |     # "papersize": "letterpaper",
240 |     # The font size ("10pt", "11pt" or "12pt").
241 |     # "pointsize": "10pt",
242 |     # Additional stuff for the LaTeX preamble.
243 |     # "preamble": "",
244 | }
245 | 
246 | # Grouping the document tree into LaTeX files. List of tuples
247 | # (source start file, target name, title, author, documentclass [howto/manual]).
248 | latex_documents = [
249 |     ("index", "user_guide.tex", "datapub Documentation", "a21ns1g4ts", "manual")
250 | ]
251 | 
252 | # The name of an image file (relative to this directory) to place at the top of
253 | # the title page.
254 | # latex_logo = ""
255 | 
256 | # For "manual" documents, if this is true, then toplevel headings are parts,
257 | # not chapters.
258 | # latex_use_parts = False
259 | 
260 | # If true, show page references after internal links.
261 | # latex_show_pagerefs = False
262 | 
263 | # If true, show URL addresses after external links.
264 | # latex_show_urls = False
265 | 
266 | # Documents to append as an appendix to all manuals.
267 | # latex_appendices = []
268 | 
269 | # If false, no module index is generated.
270 | # latex_domain_indices = True
271 | 
272 | # -- External mapping --------------------------------------------------------
273 | python_version = ".".join(map(str, sys.version_info[0:2]))
274 | intersphinx_mapping = {
275 |     "sphinx": ("https://www.sphinx-doc.org/en/master", None),
276 |     "python": ("https://docs.python.org/" + python_version, None),
277 |     "matplotlib": ("https://matplotlib.org", None),
278 |     "numpy": ("https://numpy.org/doc/stable", None),
279 |     "sklearn": ("https://scikit-learn.org/stable", None),
280 |     "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
281 |     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
282 |     "setuptools": ("https://setuptools.pypa.io/en/stable/", None),
283 |     "pyscaffold": ("https://pyscaffold.org/en/stable", None),
284 | }
285 | 
286 | print(f"loading configurations for {project} {version} ...", file=sys.stderr)
287 | 


--------------------------------------------------------------------------------
/src/datapub/entities/al_ms/extractor.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.webdriver.common.by import By
  3 | from selenium.webdriver.support.ui import WebDriverWait
  4 | from selenium.webdriver.support import expected_conditions as EC
  5 | from selenium.webdriver.chrome.service import Service
  6 | from webdriver_manager.chrome import ChromeDriverManager
  7 | import time
  8 | import os
  9 | import hashlib
 10 | import json
 11 | from pathlib import Path
 12 | from datetime import datetime
 13 | from pathlib import Path
 14 | from datetime import datetime, timedelta
 15 | 
 16 | class Extractor:
 17 |     def __init__(self, base_dir="storage/raw/alms", headless=True):
 18 |         self.base_dir = Path(base_dir)
 19 |         self.downloads_dir = (self.base_dir / "downloads").resolve()
 20 |         self.metadata_dir = self.base_dir / "metadata"
 21 |         self.logs_dir = self.base_dir / "logs"
 22 |         
 23 |         self.headless = headless
 24 |         
 25 |         # Cria os diretórios se não existirem
 26 |         # self.downloads_dir.mkdir(parents=True, exist_ok=True)
 27 |         os.makedirs(self.downloads_dir, exist_ok=True)
 28 |         self.metadata_dir.mkdir(parents=True, exist_ok=True)
 29 |         self.logs_dir.mkdir(parents=True, exist_ok=True)
 30 |         
 31 |         # Configurações do navegador
 32 |         self._setup_driver()
 33 |         
 34 |         # Configurações da busca
 35 |         self.start_number = 1  # Número inicial do diário
 36 |         self.max_attempts = 2  # Tentativas por diário
 37 |         self.delay_between = 1  # Delay entre requisições
 38 |         self.max_consecutive_failures = 5  # Limite de falhas consecutivas
 39 |     
 40 |     def _setup_driver(self):
 41 |         """Configura o WebDriver do Chrome"""
 42 |         chrome_options = webdriver.ChromeOptions()
 43 |         
 44 |         # Configurações de download
 45 |         chrome_options.add_experimental_option("prefs", {
 46 |             "download.default_directory": str(self.downloads_dir),
 47 |             "download.prompt_for_download": False,
 48 |             "plugins.always_open_pdf_externally": True,
 49 |             "download.directory_upgrade": True,
 50 |         })
 51 |         
 52 |         if self.headless:
 53 |             chrome_options.add_argument("--headless=new")  # headless modo
 54 |             chrome_options.add_argument("--disable-gpu")
 55 |             chrome_options.add_argument("--no-sandbox")
 56 |         
 57 |         # Configura o serviço do Chrome
 58 |         service = Service(ChromeDriverManager().install())
 59 |         
 60 |         # Inicia o navegador
 61 |         self.driver = webdriver.Chrome(service=service, options=chrome_options)
 62 |         self.wait = WebDriverWait(self.driver, 15)
 63 | 
 64 |     def download(self, start_num=None, end_num=None):
 65 |         """Baixa diários em um intervalo numérico"""
 66 |         self.download_range(start_num, end_num)
 67 |         print("✅ Download concluido")
 68 |     
 69 |     def download_range(self, start_num=None, end_num=None):
 70 |         """Baixa diários em um intervalo numérico"""
 71 |         current_num = start_num or self.start_number
 72 |         end_num = end_num or float('inf')
 73 |         consecutive_failures = 0
 74 |         
 75 |         self._log_start()
 76 |         
 77 |         try:
 78 |             self.driver.get("https://diariooficial.al.ms.gov.br/")
 79 |             
 80 |             while current_num <= end_num:
 81 |                 num_str = str(current_num).zfill(4)
 82 |                 success = self._process_diario(num_str)
 83 |                 
 84 |                 if not success:
 85 |                     consecutive_failures += 1
 86 |                     if consecutive_failures >= self.max_consecutive_failures:
 87 |                         print(f"🚧 {self.max_consecutive_failures} falhas consecutivas, encerrando...")
 88 |                         break
 89 |                 else:
 90 |                     consecutive_failures = 0
 91 |                 
 92 |                 current_num += 1
 93 |                 time.sleep(self.delay_between)
 94 |         
 95 |         finally:
 96 |             self._cleanup()
 97 |     
 98 |     def _process_diario(self, num_str):
 99 |         """Processa um diário específico"""
100 |         for attempt in range(self.max_attempts):
101 |             try:
102 |                 print(f"🔍 Tentando Diário nº {num_str} (tentativa {attempt + 1})")
103 |                 
104 |                 # Preenche e submete o formulário de busca
105 |                 input_field = self.wait.until(EC.presence_of_element_located((By.ID, "pesquisa")))
106 |                 input_field.clear()
107 |                 input_field.send_keys(num_str)
108 |                 
109 |                 search_btn = self.driver.find_element(By.ID, "filtro")
110 |                 search_btn.click()
111 | 
112 |                 # Aguarda a tabela de resultados atualizar
113 |                 time.sleep(1)
114 | 
115 |                 # Tenta localizar a tabela de resultados
116 |                 try:
117 |                     tabela = self.driver.find_element(By.CSS_SELECTOR, "table.table")
118 |                     linhas = tabela.find_elements(By.TAG_NAME, "tr")
119 | 
120 |                     if len(linhas) > 1:
121 |                         # Percorre todas as linhas a partir da segunda, tentando achar o link de download
122 |                         for linha in linhas[1:]:
123 |                             links = linha.find_elements(By.TAG_NAME, "a")
124 |                             if not links:
125 |                                 continue  # pula se não tiver link
126 | 
127 |                             link_download = links[-1]  # pega o último link (download)
128 |                             href = link_download.get_attribute("href")
129 |                             if href:
130 |                                 print(f"📥 Baixando PDF: {href}")
131 |                                 if self._download_pdf(num_str, href):
132 |                                     return True  # sucesso, retorna True
133 |                                 else:
134 |                                     print(f"⚠️ Falha no download do PDF para o número {num_str}")
135 |                             else:
136 |                                 print(f"⚠️ Link de download vazio para o número {num_str}")
137 |                         # Se não baixou nenhum, retorna False
138 |                         print(f"🚫 Nenhum PDF baixado para o número {num_str}")
139 |                         return False
140 |                     else:
141 |                         print(f"🚫 Diário nº {num_str} não encontrado.")
142 |                         return False
143 | 
144 |                 except Exception as e:
145 |                     print(f"❌ Nenhuma tabela encontrada para nº {num_str}: {str(e)}")
146 |                     return False
147 |                     
148 |             except Exception as e:
149 |                 print(f"⚠️ Erro na tentativa {attempt + 1} para {num_str}: {str(e)}")
150 |                 if attempt == self.max_attempts - 1:
151 |                     self._log_error(num_str, str(e))
152 |                 time.sleep(1)
153 |         
154 |         return False
155 |     
156 |     def _download_pdf(self, num_str, pdf_url):
157 |         """Baixa e processa um PDF individual com tratamento robusto de erros"""
158 |         try:
159 |             print(f"⏳ Iniciando download do Diário {num_str}...")
160 | 
161 |             # Abre o PDF em nova aba
162 |             self.driver.execute_script("window.open(arguments[0], '_blank');", pdf_url)
163 | 
164 |             # Aguarda o download com timeout
165 |             max_wait_time = 30  # segundos
166 |             start_time = time.time()
167 |             extracted = False
168 |             initial_files = set(self.downloads_dir.glob("*.pdf"))
169 | 
170 |             while (time.time() - start_time) < max_wait_time:
171 |                 current_files = set(f for f in self.downloads_dir.glob("*.pdf") if not f.name.endswith('.crdownload'))
172 |                 new_files = current_files - initial_files
173 | 
174 |                 if new_files:
175 |                     # Assume o arquivo mais recente como o novo
176 |                     latest_file = max(new_files, key=lambda f: f.stat().st_mtime)
177 | 
178 |                     # Verifica se o download está completo (tamanho estável por 2s)
179 |                     initial_size = latest_file.stat().st_size
180 |                     time.sleep(2)
181 |                     if latest_file.stat().st_size == initial_size:
182 |                         extracted = True
183 |                         break
184 |                 time.sleep(1)
185 | 
186 |             if not extracted:
187 |                 print(f"❌ Timeout ao baixar Diário {num_str}")
188 |                 self._log_error(num_str, "Timeout no download")
189 |                 return False
190 | 
191 |             # Verifica se é um PDF válido
192 |             try:
193 |                 with latest_file.open('rb') as f:
194 |                     if not f.read(4).startswith(b'%PDF'):
195 |                         print(f"❌ Arquivo inválido para Diário {num_str}")
196 |                         latest_file.unlink(missing_ok=True)
197 |                         self._log_error(num_str, "Arquivo PDF inválido")
198 |                         return False
199 |             except Exception as e:
200 |                 print(f"❌ Erro ao verificar PDF do Diário {num_str}: {str(e)}")
201 |                 self._log_error(num_str, f"Erro de leitura do arquivo: {str(e)}")
202 |                 return False
203 | 
204 |             # Gera nome único para o arquivo
205 |             try:
206 |                 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
207 |                 new_filename = f"diario-alms-{num_str}_{timestamp}.pdf"
208 |                 new_path = self.downloads_dir / new_filename
209 | 
210 |                 if new_path.exists():
211 |                     new_filename = f"diario-alms-{num_str}_{timestamp}_{hashlib.md5(str(time.time()).encode()).hexdigest()[:4]}.pdf"
212 |                     new_path = self.downloads_dir / new_filename
213 | 
214 |                 latest_file.rename(new_path)
215 |             except Exception as e:
216 |                 print(f"❌ Erro ao renomear arquivo do Diário {num_str}: {str(e)}")
217 |                 self._log_error(num_str, f"Erro ao renomear arquivo: {str(e)}")
218 |                 return False
219 | 
220 |             # Salva metadados
221 |             try:
222 |                 file_size = new_path.stat().st_size
223 |                 if file_size == 0:
224 |                     print(f"❌ Arquivo vazio para Diário {num_str}")
225 |                     new_path.unlink(missing_ok=True)
226 |                     return False
227 | 
228 |                 self._save_metadata(num_str, pdf_url, new_path)
229 |                 print(f"✅ Diário {num_str} baixado com sucesso ({file_size/1024:.2f} KB): {new_filename}")
230 |                 return True
231 | 
232 |             except Exception as e:
233 |                 print(f"❌ Erro ao salvar metadados do Diário {num_str}: {str(e)}")
234 |                 new_path.unlink(missing_ok=True)
235 |                 return False
236 | 
237 |         except Exception as e:
238 |             print(f"❌ Erro ao baixar Diário {num_str}: {str(e)}")
239 |             self._log_error(num_str, str(e))
240 |             return False
241 |   
242 |     def _save_metadata(self, num_str, url, filepath):
243 |         """Salva metadados do download"""
244 |         metadata = {
245 |             "orgao": "MS",
246 |             "numero_diario": num_str,
247 |             "url_origem": url,
248 |             "caminho_local": str(filepath),
249 |             "data_download": datetime.now().isoformat(),
250 |             "tamanho_bytes": os.path.getsize(filepath),
251 |             "hash_md5": self._calculate_hash(filepath),
252 |             "status": "sucesso"
253 |         }
254 |         
255 |         metadata_path = self.metadata_dir / f"metadata_{num_str}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
256 |         with open(metadata_path, 'w', encoding='utf-8') as f:
257 |             json.dump(metadata, f, ensure_ascii=False, indent=2)
258 |     
259 |     def _calculate_hash(self, filepath):
260 |         """Calcula hash MD5 do arquivo"""
261 |         hash_md5 = hashlib.md5()
262 |         with open(filepath, "rb") as f:
263 |             for chunk in iter(lambda: f.read(4096), b""):
264 |                 hash_md5.update(chunk)
265 |         return hash_md5.hexdigest()
266 |     
267 |     def _log_start(self):
268 |         """Registra início do processo"""
269 |         log_entry = {
270 |             "inicio": datetime.now().isoformat(),
271 |             "start_number": self.start_number,
272 |             "config": {
273 |                 "max_attempts": self.max_attempts,
274 |                 "delay": self.delay_between,
275 |                 "max_consecutive_failures": self.max_consecutive_failures
276 |             }
277 |         }
278 |         
279 |         log_path = self.logs_dir / f"execucao_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
280 |         with open(log_path, 'w', encoding='utf-8') as f:
281 |             json.dump(log_entry, f, ensure_ascii=False, indent=2)
282 |     
283 |     def _log_error(self, num_str, error_msg):
284 |         """Registras erros ocorridos"""
285 |         error_log = {
286 |             "numero_diario": num_str,
287 |             "data_hora": datetime.now().isoformat(),
288 |             "erro": error_msg
289 |         }
290 |         
291 |         error_path = self.logs_dir / "erros.json"
292 |         with open(error_path, 'a', encoding='utf-8') as f:
293 |             f.write(json.dumps(error_log, ensure_ascii=False) + "\n")
294 |     
295 |     def _cleanup(self):
296 |         """Finaliza o navegador"""
297 |         if hasattr(self, 'driver'):
298 |             self.driver.quit()
299 | 
300 |     def get_recent_complete_pdfs(downloads_dir: Path, since: float = 60.0) -> set[Path]:
301 |         """Retorna arquivos PDF completos, excluindo temporários e antigos."""
302 |         now = datetime.now()
303 |         pdfs = set()
304 | 
305 |         for file in downloads_dir.iterdir():
306 |             if not file.name.lower().endswith(".pdf"):
307 |                 continue
308 |             if file.name.endswith(".crdownload") or file.name.startswith("~"):
309 |                 continue  # arquivo ainda em download ou temporário
310 |             try:
311 |                 mtime = datetime.fromtimestamp(file.stat().st_mtime)
312 |                 if (now - mtime).total_seconds() <= since:
313 |                     pdfs.add(file)
314 |             except Exception:
315 |                 continue  # em caso de falha ao acessar stats
316 | 
317 |         return pdfs
318 | 
319 | if __name__ == "__main__":
320 |     extractor = Extractor()
321 |     extractor.download_range()
322 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. todo:: THIS IS SUPPOSED TO BE AN EXAMPLE. MODIFY IT ACCORDING TO YOUR NEEDS!
  2 | 
  3 |    The document assumes you are using a source repository service that promotes a
  4 |    contribution model similar to `GitHub's fork and pull request workflow`_.
  5 |    While this is true for the majority of services (like GitHub, GitLab,
  6 |    BitBucket), it might not be the case for private repositories (e.g., when
  7 |    using Gerrit).
  8 | 
  9 |    Also notice that the code examples might refer to GitHub URLs or the text
 10 |    might use GitHub specific terminology (e.g., *Pull Request* instead of *Merge
 11 |    Request*).
 12 | 
 13 |    Please make sure to check the document having these assumptions in mind
 14 |    and update things accordingly.
 15 | 
 16 | .. todo:: Provide the correct links/replacements at the bottom of the document.
 17 | 
 18 | .. todo:: You might want to have a look on `PyScaffold's contributor's guide`_,
 19 | 
 20 |    especially if your project is open source. The text should be very similar to
 21 |    this template, but there are a few extra contents that you might decide to
 22 |    also include, like mentioning labels of your issue tracker or automated
 23 |    releases.
 24 | 
 25 | 
 26 | ============
 27 | Contributing
 28 | ============
 29 | 
 30 | Welcome to ``datapub`` contributor's guide.
 31 | 
 32 | This document focuses on getting any potential contributor familiarized
 33 | with the development processes, but `other kinds of contributions`_ are also
 34 | appreciated.
 35 | 
 36 | If you are new to using git_ or have never collaborated in a project previously,
 37 | please have a look at `contribution-guide.org`_. Other resources are also
 38 | listed in the excellent `guide created by FreeCodeCamp`_ [#contrib1]_.
 39 | 
 40 | Please notice, all users and contributors are expected to be **open,
 41 | considerate, reasonable, and respectful**. When in doubt, `Python Software
 42 | Foundation's Code of Conduct`_ is a good reference in terms of behavior
 43 | guidelines.
 44 | 
 45 | 
 46 | Issue Reports
 47 | =============
 48 | 
 49 | If you experience bugs or general issues with ``datapub``, please have a look
 50 | on the `issue tracker`_. If you don't see anything useful there, please feel
 51 | free to fire an issue report.
 52 | 
 53 | .. tip::
 54 |    Please don't forget to include the closed issues in your search.
 55 |    Sometimes a solution was already reported, and the problem is considered
 56 |    **solved**.
 57 | 
 58 | New issue reports should include information about your programming environment
 59 | (e.g., operating system, Python version) and steps to reproduce the problem.
 60 | Please try also to simplify the reproduction steps to a very minimal example
 61 | that still illustrates the problem you are facing. By removing other factors,
 62 | you help us to identify the root cause of the issue.
 63 | 
 64 | 
 65 | Documentation Improvements
 66 | ==========================
 67 | 
 68 | You can help improve ``datapub`` docs by making them more readable and coherent, or
 69 | by adding missing information and correcting mistakes.
 70 | 
 71 | ``datapub`` documentation uses Sphinx_ as its main documentation compiler.
 72 | This means that the docs are kept in the same repository as the project code, and
 73 | that any documentation update is done in the same way was a code contribution.
 74 | 
 75 | .. todo:: Don't forget to mention which markup language you are using.
 76 | 
 77 |     e.g.,  reStructuredText_ or CommonMark_ with MyST_ extensions.
 78 | 
 79 | .. todo:: If your project is hosted on GitHub, you can also mention the following tip:
 80 | 
 81 |    .. tip::
 82 |       Please notice that the `GitHub web interface`_ provides a quick way of
 83 |       propose changes in ``datapub``'s files. While this mechanism can
 84 |       be tricky for normal code contributions, it works perfectly fine for
 85 |       contributing to the docs, and can be quite handy.
 86 | 
 87 |       If you are interested in trying this method out, please navigate to
 88 |       the ``docs`` folder in the source repository_, find which file you
 89 |       would like to propose changes and click in the little pencil icon at the
 90 |       top, to open `GitHub's code editor`_. Once you finish editing the file,
 91 |       please write a message in the form at the bottom of the page describing
 92 |       which changes have you made and what are the motivations behind them and
 93 |       submit your proposal.
 94 | 
 95 | When working on documentation changes in your local machine, you can
 96 | compile them using |tox|_::
 97 | 
 98 |     tox -e docs
 99 | 
100 | and use Python's built-in web server for a preview in your web browser
101 | (``http://localhost:8000``)::
102 | 
103 |     python3 -m http.server --directory 'docs/_build/html'
104 | 
105 | 
106 | Code Contributions
107 | ==================
108 | 
109 | .. todo:: Please include a reference or explanation about the internals of the project.
110 | 
111 |    An architecture description, design principles or at least a summary of the
112 |    main concepts will make it easy for potential contributors to get started
113 |    quickly.
114 | 
115 | Submit an issue
116 | ---------------
117 | 
118 | Before you work on any non-trivial code contribution it's best to first create
119 | a report in the `issue tracker`_ to start a discussion on the subject.
120 | This often provides additional considerations and avoids unnecessary work.
121 | 
122 | Create an environment
123 | ---------------------
124 | 
125 | Before you start coding, we recommend creating an isolated `virtual
126 | environment`_ to avoid any problems with your installed Python packages.
127 | This can easily be done via either |virtualenv|_::
128 | 
129 |     virtualenv <PATH TO VENV>
130 |     source <PATH TO VENV>/bin/activate
131 | 
132 | or Miniconda_::
133 | 
134 |     conda create -n datapub python=3 six virtualenv pytest pytest-cov
135 |     conda activate datapub
136 | 
137 | Clone the repository
138 | --------------------
139 | 
140 | #. Create an user account on |the repository service| if you do not already have one.
141 | #. Fork the project repository_: click on the *Fork* button near the top of the
142 |    page. This creates a copy of the code under your account on |the repository service|.
143 | #. Clone this copy to your local disk::
144 | 
145 |     git clone git@github.com:YourLogin/datapub.git
146 |     cd datapub
147 | 
148 | #. You should run::
149 | 
150 |     pip install -U pip setuptools -e .
151 | 
152 |    to be able to import the package under development in the Python REPL.
153 | 
154 |    .. todo:: if you are not using pre-commit, please remove the following item:
155 | 
156 | #. Install |pre-commit|_::
157 | 
158 |     pip install pre-commit
159 |     pre-commit install
160 | 
161 |    ``datapub`` comes with a lot of hooks configured to automatically help the
162 |    developer to check the code being written.
163 | 
164 | Implement your changes
165 | ----------------------
166 | 
167 | #. Create a branch to hold your changes::
168 | 
169 |     git checkout -b my-feature
170 | 
171 |    and start making changes. Never work on the main branch!
172 | 
173 | #. Start your work on this branch. Don't forget to add docstrings_ to new
174 |    functions, modules and classes, especially if they are part of public APIs.
175 | 
176 | #. Add yourself to the list of contributors in ``AUTHORS.rst``.
177 | 
178 | #. When you’re done editing, do::
179 | 
180 |     git add <MODIFIED FILES>
181 |     git commit
182 | 
183 |    to record your changes in git_.
184 | 
185 |    .. todo:: if you are not using pre-commit, please remove the following item:
186 | 
187 |    Please make sure to see the validation messages from |pre-commit|_ and fix
188 |    any eventual issues.
189 |    This should automatically use flake8_/black_ to check/fix the code style
190 |    in a way that is compatible with the project.
191 | 
192 |    .. important:: Don't forget to add unit tests and documentation in case your
193 |       contribution adds an additional feature and is not just a bugfix.
194 | 
195 |       Moreover, writing a `descriptive commit message`_ is highly recommended.
196 |       In case of doubt, you can check the commit history with::
197 | 
198 |          git log --graph --decorate --pretty=oneline --abbrev-commit --all
199 | 
200 |       to look for recurring communication patterns.
201 | 
202 | #. Please check that your changes don't break any unit tests with::
203 | 
204 |     tox
205 | 
206 |    (after having installed |tox|_ with ``pip install tox`` or ``pipx``).
207 | 
208 |    You can also use |tox|_ to run several other pre-configured tasks in the
209 |    repository. Try ``tox -av`` to see a list of the available checks.
210 | 
211 | Submit your contribution
212 | ------------------------
213 | 
214 | #. If everything works fine, push your local branch to |the repository service| with::
215 | 
216 |     git push -u origin my-feature
217 | 
218 | #. Go to the web page of your fork and click |contribute button|
219 |    to send your changes for review.
220 | 
221 |    .. todo:: if you are using GitHub, you can uncomment the following paragraph
222 | 
223 |       Find more detailed information in `creating a PR`_. You might also want to open
224 |       the PR as a draft first and mark it as ready for review after the feedbacks
225 |       from the continuous integration (CI) system or any required fixes.
226 | 
227 | 
228 | Troubleshooting
229 | ---------------
230 | 
231 | The following tips can be used when facing problems to build or test the
232 | package:
233 | 
234 | #. Make sure to fetch all the tags from the upstream repository_.
235 |    The command ``git describe --abbrev=0 --tags`` should return the version you
236 |    are expecting. If you are trying to run CI scripts in a fork repository,
237 |    make sure to push all the tags.
238 |    You can also try to remove all the egg files or the complete egg folder, i.e.,
239 |    ``.eggs``, as well as the ``*.egg-info`` folders in the ``src`` folder or
240 |    potentially in the root of your project.
241 | 
242 | #. Sometimes |tox|_ misses out when new dependencies are added, especially to
243 |    ``setup.cfg`` and ``docs/requirements.txt``. If you find any problems with
244 |    missing dependencies when running a command with |tox|_, try to recreate the
245 |    ``tox`` environment using the ``-r`` flag. For example, instead of::
246 | 
247 |     tox -e docs
248 | 
249 |    Try running::
250 | 
251 |     tox -r -e docs
252 | 
253 | #. Make sure to have a reliable |tox|_ installation that uses the correct
254 |    Python version (e.g., 3.7+). When in doubt you can run::
255 | 
256 |     tox --version
257 |     # OR
258 |     which tox
259 | 
260 |    If you have trouble and are seeing weird errors upon running |tox|_, you can
261 |    also try to create a dedicated `virtual environment`_ with a |tox|_ binary
262 |    freshly installed. For example::
263 | 
264 |     virtualenv .venv
265 |     source .venv/bin/activate
266 |     .venv/bin/pip install tox
267 |     .venv/bin/tox -e all
268 | 
269 | #. `Pytest can drop you`_ in an interactive session in the case an error occurs.
270 |    In order to do that you need to pass a ``--pdb`` option (for example by
271 |    running ``tox -- -k <NAME OF THE FALLING TEST> --pdb``).
272 |    You can also setup breakpoints manually instead of using the ``--pdb`` option.
273 | 
274 | 
275 | Maintainer tasks
276 | ================
277 | 
278 | Releases
279 | --------
280 | 
281 | .. todo:: This section assumes you are using PyPI to publicly release your package.
282 | 
283 |    If instead you are using a different/private package index, please update
284 |    the instructions accordingly.
285 | 
286 | If you are part of the group of maintainers and have correct user permissions
287 | on PyPI_, the following steps can be used to release a new version for
288 | ``datapub``:
289 | 
290 | #. Make sure all unit tests are successful.
291 | #. Tag the current commit on the main branch with a release tag, e.g., ``v1.2.3``.
292 | #. Push the new tag to the upstream repository_, e.g., ``git push upstream v1.2.3``
293 | #. Clean up the ``dist`` and ``build`` folders with ``tox -e clean``
294 |    (or ``rm -rf dist build``)
295 |    to avoid confusion with old builds and Sphinx docs.
296 | #. Run ``tox -e build`` and check that the files in ``dist`` have
297 |    the correct version (no ``.dirty`` or git_ hash) according to the git_ tag.
298 |    Also check the sizes of the distributions, if they are too big (e.g., >
299 |    500KB), unwanted clutter may have been accidentally included.
300 | #. Run ``tox -e publish -- --repository pypi`` and check that everything was
301 |    uploaded to PyPI_ correctly.
302 | 
303 | 
304 | 
305 | .. [#contrib1] Even though, these resources focus on open source projects and
306 |    communities, the general ideas behind collaborating with other developers
307 |    to collectively create software are general and can be applied to all sorts
308 |    of environments, including private companies and proprietary code bases.
309 | 
310 | 
311 | .. <-- start -->
312 | .. todo:: Please review and change the following definitions:
313 | 
314 | .. |the repository service| replace:: GitHub
315 | .. |contribute button| replace:: "Create pull request"
316 | 
317 | .. _repository: https://github.com/<USERNAME>/datapub
318 | .. _issue tracker: https://github.com/<USERNAME>/datapub/issues
319 | .. <-- end -->
320 | 
321 | 
322 | .. |virtualenv| replace:: ``virtualenv``
323 | .. |pre-commit| replace:: ``pre-commit``
324 | .. |tox| replace:: ``tox``
325 | 
326 | 
327 | .. _black: https://pypi.org/project/black/
328 | .. _CommonMark: https://commonmark.org/
329 | .. _contribution-guide.org: https://www.contribution-guide.org/
330 | .. _creating a PR: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request
331 | .. _descriptive commit message: https://chris.beams.io/posts/git-commit
332 | .. _docstrings: https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html
333 | .. _first-contributions tutorial: https://github.com/firstcontributions/first-contributions
334 | .. _flake8: https://flake8.pycqa.org/en/stable/
335 | .. _git: https://git-scm.com
336 | .. _GitHub's fork and pull request workflow: https://guides.github.com/activities/forking/
337 | .. _guide created by FreeCodeCamp: https://github.com/FreeCodeCamp/how-to-contribute-to-open-source
338 | .. _Miniconda: https://docs.conda.io/en/latest/miniconda.html
339 | .. _MyST: https://myst-parser.readthedocs.io/en/latest/syntax/syntax.html
340 | .. _other kinds of contributions: https://opensource.guide/how-to-contribute
341 | .. _pre-commit: https://pre-commit.com/
342 | .. _PyPI: https://pypi.org/
343 | .. _PyScaffold's contributor's guide: https://pyscaffold.org/en/stable/contributing.html
344 | .. _Pytest can drop you: https://docs.pytest.org/en/stable/how-to/failures.html#using-python-library-pdb-with-pytest
345 | .. _Python Software Foundation's Code of Conduct: https://www.python.org/psf/conduct/
346 | .. _reStructuredText: https://www.sphinx-doc.org/en/master/usage/restructuredtext/
347 | .. _Sphinx: https://www.sphinx-doc.org/en/master/
348 | .. _tox: https://tox.wiki/en/stable/
349 | .. _virtual environment: https://realpython.com/python-virtual-environments-a-primer/
350 | .. _virtualenv: https://virtualenv.pypa.io/en/stable/
351 | 
352 | .. _GitHub web interface: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
353 | .. _GitHub's code editor: https://docs.github.com/en/repositories/working-with-files/managing-files/editing-files
354 | 


--------------------------------------------------------------------------------
/sources.json:
--------------------------------------------------------------------------------
   1 | [
   2 |   {
   3 |     "nome": "Assembleia Legislativa do Estado de Goiás",
   4 |     "url": "assembleia_legislativa_do_estado_de_goias_al-go",
   5 |     "sigla": "al_go",
   6 |     "extracted": false,
   7 |     "processed": false,
   8 |     "structured": false
   9 |   },
  10 |   {
  11 |     "nome": "Assembleia Legislativa do Estado de Mato Grosso do Sul",
  12 |     "url": "assembleia_legislativa_do_estado_de_mato_grosso_do_sul_al-ms",
  13 |     "sigla": "al_ms",
  14 |     "extracted": false,
  15 |     "processed": false,
  16 |     "structured": false
  17 |   },
  18 |   {
  19 |     "nome": "Assembleia Legislativa do Estado de Minas Gerais",
  20 |     "url": "assembleia_legislativa_do_estado_de_minas_gerais_al-mg",
  21 |     "sigla": "al_mg",
  22 |     "extracted": false,
  23 |     "processed": false,
  24 |     "structured": false
  25 |   },
  26 |   {
  27 |     "nome": "Assembleia Legislativa do Estado de Pernambuco",
  28 |     "url": "assembleia_legislativa_do_estado_de_pernambuco_al-pe",
  29 |     "sigla": "al_pe",
  30 |     "extracted": false,
  31 |     "processed": false,
  32 |     "structured": false
  33 |   },
  34 |   {
  35 |     "nome": "Assembleia Legislativa do Estado de Roraima",
  36 |     "url": "assembleia_legislativa_do_estado_de_roraima_al-rr",
  37 |     "sigla": "al_rr",
  38 |     "extracted": false,
  39 |     "processed": false,
  40 |     "structured": false
  41 |   },
  42 |   {
  43 |     "nome": "Assembleia Legislativa do Estado de Tocantins",
  44 |     "url": "assembleia_legislativa_do_estado_de_tocantins_al-to",
  45 |     "sigla": "al_to",
  46 |     "extracted": false,
  47 |     "processed": false,
  48 |     "structured": false
  49 |   },
  50 |   {
  51 |     "nome": "Assembleia Legislativa do Estado do Maranhão",
  52 |     "url": "assembleia_legislativa_do_estado_do_maranhao_al-ma",
  53 |     "sigla": "al_ma",
  54 |     "extracted": false,
  55 |     "processed": false,
  56 |     "structured": false
  57 |   },
  58 |   {
  59 |     "nome": "Assembleia Legislativa do Estado do Paraná",
  60 |     "url": "assembleia_legislativa_do_estado_do_parana_al-pr",
  61 |     "sigla": "al_pr",
  62 |     "extracted": false,
  63 |     "processed": false,
  64 |     "structured": false
  65 |   },
  66 |   {
  67 |     "nome": "Assembleia Legislativa do Estado do Pará",
  68 |     "url": "assembleia_legislativa_do_estado_do_para_al-pa",
  69 |     "sigla": "al_pa",
  70 |     "extracted": false,
  71 |     "processed": false,
  72 |     "structured": false
  73 |   },
  74 |   {
  75 |     "nome": "Associação Amazonense de Municípios",
  76 |     "url": "associacao_amazonense_de_municipios_aam",
  77 |     "sigla": "aam",
  78 |     "extracted": false,
  79 |     "processed": false,
  80 |     "structured": false
  81 |   },
  82 |   {
  83 |     "nome": "Associação Estadual de Municípios do Rio de Janeiro",
  84 |     "url": "associacao_estadual_de_municipios_do_rio_de_janeiro_aemerj",
  85 |     "sigla": "aemerj",
  86 |     "extracted": false,
  87 |     "processed": false,
  88 |     "structured": false
  89 |   },
  90 |   {
  91 |     "nome": "Associação Goiana de Municípios",
  92 |     "url": "associacao_goiana_de_municipios_agm",
  93 |     "sigla": "agm",
  94 |     "extracted": false,
  95 |     "processed": false,
  96 |     "structured": false
  97 |   },
  98 |   {
  99 |     "nome": "Associação Mato-Grossense dos Municípios",
 100 |     "url": "associacao_mato-grossense_dos_municipios_amm-mt",
 101 |     "sigla": "amm_mt",
 102 |     "extracted": false,
 103 |     "processed": false,
 104 |     "structured": false
 105 |   },
 106 |   {
 107 |     "nome": "Associação Mineira de Municípios",
 108 |     "url": "associacao_mineira_de_municipios_amm-mg",
 109 |     "sigla": "amm_mg",
 110 |     "extracted": false,
 111 |     "processed": false,
 112 |     "structured": false
 113 |   },
 114 |   {
 115 |     "nome": "Associação Municipalista de Pernambuco",
 116 |     "url": "associacao_municipalista_de_pernambuco_amupe",
 117 |     "sigla": "amupe",
 118 |     "extracted": false,
 119 |     "processed": false,
 120 |     "structured": false
 121 |   },
 122 |   {
 123 |     "nome": "Associação Paulista de Municípios",
 124 |     "url": "associacao_paulista_de_municipios_apm",
 125 |     "sigla": "apm",
 126 |     "extracted": false,
 127 |     "processed": false,
 128 |     "structured": false
 129 |   },
 130 |   {
 131 |     "nome": "Associação Rondoniense de Municípios",
 132 |     "url": "associacao_rondoniense_de_municipios_arom",
 133 |     "sigla": "arom",
 134 |     "extracted": false,
 135 |     "processed": false,
 136 |     "structured": false
 137 |   },
 138 |   {
 139 |     "nome": "Associação de Municípios Alagoanos",
 140 |     "url": "associacao_de_municipios_alagoanos_ama",
 141 |     "sigla": "ama",
 142 |     "extracted": false,
 143 |     "processed": false,
 144 |     "structured": false
 145 |   },
 146 |   {
 147 |     "nome": "Associação do Sul, Extremo Sul e Sudoeste da Bahia",
 148 |     "url": "associacao_do_sul_extremo_sul_e_sudoeste_da_bahia_amurc",
 149 |     "sigla": "amurc",
 150 |     "extracted": false,
 151 |     "processed": false,
 152 |     "structured": false
 153 |   },
 154 |   {
 155 |     "nome": "Associação dos Municípios do Mato Grosso do Sul",
 156 |     "url": "associacao_dos_municipios_do_mato_grosso_do_sul_assomasul",
 157 |     "sigla": "assomasul",
 158 |     "extracted": false,
 159 |     "processed": false,
 160 |     "structured": false
 161 |   },
 162 |   {
 163 |     "nome": "Associação dos Municípios do Paraná",
 164 |     "url": "associacao_dos_municipios_do_parana_amp",
 165 |     "sigla": "amp",
 166 |     "extracted": false,
 167 |     "processed": false,
 168 |     "structured": false
 169 |   },
 170 |   {
 171 |     "nome": "Associação dos Municípios e Prefeitos do Estado do Ceará",
 172 |     "url": "associacao_dos_municipios_e_prefeitos_do_estado_do_ceara_aprece",
 173 |     "sigla": "aprece",
 174 |     "extracted": false,
 175 |     "processed": false,
 176 |     "structured": false
 177 |   },
 178 |   {
 179 |     "nome": "CJM12 - Auditoria da 12ª CJM",
 180 |     "url": "cjm12_-_auditoria_da_12_cjm_12cjm",
 181 |     "sigla": "12cjm",
 182 |     "extracted": false,
 183 |     "processed": false,
 184 |     "structured": false
 185 |   },
 186 |   {
 187 |     "nome": "Conselho Nacional de Justiça",
 188 |     "url": "conselho_nacional_de_justica_cnj",
 189 |     "sigla": "cnj",
 190 |     "extracted": false,
 191 |     "processed": false,
 192 |     "structured": false
 193 |   },
 194 |   {
 195 |     "nome": "Conselho Superior da Justiça do Trabalho",
 196 |     "url": "conselho_superior_da_justica_do_trabalho_csjt",
 197 |     "sigla": "csjt",
 198 |     "extracted": false,
 199 |     "processed": false,
 200 |     "structured": false
 201 |   },
 202 |   {
 203 |     "nome": "Corregedoria Regional de Justiça do Trabalho da 16ª Região",
 204 |     "url": "corregedoria_regional_de_justica_do_trabalho_da_16_regiao_pjecortrt16",
 205 |     "sigla": "pjecortrt16",
 206 |     "extracted": false,
 207 |     "processed": false,
 208 |     "structured": false
 209 |   },
 210 |   {
 211 |     "nome": "Corregedoria Regional de Justiça do Trabalho da 23ª Região",
 212 |     "url": "corregedoria_regional_de_justica_do_trabalho_da_23_regiao_pjecortrt23",
 213 |     "sigla": "pjecortrt23",
 214 |     "extracted": false,
 215 |     "processed": false,
 216 |     "structured": false
 217 |   },
 218 |   {
 219 |     "nome": "Corregedoria Regional de Justiça do Trabalho da 5ª Região",
 220 |     "url": "corregedoria_regional_de_justica_do_trabalho_da_5_regiao_pjecortrt5",
 221 |     "sigla": "pjecortrt5",
 222 |     "extracted": false,
 223 |     "processed": false,
 224 |     "structured": false
 225 |   },
 226 |   {
 227 |     "nome": "Diario Oficial - Prefeitura Municipal de Niterói",
 228 |     "url": "diario_oficial_-_prefeitura_municipal_de_niteroi_dom-nit-rj",
 229 |     "sigla": "dom_nit_rj",
 230 |     "extracted": false,
 231 |     "processed": false,
 232 |     "structured": false
 233 |   },
 234 |   {
 235 |     "nome": "Diário Eletrônico da Ordem dos Advogados do Brasil",
 236 |     "url": "diario_eletronico_da_ordem_dos_advogados_do_brasil_deoab",
 237 |     "sigla": "deoab",
 238 |     "extracted": false,
 239 |     "processed": false,
 240 |     "structured": false
 241 |   },
 242 |   {
 243 |     "nome": "Diário Oficial da União",
 244 |     "url": "diario_oficial_da_uniao_dou",
 245 |     "sigla": "dou",
 246 |     "extracted": false,
 247 |     "processed": false,
 248 |     "structured": false
 249 |   },
 250 |   {
 251 |     "nome": "Diário Oficial do Distrito Federal",
 252 |     "url": "diario_oficial_do_distrito_federal_dodf",
 253 |     "sigla": "dodf",
 254 |     "extracted": false,
 255 |     "processed": false,
 256 |     "structured": false
 257 |   },
 258 |   {
 259 |     "nome": "Diário Oficial do Estado da Bahia",
 260 |     "url": "diario_oficial_do_estado_da_bahia_doeba",
 261 |     "sigla": "doe_ba",
 262 |     "extracted": false,
 263 |     "processed": false,
 264 |     "structured": false
 265 |   },
 266 |   {
 267 |     "nome": "Diário Oficial do Estado da Paraíba",
 268 |     "url": "diario_oficial_do_estado_da_paraiba_doepb",
 269 |     "sigla": "doe_pb",
 270 |     "extracted": false,
 271 |     "processed": false,
 272 |     "structured": false
 273 |   },
 274 |   {
 275 |     "nome": "Diário Oficial do Estado de Alagoas",
 276 |     "url": "diario_oficial_do_estado_de_alagoas_doeal",
 277 |     "sigla": "doe_al",
 278 |     "extracted": false,
 279 |     "processed": false,
 280 |     "structured": false
 281 |   },
 282 |   {
 283 |     "nome": "Diário Oficial do Estado de Goiás",
 284 |     "url": "diario_oficial_do_estado_de_goias_doego",
 285 |     "sigla": "doe_go",
 286 |     "extracted": false,
 287 |     "processed": false,
 288 |     "structured": false
 289 |   },
 290 |   {
 291 |     "nome": "Diário Oficial do Estado de Minas Gerais",
 292 |     "url": "diario_oficial_do_estado_de_minas_gerais_doemg",
 293 |     "sigla": "doe_mg",
 294 |     "extracted": false,
 295 |     "processed": false,
 296 |     "structured": false
 297 |   },
 298 |   {
 299 |     "nome": "Diário Oficial do Estado de Pernambuco",
 300 |     "url": "diario_oficial_do_estado_de_pernambuco_doepe",
 301 |     "sigla": "doe_pe",
 302 |     "extracted": false,
 303 |     "processed": false,
 304 |     "structured": false
 305 |   },
 306 |   {
 307 |     "nome": "Diário Oficial do Estado de Rondônia",
 308 |     "url": "diario_oficial_do_estado_de_rondonia_doero",
 309 |     "sigla": "doe_ro",
 310 |     "extracted": false,
 311 |     "processed": false,
 312 |     "structured": false
 313 |   },
 314 |   {
 315 |     "nome": "Diário Oficial do Estado de Roraima",
 316 |     "url": "diario_oficial_do_estado_de_roraima_doerr",
 317 |     "sigla": "doe_rr",
 318 |     "extracted": false,
 319 |     "processed": false,
 320 |     "structured": false
 321 |   },
 322 |   {
 323 |     "nome": "Diário Oficial do Estado de Santa Catarina",
 324 |     "url": "diario_oficial_do_estado_de_santa_catarina_doesc",
 325 |     "sigla": "doe_sc",
 326 |     "extracted": false,
 327 |     "processed": false,
 328 |     "structured": false
 329 |   },
 330 |   {
 331 |     "nome": "Diário Oficial do Estado de Sergipe",
 332 |     "url": "diario_oficial_do_estado_de_sergipe_doese",
 333 |     "sigla": "doe_se",
 334 |     "extracted": false,
 335 |     "processed": false,
 336 |     "structured": false
 337 |   },
 338 |   {
 339 |     "nome": "Diário Oficial do Estado de São Paulo",
 340 |     "url": "diario_oficial_do_estado_de_sao_paulo_dosp",
 341 |     "sigla": "doe_sp",
 342 |     "extracted": false,
 343 |     "processed": false,
 344 |     "structured": false
 345 |   },
 346 |   {
 347 |     "nome": "Diário Oficial do Estado do Acre",
 348 |     "url": "diario_oficial_do_estado_do_acre_doeac",
 349 |     "sigla": "doe_ac",
 350 |     "extracted": false,
 351 |     "processed": false,
 352 |     "structured": false
 353 |   },
 354 |   {
 355 |     "nome": "Diário Oficial do Estado do Amapá",
 356 |     "url": "diario_oficial_do_estado_do_amapa_doeap",
 357 |     "sigla": "doe_ap",
 358 |     "extracted": false,
 359 |     "processed": false,
 360 |     "structured": false
 361 |   },
 362 |   {
 363 |     "nome": "Diário Oficial do Estado do Amazonas",
 364 |     "url": "diario_oficial_do_estado_do_amazonas_doeam",
 365 |     "sigla": "doe_am",
 366 |     "extracted": false,
 367 |     "processed": false,
 368 |     "structured": false
 369 |   },
 370 |   {
 371 |     "nome": "Diário Oficial do Estado do Ceará",
 372 |     "url": "diario_oficial_do_estado_do_ceara_doece",
 373 |     "sigla": "doe_ce",
 374 |     "extracted": false,
 375 |     "processed": false,
 376 |     "structured": false
 377 |   },
 378 |   {
 379 |     "nome": "Diário Oficial do Estado do Espírito Santo",
 380 |     "url": "diario_oficial_do_estado_do_espirito_santo_doees",
 381 |     "sigla": "doe_es",
 382 |     "extracted": false,
 383 |     "processed": false,
 384 |     "structured": false
 385 |   },
 386 |   {
 387 |     "nome": "Diário Oficial do Estado do Maranhão",
 388 |     "url": "diario_oficial_do_estado_do_maranhao_doema",
 389 |     "sigla": "doe_ma",
 390 |     "extracted": false,
 391 |     "processed": false,
 392 |     "structured": false
 393 |   },
 394 |   {
 395 |     "nome": "Diário Oficial do Estado do Mato Grosso",
 396 |     "url": "diario_oficial_do_estado_do_mato_grosso_doemt",
 397 |     "sigla": "doe_mt",
 398 |     "extracted": false,
 399 |     "processed": false,
 400 |     "structured": false
 401 |   },
 402 |   {
 403 |     "nome": "Diário Oficial do Estado do Mato Grosso do Sul",
 404 |     "url": "diario_oficial_do_estado_do_mato_grosso_do_sul_doems",
 405 |     "sigla": "doe_ms",
 406 |     "extracted": false,
 407 |     "processed": false,
 408 |     "structured": false
 409 |   },
 410 |   {
 411 |     "nome": "Diário Oficial do Estado do Paraná",
 412 |     "url": "diario_oficial_do_estado_do_parana_doepr",
 413 |     "sigla": "doe_pr",
 414 |     "extracted": false,
 415 |     "processed": false,
 416 |     "structured": false
 417 |   },
 418 |   {
 419 |     "nome": "Diário Oficial do Estado do Pará",
 420 |     "url": "diario_oficial_do_estado_do_para_doepa",
 421 |     "sigla": "doe_pa",
 422 |     "extracted": false,
 423 |     "processed": false,
 424 |     "structured": false
 425 |   },
 426 |   {
 427 |     "nome": "Diário Oficial do Estado do Piauí",
 428 |     "url": "diario_oficial_do_estado_do_piaui_doepi",
 429 |     "sigla": "doe_pi",
 430 |     "extracted": false,
 431 |     "processed": false,
 432 |     "structured": false
 433 |   },
 434 |   {
 435 |     "nome": "Diário Oficial do Estado do Rio Grande do Norte",
 436 |     "url": "diario_oficial_do_estado_do_rio_grande_do_norte_doern",
 437 |     "sigla": "doe_rn",
 438 |     "extracted": false,
 439 |     "processed": false,
 440 |     "structured": false
 441 |   },
 442 |   {
 443 |     "nome": "Diário Oficial do Estado do Rio Grande do Sul",
 444 |     "url": "diario_oficial_do_estado_do_rio_grande_do_sul_doers",
 445 |     "sigla": "doe_rs",
 446 |     "extracted": false,
 447 |     "processed": false,
 448 |     "structured": false
 449 |   },
 450 |   {
 451 |     "nome": "Diário Oficial do Estado do Rio de Janeiro",
 452 |     "url": "diario_oficial_do_estado_do_rio_de_janeiro_doerj",
 453 |     "sigla": "doe_rj",
 454 |     "extracted": false,
 455 |     "processed": false,
 456 |     "structured": false
 457 |   },
 458 |   {
 459 |     "nome": "Diário Oficial do Estado do Tocantins",
 460 |     "url": "diario_oficial_do_estado_do_tocantins_doeto",
 461 |     "sigla": "doe_to",
 462 |     "extracted": false,
 463 |     "processed": false,
 464 |     "structured": false
 465 |   },
 466 |   {
 467 |     "nome": "Diário Oficial do Município de Americana",
 468 |     "url": "diario_oficial_do_municipio_de_americana_dom-americ",
 469 |     "sigla": "dom_americ",
 470 |     "extracted": false,
 471 |     "processed": false,
 472 |     "structured": false
 473 |   },
 474 |   {
 475 |     "nome": "Diário Oficial do Município de Barueri",
 476 |     "url": "diario_oficial_do_municipio_de_barueri_dom-bru",
 477 |     "sigla": "dom_bru",
 478 |     "extracted": false,
 479 |     "processed": false,
 480 |     "structured": false
 481 |   },
 482 |   {
 483 |     "nome": "Diário Oficial do Município de Bauru",
 484 |     "url": "diario_oficial_do_municipio_de_bauru_dom-bauru",
 485 |     "sigla": "dom_bauru",
 486 |     "extracted": false,
 487 |     "processed": false,
 488 |     "structured": false
 489 |   },
 490 |   {
 491 |     "nome": "Diário Oficial do Município de Belém",
 492 |     "url": "diario_oficial_do_municipio_de_belem_dom-belem",
 493 |     "sigla": "dom_belem",
 494 |     "extracted": false,
 495 |     "processed": false,
 496 |     "structured": false
 497 |   },
 498 |   {
 499 |     "nome": "Diário Oficial do Município de Boa Vista",
 500 |     "url": "diario_oficial_do_municipio_de_boa_vista_dom-bvb",
 501 |     "sigla": "dom_bvb",
 502 |     "extracted": false,
 503 |     "processed": false,
 504 |     "structured": false
 505 |   },
 506 |   {
 507 |     "nome": "Diário Oficial do Município de Camaçari",
 508 |     "url": "diario_oficial_do_municipio_de_camacari_dom-camacari",
 509 |     "sigla": "dom_camacari",
 510 |     "extracted": false,
 511 |     "processed": false,
 512 |     "structured": false
 513 |   },
 514 |   {
 515 |     "nome": "Diário Oficial do Município de Campinas",
 516 |     "url": "diario_oficial_do_municipio_de_campinas_dom-camp",
 517 |     "sigla": "dom_camp",
 518 |     "extracted": false,
 519 |     "processed": false,
 520 |     "structured": false
 521 |   },
 522 |   {
 523 |     "nome": "Diário Oficial do Município de Campo Grande",
 524 |     "url": "diario_oficial_do_municipio_de_campo_grande_dom-campog",
 525 |     "sigla": "dom_campog",
 526 |     "extracted": false,
 527 |     "processed": false,
 528 |     "structured": false
 529 |   },
 530 |   {
 531 |     "nome": "Diário Oficial do Município de Campos dos Goytacazes",
 532 |     "url": "diario_oficial_do_municipio_de_campos_dos_goytacazes_dom-goy-rj",
 533 |     "sigla": "dom_goy_rj",
 534 |     "extracted": false,
 535 |     "processed": false,
 536 |     "structured": false
 537 |   },
 538 |   {
 539 |     "nome": "Diário Oficial do Município de Caxias do Sul",
 540 |     "url": "diario_oficial_do_municipio_de_caxias_do_sul_dom-cxs",
 541 |     "sigla": "dom_cxs",
 542 |     "extracted": false,
 543 |     "processed": false,
 544 |     "structured": false
 545 |   },
 546 |   {
 547 |     "nome": "Diário Oficial do Município de Curitiba",
 548 |     "url": "diario_oficial_do_municipio_de_curitiba_dom-ctba",
 549 |     "sigla": "dom_ctba",
 550 |     "extracted": false,
 551 |     "processed": false,
 552 |     "structured": false
 553 |   },
 554 |   {
 555 |     "nome": "Diário Oficial do Município de Duque de Caxias",
 556 |     "url": "diario_oficial_do_municipio_de_duque_de_caxias_dom-dcs",
 557 |     "sigla": "dom_dcs",
 558 |     "extracted": false,
 559 |     "processed": false,
 560 |     "structured": false
 561 |   },
 562 |   {
 563 |     "nome": "Diário Oficial do Município de Florianópolis",
 564 |     "url": "diario_oficial_do_municipio_de_florianopolis_dom-fln-sc",
 565 |     "sigla": "dom_fln_sc",
 566 |     "extracted": false,
 567 |     "processed": false,
 568 |     "structured": false
 569 |   },
 570 |   {
 571 |     "nome": "Diário Oficial do Município de Fortaleza",
 572 |     "url": "diario_oficial_do_municipio_de_fortaleza_dom-for",
 573 |     "sigla": "dom_for",
 574 |     "extracted": false,
 575 |     "processed": false,
 576 |     "structured": false
 577 |   },
 578 |   {
 579 |     "nome": "Diário Oficial do Município de Goiânia",
 580 |     "url": "diario_oficial_do_municipio_de_goiania_dom-gyn",
 581 |     "sigla": "dom_gyn",
 582 |     "extracted": false,
 583 |     "processed": false,
 584 |     "structured": false
 585 |   },
 586 |   {
 587 |     "nome": "Diário Oficial do Município de Guarulhos",
 588 |     "url": "diario_oficial_do_municipio_de_guarulhos_dom-gru",
 589 |     "sigla": "dom_gru",
 590 |     "extracted": false,
 591 |     "processed": false,
 592 |     "structured": false
 593 |   },
 594 |   {
 595 |     "nome": "Diário Oficial do Município de João Pessoa",
 596 |     "url": "diario_oficial_do_municipio_de_joao_pessoa_dom-jpa",
 597 |     "sigla": "dom_jpa",
 598 |     "extracted": false,
 599 |     "processed": false,
 600 |     "structured": false
 601 |   },
 602 |   {
 603 |     "nome": "Diário Oficial do Município de Macapá",
 604 |     "url": "diario_oficial_do_municipio_de_macapa_dom-macapa",
 605 |     "sigla": "dom_macapa",
 606 |     "extracted": false,
 607 |     "processed": false,
 608 |     "structured": false
 609 |   },
 610 |   {
 611 |     "nome": "Diário Oficial do Município de Maceió",
 612 |     "url": "diario_oficial_do_municipio_de_maceio_dom-maceio",
 613 |     "sigla": "dom_maceio",
 614 |     "extracted": false,
 615 |     "processed": false,
 616 |     "structured": false
 617 |   },
 618 |   {
 619 |     "nome": "Diário Oficial do Município de Manaus",
 620 |     "url": "diario_oficial_do_municipio_de_manaus_dom-manaus",
 621 |     "sigla": "dom_manaus",
 622 |     "extracted": false,
 623 |     "processed": false,
 624 |     "structured": false
 625 |   },
 626 |   {
 627 |     "nome": "Diário Oficial do Município de Marília",
 628 |     "url": "diario_oficial_do_municipio_de_marilia_dom-mar",
 629 |     "sigla": "dom_mar",
 630 |     "extracted": false,
 631 |     "processed": false,
 632 |     "structured": false
 633 |   },
 634 |   {
 635 |     "nome": "Diário Oficial do Município de Natal",
 636 |     "url": "diario_oficial_do_municipio_de_natal_dom-natal",
 637 |     "sigla": "dom_natal",
 638 |     "extracted": false,
 639 |     "processed": false,
 640 |     "structured": false
 641 |   },
 642 |   {
 643 |     "nome": "Diário Oficial do Município de Osasco",
 644 |     "url": "diario_oficial_do_municipio_de_osasco_dom-osasco",
 645 |     "sigla": "dom_osasco",
 646 |     "extracted": false,
 647 |     "processed": false,
 648 |     "structured": false
 649 |   },
 650 |   {
 651 |     "nome": "Diário Oficial do Município de Palmas",
 652 |     "url": "diario_oficial_do_municipio_de_palmas_dom-pmw",
 653 |     "sigla": "dom_pmw",
 654 |     "extracted": false,
 655 |     "processed": false,
 656 |     "structured": false
 657 |   },
 658 |   {
 659 |     "nome": "Diário Oficial do Município de Ponta Porã",
 660 |     "url": "diario_oficial_do_municipio_de_ponta_pora_dom-pmg-ms",
 661 |     "sigla": "dom_pmg_ms",
 662 |     "extracted": false,
 663 |     "processed": false,
 664 |     "structured": false
 665 |   },
 666 |   {
 667 |     "nome": "Diário Oficial do Município de Porto Alegre",
 668 |     "url": "diario_oficial_do_municipio_de_porto_alegre_dom-poa",
 669 |     "sigla": "dom_poa",
 670 |     "extracted": false,
 671 |     "processed": false,
 672 |     "structured": false
 673 |   },
 674 |   {
 675 |     "nome": "Diário Oficial do Município de Porto Velho",
 676 |     "url": "diario_oficial_do_municipio_de_porto_velho_dom-pvh",
 677 |     "sigla": "dom_pvh",
 678 |     "extracted": false,
 679 |     "processed": false,
 680 |     "structured": false
 681 |   },
 682 |   {
 683 |     "nome": "Diário Oficial do Município de Salvador",
 684 |     "url": "diario_oficial_do_municipio_de_salvador_dom-ssa",
 685 |     "sigla": "dom_ssa",
 686 |     "extracted": false,
 687 |     "processed": false,
 688 |     "structured": false
 689 |   },
 690 |   {
 691 |     "nome": "Diário Oficial do Município de Santos",
 692 |     "url": "diario_oficial_do_municipio_de_santos_dom-santos",
 693 |     "sigla": "dom_santos",
 694 |     "extracted": false,
 695 |     "processed": false,
 696 |     "structured": false
 697 |   },
 698 |   {
 699 |     "nome": "Diário Oficial do Município de Sorocaba",
 700 |     "url": "diario_oficial_do_municipio_de_sorocaba_dom-sod-sp",
 701 |     "sigla": "dom_sod_sp",
 702 |     "extracted": false,
 703 |     "processed": false,
 704 |     "structured": false
 705 |   },
 706 |   {
 707 |     "nome": "Diário Oficial do Município de São Bernardo do Campo",
 708 |     "url": "diario_oficial_do_municipio_de_sao_bernardo_do_campo_dom-sbc",
 709 |     "sigla": "dom_sbc",
 710 |     "extracted": false,
 711 |     "processed": false,
 712 |     "structured": false
 713 |   },
 714 |   {
 715 |     "nome": "Diário Oficial do Município de São Gonçalo",
 716 |     "url": "diario_oficial_do_municipio_de_sao_goncalo_dom-qsd-rj",
 717 |     "sigla": "dom_qsd_rj",
 718 |     "extracted": false,
 719 |     "processed": false,
 720 |     "structured": false
 721 |   },
 722 |   {
 723 |     "nome": "Diário Oficial do Município de São Luís",
 724 |     "url": "diario_oficial_do_municipio_de_sao_luis_dom-slz",
 725 |     "sigla": "dom_slz",
 726 |     "extracted": false,
 727 |     "processed": false,
 728 |     "structured": false
 729 |   },
 730 |   {
 731 |     "nome": "Diário Oficial do Município de São Paulo",
 732 |     "url": "diario_oficial_do_municipio_de_sao_paulo_dom-sp",
 733 |     "sigla": "dom_sp",
 734 |     "extracted": false,
 735 |     "processed": false,
 736 |     "structured": false
 737 |   },
 738 |   {
 739 |     "nome": "Diário Oficial do Município de Teresina",
 740 |     "url": "diario_oficial_do_municipio_de_teresina_dom-the",
 741 |     "sigla": "dom_the",
 742 |     "extracted": false,
 743 |     "processed": false,
 744 |     "structured": false
 745 |   },
 746 |   {
 747 |     "nome": "Diário Oficial do Município de Uberaba",
 748 |     "url": "diario_oficial_do_municipio_de_uberaba_dom-ubera",
 749 |     "sigla": "dom_ubera",
 750 |     "extracted": false,
 751 |     "processed": false,
 752 |     "structured": false
 753 |   },
 754 |   {
 755 |     "nome": "Diário Oficial do Município de Uberlândia",
 756 |     "url": "diario_oficial_do_municipio_de_uberlandia_dom-uberl",
 757 |     "sigla": "dom_uberl",
 758 |     "extracted": false,
 759 |     "processed": false,
 760 |     "structured": false
 761 |   },
 762 |   {
 763 |     "nome": "Diário Oficial do Município de Vitória",
 764 |     "url": "diario_oficial_do_municipio_de_vitoria_dom-vix",
 765 |     "sigla": "dom_vix",
 766 |     "extracted": false,
 767 |     "processed": false,
 768 |     "structured": false
 769 |   },
 770 |   {
 771 |     "nome": "Diário Oficial dos Municípios Capixabas",
 772 |     "url": "diario_oficial_dos_municipios_capixabas_dom-capix",
 773 |     "sigla": "dom_capix",
 774 |     "extracted": false,
 775 |     "processed": false,
 776 |     "structured": false
 777 |   },
 778 |   {
 779 |     "nome": "Diário Oficial dos Municípios de Santa Catarina",
 780 |     "url": "diario_oficial_dos_municipios_de_santa_catarina_dom-sc",
 781 |     "sigla": "dom_sc",
 782 |     "extracted": false,
 783 |     "processed": false,
 784 |     "structured": false
 785 |   },
 786 |   {
 787 |     "nome": "Diário da Justiça Militar do Estado de Minas Gerais",
 788 |     "url": "diario_da_justica_militar_do_estado_de_minas_gerais_djmmg",
 789 |     "sigla": "djm_mg",
 790 |     "extracted": false,
 791 |     "processed": false,
 792 |     "structured": false
 793 |   },
 794 |   {
 795 |     "nome": "Diário da Justiça Militar do Estado de São Paulo",
 796 |     "url": "diario_da_justica_militar_do_estado_de_sao_paulo_djmsp",
 797 |     "sigla": "djm_sp",
 798 |     "extracted": false,
 799 |     "processed": false,
 800 |     "structured": false
 801 |   },
 802 |   {
 803 |     "nome": "Diário de Justiça da União",
 804 |     "url": "diario_de_justica_da_uniao_dju",
 805 |     "sigla": "dju",
 806 |     "extracted": false,
 807 |     "processed": false,
 808 |     "structured": false
 809 |   },
 810 |   {
 811 |     "nome": "Diário de Justiça do Distrito Federal",
 812 |     "url": "diario_de_justica_do_distrito_federal_djdf",
 813 |     "sigla": "djdf",
 814 |     "extracted": false,
 815 |     "processed": false,
 816 |     "structured": false
 817 |   },
 818 |   {
 819 |     "nome": "Diário de Justiça do Estado da Bahia",
 820 |     "url": "diario_de_justica_do_estado_da_bahia_djba",
 821 |     "sigla": "djba",
 822 |     "extracted": false,
 823 |     "processed": false,
 824 |     "structured": false
 825 |   },
 826 |   {
 827 |     "nome": "Diário de Justiça do Estado da Paraíba",
 828 |     "url": "diario_de_justica_do_estado_da_paraiba_djpb",
 829 |     "sigla": "djpb",
 830 |     "extracted": false,
 831 |     "processed": false,
 832 |     "structured": false
 833 |   },
 834 |   {
 835 |     "nome": "Diário de Justiça do Estado de Alagoas",
 836 |     "url": "diario_de_justica_do_estado_de_alagoas_djal",
 837 |     "sigla": "djal",
 838 |     "extracted": false,
 839 |     "processed": false,
 840 |     "structured": false
 841 |   },
 842 |   {
 843 |     "nome": "Diário de Justiça do Estado de Goiás",
 844 |     "url": "diario_de_justica_do_estado_de_goias_djgo",
 845 |     "sigla": "djgo",
 846 |     "extracted": false,
 847 |     "processed": false,
 848 |     "structured": false
 849 |   },
 850 |   {
 851 |     "nome": "Diário de Justiça do Estado de Minas Gerais",
 852 |     "url": "diario_de_justica_do_estado_de_minas_gerais_djmg",
 853 |     "sigla": "djmg",
 854 |     "extracted": false,
 855 |     "processed": false,
 856 |     "structured": false
 857 |   },
 858 |   {
 859 |     "nome": "Diário de Justiça do Estado de Pernambuco",
 860 |     "url": "diario_de_justica_do_estado_de_pernambuco_djpe",
 861 |     "sigla": "djpe",
 862 |     "extracted": false,
 863 |     "processed": false,
 864 |     "structured": false
 865 |   },
 866 |   {
 867 |     "nome": "Diário de Justiça do Estado de Rondônia",
 868 |     "url": "diario_de_justica_do_estado_de_rondonia_djro",
 869 |     "sigla": "djro",
 870 |     "extracted": false,
 871 |     "processed": false,
 872 |     "structured": false
 873 |   },
 874 |   {
 875 |     "nome": "Diário de Justiça do Estado de Roraima",
 876 |     "url": "diario_de_justica_do_estado_de_roraima_djrr",
 877 |     "sigla": "djrr",
 878 |     "extracted": false,
 879 |     "processed": false,
 880 |     "structured": false
 881 |   },
 882 |   {
 883 |     "nome": "Diário de Justiça do Estado de Santa Catarina",
 884 |     "url": "diario_de_justica_do_estado_de_santa_catarina_djsc",
 885 |     "sigla": "djsc",
 886 |     "extracted": false,
 887 |     "processed": false,
 888 |     "structured": false
 889 |   },
 890 |   {
 891 |     "nome": "Diário de Justiça do Estado de Sergipe",
 892 |     "url": "diario_de_justica_do_estado_de_sergipe_djse",
 893 |     "sigla": "djse",
 894 |     "extracted": false,
 895 |     "processed": false,
 896 |     "structured": false
 897 |   },
 898 |   {
 899 |     "nome": "Diário de Justiça do Estado de São Paulo",
 900 |     "url": "diario_de_justica_do_estado_de_sao_paulo_djsp",
 901 |     "sigla": "djsp",
 902 |     "extracted": false,
 903 |     "processed": false,
 904 |     "structured": false
 905 |   },
 906 |   {
 907 |     "nome": "Diário de Justiça do Estado de Tocantins",
 908 |     "url": "diario_de_justica_do_estado_de_tocantins_djto",
 909 |     "sigla": "djto",
 910 |     "extracted": false,
 911 |     "processed": false,
 912 |     "structured": false
 913 |   },
 914 |   {
 915 |     "nome": "Diário de Justiça do Estado do Acre",
 916 |     "url": "diario_de_justica_do_estado_do_acre_djac",
 917 |     "sigla": "djac",
 918 |     "extracted": false,
 919 |     "processed": false,
 920 |     "structured": false
 921 |   },
 922 |   {
 923 |     "nome": "Diário de Justiça do Estado do Amapá",
 924 |     "url": "diario_de_justica_do_estado_do_amapa_djap",
 925 |     "sigla": "djap",
 926 |     "extracted": false,
 927 |     "processed": false,
 928 |     "structured": false
 929 |   },
 930 |   {
 931 |     "nome": "Diário de Justiça do Estado do Amazonas",
 932 |     "url": "diario_de_justica_do_estado_do_amazonas_djam",
 933 |     "sigla": "djam",
 934 |     "extracted": false,
 935 |     "processed": false,
 936 |     "structured": false
 937 |   },
 938 |   {
 939 |     "nome": "Diário de Justiça do Estado do Ceará",
 940 |     "url": "diario_de_justica_do_estado_do_ceara_djce",
 941 |     "sigla": "djce",
 942 |     "extracted": false,
 943 |     "processed": false,
 944 |     "structured": false
 945 |   },
 946 |   {
 947 |     "nome": "Diário de Justiça do Estado do Maranhão",
 948 |     "url": "diario_de_justica_do_estado_do_maranhao_djma",
 949 |     "sigla": "djma",
 950 |     "extracted": false,
 951 |     "processed": false,
 952 |     "structured": false
 953 |   },
 954 |   {
 955 |     "nome": "Diário de Justiça do Estado do Mato Grosso",
 956 |     "url": "diario_de_justica_do_estado_do_mato_grosso_djmt",
 957 |     "sigla": "djmt",
 958 |     "extracted": false,
 959 |     "processed": false,
 960 |     "structured": false
 961 |   },
 962 |   {
 963 |     "nome": "Diário de Justiça do Estado do Mato Grosso do Sul",
 964 |     "url": "diario_de_justica_do_estado_do_mato_grosso_do_sul_djms",
 965 |     "sigla": "djms",
 966 |     "extracted": false,
 967 |     "processed": false,
 968 |     "structured": false
 969 |   },
 970 |   {
 971 |     "nome": "Diário de Justiça do Estado do Paraná",
 972 |     "url": "diario_de_justica_do_estado_do_parana_djpr",
 973 |     "sigla": "djpr",
 974 |     "extracted": false,
 975 |     "processed": false,
 976 |     "structured": false
 977 |   },
 978 |   {
 979 |     "nome": "Diário de Justiça do Estado do Pará",
 980 |     "url": "diario_de_justica_do_estado_do_para_djpa",
 981 |     "sigla": "djpa",
 982 |     "extracted": false,
 983 |     "processed": false,
 984 |     "structured": false
 985 |   },
 986 |   {
 987 |     "nome": "Diário de Justiça do Estado do Piauí",
 988 |     "url": "diario_de_justica_do_estado_do_piaui_djpi",
 989 |     "sigla": "djpi",
 990 |     "extracted": false,
 991 |     "processed": false,
 992 |     "structured": false
 993 |   },
 994 |   {
 995 |     "nome": "Diário de Justiça do Estado do Rio Grande do Norte",
 996 |     "url": "diario_de_justica_do_estado_do_rio_grande_do_norte_djrn",
 997 |     "sigla": "djrn",
 998 |     "extracted": false,
 999 |     "processed": false,
1000 |     "structured": false
1001 |   },
1002 |   {
1003 |     "nome": "Diário de Justiça do Estado do Rio Grande do Sul",
1004 |     "url": "diario_de_justica_do_estado_do_rio_grande_do_sul_djrs",
1005 |     "sigla": "djrs",
1006 |     "extracted": false,
1007 |     "processed": false,
1008 |     "structured": false
1009 |   },
1010 |   {
1011 |     "nome": "Diário de Justiça do Estado do Rio de Janeiro",
1012 |     "url": "diario_de_justica_do_estado_do_rio_de_janeiro_djrj",
1013 |     "sigla": "djrj",
1014 |     "extracted": false,
1015 |     "processed": false,
1016 |     "structured": false
1017 |   },
1018 |   {
1019 |     "nome": "Federação Goiana de Municípios",
1020 |     "url": "federacao_goiana_de_municipios_fgm",
1021 |     "sigla": "fgm",
1022 |     "extracted": false,
1023 |     "processed": false,
1024 |     "structured": false
1025 |   },
1026 |   {
1027 |     "nome": "Federação das Associações de Municípios da Paraíba",
1028 |     "url": "federacao_das_associacoes_de_municipios_da_paraiba_famup",
1029 |     "sigla": "famup",
1030 |     "extracted": false,
1031 |     "processed": false,
1032 |     "structured": false
1033 |   },
1034 |   {
1035 |     "nome": "Federação das Associações de Municípios do Estado do Pará",
1036 |     "url": "federacao_das_associacoes_de_municipios_do_estado_do_para_famep",
1037 |     "sigla": "famep",
1038 |     "extracted": false,
1039 |     "processed": false,
1040 |     "structured": false
1041 |   },
1042 |   {
1043 |     "nome": "Federação das Associações de Municípios do Rio Grande do Sul",
1044 |     "url": "federacao_das_associacoes_de_municipios_do_rio_grande_do_sul_famurs",
1045 |     "sigla": "famurs",
1046 |     "extracted": false,
1047 |     "processed": false,
1048 |     "structured": false
1049 |   },
1050 |   {
1051 |     "nome": "Federação dos Municípios do Rio Grande do Norte",
1052 |     "url": "federacao_dos_municipios_do_rio_grande_do_norte_femurn",
1053 |     "sigla": "femurn",
1054 |     "extracted": false,
1055 |     "processed": false,
1056 |     "structured": false
1057 |   },
1058 | 
1059 |   {
1060 |     "nome": "Ministério Público de Pernambuco",
1061 |     "url": "ministerio_publico_de_pernambuco_mp-pe",
1062 |     "sigla": "mp_pe",
1063 |     "extracted": false,
1064 |     "processed": false,
1065 |     "structured": false
1066 |   },
1067 |   {
1068 |     "nome": "Ministério Público do Estado da Santa Catarina",
1069 |     "url": "ministerio_publico_do_estado_da_santa_catarina_mp-sc",
1070 |     "sigla": "mp_sc",
1071 |     "extracted": false,
1072 |     "processed": false,
1073 |     "structured": false
1074 |   },
1075 |   {
1076 |     "nome": "Ministério Público do Estado de Minas Gerais",
1077 |     "url": "ministerio_publico_do_estado_de_minas_gerais_mp-mg",
1078 |     "sigla": "mp_mg",
1079 |     "extracted": false,
1080 |     "processed": false,
1081 |     "structured": false
1082 |   },
1083 |   {
1084 |     "nome": "Ministério Público do Estado do Amapá",
1085 |     "url": "ministerio_publico_do_estado_do_amapa_mp-ap",
1086 |     "sigla": "mp_ap",
1087 |     "extracted": false,
1088 |     "processed": false,
1089 |     "structured": false
1090 |   },
1091 |   {
1092 |     "nome": "Ministério Público do Estado do Mato Grosso do Sul",
1093 |     "url": "ministerio_publico_do_estado_do_mato_grosso_do_sul_mp-ms",
1094 |     "sigla": "mp_ms",
1095 |     "extracted": false,
1096 |     "processed": false,
1097 |     "structured": false
1098 |   },
1099 |   {
1100 |     "nome": "Ministério Público do Estado do Rio Grande do Sul",
1101 |     "url": "ministerio_publico_do_estado_do_rio_grande_do_sul_mp-rs",
1102 |     "sigla": "mp_rs",
1103 |     "extracted": false,
1104 |     "processed": false,
1105 |     "structured": false
1106 |   },
1107 |   {
1108 |     "nome": "PJECOR - Corregedoria Geral",
1109 |     "url": "pjecor_-_corregedoria_geral_pjecoroutros",
1110 |     "sigla": "pjecoroutros",
1111 |     "extracted": false,
1112 |     "processed": false,
1113 |     "structured": false
1114 |   },
1115 |   {
1116 |     "nome": "Revista da Propriedade Industrial",
1117 |     "url": "revista_da_propriedade_industrial_rpi",
1118 |     "sigla": "rpi",
1119 |     "extracted": false,
1120 |     "processed": false,
1121 |     "structured": false
1122 |   },
1123 |   {
1124 |     "nome": "SEEU - Diário de Justiça do Estado do Acre",
1125 |     "url": "seeu_-_diario_de_justica_do_estado_do_acre_seeu-tjac",
1126 |     "sigla": "seeu_tjac",
1127 |     "extracted": false,
1128 |     "processed": false,
1129 |     "structured": false
1130 |   },
1131 |   {
1132 |     "nome": "SEEU - Justiça Federal no Rio Grande do Norte",
1133 |     "url": "seeu_-_justica_federal_no_rio_grande_do_norte_seeu-sjrn",
1134 |     "sigla": "seeu_sjrn",
1135 |     "extracted": false,
1136 |     "processed": false,
1137 |     "structured": false
1138 |   },
1139 |   {
1140 |     "nome": "SEEU - Tribunal Regional Federal da 2ª Região",
1141 |     "url": "seeu_-_tribunal_regional_federal_da_2_regiao_seeu-trf2",
1142 |     "sigla": "seeu_trf2",
1143 |     "extracted": false,
1144 |     "processed": false,
1145 |     "structured": false
1146 |   },
1147 |   {
1148 |     "nome": "SEEU - Tribunal Regional Federal da 5ª Região",
1149 |     "url": "seeu_-_tribunal_regional_federal_da_5_regiao_seeu-trf5",
1150 |     "sigla": "seeu_trf5",
1151 |     "extracted": false,
1152 |     "processed": false,
1153 |     "structured": false
1154 |   },
1155 |   {
1156 |     "nome": "SEEU - Tribunal de Justiça Militar de Minas Gerais",
1157 |     "url": "seeu_-_tribunal_de_justica_militar_de_minas_gerais_seeu-tjmmg",
1158 |     "sigla": "seeu_tjmmg",
1159 |     "extracted": false,
1160 |     "processed": false,
1161 |     "structured": false
1162 |   },
1163 |   {
1164 |     "nome": "SEEU - Tribunal de Justiça de Alagoas",
1165 |     "url": "seeu_-_tribunal_de_justica_de_alagoas_seeu-tjal",
1166 |     "sigla": "seeu_tjal",
1167 |     "extracted": false,
1168 |     "processed": false,
1169 |     "structured": false
1170 |   },
1171 |   {
1172 |     "nome": "SEEU - Tribunal de Justiça de Rondônia",
1173 |     "url": "seeu_-_tribunal_de_justica_de_rondonia_seeu-tjro",
1174 |     "sigla": "seeu_tjro",
1175 |     "extracted": false,
1176 |     "processed": false,
1177 |     "structured": false
1178 |   },
1179 |   {
1180 |     "nome": "SEEU - Tribunal de Justiça do Estado do Espírito Santo",
1181 |     "url": "seeu_-_tribunal_de_justica_do_estado_do_espirito_santo_seeu-tjes",
1182 |     "sigla": "seeu_tjes",
1183 |     "extracted": false,
1184 |     "processed": false,
1185 |     "structured": false
1186 |   },
1187 |   {
1188 |     "nome": "SEEU - Tribunal de Justiça do Piauí",
1189 |     "url": "seeu_-_tribunal_de_justica_do_piaui_seeu-tjpi",
1190 |     "sigla": "seeu_tjpi",
1191 |     "extracted": false,
1192 |     "processed": false,
1193 |     "structured": false
1194 |   },
1195 |   {
1196 |     "nome": "SEEU CJM4 - Auditoria da 4ª CJM - Aberto",
1197 |     "url": "seeu_cjm4_-_auditoria_da_4_cjm_-_aberto_seeu-4cjm",
1198 |     "sigla": "seeu_4cjm",
1199 |     "extracted": false,
1200 |     "processed": false,
1201 |     "structured": false
1202 |   },
1203 |   {
1204 |     "nome": "Sistema Eletrônico de Execução Unificado do TJAP",
1205 |     "url": "sistema_eletronico_de_execucao_unificado_do_tjap_seeu-tjap",
1206 |     "sigla": "seeu_tjap",
1207 |     "extracted": false,
1208 |     "processed": false,
1209 |     "structured": false
1210 |   },
1211 |   {
1212 |     "nome": "Sistema Eletrônico de Execução Unificado do TJDFT",
1213 |     "url": "sistema_eletronico_de_execucao_unificado_do_tjdft_seeu-tjdft",
1214 |     "sigla": "seeu_tjdft",
1215 |     "extracted": false,
1216 |     "processed": false,
1217 |     "structured": false
1218 |   },
1219 |   {
1220 |     "nome": "Sistema Eletrônico de Execução Unificado do TJPE",
1221 |     "url": "sistema_eletronico_de_execucao_unificado_do_tjpe_seeu-tjpe",
1222 |     "sigla": "seeu_tjpe",
1223 |     "extracted": false,
1224 |     "processed": false,
1225 |     "structured": false
1226 |   },
1227 |   {
1228 |     "nome": "Sistema Eletrônico de Execução Unificado do TRF6",
1229 |     "url": "sistema_eletronico_de_execucao_unificado_do_trf6_seeu-trf6",
1230 |     "sigla": "seeu_trf6",
1231 |     "extracted": false,
1232 |     "processed": false,
1233 |     "structured": false
1234 |   },
1235 |   {
1236 |     "nome": "Sistema Eletrônico de Execução Unificado do Tribunal de Justiça da Par",
1237 |     "url": "sistema_eletronico_de_execucao_unificado_do_tribunal_de_justica_da_par_seeu-tjpb",
1238 |     "sigla": "seeu_tjpb",
1239 |     "extracted": false,
1240 |     "processed": false,
1241 |     "structured": false
1242 |   },
1243 |   {
1244 |     "nome": "Superior Tribunal Militar",
1245 |     "url": "superior_tribunal_militar_stm",
1246 |     "sigla": "stm",
1247 |     "extracted": false,
1248 |     "processed": false,
1249 |     "structured": false
1250 |   },
1251 |   {
1252 |     "nome": "Superior Tribunal de Justiça",
1253 |     "url": "superior_tribunal_de_justica_stj",
1254 |     "sigla": "stj",
1255 |     "extracted": false,
1256 |     "processed": false,
1257 |     "structured": false
1258 |   },
1259 |   {
1260 |     "nome": "Tribunal Regional do Trabalho da 23ª Região",
1261 |     "url": "tribunal_regional_do_trabalho_da_23_regiao_trt-23",
1262 |     "sigla": "trt_23",
1263 |     "extracted": false,
1264 |     "processed": false,
1265 |     "structured": false
1266 |   },
1267 |   {
1268 |     "nome": "Tribunal Regional do Trabalho da 24ª Região",
1269 |     "url": "tribunal_regional_do_trabalho_da_24_regiao_trt-24",
1270 |     "sigla": "trt_24",
1271 |     "extracted": false,
1272 |     "processed": false,
1273 |     "structured": false
1274 |   },
1275 |   {
1276 |     "nome": "Tribunal Regional do Trabalho da 2ª Região",
1277 |     "url": "tribunal_regional_do_trabalho_da_2_regiao_trt-2",
1278 |     "sigla": "trt_2",
1279 |     "extracted": false,
1280 |     "processed": false,
1281 |     "structured": false
1282 |   },
1283 |   {
1284 |     "nome": "Tribunal Regional do Trabalho da 3ª Região",
1285 |     "url": "tribunal_regional_do_trabalho_da_3_regiao_trt-3",
1286 |     "sigla": "trt_3",
1287 |     "extracted": false,
1288 |     "processed": false,
1289 |     "structured": false
1290 |   },
1291 |   {
1292 |     "nome": "Tribunal Regional do Trabalho da 4ª Região",
1293 |     "url": "tribunal_regional_do_trabalho_da_4_regiao_trt-4",
1294 |     "sigla": "trt_4",
1295 |     "extracted": false,
1296 |     "processed": false,
1297 |     "structured": false
1298 |   },
1299 |   {
1300 |     "nome": "Tribunal Regional do Trabalho da 5ª Região",
1301 |     "url": "tribunal_regional_do_trabalho_da_5_regiao_trt-5",
1302 |     "sigla": "trt_5",
1303 |     "extracted": false,
1304 |     "processed": false,
1305 |     "structured": false
1306 |   },
1307 |   {
1308 |     "nome": "Tribunal Regional do Trabalho da 6ª Região",
1309 |     "url": "tribunal_regional_do_trabalho_da_6_regiao_trt-6",
1310 |     "sigla": "trt_6",
1311 |     "extracted": false,
1312 |     "processed": false,
1313 |     "structured": false
1314 |   },
1315 |   {
1316 |     "nome": "Tribunal Regional do Trabalho da 7ª Região",
1317 |     "url": "tribunal_regional_do_trabalho_da_7_regiao_trt-7",
1318 |     "sigla": "trt_7",
1319 |     "extracted": false,
1320 |     "processed": false,
1321 |     "structured": false
1322 |   },
1323 |   {
1324 |     "nome": "Tribunal Regional do Trabalho da 8ª Região",
1325 |     "url": "tribunal_regional_do_trabalho_da_8_regiao_trt-8",
1326 |     "sigla": "trt_8",
1327 |     "extracted": false,
1328 |     "processed": false,
1329 |     "structured": false
1330 |   },
1331 |   {
1332 |     "nome": "Tribunal Regional do Trabalho da 9ª Região",
1333 |     "url": "tribunal_regional_do_trabalho_da_9_regiao_trt-9",
1334 |     "sigla": "trt_9",
1335 |     "extracted": false,
1336 |     "processed": false,
1337 |     "structured": false
1338 |   },
1339 |   {
1340 |     "nome": "Tribunal Superior Eleitoral",
1341 |     "url": "tribunal_superior_eleitoral_tse",
1342 |     "sigla": "tse",
1343 |     "extracted": false,
1344 |     "processed": false,
1345 |     "structured": false
1346 |   },
1347 |   {
1348 |     "nome": "Tribunal Superior do Trabalho",
1349 |     "url": "tribunal_superior_do_trabalho_tst",
1350 |     "sigla": "tst",
1351 |     "extracted": false,
1352 |     "processed": false,
1353 |     "structured": false
1354 |   },
1355 |   {
1356 |     "nome": "Tribunal de Justiça Militar do Rio Grande do Sul",
1357 |     "url": "tribunal_de_justica_militar_do_rio_grande_do_sul_tjm-rs",
1358 |     "sigla": "tjm_rs",
1359 |     "extracted": false,
1360 |     "processed": false,
1361 |     "structured": false
1362 |   },
1363 |   {
1364 |     "nome": "Tribunal de Justiça do Espírito Santo",
1365 |     "url": "tribunal_de_justica_do_espirito_santo_tj-es",
1366 |     "sigla": "tj_es",
1367 |     "extracted": false,
1368 |     "processed": false,
1369 |     "structured": false
1370 |   }
1371 | ]
1372 | 


--------------------------------------------------------------------------------