├── runtime.txt ├── transparenciagovbr ├── __init__.py ├── utils │ ├── __init__.py │ ├── cities.py │ ├── print_spider_names.py │ ├── date.py │ ├── io.py │ └── fields.py ├── spiders │ ├── __init__.py │ ├── pagamento.py │ ├── execucao_despesa.py │ ├── orcamento_despesa.py │ ├── pagamento_historico.py │ ├── auxilio_emergencial.py │ ├── base.py │ └── despesa_item_empenho.py ├── pipelines.py ├── items.py ├── exporters.py ├── fields.py ├── settings.py └── middlewares.py ├── requirements-development.txt ├── .github └── FUNDING.yml ├── pensionista ├── requirements.txt ├── import-pgsql.sh ├── README.md ├── list_zips.py ├── indexes.sql ├── download-old.sh └── convert.py ├── Makefile ├── requirements.txt ├── .gitignore ├── scrapy.cfg ├── schema ├── auxilio_emergencial.csv ├── despesa_item_empenho.csv ├── pagamento_historico.csv ├── orcamento_despesa.csv ├── pagamento.csv ├── execucao_despesa.csv └── despesa_empenho.csv ├── import-postgresql.sh ├── run.sh ├── README.md ├── create_mirror_script.py ├── scripts └── auxilio_emergencial.py └── LICENSE /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.8.2 2 | -------------------------------------------------------------------------------- /transparenciagovbr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-development.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | autoflake 4 | black 5 | ipython 6 | isort 7 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | custom: https://apoia.se/brasilio 4 | -------------------------------------------------------------------------------- /pensionista/requirements.txt: -------------------------------------------------------------------------------- 1 | calculadora-do-cidadao 2 | https://github.com/turicas/rows/archive/develop.zip 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | fix-imports: 2 | autoflake --in-place --recursive --remove-unused-variables --remove-all-unused-imports . 3 | isort -rc . 4 | black . 5 | 6 | .PHONY: fix-imports 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cached_property 2 | https://github.com/turicas/rows/archive/develop.zip 3 | requests # TODO: remove when dependency bug in rows is fixed 4 | s3cmd 5 | scrapy 6 | tqdm 7 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | *.pyc 3 | *~ 4 | .*.sw? 5 | .DS_Store 6 | .activate 7 | .coverage 8 | .directory 9 | .env 10 | .idea/* 11 | .scrapy 12 | .tox 13 | MANIFEST 14 | build/* 15 | data/* 16 | dist/* 17 | download.sh 18 | mirror.sh 19 | reg_settings.py 20 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/cities.py: -------------------------------------------------------------------------------- 1 | import rows 2 | 3 | from transparenciagovbr import settings 4 | 5 | cities_filename = settings.REPOSITORY_PATH / "data" / "populacao-estimada-2020.csv" 6 | city_name_by_id = { 7 | row.city_ibge_code: row.city for row in rows.import_from_csv(cities_filename) 8 | } 9 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/print_spider_names.py: -------------------------------------------------------------------------------- 1 | from scrapy import spiderloader 2 | from scrapy.utils import project 3 | 4 | settings = project.get_project_settings() 5 | spider_loader = spiderloader.SpiderLoader.from_settings(settings) 6 | spiders = spider_loader.list() 7 | for spider_name in spiders: 8 | print(spider_name) 9 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = transparenciagovbr.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = transparenciagovbr 12 | -------------------------------------------------------------------------------- /transparenciagovbr/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TransparenciagovbrPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /transparenciagovbr/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TransparenciagovbrItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /pensionista/import-pgsql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$DATABASE_URL" ]; then 4 | echo "ERROR: must set $DATABASE_URL with postgres connection string" 5 | exit 1 6 | fi 7 | 8 | for table in cadastro observacao remuneracao; do 9 | rows pgimport \ 10 | --dialect=excel \ 11 | --input-encoding=utf-8 \ 12 | --schema=schema/pensionista_${table}.csv \ 13 | data/output/pensionista_${table}.csv.gz \ 14 | $DATABASE_URL \ 15 | pensionista_${table} 16 | done 17 | -------------------------------------------------------------------------------- /pensionista/README.md: -------------------------------------------------------------------------------- 1 | # Dados de pensionistas 2 | 3 | ## Instalação 4 | 5 | ```shell 6 | pip install -r requirements.txt 7 | ``` 8 | 9 | ## Execução 10 | 11 | Baixe os arquivos de pensionistas disponíveis [nesse 12 | site](http://transparencia.gov.br/download-de-dados/servidores) e coloque-os em 13 | `data/download/`. Depois, execute: 14 | 15 | ```shell 16 | python convert.py 17 | ``` 18 | 19 | Os arquivos `cadastro.csv.gz`, `observacao.csv.gz` e `remuneracao.csv.gz` serão 20 | gerados em `data/output/`. 21 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/pagamento.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 4 | from transparenciagovbr.utils.date import today 5 | 6 | 7 | class PagamentoSpider(TransparenciaBaseSpider): 8 | name = "pagamento" 9 | base_url = "http://www.portaldatransparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}" 10 | start_date = datetime.date(2013, 3, 31) 11 | end_date = today() 12 | publish_frequency = "daily" 13 | filename_suffix = "_Despesas_Pagamento.csv" 14 | schema_filename = "pagamento.csv" 15 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/execucao_despesa.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 4 | from transparenciagovbr.utils.date import today 5 | 6 | 7 | class ExecucaoDespesaSpider(TransparenciaBaseSpider): 8 | name = "execucao_despesa" 9 | base_url = "http://transparencia.gov.br/download-de-dados/despesas-execucao/{year}{month:02d}" 10 | start_date = datetime.date(2014, 1, 1) 11 | end_date = today() 12 | publish_frequency = "monthly" 13 | filename_suffix = "_Despesas.csv" 14 | schema_filename = "execucao_despesa.csv" 15 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/orcamento_despesa.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 4 | from transparenciagovbr.utils.date import today 5 | 6 | 7 | class OrcamentoDespesaSpider(TransparenciaBaseSpider): 8 | name = "orcamento_despesa" 9 | base_url = "http://transparencia.gov.br/download-de-dados/orcamento-despesa/{year}" 10 | start_date = datetime.date(2014, 1, 1) 11 | end_date = today() 12 | publish_frequency = "yearly" 13 | filename_suffix = "_OrcamentoDespesa.zip.csv" 14 | schema_filename = "orcamento_despesa.csv" 15 | -------------------------------------------------------------------------------- /schema/auxilio_emergencial.csv: -------------------------------------------------------------------------------- 1 | original_name,field_name,internal_field_type,field_type 2 | MÊS DISPONIBILIZAÇÃO,ano_mes,integer,integer 3 | UF,uf,text,text 4 | CÓDIGO MUNICÍPIO IBGE,codigo_ibge_municipio,integer,integer 5 | NOME MUNICÍPIO,municipio,text,text 6 | NIS BENEFICIÁRIO,nis_beneficiario,custom_integer,integer 7 | CPF BENEFICIÁRIO,cpf_beneficiario,cpf,text 8 | NOME BENEFICIÁRIO,beneficiario,text,text 9 | NIS RESPONSÁVEL,nis_responsavel,custom_integer,integer 10 | CPF RESPONSÁVEL,cpf_responsavel,cpf,text 11 | NOME RESPONSÁVEL,responsavel,custom_text,text 12 | ENQUADRAMENTO,enquadramento,text,text 13 | PARCELA,parcela,custom_integer,integer 14 | OBSERVAÇÃO,observacao,custom_text,text 15 | VALOR BENEFÍCIO,valor,money_real,decimal 16 | -------------------------------------------------------------------------------- /import-postgresql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCHEMA_PATH="schema" 6 | OUTPUT_PATH="data/output" 7 | 8 | function import_table() { 9 | tablename="$1" 10 | 11 | echo "DROP TABLE IF EXISTS ${tablename};" | psql "$POSTGRESQL_URI" 12 | time rows pgimport \ 13 | --schema="$SCHEMA_PATH/${tablename}.csv" \ 14 | --input-encoding="utf-8" \ 15 | --dialect="excel" \ 16 | "$OUTPUT_PATH/${tablename}.csv.gz" \ 17 | "$POSTGRESQL_URI" \ 18 | "$tablename" 19 | } 20 | 21 | if [ -z "$POSTGRESQL_URI" ]; then 22 | echo "ERROR: you must set POSTGRESQL_URI environment variable." 23 | exit 1 24 | fi 25 | 26 | if [ ! -z "$1" ]; then 27 | import_table $1 28 | else 29 | for table in pagamento pagamento_historico execucao_despesa orcamento_despesa; do 30 | import_table $table 31 | done 32 | fi 33 | -------------------------------------------------------------------------------- /pensionista/list_zips.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urljoin 3 | from urllib.request import urlopen 4 | 5 | 6 | def ckan_package_resources(base_url, resource_id): 7 | template_url = urljoin(base_url, "/api/3/action/package_show?id={resource_id}") 8 | url = template_url.format(resource_id=resource_id) 9 | response = urlopen(url) 10 | data = json.loads(response.read()) 11 | return data["result"]["resources"] 12 | 13 | 14 | if __name__ == "__main__": 15 | resources = ckan_package_resources( 16 | base_url="http://www.dados.gov.br", resource_id="c76a1bc6-2330-4b05-b3dd-491124931496" 17 | ) 18 | 19 | for resource in resources: 20 | if not resource["url"].lower().endswith(".zip"): 21 | continue 22 | print(resource["url"]) 23 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | OUTPUT_PATH=data/output 5 | LOG_PATH=data/log 6 | LOG_LEVEL=INFO 7 | if [ "$1" = "--use-mirror" ]; then 8 | OPTS="-a use_mirror=true" 9 | shift 10 | else 11 | OPTS="" 12 | fi 13 | 14 | run_spider() { 15 | spider="$1" 16 | 17 | mkdir -p $LOG_PATH $OUTPUT_PATH 18 | log_filename="$LOG_PATH/${spider}.log" 19 | output_filename="$OUTPUT_PATH/${spider}.csv.gz" 20 | rm -rf $log_filename $output_filename 21 | echo "Running ${spider} - check $log_filename for logs and $output_filename for output" 22 | time scrapy crawl \ 23 | --loglevel=$LOG_LEVEL \ 24 | --logfile=$log_filename \ 25 | $OPTS \ 26 | $spider \ 27 | -t "csv.gz" \ 28 | -o $output_filename 29 | } 30 | 31 | if [ ! -z "$1" ]; then 32 | spiders="$@" 33 | else 34 | spiders="$(python transparenciagovbr/utils/print_spider_names.py)" 35 | fi 36 | for spider in $spiders; do 37 | run_spider $spider 38 | done 39 | -------------------------------------------------------------------------------- /transparenciagovbr/exporters.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | 3 | from scrapy.exporters import CsvItemExporter 4 | 5 | 6 | # Code from 7 | class GzipCsvItemExporter(CsvItemExporter): 8 | """Gzip-compressed CSV exporter 9 | 10 | To use it, add 11 | :: 12 | 13 | FEED_EXPORTERS = { 14 | 'csv.gz': 'myproject.exporters.GzipCsvItemExporter', 15 | } 16 | FEED_FORMAT = 'csv.gz' 17 | 18 | to settings.py and then run scrapy crawl like this:: 19 | 20 | scrapy crawl foo -o item.csv.gz 21 | 22 | (if `FEED_FORMAT` is not explicitly specified, you'll need to add 23 | `-t csv.gz` to the command above) 24 | """ 25 | 26 | def __init__(self, fobj, **kwargs): 27 | filename = fobj.name 28 | fobj.close() 29 | fobj = open(filename, mode="wb", buffering=8 * 1024 * 1024) 30 | self.gzfile = gzip.GzipFile(fileobj=fobj) 31 | super().__init__(self.gzfile, **kwargs) 32 | 33 | def finish_exporting(self): 34 | self.gzfile.close() 35 | -------------------------------------------------------------------------------- /pensionista/indexes.sql: -------------------------------------------------------------------------------- 1 | CREATE INDEX idx_pensobs_id ON pensionista_observacao (id_servidor_portal, ano, mes, sistema_origem); 2 | CREATE INDEX idx_pensobs_uuid ON pensionista_observacao (pessoa_uuid); 3 | CREATE INDEX idx_pensobs_orig ON pensionista_observacao (sistema_origem); 4 | 5 | CREATE INDEX idx_penscad_id ON pensionista_cadastro (id_servidor_portal, ano, mes, sistema_origem); 6 | CREATE INDEX idx_penscad_uuid1 ON pensionista_cadastro (pessoa_uuid); 7 | CREATE INDEX idx_penscad_uuid2 ON pensionista_cadastro (representante_legal_uuid); 8 | CREATE INDEX idx_penscad_uuid3 ON pensionista_cadastro (instituidor_pensao_uuid); 9 | CREATE INDEX idx_penscad_orig ON pensionista_cadastro (sistema_origem); 10 | 11 | CREATE INDEX idx_pensrem_id ON pensionista_remuneracao (id_servidor_portal, ano, mes, sistema_origem); 12 | CREATE INDEX idx_pensrem_uuid ON pensionista_remuneracao (pessoa_uuid); 13 | CREATE INDEX idx_pensrem_orig ON pensionista_remuneracao (sistema_origem); 14 | 15 | ALTER TABLE pensionista_cadastro ADD PRIMARY KEY (id_servidor_portal, ano, mes, sistema_origem); 16 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/pagamento_historico.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import io 4 | import zipfile 5 | 6 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 7 | from transparenciagovbr.utils.io import NotNullTextWrapper 8 | 9 | 10 | class PagamentoHistSpider(TransparenciaBaseSpider): 11 | name = "pagamento_historico" 12 | base_url = "http://www.portaltransparencia.gov.br/download-de-dados/historico-gastos-diretos-pagamentos/{year}{month:02d}" 13 | start_date = datetime.date(2011, 1, 1) 14 | end_date = datetime.date(2012, 12, 31) 15 | publish_frequency = "monthly" 16 | schema_filename = "pagamento_historico.csv" 17 | 18 | def parse_zip_response(self, response): 19 | zf = zipfile.ZipFile(io.BytesIO(response.body)) 20 | assert len(zf.filelist) == 1 21 | fobj = NotNullTextWrapper( 22 | zf.open(zf.filelist[0].filename), encoding=self.encoding 23 | ) 24 | reader = csv.DictReader(fobj, delimiter="\t") 25 | 26 | for row in reader: 27 | new = self.schema.deserialize(row) 28 | if new is not None: 29 | yield new 30 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/date.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | import datetime 3 | 4 | 5 | def today(): 6 | date = datetime.datetime.now() 7 | return datetime.date(date.year, date.month, date.day) 8 | 9 | 10 | def next_day(date): 11 | return date + datetime.timedelta(days=1) 12 | 13 | 14 | def next_month(date): 15 | return datetime.date( 16 | year=date.year + (date.month // 12), month=(date.month % 12) + 1, day=date.day 17 | ) 18 | 19 | 20 | def next_year(date): 21 | if calendar.isleap(date.year): 22 | days_to_add = 366 23 | else: 24 | days_to_add = 365 25 | return date + datetime.timedelta(days=days_to_add) 26 | 27 | 28 | def next_date(date, interval="daily"): 29 | from_interval = {"daily": next_day, "monthly": next_month, "yearly": next_year} 30 | 31 | return from_interval[interval](date) 32 | 33 | 34 | def date_range(start, stop, interval="daily"): 35 | current = start 36 | while current < stop: 37 | yield current 38 | current = next_date(date=current, interval=interval) 39 | 40 | 41 | def date_to_dict(date): 42 | return {"year": date.year, "month": date.month, "day": date.day} 43 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/io.py: -------------------------------------------------------------------------------- 1 | import re 2 | from csv import DictReader 3 | from io import TextIOWrapper 4 | from zipfile import ZipFile 5 | 6 | 7 | class NotNullTextWrapper(TextIOWrapper): 8 | def read(self, *args, **kwargs): 9 | data = super().read(*args, **kwargs) 10 | return data.replace("\x00", "") 11 | 12 | def readline(self, *args, **kwargs): 13 | data = super().readline(*args, **kwargs) 14 | return data.replace("\x00", "") 15 | 16 | 17 | def parse_zip(filename_or_fobj, inner_filename_suffix, encoding): 18 | zf = ZipFile(filename_or_fobj) 19 | for file_info in zf.filelist: 20 | filename = file_info.filename 21 | if isinstance(inner_filename_suffix, re.Pattern): 22 | file_matches = bool(inner_filename_suffix.findall(filename)) 23 | else: 24 | file_matches = filename.endswith(inner_filename_suffix) 25 | 26 | if file_matches: 27 | fobj = TextIOWrapper( 28 | zf.open(filename), encoding=encoding 29 | ) 30 | reader = DictReader(fobj, delimiter=";") 31 | for row in reader: 32 | yield row 33 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/auxilio_emergencial.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 4 | from transparenciagovbr.utils.cities import city_name_by_id 5 | from transparenciagovbr.utils.date import today 6 | 7 | day = today() 8 | last_month = day.month - 1 if day.month > 1 else 12 9 | year = day.year if day.month > 1 else day.year - 1 10 | end_date = datetime.date(year, last_month, day.day) 11 | 12 | 13 | class AuxilioEmergencialSpider(TransparenciaBaseSpider): 14 | name = "auxilio_emergencial" 15 | base_url = "http://transparencia.gov.br/download-de-dados/auxilio-emergencial/{year}{month:02d}" 16 | start_date = datetime.date(2020, 4, 1) 17 | end_date = end_date 18 | publish_frequency = "monthly" 19 | filename_suffix = "_AuxilioEmergencial.csv" 20 | schema_filename = "auxilio_emergencial.csv" 21 | 22 | def convert_row(self, row): 23 | row = super().convert_row(row) 24 | 25 | if row["codigo_ibge_municipio"] is not None: 26 | # Força nome de município a ser mais bonito (com acentos, 27 | # maiúsculas e minúsculas). :) 28 | row["municipio"] = city_name_by_id[row["codigo_ibge_municipio"]] 29 | return row 30 | -------------------------------------------------------------------------------- /schema/despesa_item_empenho.csv: -------------------------------------------------------------------------------- 1 | field_name,field_type,internal_field_type,original_name 2 | categoria_despesa,text,text,Categoria de Despesa 3 | codigo_categoria_despesa,text,text,Código Categoria de Despesa 4 | codigo_elemento_despesa,integer,text,Código Elemento de Despesa 5 | codigo_empenho,text,text,Código Empenho 6 | codigo_grupo_despesa,text,text,Código Grupo de Despesa 7 | codigo_modalidade_aplicacao,text,text,Código Modalidade de Aplicação 8 | codigo_subelemento_despesa,integer,text,Código SubElemento de Despesa 9 | descricao,text,text,Descrição 10 | elemento_despesa,text,text,Elemento de Despesa 11 | em_sigilo,bool,bool,(coluna criada pelo script) 12 | grupo_despesa,text,text,Grupo de Despesa 13 | id_empenho,integer,integer,Id Empenho 14 | modalidade_aplicacao,text,text,Modalidade de Aplicação 15 | quantidade,decimal,money_real,Quantidade 16 | subelemento_despesa,text,text,SubElemento de Despesa 17 | valor_total,decimal,money_real,Valor Total 18 | valor_unitario,decimal,money_real,Valor Unitário 19 | sequencial,integer,integer,Sequencial 20 | valor_atual,decimal,money_real,Valor Atual 21 | data,date,date, 22 | unidade,text,text, 23 | item,text,text, 24 | marca,text,text, 25 | item_processo,text,text, 26 | item_material,text,text, 27 | descricao_restante,text,text, -------------------------------------------------------------------------------- /schema/pagamento_historico.csv: -------------------------------------------------------------------------------- 1 | original_name,field_name,internal_field_type,field_type 2 | Nome Ação,acao,text,text 3 | Código Ação,codigo_acao,text,text 4 | Código Elemento Despesa,codigo_elemento_despesa,text,text 5 | Código Favorecido,codigo_favorecido,text,text 6 | Código Função,codigo_funcao,custom_integer,integer 7 | Código Grupo Despesa,codigo_grupo_despesa,text,text 8 | Código Órgão,codigo_orgao,custom_integer,integer 9 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer 10 | Código Programa,codigo_programa,custom_integer,integer 11 | Código Subfunção,codigo_subfuncao,custom_integer,integer 12 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer 13 | Data Pagamento,data_pagamento,brazilian_date,date 14 | Nome Elemento Despesa,elemento_despesa,text,text 15 | ,em_sigilo,bool,bool 16 | Nome Favorecido,favorecido,text,text 17 | Nome Função,funcao,text,text 18 | Gestão Pagamento,gestao_pagamento,text,text 19 | Nome Grupo Despesa,grupo_despesa,text,text 20 | Linguagem Cidadã,linguagem_cidada,text,text 21 | Número Documento,numero_documento,text,text 22 | Nome Órgao,orgao,text,text 23 | Nome Órgão Superior,orgao_superior,text,text 24 | Nome Programa,programa,text,text 25 | Nome Subfunção,subfuncao,text,text 26 | Nome Unidade Gestora,unidade_gestora,text,text 27 | Valor,valor,money_real,decimal 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraper do Portal da Transparência do Governo Federal 2 | 3 | ## Instalando 4 | 5 | ```shell 6 | pyenv virtualenv 3.7.3 transparencia-gov-br 7 | pyenv activate transparencia-gov-br 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Rodando 12 | 13 | Todos os spiders: 14 | 15 | ```shell 16 | ./run.sh 17 | ``` 18 | 19 | Apenas um spider: 20 | 21 | ```shell 22 | ./run.sh 23 | ``` 24 | 25 | > Nota: consule os nomes dos spiders disponíveis em 26 | > [transparenciagovbr/spiders/](transparenciagovbr/spiders/]). 27 | 28 | Pode ser interessante rodar algum script de extração fora de um spider (por 29 | limitações do scrapy). Veja os scripts disponíveis na pasta `scripts` e 30 | execute-os com o parâmetro `--help` para ver as opções disponíveis. 31 | 32 | 33 | ## Importando no PostgreSQL 34 | 35 | Antes, instale as dependências, rode os spiders e crie uma varíavel com a URI 36 | de conexão com o banco: 37 | 38 | ```shell 39 | pip install psycopg2-binary tqdm 40 | ./run.sh 41 | export POSTGRESQL_URI="postgres://usuario:senha@host:porta/banco" 42 | ``` 43 | 44 | Depois, execute o script para importar todas as tabelas: 45 | 46 | 47 | ```shell 48 | ./import-postgresql.sh 49 | ``` 50 | 51 | Ou apenas a tabela de um spider específico: 52 | 53 | ```shell 54 | ./import-postgresql.sh 55 | ``` 56 | 57 | > Nota: consule os nomes dos spiders disponíveis em 58 | > [transparenciagovbr/spiders/](transparenciagovbr/spiders/]). 59 | -------------------------------------------------------------------------------- /schema/orcamento_despesa.csv: -------------------------------------------------------------------------------- 1 | original_name,field_name,internal_field_type,field_type 2 | CÓDIGO AÇÃO,codigo_acao,text,text 3 | CÓDIGO CATEGORIA ECONÔMICA,codigo_categoria_economica,custom_integer,integer 4 | CÓDIGO ELEMENTO DE DESPESA,codigo_elemento_de_despesa,text,text 5 | CÓDIGO FUNÇÃO,codigo_funcao,custom_integer,integer 6 | CÓDIGO GRUPO DE DESPESA,codigo_grupo_de_despesa,text,text 7 | CÓDIGO ÓRGÃO SUBORDINADO,codigo_orgao_subordinado,custom_integer,integer 8 | CÓDIGO ÓRGÃO SUPERIOR,codigo_orgao_superior,custom_integer,integer 9 | CÓDIGO PROGRAMA ORÇAMENTÁRIO,codigo_programa_orcamentario,custom_integer,integer 10 | CÓDIGO SUBFUNÇÃO,codigo_subfuncao,custom_integer,integer 11 | CÓDIGO UNIDADE ORÇAMENTÁRIA,codigo_unidade_orcamentaria,custom_integer,integer 12 | ,em_sigilo,bool,bool 13 | EXERCÍCIO,exercicio,text,text 14 | NOME AÇÃO,nome_acao,text,text 15 | NOME CATEGORIA ECONÔMICA,nome_categoria_economica,text,text 16 | NOME ELEMENTO DE DESPESA,nome_elemento_de_despesa,text,text 17 | NOME FUNÇÃO,nome_funcao,text,text 18 | NOME GRUPO DE DESPESA,nome_grupo_de_despesa,text,text 19 | NOME ÓRGÃO SUBORDINADO,nome_orgao_subordinado,text,text 20 | NOME ÓRGÃO SUPERIOR,nome_orgao_superior,text,text 21 | NOME PROGRAMA ORÇAMENTÁRIO,nome_programa_orcamentario,text,text 22 | NOME SUBFUNÇÃO,nome_subfuncao,text,text 23 | NOME UNIDADE ORÇAMENTÁRIA,nome_unidade_orcamentaria,text,text 24 | ORÇAMENTO ATUALIZADO (R$),orcamento_atualizado,money_real,decimal 25 | ORÇAMENTO INICIAL (R$),orcamento_inicial,money_real,decimal 26 | ORÇAMENTO REALIZADO (R$),orcamento_realizado,money_real,decimal 27 | -------------------------------------------------------------------------------- /schema/pagamento.csv: -------------------------------------------------------------------------------- 1 | original_name,field_name,internal_field_type,field_type 2 | Categoria de Despesa,categoria_de_despesa,text,text 3 | Código Categoria de Despesa,codigo_categoria_de_despesa,text,text 4 | Código Elemento de Despesa,codigo_elemento_de_despesa,text,text 5 | Código Favorecido,codigo_favorecido,text,text 6 | Código Gestão,codigo_gestao,custom_integer,integer 7 | Código Grupo de Despesa,codigo_grupo_de_despesa,text,text 8 | Código Modalidade de Aplicação,codigo_modalidade_de_aplicacao,text,text 9 | Código Órgão,codigo_orgao,custom_integer,integer 10 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer 11 | Código Pagamento,codigo_pagamento,text,text 12 | Código Pagamento Resumido,codigo_pagamento_resumido,text,text 13 | Código Plano Orçamentário,codigo_plano_orcamentario,text,text 14 | Código Programa Governo,codigo_programa_governo,text,text 15 | Código Tipo Documento,codigo_tipo_documento,text,text 16 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer 17 | Data Emissão,data_emissao,brazilian_date,date 18 | Elemento de Despesa,elemento_de_despesa,text,text 19 | ,em_sigilo,bool,bool 20 | Extraorçamentário,extraorcamentario,text,text 21 | Favorecido,favorecido,text,text 22 | Gestão,gestao,text,text 23 | Grupo de Despesa,grupo_de_despesa,text,text 24 | Modalidade de Aplicação,modalidade_de_aplicacao,text,text 25 | Observação,observacao,custom_text,text 26 | Órgão,orgao,text,text 27 | Órgão Superior,orgao_superior,text,text 28 | Plano Orçamentário,plano_orcamentario,text,text 29 | Processo,processo,text,text 30 | Nome Programa Governo,programa_governo,text,text 31 | Valor Utilizado na Conversão,taxa_de_conversao,money_real,decimal 32 | Tipo Documento,tipo_documento,text,text 33 | Tipo OB,tipo_ob,text,text 34 | Unidade Gestora,unidade_gestora,text,text 35 | Valor do Pagamento Convertido pra R$,valor_convertido_para_reais,money_real,decimal 36 | Valor Original do Pagamento,valor_original,money_real,decimal 37 | -------------------------------------------------------------------------------- /create_mirror_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import stat 3 | from textwrap import dedent 4 | from urllib.parse import urlparse 5 | 6 | from scrapy import spiderloader 7 | from scrapy.utils import project 8 | 9 | from transparenciagovbr.utils.date import date_range, date_to_dict 10 | 11 | output_filename = "mirror.sh" 12 | settings = project.get_project_settings() 13 | spider_loader = spiderloader.SpiderLoader.from_settings(settings) 14 | spiders = spider_loader.list() 15 | with open(output_filename, mode="w") as fobj: 16 | fobj.write( 17 | dedent( 18 | """ 19 | #!/bin/bash 20 | 21 | mirror_file() { 22 | url="$1" 23 | download_path="$2" 24 | mirror_uri="$3" 25 | 26 | aria2c \\ 27 | --summary-interval=0 \\ 28 | --dir=$(dirname "$download_path") \\ 29 | --out=$(basename "$download_path") \\ 30 | "$url" 31 | if [ -e "$download_path" ]; then 32 | s3cmd put "$download_path" "$mirror_uri" 33 | rm "$download_path" 34 | fi 35 | } 36 | """ 37 | ).strip() 38 | ) 39 | fobj.write(f"\nmkdir -p {settings['DOWNLOAD_PATH']}\n") 40 | for spider_name in spiders: 41 | fobj.write(f"\n# {spider_name}\n") 42 | SpiderClass = spider_loader.load(spider_name) 43 | for date in date_range( 44 | start=SpiderClass.start_date, 45 | stop=SpiderClass.end_date, 46 | interval=SpiderClass.publish_frequency, 47 | ): 48 | url = SpiderClass.base_url.format(**date_to_dict(date)) 49 | filename = urlparse(url).path.rsplit("/", maxsplit=1)[-1] 50 | mirror_uri = f"s3://mirror/transparenciagovbr/{spider_name}/{filename}" 51 | download_path = settings["DOWNLOAD_PATH"] / filename 52 | fobj.write(f"mirror_file {url} {download_path} {mirror_uri}\n") 53 | # chmod 750 mirror.sh 54 | os.chmod( 55 | output_filename, 56 | stat.S_IRUSR + stat.S_IWUSR + stat.S_IXUSR + stat.S_IRGRP + stat.S_IXGRP, 57 | ) 58 | -------------------------------------------------------------------------------- /scripts/auxilio_emergencial.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from pathlib import Path 5 | 6 | import rows 7 | from tqdm import tqdm 8 | 9 | sys.path.insert(0, str(Path(__file__).parent.parent.absolute())) # noqa 10 | from transparenciagovbr.utils.cities import city_name_by_id 11 | from transparenciagovbr.utils.fields import Schema 12 | from transparenciagovbr.utils.io import parse_zip 13 | 14 | 15 | def extract_rows(schema, filename): 16 | data = parse_zip( 17 | filename_or_fobj=filename, 18 | inner_filename_suffix="_AuxilioEmergencial.csv", 19 | encoding="iso-8859-1", 20 | ) 21 | for row in data: 22 | new = schema.deserialize(row) 23 | if new is not None: 24 | if new["codigo_ibge_municipio"] is not None: 25 | # Força nome de município a ser mais bonito (com acentos, 26 | # maiúsculas e minúsculas). :) 27 | new["municipio"] = city_name_by_id[new["codigo_ibge_municipio"]] 28 | yield new 29 | 30 | 31 | def main(): 32 | # TODO: move this `main` to a general command-line interface so we can run 33 | # any extractor by command-line. 34 | 35 | BASE_PATH = Path(__file__).parent 36 | DATA_PATH = BASE_PATH / "data" 37 | DOWNLOAD_PATH = DATA_PATH / "download" 38 | OUTPUT_PATH = DATA_PATH / "output" 39 | 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument("input_filename") 42 | parser.add_argument("output_filename") 43 | parser.add_argument("--buffering", default=4 * 1024 * 1024) 44 | parser.add_argument("--schema-filename", default="auxilio_emergencial.csv") 45 | args = parser.parse_args() 46 | 47 | schema = Schema(args.schema_filename) 48 | filename = Path(args.input_filename) 49 | fobj = rows.utils.open_compressed(args.output_filename, mode="w", buffering=args.buffering) 50 | writer = rows.utils.CsvLazyDictWriter(fobj) 51 | 52 | data = extract_rows(schema, filename) 53 | for row in tqdm(data, desc=f"Extracting {filename.name}"): 54 | writer.writerow(row) 55 | fobj.close() 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /schema/execucao_despesa.csv: -------------------------------------------------------------------------------- 1 | original_name,field_name,internal_field_type,field_type 2 | Nome Ação,acao,text,text 3 | Ano e mês do lançamento,anomes_lancamento,text,text 4 | Código Ação,codigo_acao,text,text 5 | Código Elemento de Despesa,codigo_elemento_despesa,text,text 6 | Código Função,codigo_funcao,custom_integer,integer 7 | Código Gestão,codigo_gestao,custom_integer,integer 8 | Código Grupo de Despesa,codigo_grupo_despesa,text,text 9 | Código Modalidade da Despesa,codigo_modalidade_despesa,text,text 10 | Código Órgão Subordinado,codigo_orgao_subordinado,custom_integer,integer 11 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer 12 | Código Plano Orçamentário,codigo_plano_orcamentario,text,text 13 | Código Programa Governo,codigo_programa_governo,text,text 14 | Código Programa Orçamentário,codigo_programa_orcamentario,custom_integer,integer 15 | Código Subfução,codigo_subfucao,custom_integer,integer 16 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer 17 | Código Unidade Orçamentária,codigo_unidade_orcamentaria,custom_integer,integer 18 | Nome Elemento de Despesa,elemento_despesa,text,text 19 | ,em_sigilo,bool,bool 20 | Nome Função,funcao,text,text 21 | Nome Gestão,gestao,text,text 22 | Nome Grupo de Despesa,grupo_despesa,text,text 23 | Modalidade da Despesa,modalidade_despesa,text,text 24 | Nome Órgão Subordinado,orgao_subordinado,text,text 25 | Nome Órgão Superior,orgao_superior,text,text 26 | Plano Orçamentário,plano_orcamentario,text,text 27 | Nome Programa Governo,programa_governo,text,text 28 | Nome Programa Orçamentário,programa_orcamentario,text,text 29 | Nome Subfunção,subfuncao,text,text 30 | Nome Unidade Gestora,unidade_gestora,text,text 31 | Nome Unidade Orçamentária,unidade_orcamentaria,text,text 32 | Valor Empenhado (R$),valor_empenhado,money_real,decimal 33 | Valor Liquidado (R$),valor_liquidado,money_real,decimal 34 | Valor Pago (R$),valor_pago,money_real,decimal 35 | Valor Restos a Pagar Cancelado (R$),valor_restos_a_pagar_cancelado,money_real,decimal 36 | Valor Restos a Pagar Inscritos (R$),valor_restos_a_pagar_inscritos,money_real,decimal 37 | Valor Restos a Pagar Pagos (R$),valor_restos_a_pagar_pagos,money_real,decimal 38 | -------------------------------------------------------------------------------- /transparenciagovbr/fields.py: -------------------------------------------------------------------------------- 1 | from rows.fields import ( 2 | BoolField, 3 | DateField, 4 | DecimalField, 5 | IntegerField, 6 | TextField, 7 | as_string, 8 | is_null, 9 | ) 10 | 11 | 12 | class BrazilianBoolField(BoolField): 13 | name = "brazilian_bool" 14 | TRUE_VALUES = ("SIM", "sim", "Sim") 15 | FALSE_VALUES = ("NÃO", "NAO", "Não", "Nao", "não", "nao") 16 | 17 | 18 | class BrazilianDateField(DateField): 19 | name = "brazilian_date" 20 | INPUT_FORMAT = "%d/%m/%Y" 21 | 22 | 23 | class CPFField(TextField): 24 | """TextField to clean-up unneeded chars in CPF""" 25 | 26 | name = "cpf" 27 | 28 | @classmethod 29 | def deserialize(cls, value, *args, **kwargs): 30 | if is_null(value): 31 | return None 32 | 33 | value = as_string(value).strip() 34 | value = value.replace(".", "").replace("-", "") 35 | assert len(value) == 11 36 | return value 37 | 38 | 39 | class CustomIntegerField(IntegerField): 40 | """Locale-aware field class to represent integer 41 | 42 | Accepts numbers starting with 0 and removes unnecessary characters. 43 | """ 44 | 45 | name = "custom_integer" 46 | 47 | @classmethod 48 | def deserialize(cls, value, *args, **kwargs): 49 | if is_null(value): 50 | return None 51 | elif isinstance(value, cls.TYPE): 52 | return value 53 | 54 | value = as_string(value).strip() 55 | value = value.replace("ª", "") 56 | while value.startswith("0"): 57 | value = value[1:] 58 | return super().deserialize(value) 59 | 60 | 61 | class CustomTextField(TextField): 62 | """TextField to clean-up a value that should be empty""" 63 | 64 | name = "custom_text" 65 | 66 | @classmethod 67 | def deserialize(cls, value, *args, **kwargs): 68 | if is_null(value) or value in ("Não há", "Não se aplica"): 69 | return None 70 | 71 | return value 72 | 73 | 74 | class MoneyRealField(DecimalField): 75 | name = "money_real" 76 | 77 | @classmethod 78 | def deserialize(cls, value): 79 | """ 80 | >>> MoneyRealField.deserialize("89188,11") 81 | '89188.11' 82 | """ 83 | if is_null(value): 84 | return None 85 | elif isinstance(value, cls.TYPE): 86 | return value 87 | 88 | value = value.replace(",", ".") 89 | return super().deserialize(value) 90 | -------------------------------------------------------------------------------- /pensionista/download-old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/download 4 | cd data/download 5 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_112019.zip 6 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_122019.zip 7 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_012020.zip 8 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_022020.zip 9 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_032020.zip 10 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_042020.zip 11 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1994.zip 12 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1995.zip 13 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1996.zip 14 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1997.zip 15 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1998.zip 16 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1999.zip 17 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2000.zip 18 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2001.zip 19 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2002.zip 20 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2003.zip 21 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2004.zip 22 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2005.zip 23 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2006.zip 24 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2007.zip 25 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2008.zip 26 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2009.zip 27 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2010.zip 28 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2011.zip 29 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2012.zip 30 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2013.zip 31 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2014.zip 32 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2015.zip 33 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2016.zip 34 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2017.zip 35 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2018.zip 36 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2019.zip 37 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_052020.zip 38 | cd - 39 | -------------------------------------------------------------------------------- /transparenciagovbr/utils/fields.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import rows 4 | 5 | from transparenciagovbr import fields, settings 6 | 7 | 8 | EM_SIGILO_STRINGS = ( 9 | "Detalhamento das informações bloqueado.", 10 | "Informações protegidas por sigilo, nos termos da legislação, para garantia da segurança da sociedade e do Estado", 11 | ) 12 | 13 | 14 | def schema_path_from_filename(filename): 15 | return str((settings.REPOSITORY_PATH / "schema" / filename).absolute()) 16 | 17 | 18 | def load_schema(filename): 19 | schema_path = schema_path_from_filename(filename) 20 | table = rows.import_from_csv(schema_path) 21 | table.field_names 22 | # Our internal context will be all available rows.fields + our custom 23 | # fields 24 | rows_context = { 25 | field_name.replace("Field", "").lower(): getattr(rows.fields, field_name) 26 | for field_name in rows.fields.__all__ 27 | if "Field" in field_name and field_name != "Field" 28 | } 29 | custom_context = {} 30 | for type_name in dir(fields): 31 | FieldClass = getattr(fields, type_name) 32 | if "Field" in type_name and FieldClass.__module__ != "rows.fields": 33 | custom_context[FieldClass.name] = FieldClass 34 | context = {**rows_context, **custom_context} 35 | return OrderedDict( 36 | [(row.field_name, context[row.internal_field_type]) for row in table] 37 | ) 38 | 39 | 40 | def field_mapping_from_csv(csvfile): 41 | schema_path = schema_path_from_filename(csvfile) 42 | return { 43 | row.original_name: row.field_name for row in rows.import_from_csv(schema_path) 44 | } 45 | 46 | 47 | class Schema: 48 | 49 | def __init__(self, schema_filename): 50 | schema = load_schema(schema_filename) 51 | field_mapping = field_mapping_from_csv(schema_filename) 52 | 53 | self.fields = [] 54 | for original_field_name, field_name in field_mapping.items(): 55 | if field_name == "em_sigilo": 56 | deserialize = lambda value: "f" 57 | else: 58 | deserialize = schema[field_name].deserialize 59 | self.fields.append((field_name, original_field_name, deserialize)) 60 | 61 | def deserialize(self, row): 62 | new = { 63 | field_name: deserialize(row.pop(original_field_name, None)) 64 | for field_name, original_field_name, deserialize in self.fields 65 | } 66 | if row: 67 | raise ValueError(f"Missing fields during deserialization: {', '.join(row.keys())}") 68 | for key, value in new.items(): 69 | if value in EM_SIGILO_STRINGS: 70 | new[key] = None 71 | new["em_sigilo"] = "t" 72 | return new 73 | -------------------------------------------------------------------------------- /schema/despesa_empenho.csv: -------------------------------------------------------------------------------- 1 | field_name,field_type,internal_field_type,original_name 2 | id,integer,integer,Id Empenho 3 | codigo,text,text,Código Empenho 4 | codigo_resumido,text,text,Código Empenho Resumido 5 | data_emissao,text,text,Data Emissão 6 | codigo_tipo_documento,text,text,Código Tipo Documento 7 | tipo_documento,text,text,Tipo Documento 8 | tipo,text,text,Tipo Empenho 9 | especie,text,text,Espécie Empenho 10 | codigo_orgao_superior,text,text,Código Órgão Superior 11 | orgao_superior,text,text,Órgão Superior 12 | codigo_orgao,text,text,Código Órgão 13 | orgao,text,text,Órgão 14 | codigo_unidade_gestora,text,text,Código Unidade Gestora 15 | unidade_gestora,text,text,Unidade Gestora 16 | codigo_gestao,text,text,Código Gestão 17 | gestao,text,text,Gestão 18 | codigo_favorecido,text,text,Código Favorecido 19 | favorecido,text,text,Favorecido 20 | observacao,text,text,Observação 21 | codigo_esfera_orcamentaria,text,text,Código Esfera Orçamentária 22 | esfera_orcamentaria,text,text,Esfera Orçamentária 23 | codigo_tipo_credito,text,text,Código Tipo Crédito 24 | tipo_credito,text,text,Tipo Crédito 25 | codigo_grupo_fonte_recurso,text,text,Código Grupo Fonte Recurso 26 | grupo_fonte_recurso,text,text,Grupo Fonte Recurso 27 | codigo_fonte_recurso,text,text,Código Fonte Recurso 28 | fonte_recurso,text,text,Fonte Recurso 29 | codigo_unidade_orcamentaria,text,text,Código Unidade Orçamentária 30 | unidade_orcamentaria,text,text,Unidade Orçamentária 31 | codigo_funcao,text,text,Código Função 32 | funcao,text,text,Função 33 | codigo_subfuncao,text,text,Código SubFunção 34 | subfuncao,text,text,SubFunção 35 | codigo_programa,text,text,Código Programa 36 | programa,text,text,Programa 37 | codigo_acao,text,text,Código Ação 38 | acao,text,text,Ação 39 | linguagem_cidada,text,text,Linguagem Cidadã 40 | codigo_subtitulo_localizador,text,text,Código Subtítulo (Localizador) 41 | subtitulo_localizador,text,text,Subtítulo (Localizador) 42 | codigo_plano_orcamentario,text,text,Código Plano Orçamentário 43 | plano_orcamentario,text,text,Plano Orçamentário 44 | codigo_programa_governo,text,text,Código Programa Governo 45 | programa_governo,text,text,Nome Programa Governo 46 | autor_emenda,text,text,Autor Emenda 47 | codigo_categoria_de_despesa,text,text,Código Categoria de Despesa 48 | categoria_despesa,text,text,Categoria de Despesa 49 | codigo_grupo_despesa,text,text,Código Grupo de Despesa 50 | grupo_despesa,text,text,Grupo de Despesa 51 | codigo_modalidade_aplicacao,text,text,Código Modalidade de Aplicação 52 | modalidade_aplicacao,text,text,Modalidade de Aplicação 53 | codigo_elemento_despesa,text,text,Código Elemento de Despesa 54 | elemento_despesa,text,text,Elemento de Despesa 55 | processo,text,text,Processo 56 | modalidade_licitacao,text,text,Modalidade de Licitação 57 | inciso,text,text,Inciso 58 | amparo,text,text,Amparo 59 | referencia_dispensa_inexigibilidade,text,text,Referência de Dispensa ou Inexigibilidade 60 | codigo_convenio,text,text,Código Convênio 61 | contrato_repasse_parceria_outros,text,text,Contrato de Repasse / Termo de Parceria / Outros 62 | valor_original,decimal,money_real,Valor Original do Empenho 63 | valor_reais,decimal,money_real,Valor do Empenho Convertido pra R$ 64 | valor_conversao,decimal,money_real,Valor Utilizado na Conversão 65 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/base.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import zipfile 4 | from urllib.parse import urlparse 5 | 6 | import scrapy 7 | from cached_property import cached_property 8 | 9 | from transparenciagovbr import settings 10 | from transparenciagovbr.utils.date import date_range, date_to_dict 11 | from transparenciagovbr.utils.fields import Schema 12 | from transparenciagovbr.utils.io import parse_zip 13 | 14 | 15 | def parse_csv_rows(filename_or_fobj, inner_filename_suffix, encoding, schema): 16 | data = parse_zip( 17 | filename_or_fobj=filename_or_fobj, 18 | inner_filename_suffix=inner_filename_suffix, 19 | encoding=encoding, 20 | ) 21 | for row in data: 22 | new = schema.deserialize(row) 23 | if new is not None: 24 | yield new 25 | 26 | 27 | class TransparenciaBaseSpider(scrapy.Spider): 28 | allowed_domains = [ 29 | "portaldatransparencia.gov.br", 30 | "transparencia.gov.br", 31 | "data.brasil.io", 32 | ] 33 | custom_settings = { 34 | "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.93 Safari/537.36", 35 | } 36 | encoding = "iso-8859-1" 37 | mirror_url = "https://data.brasil.io/mirror/transparenciagovbr/{dataset}/{filename}" 38 | 39 | 40 | def __init__(self, use_mirror="False", save_file="True", *args, **kwargs): 41 | super().__init__(*args, **kwargs) 42 | self.use_mirror = use_mirror.lower() == "true" 43 | self.save_file = save_file.lower() == "true" 44 | 45 | @property 46 | def schema(self): 47 | return Schema(self.schema_filename) 48 | 49 | def make_filename(self, url): 50 | return settings.DOWNLOAD_PATH / self.name / urlparse(url).path.rsplit("/", maxsplit=1)[-1] 51 | 52 | def start_requests(self): 53 | for date in date_range( 54 | start=self.start_date, stop=self.end_date, interval=self.publish_frequency 55 | ): 56 | url = self.base_url.format(**date_to_dict(date)) 57 | if self.use_mirror: 58 | url = self.mirror_url.format( 59 | dataset=self.name, 60 | filename=urlparse(url).path.rsplit("/", maxsplit=1)[-1], 61 | ) 62 | elif self.save_file: 63 | filename = self.make_filename(url) 64 | if filename.exists(): 65 | url = f"file://{filename.absolute()}" 66 | yield scrapy.Request(url, callback=self.parse_zip_response) 67 | 68 | def parse_zip_response(self, response): 69 | # If it's set to save file and the response comes from the Web, then 70 | # save it to the disk. 71 | if self.save_file and not response.request.url.startswith("file://"): 72 | filename = self.make_filename(response.request.url) 73 | if not filename.parent.exists(): 74 | filename.parent.mkdir(parents=True) 75 | with open(filename, mode="wb") as fobj: 76 | fobj.write(response.body) 77 | 78 | yield from parse_csv_rows( 79 | filename_or_fobj=io.BytesIO(response.body), 80 | inner_filename_suffix=self.filename_suffix, 81 | encoding=self.encoding, 82 | schema=self.schema, 83 | ) 84 | -------------------------------------------------------------------------------- /transparenciagovbr/spiders/despesa_item_empenho.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from decimal import Decimal, InvalidOperation 3 | from urllib.parse import urlparse 4 | 5 | from transparenciagovbr import settings 6 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider 7 | from transparenciagovbr.utils.date import today 8 | 9 | 10 | class Text(str): 11 | 12 | def until(self, substr): 13 | return Text(self[:self.find(substr)]) 14 | 15 | def starting_at(self, substr): 16 | return Text(self[self.find(substr):]) 17 | 18 | def after(self, substr): 19 | return Text(self[self.find(substr) + len(substr):]) 20 | 21 | 22 | def parse_description(text): 23 | """Extrai dados estruturados do texto da descrição""" 24 | 25 | new = { 26 | "descricao_restante": "", 27 | "item": "", 28 | "item_material": "", 29 | "item_processo": "", 30 | "marca": "", 31 | "quantidade": "", 32 | "unidade": "", 33 | } 34 | 35 | if len(text) < 78 or "MARCA:" not in text: 36 | return new 37 | 38 | # TODO: verificar a possibilidade de transformar essa função num conjunto 39 | # de expressões regulares (provavelmente rodarão mais rapidamente) 40 | 41 | part1, part2 = Text(text[:78].strip()), Text(text[78:]) 42 | try: 43 | new["quantidade"] = Decimal(part1.until(" ").strip().replace(".", "").replace(",", ".")) 44 | except InvalidOperation: 45 | return new 46 | new["unidade"] = part1.after(" ").strip() 47 | 48 | item = part2.until(",") 49 | if item and item[0] == item[-1] == "'": 50 | item = item[1:-1] 51 | new["item"] = item 52 | rest = part2.after(",") 53 | 54 | new["descricao_restante"] = rest.until("MARCA:").strip() 55 | rest = rest.after("MARCA:") 56 | 57 | new["marca"] = rest.until("ITEM DO PROCESSO:").strip() 58 | rest = rest.after("ITEM DO PROCESSO:") 59 | 60 | new["item_processo"] = rest.until("ITEM DE MATERIAL:").strip() 61 | rest = rest.after("ITEM DE MATERIAL:") 62 | 63 | new["item_material"] = rest.strip() 64 | 65 | return new 66 | 67 | 68 | def extract_extra_fields(row): 69 | new = { 70 | "quantidade": None, 71 | "unidade": None, 72 | "item": None, 73 | "marca": None, 74 | "item_processo": None, 75 | "item_material": None, 76 | "descricao_restante": None, 77 | } 78 | if row["elemento_despesa"] != "MATERIAL DE CONSUMO": 79 | return new 80 | # TODO: add fields related to services and other elements 81 | 82 | new.update(parse_description(row["descricao"])) 83 | return new 84 | 85 | class DespesaMixin: 86 | def make_filename(self, url): 87 | return settings.DOWNLOAD_PATH / "despesa" / urlparse(url).path.rsplit("/", maxsplit=1)[-1] 88 | 89 | 90 | class DespesaItemEmpenhoSpider(DespesaMixin, TransparenciaBaseSpider): 91 | name = "despesa_item_empenho" 92 | base_url = "http://transparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}" 93 | start_date = datetime.date(2013, 3, 31) 94 | end_date = today() 95 | publish_frequency = "daily" 96 | filename_suffix = "_Despesas_ItemEmpenho.csv" 97 | schema_filename = "despesa_item_empenho.csv" 98 | 99 | def parse_zip_response(self, response): 100 | for row in super().parse_zip_response(response): 101 | row.update(extract_extra_fields(row)) 102 | yield row 103 | 104 | 105 | class DespesaEmpenhoSpider(DespesaMixin, TransparenciaBaseSpider): 106 | name = "despesa_empenho" 107 | base_url = "http://transparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}" 108 | start_date = datetime.date(2013, 3, 31) 109 | end_date = today() 110 | publish_frequency = "daily" 111 | filename_suffix = "_Despesas_Empenho.csv" 112 | schema_filename = "despesa_empenho.csv" 113 | -------------------------------------------------------------------------------- /transparenciagovbr/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pathlib import Path 4 | 5 | # Scrapy settings for transparenciagovbr project 6 | # 7 | # For simplicity, this file contains only settings considered important or 8 | # commonly used. You can find more settings consulting the documentation: 9 | # 10 | # https://doc.scrapy.org/en/latest/topics/settings.html 11 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 12 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 13 | 14 | BOT_NAME = "transparenciagovbr" 15 | 16 | SPIDER_MODULES = ["transparenciagovbr.spiders"] 17 | NEWSPIDER_MODULE = "transparenciagovbr.spiders" 18 | 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | # USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | # CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | # DOWNLOAD_DELAY = 3 33 | # The download delay setting will honor only one of: 34 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | # CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | # COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | # TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | # DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | # } 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'transparenciagovbr.middlewares.TransparenciagovbrSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'transparenciagovbr.middlewares.TransparenciagovbrDownloaderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | # ITEM_PIPELINES = { 70 | # 'transparenciagovbr.pipelines.TransparenciagovbrPipeline': 300, 71 | # } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | # AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | # AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | # AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | HTTPCACHE_ENABLED = True 89 | HTTPCACHE_EXPIRATION_SECS = 0 90 | HTTPCACHE_DIR = "httpcache" 91 | HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 93 | 94 | FEED_EXPORTERS = {"csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter"} 95 | FEED_FORMAT = "csv.gz" 96 | 97 | REPOSITORY_PATH = Path(__file__).parent.parent 98 | DOWNLOAD_PATH = REPOSITORY_PATH / "data" / "download" 99 | 100 | DOWNLOAD_WARNSIZE = 2 * 1024 * 1024 * 1024 101 | -------------------------------------------------------------------------------- /transparenciagovbr/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TransparenciagovbrSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class TransparenciagovbrDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info("Spider opened: %s" % spider.name) 104 | -------------------------------------------------------------------------------- /pensionista/convert.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import io 4 | from functools import lru_cache 5 | from pathlib import Path 6 | from uuid import NAMESPACE_URL, uuid5 7 | from zipfile import ZipFile 8 | 9 | from rows.fields import slug 10 | from rows.utils import CsvLazyDictWriter, open_compressed 11 | from tqdm import tqdm 12 | 13 | 14 | strptime = datetime.datetime.strptime 15 | 16 | @lru_cache(maxsize=1024 ** 2) 17 | def convert_number(value): 18 | return value.replace(".", "").replace(",", ".") 19 | 20 | 21 | @lru_cache(maxsize=32 * 1024) 22 | def convert_date(value): 23 | value = value.strip() 24 | if not value: 25 | return None 26 | return str(strptime(value, "%d/%m/%Y").date()) 27 | 28 | 29 | @lru_cache(maxsize=1024 * 1024) 30 | def person_uuid(cpf, name): 31 | """Create UUID based on URLid methodology""" 32 | 33 | if cpf is None: 34 | cpf = "***********" 35 | assert len(cpf) == 11, f"Invalid CPF: {repr(cpf)}" 36 | internal_id = cpf[3:9] + "-" + slug(name).upper().replace("_", "-") 37 | return str(uuid5(NAMESPACE_URL, f"https://id.brasil.io/person/v1/{internal_id}/")) 38 | 39 | 40 | @lru_cache(maxsize=128) 41 | def normalize_key(text): 42 | 43 | text = text.replace("(R$)", "_brl_").replace("(U$)", "_usd_") 44 | result = ( 45 | slug(text) 46 | .replace("_registradas_em_sistemas_de_pessoal_", "_") 47 | .replace("_programa_desligamento_voluntario_mp_792_2017_", "_deslig_voluntario_") 48 | ) 49 | return result 50 | 51 | 52 | def convert_row(row): 53 | new = {} 54 | for original_key, value in row.items(): 55 | key = normalize_key(original_key) 56 | value = value.strip() 57 | if (value and value[0] == "0" and value[-1] == "0" and set(value) == {"0"}) or value in ("-", "--"): 58 | value = None 59 | if not key and not value: 60 | continue 61 | 62 | if key.startswith("data_") and value is not None: 63 | value = convert_date(value) 64 | elif value is not None and ("R$" in original_key or "U$" in original_key): 65 | value = convert_number(value) 66 | 67 | new[key] = value 68 | return new 69 | 70 | 71 | def read_csv(fobj, table_name, year, month, input_encoding="iso-8859-1", delimiter=";"): 72 | """Read binary `fobj` as CSV, convert each row, adding `table_name` as a column""" 73 | 74 | fobj = io.TextIOWrapper(fobj, encoding=input_encoding) 75 | reader = csv.DictReader(fobj, delimiter=delimiter) 76 | for row in reader: 77 | new = convert_row(row) 78 | if "(*)" in new.get("ano", ""): # Invalid row 79 | continue 80 | if "ano" not in new: 81 | new["ano"] = year 82 | if "mes" not in new: 83 | new["mes"] = month 84 | if "PENSIONISTA MENOR DE 16 ANOS" in new["cpf"]: 85 | new["menor_16"] = True 86 | new["cpf"] = None 87 | else: 88 | new["menor_16"] = False 89 | new["cpf"] = new["cpf"].replace(".", "").replace("-", "") 90 | new["sistema_origem"] = table_name 91 | new["pessoa_uuid"] = person_uuid(new["cpf"], new["nome"]) 92 | if table_name == "cadastro": 93 | for key in ("representante_legal", "instituidor"): 94 | new[f"cpf_{key}"] = new[f"cpf_{key}"].replace(".", "").replace("-", "") 95 | new[f"{key}_uuid"] = person_uuid(new[f"cpf_{key}"], new[f"nome_{key}"]) 96 | yield new 97 | 98 | 99 | def extract_year_month(filename): 100 | """Extract year and month from ZIP filename""" 101 | 102 | part = filename.name.lower().split(".zip")[0] 103 | return int(part[:4]), int(part[4:6]) 104 | 105 | 106 | def extract_origin_system(filename): 107 | return filename.split(".zip")[0].split("_")[-1] 108 | 109 | 110 | if __name__ == "__main__": 111 | import argparse 112 | 113 | parser = argparse.ArgumentParser() 114 | parser.add_argument("table_name", choices=("cadastro", "remuneracao", "observacao")) 115 | args = parser.parse_args() 116 | 117 | # Make sure all working paths exist before anything 118 | DATA_PATH = Path(__file__).parent / "data" 119 | DOWNLOAD_PATH = DATA_PATH / "download" 120 | OUTPUT_PATH = DATA_PATH / "output" 121 | for path in (DATA_PATH, DOWNLOAD_PATH, OUTPUT_PATH): 122 | if not path.exists(): 123 | path.mkdir(parents=True) 124 | 125 | # Create one compressed-CSV writer 126 | filename = OUTPUT_PATH / f"pensionista_{args.table_name}.csv.gz" 127 | fobj = open_compressed(filename, mode="w", buffering=8 * 1024 * 1024) 128 | writer = CsvLazyDictWriter(fobj) 129 | 130 | # Read each ZIP file, then each inner ZIP file, then filter desired 131 | # inner-inner CSV file, convert it and write to the output CSV. 132 | progress_bar = tqdm() 133 | filenames = DOWNLOAD_PATH.glob("*.zip") 134 | for filename in sorted(filenames, key=extract_year_month): 135 | year, month = extract_year_month(filename) 136 | progress_bar.desc = f"{year}-{month:02d}" 137 | progress_bar.refresh() 138 | zf = ZipFile(filename) 139 | for fileinfo in zf.filelist: 140 | origin_system = extract_origin_system(fileinfo.filename) 141 | progress_bar.desc = f"{year}-{month:02d}/{origin_system}" 142 | inner_zf = ZipFile(zf.open(fileinfo.filename)) 143 | for inner_fileinfo in inner_zf.filelist: 144 | table_name = inner_fileinfo.filename.split(".")[0].split("_")[-1].lower().replace("observacoes", "observacao") 145 | if table_name != args.table_name: # We don't want this file 146 | continue 147 | progress_bar.desc = f"{year}-{month:02d}/{origin_system}.{table_name}" 148 | fobj = inner_zf.open(inner_fileinfo.filename) 149 | reader = read_csv(fobj, origin_system, year, month) 150 | for row in reader: 151 | writer.writerow(row) 152 | progress_bar.update() 153 | progress_bar.close() 154 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | --------------------------------------------------------------------------------