├── runtime.txt
├── transparenciagovbr
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── cities.py
    │   ├── print_spider_names.py
    │   ├── date.py
    │   ├── io.py
    │   └── fields.py
    ├── spiders
    │   ├── __init__.py
    │   ├── pagamento.py
    │   ├── execucao_despesa.py
    │   ├── orcamento_despesa.py
    │   ├── pagamento_historico.py
    │   ├── auxilio_emergencial.py
    │   ├── base.py
    │   └── despesa_item_empenho.py
    ├── pipelines.py
    ├── items.py
    ├── exporters.py
    ├── fields.py
    ├── settings.py
    └── middlewares.py
├── requirements-development.txt
├── .github
    └── FUNDING.yml
├── pensionista
    ├── requirements.txt
    ├── import-pgsql.sh
    ├── README.md
    ├── list_zips.py
    ├── indexes.sql
    ├── download-old.sh
    └── convert.py
├── Makefile
├── requirements.txt
├── .gitignore
├── scrapy.cfg
├── schema
    ├── auxilio_emergencial.csv
    ├── despesa_item_empenho.csv
    ├── pagamento_historico.csv
    ├── orcamento_despesa.csv
    ├── pagamento.csv
    ├── execucao_despesa.csv
    └── despesa_empenho.csv
├── import-postgresql.sh
├── run.sh
├── README.md
├── create_mirror_script.py
├── scripts
    └── auxilio_emergencial.py
└── LICENSE


/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.8.2
2 | 


--------------------------------------------------------------------------------
/transparenciagovbr/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements-development.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | autoflake
4 | black
5 | ipython
6 | isort
7 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | custom: https://apoia.se/brasilio
4 | 


--------------------------------------------------------------------------------
/pensionista/requirements.txt:
--------------------------------------------------------------------------------
1 | calculadora-do-cidadao
2 | https://github.com/turicas/rows/archive/develop.zip
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | fix-imports:
2 | 	autoflake --in-place --recursive --remove-unused-variables --remove-all-unused-imports .
3 | 	isort -rc .
4 | 	black .
5 | 
6 | .PHONY:	fix-imports
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cached_property
2 | https://github.com/turicas/rows/archive/develop.zip
3 | requests  # TODO: remove when dependency bug in rows is fixed
4 | s3cmd
5 | scrapy
6 | tqdm
7 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info/
 2 | *.pyc
 3 | *~
 4 | .*.sw?
 5 | .DS_Store
 6 | .activate
 7 | .coverage
 8 | .directory
 9 | .env
10 | .idea/*
11 | .scrapy
12 | .tox
13 | MANIFEST
14 | build/*
15 | data/*
16 | dist/*
17 | download.sh
18 | mirror.sh
19 | reg_settings.py
20 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/cities.py:
--------------------------------------------------------------------------------
1 | import rows
2 | 
3 | from transparenciagovbr import settings
4 | 
5 | cities_filename = settings.REPOSITORY_PATH / "data" / "populacao-estimada-2020.csv"
6 | city_name_by_id = {
7 |     row.city_ibge_code: row.city for row in rows.import_from_csv(cities_filename)
8 | }
9 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/print_spider_names.py:
--------------------------------------------------------------------------------
1 | from scrapy import spiderloader
2 | from scrapy.utils import project
3 | 
4 | settings = project.get_project_settings()
5 | spider_loader = spiderloader.SpiderLoader.from_settings(settings)
6 | spiders = spider_loader.list()
7 | for spider_name in spiders:
8 |     print(spider_name)
9 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = transparenciagovbr.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = transparenciagovbr
12 | 


--------------------------------------------------------------------------------
/transparenciagovbr/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TransparenciagovbrPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/transparenciagovbr/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TransparenciagovbrItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/pensionista/import-pgsql.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$DATABASE_URL" ]; then
 4 | 	echo "ERROR: must set $DATABASE_URL with postgres connection string"
 5 | 	exit 1
 6 | fi
 7 | 
 8 | for table in cadastro observacao remuneracao; do
 9 | 	rows pgimport \
10 | 		--dialect=excel \
11 | 		--input-encoding=utf-8 \
12 | 		--schema=schema/pensionista_${table}.csv \
13 | 		data/output/pensionista_${table}.csv.gz \
14 | 		$DATABASE_URL \
15 | 		pensionista_${table}
16 | done
17 | 


--------------------------------------------------------------------------------
/pensionista/README.md:
--------------------------------------------------------------------------------
 1 | # Dados de pensionistas
 2 | 
 3 | ## Instalação
 4 | 
 5 | ```shell
 6 | pip install -r requirements.txt
 7 | ```
 8 | 
 9 | ## Execução
10 | 
11 | Baixe os arquivos de pensionistas disponíveis [nesse
12 | site](http://transparencia.gov.br/download-de-dados/servidores) e coloque-os em
13 | `data/download/`. Depois, execute:
14 | 
15 | ```shell
16 | python convert.py <cadastro|observacao|remuneracao>
17 | ```
18 | 
19 | Os arquivos `cadastro.csv.gz`, `observacao.csv.gz` e `remuneracao.csv.gz` serão
20 | gerados em `data/output/`.
21 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/pagamento.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
 4 | from transparenciagovbr.utils.date import today
 5 | 
 6 | 
 7 | class PagamentoSpider(TransparenciaBaseSpider):
 8 |     name = "pagamento"
 9 |     base_url = "http://www.portaldatransparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}"
10 |     start_date = datetime.date(2013, 3, 31)
11 |     end_date = today()
12 |     publish_frequency = "daily"
13 |     filename_suffix = "_Despesas_Pagamento.csv"
14 |     schema_filename = "pagamento.csv"
15 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/execucao_despesa.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
 4 | from transparenciagovbr.utils.date import today
 5 | 
 6 | 
 7 | class ExecucaoDespesaSpider(TransparenciaBaseSpider):
 8 |     name = "execucao_despesa"
 9 |     base_url = "http://transparencia.gov.br/download-de-dados/despesas-execucao/{year}{month:02d}"
10 |     start_date = datetime.date(2014, 1, 1)
11 |     end_date = today()
12 |     publish_frequency = "monthly"
13 |     filename_suffix = "_Despesas.csv"
14 |     schema_filename = "execucao_despesa.csv"
15 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/orcamento_despesa.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
 4 | from transparenciagovbr.utils.date import today
 5 | 
 6 | 
 7 | class OrcamentoDespesaSpider(TransparenciaBaseSpider):
 8 |     name = "orcamento_despesa"
 9 |     base_url = "http://transparencia.gov.br/download-de-dados/orcamento-despesa/{year}"
10 |     start_date = datetime.date(2014, 1, 1)
11 |     end_date = today()
12 |     publish_frequency = "yearly"
13 |     filename_suffix = "_OrcamentoDespesa.zip.csv"
14 |     schema_filename = "orcamento_despesa.csv"
15 | 


--------------------------------------------------------------------------------
/schema/auxilio_emergencial.csv:
--------------------------------------------------------------------------------
 1 | original_name,field_name,internal_field_type,field_type
 2 | MÊS DISPONIBILIZAÇÃO,ano_mes,integer,integer
 3 | UF,uf,text,text
 4 | CÓDIGO MUNICÍPIO IBGE,codigo_ibge_municipio,integer,integer
 5 | NOME MUNICÍPIO,municipio,text,text
 6 | NIS BENEFICIÁRIO,nis_beneficiario,custom_integer,integer
 7 | CPF BENEFICIÁRIO,cpf_beneficiario,cpf,text
 8 | NOME BENEFICIÁRIO,beneficiario,text,text
 9 | NIS RESPONSÁVEL,nis_responsavel,custom_integer,integer
10 | CPF RESPONSÁVEL,cpf_responsavel,cpf,text
11 | NOME RESPONSÁVEL,responsavel,custom_text,text
12 | ENQUADRAMENTO,enquadramento,text,text
13 | PARCELA,parcela,custom_integer,integer
14 | OBSERVAÇÃO,observacao,custom_text,text
15 | VALOR BENEFÍCIO,valor,money_real,decimal
16 | 


--------------------------------------------------------------------------------
/import-postgresql.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCHEMA_PATH="schema"
 6 | OUTPUT_PATH="data/output"
 7 | 
 8 | function import_table() {
 9 | 	tablename="$1"
10 | 
11 | 	echo "DROP TABLE IF EXISTS ${tablename};" | psql "$POSTGRESQL_URI"
12 | 	time rows pgimport \
13 | 		--schema="$SCHEMA_PATH/${tablename}.csv" \
14 | 		--input-encoding="utf-8" \
15 | 		--dialect="excel" \
16 | 		"$OUTPUT_PATH/${tablename}.csv.gz" \
17 | 		"$POSTGRESQL_URI" \
18 | 		"$tablename"
19 | }
20 | 
21 | if [ -z "$POSTGRESQL_URI" ]; then
22 | 	echo "ERROR: you must set POSTGRESQL_URI environment variable."
23 | 	exit 1
24 | fi
25 | 
26 | if [ ! -z "$1" ]; then
27 | 	import_table $1
28 | else
29 | 	for table in pagamento pagamento_historico execucao_despesa orcamento_despesa; do
30 | 		import_table $table
31 | 	done
32 | fi
33 | 


--------------------------------------------------------------------------------
/pensionista/list_zips.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib.parse import urljoin
 3 | from urllib.request import urlopen
 4 | 
 5 | 
 6 | def ckan_package_resources(base_url, resource_id):
 7 |     template_url = urljoin(base_url, "/api/3/action/package_show?id={resource_id}")
 8 |     url = template_url.format(resource_id=resource_id)
 9 |     response = urlopen(url)
10 |     data = json.loads(response.read())
11 |     return data["result"]["resources"]
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     resources = ckan_package_resources(
16 |         base_url="http://www.dados.gov.br", resource_id="c76a1bc6-2330-4b05-b3dd-491124931496"
17 |     )
18 | 
19 |     for resource in resources:
20 |         if not resource["url"].lower().endswith(".zip"):
21 |             continue
22 |         print(resource["url"])
23 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | OUTPUT_PATH=data/output
 5 | LOG_PATH=data/log
 6 | LOG_LEVEL=INFO
 7 | if [ "$1" = "--use-mirror" ]; then
 8 | 	OPTS="-a use_mirror=true"
 9 | 	shift
10 | else
11 | 	OPTS=""
12 | fi
13 | 
14 | run_spider() {
15 | 	spider="$1"
16 | 
17 | 	mkdir -p $LOG_PATH $OUTPUT_PATH
18 | 	log_filename="$LOG_PATH/${spider}.log"
19 | 	output_filename="$OUTPUT_PATH/${spider}.csv.gz"
20 | 	rm -rf $log_filename $output_filename
21 | 	echo "Running ${spider} - check $log_filename for logs and $output_filename for output"
22 | 	time scrapy crawl \
23 | 		--loglevel=$LOG_LEVEL \
24 | 		--logfile=$log_filename \
25 | 		$OPTS \
26 | 		$spider \
27 | 		-t "csv.gz" \
28 | 		-o $output_filename
29 | }
30 | 
31 | if [ ! -z "$1" ]; then
32 | 	spiders="$@"
33 | else
34 | 	spiders="$(python transparenciagovbr/utils/print_spider_names.py)"
35 | fi
36 | for spider in $spiders; do
37 | 	run_spider $spider
38 | done
39 | 


--------------------------------------------------------------------------------
/transparenciagovbr/exporters.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | 
 3 | from scrapy.exporters import CsvItemExporter
 4 | 
 5 | 
 6 | # Code from <https://github.com/scrapy/scrapy/issues/2174>
 7 | class GzipCsvItemExporter(CsvItemExporter):
 8 |     """Gzip-compressed CSV exporter
 9 | 
10 |     To use it, add
11 |     ::
12 | 
13 |         FEED_EXPORTERS = {
14 |             'csv.gz': 'myproject.exporters.GzipCsvItemExporter',
15 |         }
16 |         FEED_FORMAT = 'csv.gz'
17 | 
18 |     to settings.py and then run scrapy crawl like this::
19 | 
20 |         scrapy crawl foo -o item.csv.gz
21 | 
22 |     (if `FEED_FORMAT` is not explicitly specified, you'll need to add
23 |     `-t csv.gz` to the command above)
24 |     """
25 | 
26 |     def __init__(self, fobj, **kwargs):
27 |         filename = fobj.name
28 |         fobj.close()
29 |         fobj = open(filename, mode="wb", buffering=8 * 1024 * 1024)
30 |         self.gzfile = gzip.GzipFile(fileobj=fobj)
31 |         super().__init__(self.gzfile, **kwargs)
32 | 
33 |     def finish_exporting(self):
34 |         self.gzfile.close()
35 | 


--------------------------------------------------------------------------------
/pensionista/indexes.sql:
--------------------------------------------------------------------------------
 1 | CREATE INDEX idx_pensobs_id ON pensionista_observacao (id_servidor_portal, ano, mes, sistema_origem);
 2 | CREATE INDEX idx_pensobs_uuid ON pensionista_observacao (pessoa_uuid);
 3 | CREATE INDEX idx_pensobs_orig ON pensionista_observacao (sistema_origem);
 4 | 
 5 | CREATE INDEX idx_penscad_id ON pensionista_cadastro (id_servidor_portal, ano, mes, sistema_origem);
 6 | CREATE INDEX idx_penscad_uuid1 ON pensionista_cadastro (pessoa_uuid);
 7 | CREATE INDEX idx_penscad_uuid2 ON pensionista_cadastro (representante_legal_uuid);
 8 | CREATE INDEX idx_penscad_uuid3 ON pensionista_cadastro (instituidor_pensao_uuid);
 9 | CREATE INDEX idx_penscad_orig ON pensionista_cadastro (sistema_origem);
10 | 
11 | CREATE INDEX idx_pensrem_id ON pensionista_remuneracao (id_servidor_portal, ano, mes, sistema_origem);
12 | CREATE INDEX idx_pensrem_uuid ON pensionista_remuneracao (pessoa_uuid);
13 | CREATE INDEX idx_pensrem_orig ON pensionista_remuneracao (sistema_origem);
14 | 
15 | ALTER TABLE pensionista_cadastro ADD PRIMARY KEY (id_servidor_portal, ano, mes, sistema_origem);
16 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/pagamento_historico.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import datetime
 3 | import io
 4 | import zipfile
 5 | 
 6 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
 7 | from transparenciagovbr.utils.io import NotNullTextWrapper
 8 | 
 9 | 
10 | class PagamentoHistSpider(TransparenciaBaseSpider):
11 |     name = "pagamento_historico"
12 |     base_url = "http://www.portaltransparencia.gov.br/download-de-dados/historico-gastos-diretos-pagamentos/{year}{month:02d}"
13 |     start_date = datetime.date(2011, 1, 1)
14 |     end_date = datetime.date(2012, 12, 31)
15 |     publish_frequency = "monthly"
16 |     schema_filename = "pagamento_historico.csv"
17 | 
18 |     def parse_zip_response(self, response):
19 |         zf = zipfile.ZipFile(io.BytesIO(response.body))
20 |         assert len(zf.filelist) == 1
21 |         fobj = NotNullTextWrapper(
22 |             zf.open(zf.filelist[0].filename), encoding=self.encoding
23 |         )
24 |         reader = csv.DictReader(fobj, delimiter="\t")
25 | 
26 |         for row in reader:
27 |             new = self.schema.deserialize(row)
28 |             if new is not None:
29 |                 yield new
30 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/date.py:
--------------------------------------------------------------------------------
 1 | import calendar
 2 | import datetime
 3 | 
 4 | 
 5 | def today():
 6 |     date = datetime.datetime.now()
 7 |     return datetime.date(date.year, date.month, date.day)
 8 | 
 9 | 
10 | def next_day(date):
11 |     return date + datetime.timedelta(days=1)
12 | 
13 | 
14 | def next_month(date):
15 |     return datetime.date(
16 |         year=date.year + (date.month // 12), month=(date.month % 12) + 1, day=date.day
17 |     )
18 | 
19 | 
20 | def next_year(date):
21 |     if calendar.isleap(date.year):
22 |         days_to_add = 366
23 |     else:
24 |         days_to_add = 365
25 |     return date + datetime.timedelta(days=days_to_add)
26 | 
27 | 
28 | def next_date(date, interval="daily"):
29 |     from_interval = {"daily": next_day, "monthly": next_month, "yearly": next_year}
30 | 
31 |     return from_interval[interval](date)
32 | 
33 | 
34 | def date_range(start, stop, interval="daily"):
35 |     current = start
36 |     while current < stop:
37 |         yield current
38 |         current = next_date(date=current, interval=interval)
39 | 
40 | 
41 | def date_to_dict(date):
42 |     return {"year": date.year, "month": date.month, "day": date.day}
43 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/io.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from csv import DictReader
 3 | from io import TextIOWrapper
 4 | from zipfile import ZipFile
 5 | 
 6 | 
 7 | class NotNullTextWrapper(TextIOWrapper):
 8 |     def read(self, *args, **kwargs):
 9 |         data = super().read(*args, **kwargs)
10 |         return data.replace("\x00", "")
11 | 
12 |     def readline(self, *args, **kwargs):
13 |         data = super().readline(*args, **kwargs)
14 |         return data.replace("\x00", "")
15 | 
16 | 
17 | def parse_zip(filename_or_fobj, inner_filename_suffix, encoding):
18 |     zf = ZipFile(filename_or_fobj)
19 |     for file_info in zf.filelist:
20 |         filename = file_info.filename
21 |         if isinstance(inner_filename_suffix, re.Pattern):
22 |             file_matches = bool(inner_filename_suffix.findall(filename))
23 |         else:
24 |             file_matches = filename.endswith(inner_filename_suffix)
25 | 
26 |         if file_matches:
27 |             fobj = TextIOWrapper(
28 |                 zf.open(filename), encoding=encoding
29 |             )
30 |             reader = DictReader(fobj, delimiter=";")
31 |             for row in reader:
32 |                 yield row
33 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/auxilio_emergencial.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
 4 | from transparenciagovbr.utils.cities import city_name_by_id
 5 | from transparenciagovbr.utils.date import today
 6 | 
 7 | day = today()
 8 | last_month = day.month - 1 if day.month > 1 else 12
 9 | year = day.year if day.month > 1 else day.year - 1
10 | end_date = datetime.date(year, last_month, day.day)
11 | 
12 | 
13 | class AuxilioEmergencialSpider(TransparenciaBaseSpider):
14 |     name = "auxilio_emergencial"
15 |     base_url = "http://transparencia.gov.br/download-de-dados/auxilio-emergencial/{year}{month:02d}"
16 |     start_date = datetime.date(2020, 4, 1)
17 |     end_date = end_date
18 |     publish_frequency = "monthly"
19 |     filename_suffix = "_AuxilioEmergencial.csv"
20 |     schema_filename = "auxilio_emergencial.csv"
21 | 
22 |     def convert_row(self, row):
23 |         row = super().convert_row(row)
24 | 
25 |         if row["codigo_ibge_municipio"] is not None:
26 |             # Força nome de município a ser mais bonito (com acentos,
27 |             # maiúsculas e minúsculas). :)
28 |             row["municipio"] = city_name_by_id[row["codigo_ibge_municipio"]]
29 |         return row
30 | 


--------------------------------------------------------------------------------
/schema/despesa_item_empenho.csv:
--------------------------------------------------------------------------------
 1 | field_name,field_type,internal_field_type,original_name
 2 | categoria_despesa,text,text,Categoria de Despesa
 3 | codigo_categoria_despesa,text,text,Código Categoria de Despesa
 4 | codigo_elemento_despesa,integer,text,Código Elemento de Despesa
 5 | codigo_empenho,text,text,Código Empenho
 6 | codigo_grupo_despesa,text,text,Código Grupo de Despesa
 7 | codigo_modalidade_aplicacao,text,text,Código Modalidade de Aplicação
 8 | codigo_subelemento_despesa,integer,text,Código SubElemento de Despesa
 9 | descricao,text,text,Descrição
10 | elemento_despesa,text,text,Elemento de Despesa
11 | em_sigilo,bool,bool,(coluna criada pelo script)
12 | grupo_despesa,text,text,Grupo de Despesa
13 | id_empenho,integer,integer,Id Empenho
14 | modalidade_aplicacao,text,text,Modalidade de Aplicação
15 | quantidade,decimal,money_real,Quantidade
16 | subelemento_despesa,text,text,SubElemento de Despesa
17 | valor_total,decimal,money_real,Valor Total
18 | valor_unitario,decimal,money_real,Valor Unitário
19 | sequencial,integer,integer,Sequencial
20 | valor_atual,decimal,money_real,Valor Atual
21 | data,date,date,
22 | unidade,text,text,
23 | item,text,text,
24 | marca,text,text,
25 | item_processo,text,text,
26 | item_material,text,text,
27 | descricao_restante,text,text,


--------------------------------------------------------------------------------
/schema/pagamento_historico.csv:
--------------------------------------------------------------------------------
 1 | original_name,field_name,internal_field_type,field_type
 2 | Nome Ação,acao,text,text
 3 | Código Ação,codigo_acao,text,text
 4 | Código Elemento Despesa,codigo_elemento_despesa,text,text
 5 | Código Favorecido,codigo_favorecido,text,text
 6 | Código Função,codigo_funcao,custom_integer,integer
 7 | Código Grupo Despesa,codigo_grupo_despesa,text,text
 8 | Código Órgão,codigo_orgao,custom_integer,integer
 9 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer
10 | Código Programa,codigo_programa,custom_integer,integer
11 | Código Subfunção,codigo_subfuncao,custom_integer,integer
12 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer
13 | Data Pagamento,data_pagamento,brazilian_date,date
14 | Nome Elemento Despesa,elemento_despesa,text,text
15 | ,em_sigilo,bool,bool
16 | Nome Favorecido,favorecido,text,text
17 | Nome Função,funcao,text,text
18 | Gestão Pagamento,gestao_pagamento,text,text
19 | Nome Grupo Despesa,grupo_despesa,text,text
20 | Linguagem Cidadã,linguagem_cidada,text,text
21 | Número Documento,numero_documento,text,text
22 | Nome Órgao,orgao,text,text
23 | Nome Órgão Superior,orgao_superior,text,text
24 | Nome Programa,programa,text,text
25 | Nome Subfunção,subfuncao,text,text
26 | Nome Unidade Gestora,unidade_gestora,text,text
27 | Valor,valor,money_real,decimal
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scraper do Portal da Transparência do Governo Federal
 2 | 
 3 | ## Instalando
 4 | 
 5 | ```shell
 6 | pyenv virtualenv 3.7.3 transparencia-gov-br
 7 | pyenv activate transparencia-gov-br
 8 | pip install -r requirements.txt
 9 | ```
10 | 
11 | ## Rodando
12 | 
13 | Todos os spiders:
14 | 
15 | ```shell
16 | ./run.sh
17 | ```
18 | 
19 | Apenas um spider:
20 | 
21 | ```shell
22 | ./run.sh <nome-do-spider>
23 | ```
24 | 
25 | > Nota: consule os nomes dos spiders disponíveis em
26 | > [transparenciagovbr/spiders/](transparenciagovbr/spiders/]).
27 | 
28 | Pode ser interessante rodar algum script de extração fora de um spider (por
29 | limitações do scrapy). Veja os scripts disponíveis na pasta `scripts` e
30 | execute-os com o parâmetro `--help` para ver as opções disponíveis.
31 | 
32 | 
33 | ## Importando no PostgreSQL
34 | 
35 | Antes, instale as dependências, rode os spiders e crie uma varíavel com a URI
36 | de conexão com o banco:
37 | 
38 | ```shell
39 | pip install psycopg2-binary tqdm
40 | ./run.sh
41 | export POSTGRESQL_URI="postgres://usuario:senha@host:porta/banco"
42 | ```
43 | 
44 | Depois, execute o script para importar todas as tabelas:
45 | 
46 | 
47 | ```shell
48 | ./import-postgresql.sh
49 | ```
50 | 
51 | Ou apenas a tabela de um spider específico:
52 | 
53 | ```shell
54 | ./import-postgresql.sh <nome-do-spider>
55 | ```
56 | 
57 | > Nota: consule os nomes dos spiders disponíveis em
58 | > [transparenciagovbr/spiders/](transparenciagovbr/spiders/]).
59 | 


--------------------------------------------------------------------------------
/schema/orcamento_despesa.csv:
--------------------------------------------------------------------------------
 1 | original_name,field_name,internal_field_type,field_type
 2 | CÓDIGO AÇÃO,codigo_acao,text,text
 3 | CÓDIGO CATEGORIA ECONÔMICA,codigo_categoria_economica,custom_integer,integer
 4 | CÓDIGO ELEMENTO DE DESPESA,codigo_elemento_de_despesa,text,text
 5 | CÓDIGO FUNÇÃO,codigo_funcao,custom_integer,integer
 6 | CÓDIGO GRUPO DE DESPESA,codigo_grupo_de_despesa,text,text
 7 | CÓDIGO ÓRGÃO SUBORDINADO,codigo_orgao_subordinado,custom_integer,integer
 8 | CÓDIGO ÓRGÃO SUPERIOR,codigo_orgao_superior,custom_integer,integer
 9 | CÓDIGO PROGRAMA ORÇAMENTÁRIO,codigo_programa_orcamentario,custom_integer,integer
10 | CÓDIGO SUBFUNÇÃO,codigo_subfuncao,custom_integer,integer
11 | CÓDIGO UNIDADE ORÇAMENTÁRIA,codigo_unidade_orcamentaria,custom_integer,integer
12 | ,em_sigilo,bool,bool
13 | EXERCÍCIO,exercicio,text,text
14 | NOME AÇÃO,nome_acao,text,text
15 | NOME CATEGORIA ECONÔMICA,nome_categoria_economica,text,text
16 | NOME ELEMENTO DE DESPESA,nome_elemento_de_despesa,text,text
17 | NOME FUNÇÃO,nome_funcao,text,text
18 | NOME GRUPO DE DESPESA,nome_grupo_de_despesa,text,text
19 | NOME ÓRGÃO SUBORDINADO,nome_orgao_subordinado,text,text
20 | NOME ÓRGÃO SUPERIOR,nome_orgao_superior,text,text
21 | NOME PROGRAMA ORÇAMENTÁRIO,nome_programa_orcamentario,text,text
22 | NOME SUBFUNÇÃO,nome_subfuncao,text,text
23 | NOME UNIDADE ORÇAMENTÁRIA,nome_unidade_orcamentaria,text,text
24 | ORÇAMENTO ATUALIZADO (R$),orcamento_atualizado,money_real,decimal
25 | ORÇAMENTO INICIAL (R$),orcamento_inicial,money_real,decimal
26 | ORÇAMENTO REALIZADO (R$),orcamento_realizado,money_real,decimal
27 | 


--------------------------------------------------------------------------------
/schema/pagamento.csv:
--------------------------------------------------------------------------------
 1 | original_name,field_name,internal_field_type,field_type
 2 | Categoria de Despesa,categoria_de_despesa,text,text
 3 | Código Categoria de Despesa,codigo_categoria_de_despesa,text,text
 4 | Código Elemento de Despesa,codigo_elemento_de_despesa,text,text
 5 | Código Favorecido,codigo_favorecido,text,text
 6 | Código Gestão,codigo_gestao,custom_integer,integer
 7 | Código Grupo de Despesa,codigo_grupo_de_despesa,text,text
 8 | Código Modalidade de Aplicação,codigo_modalidade_de_aplicacao,text,text
 9 | Código Órgão,codigo_orgao,custom_integer,integer
10 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer
11 | Código Pagamento,codigo_pagamento,text,text
12 | Código Pagamento Resumido,codigo_pagamento_resumido,text,text
13 | Código Plano Orçamentário,codigo_plano_orcamentario,text,text
14 | Código Programa Governo,codigo_programa_governo,text,text
15 | Código Tipo Documento,codigo_tipo_documento,text,text
16 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer
17 | Data Emissão,data_emissao,brazilian_date,date
18 | Elemento de Despesa,elemento_de_despesa,text,text
19 | ,em_sigilo,bool,bool
20 | Extraorçamentário,extraorcamentario,text,text
21 | Favorecido,favorecido,text,text
22 | Gestão,gestao,text,text
23 | Grupo de Despesa,grupo_de_despesa,text,text
24 | Modalidade de Aplicação,modalidade_de_aplicacao,text,text
25 | Observação,observacao,custom_text,text
26 | Órgão,orgao,text,text
27 | Órgão Superior,orgao_superior,text,text
28 | Plano Orçamentário,plano_orcamentario,text,text
29 | Processo,processo,text,text
30 | Nome Programa Governo,programa_governo,text,text
31 | Valor Utilizado na Conversão,taxa_de_conversao,money_real,decimal
32 | Tipo Documento,tipo_documento,text,text
33 | Tipo OB,tipo_ob,text,text
34 | Unidade Gestora,unidade_gestora,text,text
35 | Valor do Pagamento Convertido pra R$,valor_convertido_para_reais,money_real,decimal
36 | Valor Original do Pagamento,valor_original,money_real,decimal
37 | 


--------------------------------------------------------------------------------
/create_mirror_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import stat
 3 | from textwrap import dedent
 4 | from urllib.parse import urlparse
 5 | 
 6 | from scrapy import spiderloader
 7 | from scrapy.utils import project
 8 | 
 9 | from transparenciagovbr.utils.date import date_range, date_to_dict
10 | 
11 | output_filename = "mirror.sh"
12 | settings = project.get_project_settings()
13 | spider_loader = spiderloader.SpiderLoader.from_settings(settings)
14 | spiders = spider_loader.list()
15 | with open(output_filename, mode="w") as fobj:
16 |     fobj.write(
17 |         dedent(
18 |             """
19 |     #!/bin/bash
20 | 
21 |     mirror_file() {
22 |         url="$1"
23 |         download_path="$2"
24 |         mirror_uri="$3"
25 | 
26 |         aria2c \\
27 |                 --summary-interval=0 \\
28 |                 --dir=$(dirname "$download_path") \\
29 |                 --out=$(basename "$download_path") \\
30 |                 "$url"
31 |         if [ -e "$download_path" ]; then
32 |             s3cmd put "$download_path" "$mirror_uri"
33 |             rm "$download_path"
34 |         fi
35 |     }
36 |     """
37 |         ).strip()
38 |     )
39 |     fobj.write(f"\nmkdir -p {settings['DOWNLOAD_PATH']}\n")
40 |     for spider_name in spiders:
41 |         fobj.write(f"\n# {spider_name}\n")
42 |         SpiderClass = spider_loader.load(spider_name)
43 |         for date in date_range(
44 |             start=SpiderClass.start_date,
45 |             stop=SpiderClass.end_date,
46 |             interval=SpiderClass.publish_frequency,
47 |         ):
48 |             url = SpiderClass.base_url.format(**date_to_dict(date))
49 |             filename = urlparse(url).path.rsplit("/", maxsplit=1)[-1]
50 |             mirror_uri = f"s3://mirror/transparenciagovbr/{spider_name}/{filename}"
51 |             download_path = settings["DOWNLOAD_PATH"] / filename
52 |             fobj.write(f"mirror_file {url} {download_path} {mirror_uri}\n")
53 | # chmod 750 mirror.sh
54 | os.chmod(
55 |     output_filename,
56 |     stat.S_IRUSR + stat.S_IWUSR + stat.S_IXUSR + stat.S_IRGRP + stat.S_IXGRP,
57 | )
58 | 


--------------------------------------------------------------------------------
/scripts/auxilio_emergencial.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | import rows
 7 | from tqdm import tqdm
 8 | 
 9 | sys.path.insert(0, str(Path(__file__).parent.parent.absolute()))  # noqa
10 | from transparenciagovbr.utils.cities import city_name_by_id
11 | from transparenciagovbr.utils.fields import Schema
12 | from transparenciagovbr.utils.io import parse_zip
13 | 
14 | 
15 | def extract_rows(schema, filename):
16 |     data = parse_zip(
17 |         filename_or_fobj=filename,
18 |         inner_filename_suffix="_AuxilioEmergencial.csv",
19 |         encoding="iso-8859-1",
20 |     )
21 |     for row in data:
22 |         new = schema.deserialize(row)
23 |         if new is not None:
24 |             if new["codigo_ibge_municipio"] is not None:
25 |                 # Força nome de município a ser mais bonito (com acentos,
26 |                 # maiúsculas e minúsculas). :)
27 |                 new["municipio"] = city_name_by_id[new["codigo_ibge_municipio"]]
28 |             yield new
29 | 
30 | 
31 | def main():
32 |     # TODO: move this `main` to a general command-line interface so we can run
33 |     # any extractor by command-line.
34 | 
35 |     BASE_PATH = Path(__file__).parent
36 |     DATA_PATH = BASE_PATH / "data"
37 |     DOWNLOAD_PATH = DATA_PATH / "download"
38 |     OUTPUT_PATH = DATA_PATH / "output"
39 | 
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument("input_filename")
42 |     parser.add_argument("output_filename")
43 |     parser.add_argument("--buffering", default=4 * 1024 * 1024)
44 |     parser.add_argument("--schema-filename", default="auxilio_emergencial.csv")
45 |     args = parser.parse_args()
46 | 
47 |     schema = Schema(args.schema_filename)
48 |     filename = Path(args.input_filename)
49 |     fobj = rows.utils.open_compressed(args.output_filename, mode="w", buffering=args.buffering)
50 |     writer = rows.utils.CsvLazyDictWriter(fobj)
51 | 
52 |     data = extract_rows(schema, filename)
53 |     for row in tqdm(data, desc=f"Extracting {filename.name}"):
54 |         writer.writerow(row)
55 |     fobj.close()
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/schema/execucao_despesa.csv:
--------------------------------------------------------------------------------
 1 | original_name,field_name,internal_field_type,field_type
 2 | Nome Ação,acao,text,text
 3 | Ano e mês do lançamento,anomes_lancamento,text,text
 4 | Código Ação,codigo_acao,text,text
 5 | Código Elemento de Despesa,codigo_elemento_despesa,text,text
 6 | Código Função,codigo_funcao,custom_integer,integer
 7 | Código Gestão,codigo_gestao,custom_integer,integer
 8 | Código Grupo de Despesa,codigo_grupo_despesa,text,text
 9 | Código Modalidade da Despesa,codigo_modalidade_despesa,text,text
10 | Código Órgão Subordinado,codigo_orgao_subordinado,custom_integer,integer
11 | Código Órgão Superior,codigo_orgao_superior,custom_integer,integer
12 | Código Plano Orçamentário,codigo_plano_orcamentario,text,text
13 | Código Programa Governo,codigo_programa_governo,text,text
14 | Código Programa Orçamentário,codigo_programa_orcamentario,custom_integer,integer
15 | Código Subfução,codigo_subfucao,custom_integer,integer
16 | Código Unidade Gestora,codigo_unidade_gestora,custom_integer,integer
17 | Código Unidade Orçamentária,codigo_unidade_orcamentaria,custom_integer,integer
18 | Nome Elemento de Despesa,elemento_despesa,text,text
19 | ,em_sigilo,bool,bool
20 | Nome Função,funcao,text,text
21 | Nome Gestão,gestao,text,text
22 | Nome Grupo de Despesa,grupo_despesa,text,text
23 | Modalidade da Despesa,modalidade_despesa,text,text
24 | Nome Órgão Subordinado,orgao_subordinado,text,text
25 | Nome Órgão Superior,orgao_superior,text,text
26 | Plano Orçamentário,plano_orcamentario,text,text
27 | Nome Programa Governo,programa_governo,text,text
28 | Nome Programa Orçamentário,programa_orcamentario,text,text
29 | Nome Subfunção,subfuncao,text,text
30 | Nome Unidade Gestora,unidade_gestora,text,text
31 | Nome Unidade Orçamentária,unidade_orcamentaria,text,text
32 | Valor Empenhado (R$),valor_empenhado,money_real,decimal
33 | Valor Liquidado (R$),valor_liquidado,money_real,decimal
34 | Valor Pago (R$),valor_pago,money_real,decimal
35 | Valor Restos a Pagar Cancelado (R$),valor_restos_a_pagar_cancelado,money_real,decimal
36 | Valor Restos a Pagar Inscritos (R$),valor_restos_a_pagar_inscritos,money_real,decimal
37 | Valor Restos a Pagar Pagos (R$),valor_restos_a_pagar_pagos,money_real,decimal
38 | 


--------------------------------------------------------------------------------
/transparenciagovbr/fields.py:
--------------------------------------------------------------------------------
 1 | from rows.fields import (
 2 |     BoolField,
 3 |     DateField,
 4 |     DecimalField,
 5 |     IntegerField,
 6 |     TextField,
 7 |     as_string,
 8 |     is_null,
 9 | )
10 | 
11 | 
12 | class BrazilianBoolField(BoolField):
13 |     name = "brazilian_bool"
14 |     TRUE_VALUES = ("SIM", "sim", "Sim")
15 |     FALSE_VALUES = ("NÃO", "NAO", "Não", "Nao", "não", "nao")
16 | 
17 | 
18 | class BrazilianDateField(DateField):
19 |     name = "brazilian_date"
20 |     INPUT_FORMAT = "%d/%m/%Y"
21 | 
22 | 
23 | class CPFField(TextField):
24 |     """TextField to clean-up unneeded chars in CPF"""
25 | 
26 |     name = "cpf"
27 | 
28 |     @classmethod
29 |     def deserialize(cls, value, *args, **kwargs):
30 |         if is_null(value):
31 |             return None
32 | 
33 |         value = as_string(value).strip()
34 |         value = value.replace(".", "").replace("-", "")
35 |         assert len(value) == 11
36 |         return value
37 | 
38 | 
39 | class CustomIntegerField(IntegerField):
40 |     """Locale-aware field class to represent integer
41 | 
42 |     Accepts numbers starting with 0 and removes unnecessary characters.
43 |     """
44 | 
45 |     name = "custom_integer"
46 | 
47 |     @classmethod
48 |     def deserialize(cls, value, *args, **kwargs):
49 |         if is_null(value):
50 |             return None
51 |         elif isinstance(value, cls.TYPE):
52 |             return value
53 | 
54 |         value = as_string(value).strip()
55 |         value = value.replace("ª", "")
56 |         while value.startswith("0"):
57 |             value = value[1:]
58 |         return super().deserialize(value)
59 | 
60 | 
61 | class CustomTextField(TextField):
62 |     """TextField to clean-up a value that should be empty"""
63 | 
64 |     name = "custom_text"
65 | 
66 |     @classmethod
67 |     def deserialize(cls, value, *args, **kwargs):
68 |         if is_null(value) or value in ("Não há", "Não se aplica"):
69 |             return None
70 | 
71 |         return value
72 | 
73 | 
74 | class MoneyRealField(DecimalField):
75 |     name = "money_real"
76 | 
77 |     @classmethod
78 |     def deserialize(cls, value):
79 |         """
80 |         >>> MoneyRealField.deserialize("89188,11")
81 |         '89188.11'
82 |         """
83 |         if is_null(value):
84 |             return None
85 |         elif isinstance(value, cls.TYPE):
86 |             return value
87 | 
88 |         value = value.replace(",", ".")
89 |         return super().deserialize(value)
90 | 


--------------------------------------------------------------------------------
/pensionista/download-old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p data/download
 4 | cd data/download
 5 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_112019.zip
 6 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_122019.zip
 7 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_012020.zip
 8 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_022020.zip
 9 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_032020.zip
10 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_042020.zip
11 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1994.zip
12 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1995.zip
13 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1996.zip
14 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1997.zip
15 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1998.zip
16 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/1999.zip
17 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2000.zip
18 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2001.zip
19 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2002.zip
20 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2003.zip
21 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2004.zip
22 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2005.zip
23 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2006.zip
24 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2007.zip
25 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2008.zip
26 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2009.zip
27 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2010.zip
28 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2011.zip
29 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2012.zip
30 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2013.zip
31 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2014.zip
32 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2015.zip
33 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2016.zip
34 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2017.zip
35 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2018.zip
36 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/2019.zip
37 | wget -c -t 0 http://repositorio.dados.gov.br/segrt/pensionistas/PENSIONISTAS_052020.zip
38 | cd -
39 | 


--------------------------------------------------------------------------------
/transparenciagovbr/utils/fields.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import rows
 4 | 
 5 | from transparenciagovbr import fields, settings
 6 | 
 7 | 
 8 | EM_SIGILO_STRINGS = (
 9 |     "Detalhamento das informações bloqueado.",
10 |     "Informações protegidas por sigilo, nos termos da legislação, para garantia da segurança da sociedade e do Estado",
11 | )
12 | 
13 | 
14 | def schema_path_from_filename(filename):
15 |     return str((settings.REPOSITORY_PATH / "schema" / filename).absolute())
16 | 
17 | 
18 | def load_schema(filename):
19 |     schema_path = schema_path_from_filename(filename)
20 |     table = rows.import_from_csv(schema_path)
21 |     table.field_names
22 |     # Our internal context will be all available rows.fields + our custom
23 |     # fields
24 |     rows_context = {
25 |         field_name.replace("Field", "").lower(): getattr(rows.fields, field_name)
26 |         for field_name in rows.fields.__all__
27 |         if "Field" in field_name and field_name != "Field"
28 |     }
29 |     custom_context = {}
30 |     for type_name in dir(fields):
31 |         FieldClass = getattr(fields, type_name)
32 |         if "Field" in type_name and FieldClass.__module__ != "rows.fields":
33 |             custom_context[FieldClass.name] = FieldClass
34 |     context = {**rows_context, **custom_context}
35 |     return OrderedDict(
36 |         [(row.field_name, context[row.internal_field_type]) for row in table]
37 |     )
38 | 
39 | 
40 | def field_mapping_from_csv(csvfile):
41 |     schema_path = schema_path_from_filename(csvfile)
42 |     return {
43 |         row.original_name: row.field_name for row in rows.import_from_csv(schema_path)
44 |     }
45 | 
46 | 
47 | class Schema:
48 | 
49 |     def __init__(self, schema_filename):
50 |         schema = load_schema(schema_filename)
51 |         field_mapping = field_mapping_from_csv(schema_filename)
52 | 
53 |         self.fields = []
54 |         for original_field_name, field_name in field_mapping.items():
55 |             if field_name == "em_sigilo":
56 |                 deserialize = lambda value: "f"
57 |             else:
58 |                 deserialize = schema[field_name].deserialize
59 |             self.fields.append((field_name, original_field_name, deserialize))
60 | 
61 |     def deserialize(self, row):
62 |         new = {
63 |             field_name: deserialize(row.pop(original_field_name, None))
64 |             for field_name, original_field_name, deserialize in self.fields
65 |         }
66 |         if row:
67 |             raise ValueError(f"Missing fields during deserialization: {', '.join(row.keys())}")
68 |         for key, value in new.items():
69 |             if value in EM_SIGILO_STRINGS:
70 |                 new[key] = None
71 |                 new["em_sigilo"] = "t"
72 |         return new
73 | 


--------------------------------------------------------------------------------
/schema/despesa_empenho.csv:
--------------------------------------------------------------------------------
 1 | field_name,field_type,internal_field_type,original_name
 2 | id,integer,integer,Id Empenho
 3 | codigo,text,text,Código Empenho
 4 | codigo_resumido,text,text,Código Empenho Resumido
 5 | data_emissao,text,text,Data Emissão
 6 | codigo_tipo_documento,text,text,Código Tipo Documento
 7 | tipo_documento,text,text,Tipo Documento
 8 | tipo,text,text,Tipo Empenho
 9 | especie,text,text,Espécie Empenho
10 | codigo_orgao_superior,text,text,Código Órgão Superior
11 | orgao_superior,text,text,Órgão Superior
12 | codigo_orgao,text,text,Código Órgão
13 | orgao,text,text,Órgão
14 | codigo_unidade_gestora,text,text,Código Unidade Gestora
15 | unidade_gestora,text,text,Unidade Gestora
16 | codigo_gestao,text,text,Código Gestão
17 | gestao,text,text,Gestão
18 | codigo_favorecido,text,text,Código Favorecido
19 | favorecido,text,text,Favorecido
20 | observacao,text,text,Observação
21 | codigo_esfera_orcamentaria,text,text,Código Esfera Orçamentária
22 | esfera_orcamentaria,text,text,Esfera Orçamentária
23 | codigo_tipo_credito,text,text,Código Tipo Crédito
24 | tipo_credito,text,text,Tipo Crédito
25 | codigo_grupo_fonte_recurso,text,text,Código Grupo Fonte Recurso
26 | grupo_fonte_recurso,text,text,Grupo Fonte Recurso
27 | codigo_fonte_recurso,text,text,Código Fonte Recurso
28 | fonte_recurso,text,text,Fonte Recurso
29 | codigo_unidade_orcamentaria,text,text,Código Unidade Orçamentária
30 | unidade_orcamentaria,text,text,Unidade Orçamentária
31 | codigo_funcao,text,text,Código Função
32 | funcao,text,text,Função
33 | codigo_subfuncao,text,text,Código SubFunção
34 | subfuncao,text,text,SubFunção
35 | codigo_programa,text,text,Código Programa
36 | programa,text,text,Programa
37 | codigo_acao,text,text,Código Ação
38 | acao,text,text,Ação
39 | linguagem_cidada,text,text,Linguagem Cidadã
40 | codigo_subtitulo_localizador,text,text,Código Subtítulo (Localizador)
41 | subtitulo_localizador,text,text,Subtítulo (Localizador)
42 | codigo_plano_orcamentario,text,text,Código Plano Orçamentário
43 | plano_orcamentario,text,text,Plano Orçamentário
44 | codigo_programa_governo,text,text,Código Programa Governo
45 | programa_governo,text,text,Nome Programa Governo
46 | autor_emenda,text,text,Autor Emenda
47 | codigo_categoria_de_despesa,text,text,Código Categoria de Despesa
48 | categoria_despesa,text,text,Categoria de Despesa
49 | codigo_grupo_despesa,text,text,Código Grupo de Despesa
50 | grupo_despesa,text,text,Grupo de Despesa
51 | codigo_modalidade_aplicacao,text,text,Código Modalidade de Aplicação
52 | modalidade_aplicacao,text,text,Modalidade de Aplicação
53 | codigo_elemento_despesa,text,text,Código Elemento de Despesa
54 | elemento_despesa,text,text,Elemento de Despesa
55 | processo,text,text,Processo
56 | modalidade_licitacao,text,text,Modalidade de Licitação
57 | inciso,text,text,Inciso
58 | amparo,text,text,Amparo
59 | referencia_dispensa_inexigibilidade,text,text,Referência de Dispensa ou Inexigibilidade
60 | codigo_convenio,text,text,Código Convênio
61 | contrato_repasse_parceria_outros,text,text,Contrato de Repasse / Termo de Parceria / Outros
62 | valor_original,decimal,money_real,Valor Original do Empenho
63 | valor_reais,decimal,money_real,Valor do Empenho Convertido pra R$
64 | valor_conversao,decimal,money_real,Valor Utilizado na Conversão
65 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/base.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import io
 3 | import zipfile
 4 | from urllib.parse import urlparse
 5 | 
 6 | import scrapy
 7 | from cached_property import cached_property
 8 | 
 9 | from transparenciagovbr import settings
10 | from transparenciagovbr.utils.date import date_range, date_to_dict
11 | from transparenciagovbr.utils.fields import Schema
12 | from transparenciagovbr.utils.io import parse_zip
13 | 
14 | 
15 | def parse_csv_rows(filename_or_fobj, inner_filename_suffix, encoding, schema):
16 |     data = parse_zip(
17 |         filename_or_fobj=filename_or_fobj,
18 |         inner_filename_suffix=inner_filename_suffix,
19 |         encoding=encoding,
20 |     )
21 |     for row in data:
22 |         new = schema.deserialize(row)
23 |         if new is not None:
24 |             yield new
25 | 
26 | 
27 | class TransparenciaBaseSpider(scrapy.Spider):
28 |     allowed_domains = [
29 |         "portaldatransparencia.gov.br",
30 |         "transparencia.gov.br",
31 |         "data.brasil.io",
32 |     ]
33 |     custom_settings = {
34 |         "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.93 Safari/537.36",
35 |     }
36 |     encoding = "iso-8859-1"
37 |     mirror_url = "https://data.brasil.io/mirror/transparenciagovbr/{dataset}/{filename}"
38 | 
39 | 
40 |     def __init__(self, use_mirror="False", save_file="True", *args, **kwargs):
41 |         super().__init__(*args, **kwargs)
42 |         self.use_mirror = use_mirror.lower() == "true"
43 |         self.save_file = save_file.lower() == "true"
44 | 
45 |     @property
46 |     def schema(self):
47 |         return Schema(self.schema_filename)
48 | 
49 |     def make_filename(self, url):
50 |         return settings.DOWNLOAD_PATH / self.name / urlparse(url).path.rsplit("/", maxsplit=1)[-1]
51 | 
52 |     def start_requests(self):
53 |         for date in date_range(
54 |             start=self.start_date, stop=self.end_date, interval=self.publish_frequency
55 |         ):
56 |             url = self.base_url.format(**date_to_dict(date))
57 |             if self.use_mirror:
58 |                 url = self.mirror_url.format(
59 |                     dataset=self.name,
60 |                     filename=urlparse(url).path.rsplit("/", maxsplit=1)[-1],
61 |                 )
62 |             elif self.save_file:
63 |                 filename = self.make_filename(url)
64 |                 if filename.exists():
65 |                     url = f"file://{filename.absolute()}"
66 |             yield scrapy.Request(url, callback=self.parse_zip_response)
67 | 
68 |     def parse_zip_response(self, response):
69 |         # If it's set to save file and the response comes from the Web, then
70 |         # save it to the disk.
71 |         if self.save_file and not response.request.url.startswith("file://"):
72 |             filename = self.make_filename(response.request.url)
73 |             if not filename.parent.exists():
74 |                 filename.parent.mkdir(parents=True)
75 |             with open(filename, mode="wb") as fobj:
76 |                 fobj.write(response.body)
77 | 
78 |         yield from parse_csv_rows(
79 |             filename_or_fobj=io.BytesIO(response.body),
80 |             inner_filename_suffix=self.filename_suffix,
81 |             encoding=self.encoding,
82 |             schema=self.schema,
83 |         )
84 | 


--------------------------------------------------------------------------------
/transparenciagovbr/spiders/despesa_item_empenho.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from decimal import Decimal, InvalidOperation
  3 | from urllib.parse import urlparse
  4 | 
  5 | from transparenciagovbr import settings
  6 | from transparenciagovbr.spiders.base import TransparenciaBaseSpider
  7 | from transparenciagovbr.utils.date import today
  8 | 
  9 | 
 10 | class Text(str):
 11 | 
 12 |     def until(self, substr):
 13 |         return Text(self[:self.find(substr)])
 14 | 
 15 |     def starting_at(self, substr):
 16 |         return Text(self[self.find(substr):])
 17 | 
 18 |     def after(self, substr):
 19 |         return Text(self[self.find(substr) + len(substr):])
 20 | 
 21 | 
 22 | def parse_description(text):
 23 |     """Extrai dados estruturados do texto da descrição"""
 24 | 
 25 |     new = {
 26 |         "descricao_restante": "",
 27 |         "item": "",
 28 |         "item_material": "",
 29 |         "item_processo": "",
 30 |         "marca": "",
 31 |         "quantidade": "",
 32 |         "unidade": "",
 33 |     }
 34 | 
 35 |     if len(text) < 78 or "MARCA:" not in text:
 36 |         return new
 37 | 
 38 |     # TODO: verificar a possibilidade de transformar essa função num conjunto
 39 |     # de expressões regulares (provavelmente rodarão mais rapidamente)
 40 | 
 41 |     part1, part2 = Text(text[:78].strip()), Text(text[78:])
 42 |     try:
 43 |         new["quantidade"] = Decimal(part1.until(" ").strip().replace(".", "").replace(",", "."))
 44 |     except InvalidOperation:
 45 |         return new
 46 |     new["unidade"] = part1.after(" ").strip()
 47 | 
 48 |     item = part2.until(",")
 49 |     if item and item[0] == item[-1] == "'":
 50 |         item = item[1:-1]
 51 |     new["item"] = item
 52 |     rest = part2.after(",")
 53 | 
 54 |     new["descricao_restante"] = rest.until("MARCA:").strip()
 55 |     rest = rest.after("MARCA:")
 56 | 
 57 |     new["marca"] = rest.until("ITEM DO PROCESSO:").strip()
 58 |     rest = rest.after("ITEM DO PROCESSO:")
 59 | 
 60 |     new["item_processo"] = rest.until("ITEM DE MATERIAL:").strip()
 61 |     rest = rest.after("ITEM DE MATERIAL:")
 62 | 
 63 |     new["item_material"] = rest.strip()
 64 | 
 65 |     return new
 66 | 
 67 | 
 68 | def extract_extra_fields(row):
 69 |     new = {
 70 |         "quantidade": None,
 71 |         "unidade": None,
 72 |         "item": None,
 73 |         "marca": None,
 74 |         "item_processo": None,
 75 |         "item_material": None,
 76 |         "descricao_restante": None,
 77 |     }
 78 |     if row["elemento_despesa"] != "MATERIAL DE CONSUMO":
 79 |         return new
 80 |     # TODO: add fields related to services and other elements
 81 | 
 82 |     new.update(parse_description(row["descricao"]))
 83 |     return new
 84 | 
 85 | class DespesaMixin:
 86 |     def make_filename(self, url):
 87 |         return settings.DOWNLOAD_PATH / "despesa" / urlparse(url).path.rsplit("/", maxsplit=1)[-1]
 88 | 
 89 | 
 90 | class DespesaItemEmpenhoSpider(DespesaMixin, TransparenciaBaseSpider):
 91 |     name = "despesa_item_empenho"
 92 |     base_url = "http://transparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}"
 93 |     start_date = datetime.date(2013, 3, 31)
 94 |     end_date = today()
 95 |     publish_frequency = "daily"
 96 |     filename_suffix = "_Despesas_ItemEmpenho.csv"
 97 |     schema_filename = "despesa_item_empenho.csv"
 98 | 
 99 |     def parse_zip_response(self, response):
100 |         for row in super().parse_zip_response(response):
101 |             row.update(extract_extra_fields(row))
102 |             yield row
103 | 
104 | 
105 | class DespesaEmpenhoSpider(DespesaMixin, TransparenciaBaseSpider):
106 |     name = "despesa_empenho"
107 |     base_url = "http://transparencia.gov.br/download-de-dados/despesas/{year}{month:02d}{day:02d}"
108 |     start_date = datetime.date(2013, 3, 31)
109 |     end_date = today()
110 |     publish_frequency = "daily"
111 |     filename_suffix = "_Despesas_Empenho.csv"
112 |     schema_filename = "despesa_empenho.csv"
113 | 


--------------------------------------------------------------------------------
/transparenciagovbr/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | # Scrapy settings for transparenciagovbr project
  6 | #
  7 | # For simplicity, this file contains only settings considered important or
  8 | # commonly used. You can find more settings consulting the documentation:
  9 | #
 10 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 11 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 12 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 13 | 
 14 | BOT_NAME = "transparenciagovbr"
 15 | 
 16 | SPIDER_MODULES = ["transparenciagovbr.spiders"]
 17 | NEWSPIDER_MODULE = "transparenciagovbr.spiders"
 18 | 
 19 | 
 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 21 | # USER_AGENT = 'transparenciagovbr (+http://www.yourdomain.com)'
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 27 | # CONCURRENT_REQUESTS = 32
 28 | 
 29 | # Configure a delay for requests for the same website (default: 0)
 30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 31 | # See also autothrottle settings and docs
 32 | # DOWNLOAD_DELAY = 3
 33 | # The download delay setting will honor only one of:
 34 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 35 | # CONCURRENT_REQUESTS_PER_IP = 16
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | # COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | # TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | # DEFAULT_REQUEST_HEADERS = {
 45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 | #   'Accept-Language': 'en',
 47 | # }
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 51 | # SPIDER_MIDDLEWARES = {
 52 | #    'transparenciagovbr.middlewares.TransparenciagovbrSpiderMiddleware': 543,
 53 | # }
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 57 | # DOWNLOADER_MIDDLEWARES = {
 58 | #    'transparenciagovbr.middlewares.TransparenciagovbrDownloaderMiddleware': 543,
 59 | # }
 60 | 
 61 | # Enable or disable extensions
 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 63 | # EXTENSIONS = {
 64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 65 | # }
 66 | 
 67 | # Configure item pipelines
 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 69 | # ITEM_PIPELINES = {
 70 | #    'transparenciagovbr.pipelines.TransparenciagovbrPipeline': 300,
 71 | # }
 72 | 
 73 | # Enable and configure the AutoThrottle extension (disabled by default)
 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 75 | # AUTOTHROTTLE_ENABLED = True
 76 | # The initial download delay
 77 | # AUTOTHROTTLE_START_DELAY = 5
 78 | # The maximum download delay to be set in case of high latencies
 79 | # AUTOTHROTTLE_MAX_DELAY = 60
 80 | # The average number of requests Scrapy should be sending in parallel to
 81 | # each remote server
 82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 83 | # Enable showing throttling stats for every response received:
 84 | # AUTOTHROTTLE_DEBUG = False
 85 | 
 86 | # Enable and configure HTTP caching (disabled by default)
 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 88 | HTTPCACHE_ENABLED = True
 89 | HTTPCACHE_EXPIRATION_SECS = 0
 90 | HTTPCACHE_DIR = "httpcache"
 91 | HTTPCACHE_IGNORE_HTTP_CODES = []
 92 | HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 93 | 
 94 | FEED_EXPORTERS = {"csv.gz": "transparenciagovbr.exporters.GzipCsvItemExporter"}
 95 | FEED_FORMAT = "csv.gz"
 96 | 
 97 | REPOSITORY_PATH = Path(__file__).parent.parent
 98 | DOWNLOAD_PATH = REPOSITORY_PATH / "data" / "download"
 99 | 
100 | DOWNLOAD_WARNSIZE = 2 * 1024 * 1024 * 1024
101 | 


--------------------------------------------------------------------------------
/transparenciagovbr/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TransparenciagovbrSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info("Spider opened: %s" % spider.name)
 57 | 
 58 | 
 59 | class TransparenciagovbrDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info("Spider opened: %s" % spider.name)
104 | 


--------------------------------------------------------------------------------
/pensionista/convert.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import datetime
  3 | import io
  4 | from functools import lru_cache
  5 | from pathlib import Path
  6 | from uuid import NAMESPACE_URL, uuid5
  7 | from zipfile import ZipFile
  8 | 
  9 | from rows.fields import slug
 10 | from rows.utils import CsvLazyDictWriter, open_compressed
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | strptime = datetime.datetime.strptime
 15 | 
 16 | @lru_cache(maxsize=1024 ** 2)
 17 | def convert_number(value):
 18 |     return value.replace(".", "").replace(",", ".")
 19 | 
 20 | 
 21 | @lru_cache(maxsize=32 * 1024)
 22 | def convert_date(value):
 23 |     value = value.strip()
 24 |     if not value:
 25 |         return None
 26 |     return str(strptime(value, "%d/%m/%Y").date())
 27 | 
 28 | 
 29 | @lru_cache(maxsize=1024 * 1024)
 30 | def person_uuid(cpf, name):
 31 |     """Create UUID based on URLid methodology"""
 32 | 
 33 |     if cpf is None:
 34 |         cpf = "***********"
 35 |     assert len(cpf) == 11, f"Invalid CPF: {repr(cpf)}"
 36 |     internal_id = cpf[3:9] + "-" + slug(name).upper().replace("_", "-")
 37 |     return str(uuid5(NAMESPACE_URL, f"https://id.brasil.io/person/v1/{internal_id}/"))
 38 | 
 39 | 
 40 | @lru_cache(maxsize=128)
 41 | def normalize_key(text):
 42 | 
 43 |     text = text.replace("(R$)", "_brl_").replace("(U$)", "_usd_")
 44 |     result = (
 45 |         slug(text)
 46 |         .replace("_registradas_em_sistemas_de_pessoal_", "_")
 47 |         .replace("_programa_desligamento_voluntario_mp_792_2017_", "_deslig_voluntario_")
 48 |     )
 49 |     return result
 50 | 
 51 | 
 52 | def convert_row(row):
 53 |     new = {}
 54 |     for original_key, value in row.items():
 55 |         key = normalize_key(original_key)
 56 |         value = value.strip()
 57 |         if (value and value[0] == "0" and value[-1] == "0" and set(value) == {"0"}) or value in ("-", "--"):
 58 |             value = None
 59 |         if not key and not value:
 60 |             continue
 61 | 
 62 |         if key.startswith("data_") and value is not None:
 63 |             value = convert_date(value)
 64 |         elif value is not None and ("R$" in original_key or "U$" in original_key):
 65 |             value = convert_number(value)
 66 | 
 67 |         new[key] = value
 68 |     return new
 69 | 
 70 | 
 71 | def read_csv(fobj, table_name, year, month, input_encoding="iso-8859-1", delimiter=";"):
 72 |     """Read binary `fobj` as CSV, convert each row, adding `table_name` as a column"""
 73 | 
 74 |     fobj = io.TextIOWrapper(fobj, encoding=input_encoding)
 75 |     reader = csv.DictReader(fobj, delimiter=delimiter)
 76 |     for row in reader:
 77 |         new = convert_row(row)
 78 |         if "(*)" in new.get("ano", ""):  # Invalid row
 79 |             continue
 80 |         if "ano" not in new:
 81 |             new["ano"] = year
 82 |         if "mes" not in new:
 83 |             new["mes"] = month
 84 |         if "PENSIONISTA MENOR DE 16 ANOS" in new["cpf"]:
 85 |             new["menor_16"] = True
 86 |             new["cpf"] = None
 87 |         else:
 88 |             new["menor_16"] = False
 89 |             new["cpf"] = new["cpf"].replace(".", "").replace("-", "")
 90 |         new["sistema_origem"] = table_name
 91 |         new["pessoa_uuid"] = person_uuid(new["cpf"], new["nome"])
 92 |         if table_name == "cadastro":
 93 |             for key in ("representante_legal", "instituidor"):
 94 |                 new[f"cpf_{key}"] = new[f"cpf_{key}"].replace(".", "").replace("-", "")
 95 |                 new[f"{key}_uuid"] = person_uuid(new[f"cpf_{key}"], new[f"nome_{key}"])
 96 |         yield new
 97 | 
 98 | 
 99 | def extract_year_month(filename):
100 |     """Extract year and month from ZIP filename"""
101 | 
102 |     part = filename.name.lower().split(".zip")[0]
103 |     return int(part[:4]), int(part[4:6])
104 | 
105 | 
106 | def extract_origin_system(filename):
107 |     return filename.split(".zip")[0].split("_")[-1]
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     import argparse
112 | 
113 |     parser = argparse.ArgumentParser()
114 |     parser.add_argument("table_name", choices=("cadastro", "remuneracao", "observacao"))
115 |     args = parser.parse_args()
116 | 
117 |     # Make sure all working paths exist before anything
118 |     DATA_PATH = Path(__file__).parent / "data"
119 |     DOWNLOAD_PATH = DATA_PATH / "download"
120 |     OUTPUT_PATH = DATA_PATH / "output"
121 |     for path in (DATA_PATH, DOWNLOAD_PATH, OUTPUT_PATH):
122 |         if not path.exists():
123 |             path.mkdir(parents=True)
124 | 
125 |     # Create one compressed-CSV writer
126 |     filename = OUTPUT_PATH / f"pensionista_{args.table_name}.csv.gz"
127 |     fobj = open_compressed(filename, mode="w", buffering=8 * 1024 * 1024)
128 |     writer = CsvLazyDictWriter(fobj)
129 | 
130 |     # Read each ZIP file, then each inner ZIP file, then filter desired
131 |     # inner-inner CSV file, convert it and write to the output CSV.
132 |     progress_bar = tqdm()
133 |     filenames = DOWNLOAD_PATH.glob("*.zip")
134 |     for filename in sorted(filenames, key=extract_year_month):
135 |         year, month = extract_year_month(filename)
136 |         progress_bar.desc = f"{year}-{month:02d}"
137 |         progress_bar.refresh()
138 |         zf = ZipFile(filename)
139 |         for fileinfo in zf.filelist:
140 |             origin_system = extract_origin_system(fileinfo.filename)
141 |             progress_bar.desc = f"{year}-{month:02d}/{origin_system}"
142 |             inner_zf = ZipFile(zf.open(fileinfo.filename))
143 |             for inner_fileinfo in inner_zf.filelist:
144 |                 table_name = inner_fileinfo.filename.split(".")[0].split("_")[-1].lower().replace("observacoes", "observacao")
145 |                 if table_name != args.table_name:  # We don't want this file
146 |                     continue
147 |                 progress_bar.desc = f"{year}-{month:02d}/{origin_system}.{table_name}"
148 |                 fobj = inner_zf.open(inner_fileinfo.filename)
149 |                 reader = read_csv(fobj, origin_system, year, month)
150 |                 for row in reader:
151 |                     writer.writerow(row)
152 |                     progress_bar.update()
153 |     progress_bar.close()
154 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------