├── data
└── logs
│ ├── .gitkeep
│ ├── .gitignore
│ └── calculate_lines.sh
├── src
├── .gitignore
├── tests
│ ├── test-create-table.py
│ └── test-convert-to-parquet.py
├── convert_state_csv_data_to_parquet.py
├── count_number_of_lines.py
├── 01_extrair_eventos_relacionados_a_votos.ipynb
├── 03B_calcular_metricas_temporais.ipynb
├── 02_isolar_timestamps_eventos.ipynb
├── 03A_calcular_metricas_tempo.ipynb
└── test-basic-queries.ipynb
├── duckdb
├── requirements.txt
└── Dockerfile
├── streamlit
├── .gitignore
├── app
│ ├── requirements.txt
│ ├── maps
│ │ └── donwload_files.txt
│ ├── maps.py
│ ├── data.py
│ ├── main.py
│ └── widgets.py
└── Dockerfile
├── docker-compose.yaml
├── convert_encoding_from_files.py
├── extract_log_files.py
├── download_log_urnas.py
└── README.md
/data/logs/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | *.csv
3 | .tmp
--------------------------------------------------------------------------------
/data/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *.zip
2 | *.logjez
3 | *.csv
--------------------------------------------------------------------------------
/duckdb/requirements.txt:
--------------------------------------------------------------------------------
1 | duckdb
2 | pandas==2.2.1
--------------------------------------------------------------------------------
/streamlit/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.svg
3 | *.zip
--------------------------------------------------------------------------------
/streamlit/app/requirements.txt:
--------------------------------------------------------------------------------
1 | altair
2 | pandas
3 | duckdb
4 | streamlit
5 | matplotlib
6 | geopandas
7 | seaborn
--------------------------------------------------------------------------------
/duckdb/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/minimal-notebook
2 |
3 | COPY requirements.txt /app/requirements.txt
4 | WORKDIR /app
5 |
6 | RUN pip install -r requirements.txt --upgrade
7 | WORKDIR /src
8 |
9 | # EXPOSE JUPYTER PORT
10 | EXPOSE 8888
11 |
--------------------------------------------------------------------------------
/streamlit/app/maps/donwload_files.txt:
--------------------------------------------------------------------------------
1 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_UF_2022.zip
2 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_Municipios_2022.zip
3 |
--------------------------------------------------------------------------------
/streamlit/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 | WORKDIR /app
3 | COPY ./app/requirements.txt /app/requirements.txt
4 |
5 | RUN pip3 install -r requirements.txt
6 |
7 | EXPOSE 8500
8 |
9 | HEALTHCHECK CMD curl --fail http://localhost:8500/_stcore/health
10 |
11 | ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8500", "--server.address=0.0.0.0"]
--------------------------------------------------------------------------------
/src/tests/test-create-table.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 |
3 | cursor = duckdb.connect('test.db')
4 | print(cursor.execute(
5 | """
6 | CREATE OR REPLACE TABLE test_zz AS
7 | SELECT
8 | *
9 | FROM
10 | read_csv('/data/logs/2_ZZ/*_new.csv', filename=True)
11 | """
12 | ))
13 |
14 | # Select the data from the table LIMIT 10
--------------------------------------------------------------------------------
/src/tests/test-convert-to-parquet.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 | import time
3 | cursor = duckdb.connect('test.db')
4 |
5 | tic = time.time()
6 | cursor.execute(
7 | """
8 | COPY (
9 | SELECT
10 | *
11 | FROM read_csv('/data/logs/2_ZZ/*_new.csv', filename=True)
12 | ) TO 'test_zz.parquet' (FORMAT 'parquet');
13 | """
14 | )
15 | toc = time.time()
16 | print(f"Time taken: {toc - tic} seconds")
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | duckdb:
4 | build: ./duckdb
5 | volumes:
6 | - ./data:/data
7 | - ./src:/src
8 | ports:
9 | - "8888:8888"
10 | command: start-notebook.sh --NotebookApp.token='1234'
11 | streamlit:
12 | build: ./streamlit
13 | volumes:
14 | - ./src:/src
15 | - ./streamlit/app/:/app
16 | ports:
17 | - "8600:8500"
18 | command: streamlit run /src/app.py
19 |
--------------------------------------------------------------------------------
/data/logs/calculate_lines.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Array of Brazilian states
4 | states=("AC" "AL" "AP" "AM" "BA" "CE" "DF" "ES" "GO" "MA" "MT" "MS" "MG" "PA" "PB" "PR" "PE" "PI" "RJ" "RN" "RS" "RO" "RR" "SC" "SP" "SE" "TO")
5 |
6 | # Iterate over each state
7 | for state in "${states[@]}"
8 | do
9 | # Concatenate "2_" in front of the state
10 | state_with_prefix="2_$state"
11 | echo "Calculating total lines for $state_with_prefix"
12 | find "./$state_with_prefix" -type f -exec wc -l {} + | awk -v st="$state_with_prefix" '{total += $1} END {print "Total lines in " st ":", total}'
13 | done
14 |
--------------------------------------------------------------------------------
/src/convert_state_csv_data_to_parquet.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 | import time
3 | import sys
4 |
5 |
6 | STATES = [
7 | "AC", "AL", "AM", "AP", "BA", "CE",
8 | "DF", "ES", "GO", "MA", "MG", "MS",
9 | "MT", "PA", "PB", "PE", "PI", "PR",
10 | "RJ", "RN", "RO", "RR", "RS", "SC",
11 | "SE", "SP", "TO", "ZZ"]
12 |
13 | if __name__ == "__main__":
14 | # get the first sys arg
15 | uf = sys.argv[1]
16 |
17 | # if sys arg not in the brazilian states
18 | if uf not in STATES:
19 | print("Invalid state")
20 | sys.exit(1)
21 |
22 | tic = time.time()
23 | cursor = duckdb.connect("")
24 | query = f"""
25 | COPY (
26 | SELECT
27 | *
28 | FROM read_csv('/data/logs/2_{uf}/*.csv', filename=True)
29 | ) TO '{uf}.parquet' (FORMAT 'parquet');
30 | """
31 |
32 | cursor.execute(query)
33 | toc = time.time()
34 | print(f"Time taken to convert {uf} to parquet: {toc - tic} seconds")
35 |
--------------------------------------------------------------------------------
/src/count_number_of_lines.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 | import time
3 | import sys
4 |
5 |
6 | STATES = [
7 | "AC", "AL", "AM", "AP", "BA", "CE",
8 | "DF", "ES", "GO", "MA", "MG", "MS",
9 | "MT", "PA", "PB", "PE", "PI", "PR",
10 | "RJ", "RN", "RO", "RR", "RS", "SC",
11 | "SE", "SP", "TO", "ZZ", "ALL"]
12 |
13 | if __name__ == "__main__":
14 | # get the first sys arg
15 | uf = sys.argv[1]
16 |
17 | # if sys arg not in the brazilian states
18 | if uf not in STATES:
19 | print("Invalid state")
20 | sys.exit(1)
21 |
22 | tic = time.time()
23 | cursor = duckdb.connect("")
24 |
25 | if uf == "ALL":
26 | query = f"""
27 | SELECT
28 | COUNT(*)
29 | FROM '*.parquet'
30 | """
31 | else:
32 | query = f"""
33 | SELECT
34 | COUNT(*)
35 | FROM '{uf}.parquet'
36 | """
37 |
38 | cursor.execute(query)
39 | toc = time.time()
40 | print(f"Time taken to count number of lines in {uf}: {toc - tic} seconds")
41 | print(cursor.fetchall())
42 |
--------------------------------------------------------------------------------
/convert_encoding_from_files.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tqdm
3 | import time
4 |
5 |
6 | if __name__ == "__main__":
7 |
8 | BASE_LOGS_PATH = "./data/logs"
9 | # list all directories in the base path
10 | directories = os.listdir(BASE_LOGS_PATH)
11 | command = "touch {} && iconv -f ISO-8859-1 -t UTF-8//TRANSLIT {} > {} && rm {}"
12 |
13 | tic = time.time()
14 | for directory in directories:
15 | path = BASE_LOGS_PATH + "/" + directory
16 | if not os.path.isdir(path):
17 | continue
18 |
19 | files = os.listdir(path)
20 | files = [file for file in files if file.endswith(".csv") and not file.endswith("_new.csv")]
21 | print(f"Processing directory {directory} with {len(files)} files")
22 |
23 | for file in tqdm.tqdm(files):
24 | # convert the encoding of the file
25 | filename = file.split(".")[0]
26 | new_filename = filename + "_new.csv"
27 |
28 | path_old_file = path + "/" + file
29 | path_new_file = path + "/" + new_filename
30 |
31 | os.system(command.format(path_new_file, path_old_file, path_new_file, path_old_file))
32 | toc = time.time()
33 |
34 | print(f"Conversion took {toc - tic} seconds")
--------------------------------------------------------------------------------
/extract_log_files.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | BASE_PATH = './data/logs'
4 |
5 |
6 | def unzip_log_files(zip_file):
7 |
8 | # Each ZIP file contains MULTIPLE *.logjez files
9 | # Each A.logjez file contains a logd.dat file that is the LOG file
10 | # This code extract all A.logjez file and rename its logd.dat to A.csv
11 |
12 | # unzip file
13 | # extracting only the .logjez files
14 | filepath = zip_file[:-4]
15 | os.system(f'7z e {zip_file} -o{filepath} *.logjez -r')
16 |
17 | # Remove unnecessary files
18 | os.system(f'rm {zip_file}') # Zip file
19 |
20 | # list all files in the directory
21 | files = os.listdir(filepath)
22 |
23 | for file in files:
24 | # extract .logjez files
25 | # and rename to .csv
26 | if file.endswith('.logjez'):
27 | new_filename = file[:-7]
28 | os.system(
29 | f'7z e {filepath}/{file} -y -o{filepath}/{new_filename} \
30 | > /dev/null'
31 | )
32 | os.system(
33 | f'mv \
34 | {filepath}/{new_filename}/logd.dat \
35 | {filepath}/{new_filename}.csv'
36 | )
37 | os.system(
38 | f'rm -r {filepath}/{new_filename}'
39 | )
40 |
41 | os.system(f'chmod 777 -R {filepath}')
42 | os.system(f'rm {filepath}/*.logjez')
43 |
44 |
45 | if __name__ == "__main__":
46 | for file in os.listdir(BASE_PATH):
47 | if file.endswith('.zip'):
48 | unzip_log_files(os.path.join(BASE_PATH, file))
--------------------------------------------------------------------------------
/streamlit/app/maps.py:
--------------------------------------------------------------------------------
1 | import geopandas as gpd
2 | import matplotlib.pyplot as plt
3 | import re
4 | import streamlit as st
5 |
6 | @st.cache_data()
7 | def load_brazil_simplified_map():
8 | """
9 | Load the simplified map of Brazil.
10 | The simplification is done to reduce the file size
11 | and improve performance on the streamlit app.
12 |
13 | Returns:
14 | gpd.GeoDataFrame: GeoDataFrame with the simplified map of Brazil.
15 | """
16 |
17 | map_ufs = './maps/BR_UF_2022.zip'
18 |
19 | gdf = gpd.read_file(map_ufs)
20 | gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01)
21 |
22 | return gdf
23 |
24 | @st.cache_data()
25 | def load_ufs_city_simplified_map():
26 | """
27 | Load the simplified map of Brazil with cities.
28 | """
29 | map_municipios = './maps/BR_Municipios_2022.zip'
30 |
31 | gdf = gpd.read_file(map_municipios)
32 | gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01)
33 |
34 | return gdf
35 |
36 | def add_ufs_and_links_to_map(svg_image_buffer):
37 | """
38 | Generate links for each UF in the SVG image buffer.
39 | And make the map clickable.
40 |
41 | Args:
42 | svg_image_buffer (str): SVG image buffer.
43 |
44 | Returns:
45 | str: SVG image buffer with links for each UF.
46 | """
47 |
48 | re_uf_map_pattern = r'(((.|\s)*?))'
49 | image_with_links = re.sub(
50 | re_uf_map_pattern,
51 | r"\1",
52 | svg_image_buffer
53 | )
54 | return image_with_links
55 |
--------------------------------------------------------------------------------
/download_log_urnas.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import queue
3 | import os
4 | import sys
5 | import logging
6 | from itertools import product
7 |
8 | BASE_URL = (
9 | 'https://cdn.tse.jus.br/estatistica/sead/eleicoes/' +
10 | 'eleicoes2022/arqurnatot/bu_imgbu_logjez_rdv_vscmr_2022_{}t_{}.zip'
11 | )
12 |
13 | UFS_BR = [
14 | 'AC', 'AL', 'AP', 'AM',
15 | 'BA', 'CE', 'DF', 'ES',
16 | 'GO', 'MA', 'MT', 'MS',
17 | 'MG', 'PA', 'PB', 'PR',
18 | 'PE', 'PI', 'RJ', 'RN',
19 | 'RS', 'RO', 'RR', 'SC',
20 | 'SP', 'SE', 'TO', 'ZZ'
21 | ]
22 | # TURNOS = [1, 2]
23 | TURNOS = [2]
24 |
25 |
26 | NUM_TRHEADS = 4
27 |
28 | # Create a queue to communicate with the worker threads
29 | turnos_uf_queue = queue.Queue()
30 |
31 | # Configure logging
32 | logging.basicConfig(
33 | level=logging.INFO,
34 | format='[%(asctime)s] %(message)s',
35 | datefmt='%d/%m/%y %H:%M:%S'
36 | )
37 |
38 |
39 | def download_file():
40 | uf_turno = turnos_uf_queue.get()
41 | url = BASE_URL.format(*uf_turno)
42 | path = os.path.join('data', 'logs', f'{uf_turno[0]}_{uf_turno[1]}.zip')
43 |
44 | logging.info(f'Downloading {url} to {path}')
45 |
46 | logging.info(f'Iniciando download de {url}')
47 | try:
48 | os.system(f'wget -O {path} {url}')
49 | except Exception as e:
50 | logging.error(f"Erro ao tentar baixar o arquivo {url}")
51 | logging.error(e)
52 | return
53 |
54 | logging.info(f'Finalizado download de {url}')
55 |
56 | if turnos_uf_queue.empty():
57 | logging.info('All downloads finished')
58 | else:
59 | logging.info(f'{turnos_uf_queue.qsize()} downloads remaining')
60 | download_file()
61 |
62 | turnos_uf_queue.task_done()
63 | return
64 |
65 |
66 | if __name__ == "__main__":
67 | ufs_br_download = UFS_BR
68 | if len(sys.argv) > 1:
69 | ufs_br_download = sys.argv[1:]
70 |
71 | logging.info(f'Iniciando download de {len(ufs_br_download)} arquivos')
72 | logging.info(f'UFs: {ufs_br_download}')
73 | logging.info(f'Turnos: {TURNOS}')
74 |
75 | for uf_br, turno in product(ufs_br_download, TURNOS):
76 | turnos_uf_queue.put((turno, uf_br))
77 |
78 | for i in range(NUM_TRHEADS):
79 | worker = threading.Thread(
80 | target=download_file,
81 | daemon=True
82 | )
83 | worker.start()
84 |
85 | turnos_uf_queue.join()
86 | logging.info("Done")
87 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Processing Logs of Electronic Ballot Boxes
3 | This repository contains Python + DuckDB scripts for processing logs from [Brazilian Electronic Ballot Boxes](https://international.tse.jus.br/en/electronic-ballot-box/presentation?set_language=en) to compute several time-related metrics (mean vote time, number of votes computed in 5min, percentage of biometric identification success).
4 |
5 | ## The Data
6 | The logs from the voting machines can be directly downloaded from the [TSE open data website](https://dadosabertos.tse.jus.br/dataset/resultados-2022-arquivos-transmitidos-para-totalizacao). This repository contains Python scripts that automatically download and extract the logs.
7 |
8 | ## What are the logs of the Electronic Ballot Boxes?
9 | Files that contain all operations performed on the machine, from the initial setup to the end of voting in the second round (if applicable). The files are stored in plain text, with each line representing an event. See an example below:
10 |
11 | ```
12 | 21/09/2022 17:21:41 INFO 67305985 LOGD Start of logd operations FDE9B0FC7A079096
13 | 21/09/2022 17:21:41 INFO 67305985 LOGD Machine turned on on 21/09/2022 at 17:20:16 B637C17E565B039B
14 | 21/09/2022 17:21:41 INFO 67305985 SCUE Starting application - Official - 1st round F82E007ACCAF93A5
15 | 21/09/2022 17:21:41 INFO 67305985 SCUE Application version: 8.26.0.0 - Jaguar D499E9A173814A70
16 | ```
17 | With these logs, it is possible to extract numerous pieces of information about the electoral process. Due to their verbosity, the logs of the Ballot Boxes are very heavy. In their original format, the set of log files for a single Brazilian state can range from 2GB to over 50GB, with all the files combined reaching 450GB! Therefore, robust processing tools and optimized file formats are indispensable.
18 |
19 | ## Note on Approximations and Errors
20 | Processing the logs of the voting machines is not a simple task.
21 | Although they are easy to read, defining a process that perfectly isolates each vote is a complex task because numerous situations can occur during the voting process.
22 |
23 | The scripts coded here attempt to be as generic and simple as possible, to facilitate understanding, maintenance, and reduce the computational cost of processing. Therefore, they may occasionally not capture ALL votes perfectly. The error rate (uncaptured votes) considering the official count from the TSE is ~3% (experiment conducted with RN data).
24 |
--------------------------------------------------------------------------------
/streamlit/app/data.py:
--------------------------------------------------------------------------------
1 | import duckdb
2 |
3 | ZONE_GROUPS = [ f'{x}-{x+20}' for x in range(0, 800, 20) ]
4 | UFS = [
5 | "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS",
6 | "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC",
7 | "SP", "SE", "TO", "ZZ", "ALL"
8 | ]
9 |
10 | class DuckDBConnector:
11 | # Connect to the database
12 | # singleton pattern
13 | _instance = None
14 | def __init__(self) -> None:
15 | self.connect()
16 |
17 | @staticmethod
18 | def get_instance():
19 | if DuckDBConnector._instance is None:
20 | DuckDBConnector._instance = DuckDBConnector()
21 | return DuckDBConnector._instance
22 |
23 |
24 | def connect(self):
25 | self.cursor = duckdb.connect()
26 |
27 |
28 | def calculate_zone_group(self, zone):
29 | if zone == 'ALL':
30 | return zone
31 |
32 | zone = int(zone)
33 | ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]
34 | for group in ZONE_GROUPS:
35 | if zone >= group[0] and zone < group[1]:
36 | return f"{group[0]}-{group[1]}"
37 |
38 |
39 | def get_vote_time_metrics(self, uf, turno, zone, section):
40 | table = """
41 | read_parquet(
42 | '/src/VOTES_TIME_METRICS.parquet/*/*/*/*.parquet',
43 | hive_partitioning=True,
44 | hive_types_autocast=0
45 | )
46 | """
47 | zone_group = self.calculate_zone_group(zone)
48 | zone = F"{int(zone):04d}" if zone != 'ALL' else zone
49 | section = F"{int(section):04d}" if section != 'ALL' else section
50 |
51 | zone_filter = f"AND zone_code = '{zone}' AND zone_group = '{zone_group}'"
52 | if uf == 'ALL':
53 | uf = "','".join(UFS)
54 | elif zone == 'ALL':
55 | zone = "','".join(ZONE_GROUPS)
56 | zone_filter = f"AND zone_group in ('{zone}', 'ALL')"
57 |
58 | query = f"""
59 | SELECT *
60 | FROM {table}
61 | WHERE 1=1
62 | AND turno = '{turno}'
63 | AND uf in ('{uf}')
64 | {zone_filter}
65 | AND section_code = '{section}'
66 | """
67 |
68 | data = self.cursor.execute(query).df()
69 | return data
70 |
71 |
72 | def get_metrics_over_time(self, uf, turno, zone, section):
73 | table = """
74 | read_parquet(
75 | '/src/VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet/*/*/*/*.parquet',
76 | hive_partitioning=True,
77 | hive_types_autocast=0
78 | )
79 | """
80 | zone_group = self.calculate_zone_group(zone)
81 | zone = F"{int(zone):04d}" if zone != 'ALL' else zone
82 | section = F"{int(section):04d}" if section != 'ALL' else section
83 |
84 | fix_zone_code = """
85 | CASE WHEN zone_code IS NULL THEN 'ALL'
86 | ELSE zone_code
87 | END
88 | """
89 |
90 | query = f"""
91 | SELECT *
92 | FROM {table}
93 | WHERE 1=1
94 | AND turno = '{turno}'
95 | AND uf in ('{uf}')
96 | AND zone_group = '{zone_group}'
97 | AND {fix_zone_code} = '{zone}'
98 | AND section_code = '{section}'
99 | AND timestamp_voto_computado_5min != 'ALL'
100 | """
101 |
102 | data = self.cursor.execute(query).df()
103 | return data
--------------------------------------------------------------------------------
/streamlit/app/main.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from widgets import (
4 | widget_bignumber_votos, widget_bignumber_secoes,
5 | widget_big_number_tempo_medio, widget_big_number_tempo_medio_bio,
6 | widget_big_number_tempo_total_voto,
7 | widget_tempo_medio_voto, widget_qtd_votos_intervalo_tempo,
8 | widget_numero_votos_intervalo_5min
9 | )
10 |
11 | UFS = [
12 | "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS",
13 | "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC",
14 | "SP", "SE", "TO", "ZZ", "ALL"
15 | ]
16 | TURNOS = ['1', '2']
17 |
18 | def get_parameters_from_http_query_params():
19 | query_parameters = st.query_params
20 | select_parameters = lambda x, default, accepted: (
21 | default
22 | if x not in query_parameters
23 | else query_parameters[x] if query_parameters[x] in accepted
24 | else default
25 | )
26 | nr_zonas_secoes = [str(x) for x in range(0, 800)]
27 |
28 | uf = select_parameters('uf', 'ALL', UFS )
29 | turno = select_parameters('turno', '1', TURNOS )
30 | zona = select_parameters('zona', 'ALL', nr_zonas_secoes)
31 | secao = select_parameters('secao', 'ALL', nr_zonas_secoes)
32 |
33 | return uf, turno, zona, secao
34 |
35 | if __name__ == "__main__":
36 | st.set_page_config(layout="wide")
37 |
38 | uf, turno, zona, secao = get_parameters_from_http_query_params()
39 |
40 | st.title(f'Eleições em Números - Tempo de Votação')
41 | subtitulo = ''
42 | subtitulo = subtitulo + f' - {uf}' if uf != 'ALL' else subtitulo + " - Brasil"
43 | subtitulo = subtitulo + f' - Zona {zona}' if zona != 'ALL' else subtitulo
44 | subtitulo = subtitulo + f', Seção {secao}' if secao != 'ALL' else subtitulo
45 |
46 | col_subtitle, col_change_turn = st.columns([4, 1])
47 | # col_subtitle.markdown( subtitulo )
48 | # add button to change the turn
49 |
50 | outro_turno = '1' if turno == '2' else '2'
51 | query_parameters = f"?turno={outro_turno}&uf={uf}&zona={zona}&secao={secao}"
52 | st.components.v1.html(
53 | f"""
54 |
75 | """,
76 | height=70
77 | )
78 |
79 | # ============================
80 | # Big Number Widgets
81 | # ============================
82 |
83 | col_bignumber_votos, col_bignumber_secoes, col_bignumber_tmedio, col_bignumber_tmedio_bio, col_bignumber_tempo_total = st.columns(5)
84 | widget_bignumber_votos(col_bignumber_votos, turno, uf, zona, secao)
85 | widget_bignumber_secoes(col_bignumber_secoes, turno, uf, zona, secao)
86 | widget_big_number_tempo_medio(col_bignumber_tmedio, turno, uf, zona, secao)
87 | widget_big_number_tempo_medio_bio(col_bignumber_tmedio_bio, turno, uf, zona, secao)
88 | widget_big_number_tempo_total_voto(col_bignumber_tempo_total, turno, uf, zona, secao)
89 | st.divider()
90 |
91 | # =================================
92 | # Heatmap and Histogram Widgets
93 | # =================================
94 | col_map, col_histogram, col_temporal_series = st.columns( [.3, .2, .5] )
95 | widget_tempo_medio_voto(col_map, turno, uf, zona, secao)
96 | widget_qtd_votos_intervalo_tempo(col_histogram, turno, uf, zona, secao)
97 | widget_numero_votos_intervalo_5min(col_temporal_series, turno, uf, zona, secao)
98 |
99 | st.divider()
100 |
101 | # =================================
102 | # Foot note. Author: João Pedro. Data gathered from TSE Open Data Portal. All code available at github.
103 | # =================================
104 |
105 | st.text('Author: João Pedro. Dados coletados do Portal de Dados Abertos do TSE. All code available at Github.')
106 | st.text('O projeto é complexo. Os podem não ser 100% precisos.')
107 |
--------------------------------------------------------------------------------
/src/01_extrair_eventos_relacionados_a_votos.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Extrair apenas eventos relacionados a votos e metadados"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Como o Log das urnas compila todo o histórico de eventos que ocorreu em uma unna, incluindo testes, carga, preparação, etc, para garantir uma consulta mais eficiente, são extraídos apenas os eventos relacionados aos votos em si.\n",
15 | "\n"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Importing libraries"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 1,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import duckdb\n",
32 | "import pandas as pd\n",
33 | "import time"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## Importing Data"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "DATASET = 'ALL_UFS.parquet'"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "cursor = duckdb.connect()"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "## Preparing Data"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "### Definindo os filtros"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "Colunas relacionadas a metadados da Seção Eleitoral"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "METADATA = [\n",
89 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Zona Eleitoral%'\",\n",
90 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Seção Eleitoral%'\",\n",
91 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Município%'\",\n",
92 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Local de Votação%'\",\n",
93 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Turno da UE%'\",\n",
94 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Identificação do Modelo de Urna%'\"\n",
95 | "]"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 4,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "COLUMN_EVENT_DESCRIPTION = 'event_description'\n",
105 | "\n",
106 | "EVENTS_DESCRIPTIONS = [\n",
107 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Urna pronta para receber vot%'\",\n",
108 | "]\n",
109 | "\n",
110 | "VOTES_DESCRIPTIONS = [\n",
111 | " # VOTOS\n",
112 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Aguardando digitação do título'\",\n",
113 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Título digitado pelo mesário'\",\n",
114 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Eleitor foi habilitado'\",\n",
115 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Voto confirmado par%'\",\n",
116 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'O voto do eleitor foi computado'\",\n",
117 | " \n",
118 | " # BIOMETRIA\n",
119 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE '%Digital%' \",\n",
120 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Dedo reconhecido%' \",\n",
121 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Solicita digital%' \",\n",
122 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Solicitação de dado pessoal do eleitor para habilitação manual' \",\n",
123 | "]\n",
124 | "\n",
125 | "ACCEPTED_DATES = [\n",
126 | " '2022-10-02', '2022-10-30', # Data constitucional da eleição\n",
127 | " '2022-10-03', '2022-10-31', # No caso da seção 'virar a noite' e acabar depois da meia noite, imagino que sejam casos RARÍSSIMOS\n",
128 | "]\n",
129 | "\n",
130 | "ALL_FILTERS = METADATA + EVENTS_DESCRIPTIONS + VOTES_DESCRIPTIONS"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "### Construindo e Executando a query"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "**Notas:** \n",
145 | "\n",
146 | "**1. Extração de metadados a partir do nome dos arquivos.**\n",
147 | " \n",
148 | "Cada arquivo TSV possui informações de uma Seção Eleitoral (que é a mesma coisa de uma Urna), e o nome do arquivo é a concatenação dos metadados da Seção Eleitoral:\n",
149 | "\n",
150 | " - Os 5 Primeiros Dígitos são o código do Município\n",
151 | " - Os 4 Dígitos seguintes são o código da Zona Eleitoral\n",
152 | " - Os 4 Dígitos seguintes são o código da Seção Eleitoral\n",
153 | "\n",
154 | "**2. Data da Eleição**\n",
155 | "\n",
156 | "A Data em que os eventos aconteceram é uma ótima forma de aproximar ainda mais os eventos que têm haver com a votação, uma vez que a votação no Brasil acontece em um único dia - aprende aí EUA ;)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 5,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "query = F\"\"\"\n",
166 | " SELECT \n",
167 | " *\n",
168 | " FROM (\n",
169 | " SELECT\n",
170 | " event_timestamp,\n",
171 | " event_timestamp::date AS event_date,\n",
172 | " event_type,\n",
173 | " some_id,\n",
174 | " event_system,\n",
175 | " event_description,\n",
176 | " event_id,\n",
177 | " \n",
178 | " REPLACE(SPLIT_PART(filename, '/', 5), '_new.csv', '') AS filename,\n",
179 | " \n",
180 | " -- Metadata from filename\n",
181 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 1, 5 ) AS city_code,\n",
182 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 6, 4 ) AS zone_code,\n",
183 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 10, 4 ) AS section_code,\n",
184 | " REPLACE(SPLIT_PART(filename, '/', 4), '2_', '') AS uf\n",
185 | " FROM\n",
186 | " {DATASET}\n",
187 | " WHERE 1=1\n",
188 | " AND ( {' OR '.join(ALL_FILTERS)} )\n",
189 | " ) _\n",
190 | " WHERE 1=1\n",
191 | " AND event_date IN ({', '.join([F\"'{date}'\" for date in ACCEPTED_DATES])})\n",
192 | "\"\"\""
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "Para facilitar consultas, os arquivos parquet são particionados por DATA DO EVENTO e UF."
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 6,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "query = F\"\"\"\n",
209 | " COPY ({query}) TO 'UFS_VOTE_EVENTS.parquet' (FORMAT 'parquet', PARTITION_BY (event_date, uf), OVERWRITE_OR_IGNORE 1);\n",
210 | "\"\"\""
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 7,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "data": {
220 | "application/vnd.jupyter.widget-view+json": {
221 | "model_id": "ca3b1617f6524b85b061c9579b6cc506",
222 | "version_major": 2,
223 | "version_minor": 0
224 | },
225 | "text/plain": [
226 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
227 | ]
228 | },
229 | "metadata": {},
230 | "output_type": "display_data"
231 | },
232 | {
233 | "name": "stdout",
234 | "output_type": "stream",
235 | "text": [
236 | "Time 1027.0729978084564s\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "tic = time.time()\n",
242 | "cursor.execute(query)\n",
243 | "toc = time.time()\n",
244 | "\n",
245 | "print(F\"Time {toc - tic}s\")"
246 | ]
247 | }
248 | ],
249 | "metadata": {
250 | "kernelspec": {
251 | "display_name": "base",
252 | "language": "python",
253 | "name": "python3"
254 | },
255 | "language_info": {
256 | "codemirror_mode": {
257 | "name": "ipython",
258 | "version": 3
259 | },
260 | "file_extension": ".py",
261 | "mimetype": "text/x-python",
262 | "name": "python",
263 | "nbconvert_exporter": "python",
264 | "pygments_lexer": "ipython3",
265 | "version": "3.11.5"
266 | }
267 | },
268 | "nbformat": 4,
269 | "nbformat_minor": 2
270 | }
271 |
--------------------------------------------------------------------------------
/src/03B_calcular_metricas_temporais.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Calcular Métricas - Tempo de Votação, Biometria, etc."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "---"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Importing libraries"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 7,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import duckdb\n",
31 | "import pandas as pd\n",
32 | "import time"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Importing Data"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 8,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "cursor = duckdb.connect()"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "Dados Brutos"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 9,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n",
65 | "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Adicionar TURNO e Timestamp final de Biometria"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "source = F\"\"\"\n",
82 | "(\n",
83 | " SELECT \n",
84 | " *,\n",
85 | " \n",
86 | " CASE event_date\n",
87 | " WHEN '2022-10-02' THEN 1\n",
88 | " WHEN '2022-10-03' THEN 1\n",
89 | " WHEN '2022-10-30' THEN 2\n",
90 | " WHEN '2022-10-31' THEN 2\n",
91 | " ELSE NULL\n",
92 | " END::INT AS turno,\n",
93 | "\n",
94 | " COALESCE(\n",
95 | " timestamp_biometria_1,\n",
96 | " timestamp_biometria_2,\n",
97 | " timestamp_biometria_3,\n",
98 | " timestamp_biometria_4,\n",
99 | " timestamp_biometria_manual\n",
100 | " ) AS timestamp_biometria_final,\n",
101 | "\n",
102 | " strftime( '%Y-%m-%d %H:', timestamp_voto_computado )\n",
103 | " || (EXTRACT(MINUTE FROM timestamp_voto_computado)//5)*5 + 5\n",
104 | " || ':00' AS timestamp_voto_computado_5min\n",
105 | " \n",
106 | " FROM \n",
107 | " {TABLE}\n",
108 | " -- WHERE uf='DF'\n",
109 | ") _\n",
110 | "\"\"\""
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "## Preparinga Data"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n",
125 | "\n",
126 | "- Número de Votos\n",
127 | "- Número de Seções Eleitorais\n",
128 | "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n",
129 | "\n",
130 | "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n",
131 | "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n",
132 | "- Quantidade de Teclas Pressionadas\n",
133 | "- Quantidade de Cargos Distintos Votados"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "**Definição das métricas de tempo**"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 5,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_titulo_digitado))\"\n",
150 | "tempo_voto = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_habilitacao_eleitor))\"\n",
151 | "tempo_biometria = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n",
152 | "\n",
153 | "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(20), 'ALL')\""
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 6,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "query_metrics = F\"\"\"\n",
163 | " SELECT\n",
164 | " {fix_null_values('turno') } AS turno,\n",
165 | " {fix_null_values('timestamp_voto_computado_5min') } AS timestamp_voto_computado_5min,\n",
166 | " {fix_null_values('uf') } AS uf,\n",
167 | " zone_code,\n",
168 | " {fix_null_values('section_code') } AS section_code,\n",
169 | "\n",
170 | " COUNT(*) AS total_votos,\n",
171 | " SUM( {tempo_voto} ) AS tempo_voto_soma,\n",
172 | " SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n",
173 | " SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n",
174 | " \n",
175 | " FROM\n",
176 | " {source}\n",
177 | " WHERE quantidade_votos_computados = 1\n",
178 | " GROUP BY ROLLUP(turno, timestamp_voto_computado_5min, uf, zone_code, section_code)\n",
179 | "\"\"\""
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "**Salvar resultado intermediário**"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 7,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "query = F\"\"\"\n",
196 | " COPY (\n",
197 | " {\n",
198 | " query_metrics\n",
199 | " } )\n",
200 | " TO 'VOTES_TIME_METRICS_OVER_TIME.parquet' \n",
201 | " (FORMAT 'parquet', PARTITION_BY (turno, uf), OVERWRITE_OR_IGNORE 1);\n",
202 | "\"\"\""
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 8,
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "application/vnd.jupyter.widget-view+json": {
213 | "model_id": "a86d242b27054c0683c1dca6f79697d6",
214 | "version_major": 2,
215 | "version_minor": 0
216 | },
217 | "text/plain": [
218 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
219 | ]
220 | },
221 | "metadata": {},
222 | "output_type": "display_data"
223 | }
224 | ],
225 | "source": [
226 | "cursor.execute(query)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "Cálculo cumulativo das métricas"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 11,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "TABLE_METRICS = \"\"\"read_parquet(\n",
243 | " 'VOTES_TIME_METRICS_OVER_TIME.parquet/*/*/*.parquet', \n",
244 | " hive_partitioning=True,\n",
245 | " hive_types_autocast =0\n",
246 | ")\n",
247 | "\"\"\""
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 12,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "query_cumulative_metrics = F\"\"\"\n",
257 | " SELECT\n",
258 | " *,\n",
259 | " SUM(total_votos) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS total_votos_cumulativo,\n",
260 | " SUM(tempo_voto_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_soma_cumulativo,\n",
261 | " SUM(tempo_biometria_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_biometria_soma_cumulativo,\n",
262 | " SUM(tempo_voto_total_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_total_soma_cumulativo\n",
263 | " FROM\n",
264 | " {TABLE_METRICS}\n",
265 | "\"\"\""
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n",
273 | "\n",
274 | "As ZONAS foram agrupadas em grupos de 20, esse número é empírico."
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 13,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "query_metrics_with_zone_group = F\"\"\"\n",
284 | " SELECT\n",
285 | " *,\n",
286 | " CASE\n",
287 | " {\n",
288 | " \"\".join(\n",
289 | " [\n",
290 | " f\"WHEN zone_code IS NOT NULL AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
291 | " for min_zone, max_zone in ZONE_GROUPS\n",
292 | " ]\n",
293 | " )\n",
294 | " }\n",
295 | " ELSE 'ALL'\n",
296 | " END AS zone_group\n",
297 | " FROM (\n",
298 | " {query_cumulative_metrics}\n",
299 | " ) _\n",
300 | "\"\"\""
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 14,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "query = F\"\"\"\n",
310 | " COPY (\n",
311 | " {\n",
312 | " query_metrics_with_zone_group\n",
313 | " } )\n",
314 | " TO 'VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet' \n",
315 | " (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
316 | "\"\"\""
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 15,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "application/vnd.jupyter.widget-view+json": {
327 | "model_id": "b9ea3c1f88764a56a54bce624acaf93e",
328 | "version_major": 2,
329 | "version_minor": 0
330 | },
331 | "text/plain": [
332 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
333 | ]
334 | },
335 | "metadata": {},
336 | "output_type": "display_data"
337 | },
338 | {
339 | "data": {
340 | "text/plain": [
341 | ""
342 | ]
343 | },
344 | "execution_count": 15,
345 | "metadata": {},
346 | "output_type": "execute_result"
347 | }
348 | ],
349 | "source": [
350 | "cursor.execute(query)"
351 | ]
352 | }
353 | ],
354 | "metadata": {
355 | "kernelspec": {
356 | "display_name": "base",
357 | "language": "python",
358 | "name": "python3"
359 | },
360 | "language_info": {
361 | "codemirror_mode": {
362 | "name": "ipython",
363 | "version": 3
364 | },
365 | "file_extension": ".py",
366 | "mimetype": "text/x-python",
367 | "name": "python",
368 | "nbconvert_exporter": "python",
369 | "pygments_lexer": "ipython3",
370 | "version": "3.11.5"
371 | }
372 | },
373 | "nbformat": 4,
374 | "nbformat_minor": 2
375 | }
376 |
--------------------------------------------------------------------------------
/src/02_isolar_timestamps_eventos.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Calcular o Tempo de Voto"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Este notebook tem como objetivo calcular o tempo de voto de um eleitor. \n",
15 | "O primeiro passo é definir exatamente o que é um voto, dado que o log das urnas contém apenas uma sequência de eventos.\n",
16 | "\n",
17 | "Na sequência, os votos são individualizados (um por linha) e o tempo de cada evento relevante é calculado."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Importing libraries"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 28,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import duckdb\n",
34 | "import pandas as pd\n",
35 | "import time"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Importing Data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 29,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "cursor = duckdb.connect()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 31,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "TABLE = \"read_parquet('UFS_VOTE_EVENTS.parquet/*/*/*.parquet', hive_partitioning=True)\""
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 32,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "source_data = f\"\"\"\n",
70 | " (\n",
71 | " SELECT\n",
72 | " *\n",
73 | " FROM {TABLE}\n",
74 | " ) AS source\n",
75 | "\"\"\""
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Preparinga Data"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### Criando um ID único para cada voto"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "Como heurística, vamos criar um id único para cada voto, que será determinado a partir de uma operação 'âncora'.\n",
97 | "\n",
98 | "A operação servirá como marcação de que um voto foi iniciado e, todas as linhas entre uma operação âncora e a próxima, serão consideradas como um único voto."
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 33,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "ANCHOR_OPERATION = 'Aguardando digitação do título'\n",
108 | "ZONE_GROUPS = [ (0, 100), (101, 200), (201, 300), (301, 400), (401, 500) ]"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "Após uma exploração dos LOGS, a operação escolhida foi 'AGUARDANDO DIGITAÇÃO DO TÍTULO', exatamente por ser o PRIMEIRO e OBRIAGTÓRIO passo para que um voto seja autorizado."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 34,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "query_create_id = f\"\"\"\n",
125 | " (\n",
126 | " SELECT\n",
127 | " *,\n",
128 | " SUM(CASE WHEN event_description = '{ANCHOR_OPERATION}' THEN 1 ELSE 0 END) \n",
129 | " OVER (PARTITION BY event_date, uf, filename ORDER BY event_timestamp) AS vote_id,\n",
130 | " \n",
131 | " CASE\n",
132 | " {\n",
133 | " \"\".join(\n",
134 | " [\n",
135 | " f\"WHEN zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
136 | " for min_zone, max_zone in ZONE_GROUPS\n",
137 | " ]\n",
138 | " )\n",
139 | " }\n",
140 | " END AS zone_group\n",
141 | "\n",
142 | " FROM {source_data}\n",
143 | " WHERE \n",
144 | " uf = '' \n",
145 | " AND event_date = ''\n",
146 | " AND zone_code::INT BETWEEN AND \n",
147 | " ) AS query_vote_id\n",
148 | "\"\"\""
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "### Pivotando Timestamp dos eventos por id"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Para calcular o tempo dos votos e dos eventos individuais que o compõem (biometria, voto) é necessário extrair o timestamp de cada evento."
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 35,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "timestamp_inicio_fim_voto = [\n",
172 | " f'''\n",
173 | " MAX(\n",
174 | " CASE WHEN event_description = 'Título digitado pelo mesário' THEN event_timestamp ELSE NULL END \n",
175 | " ) AS timestamp_titulo_digitado\n",
176 | " ''',\n",
177 | " f'''\n",
178 | " MAX(\n",
179 | " CASE WHEN event_description = 'O voto do eleitor foi computado' THEN event_timestamp ELSE NULL END \n",
180 | " ) AS timestamp_voto_computado\n",
181 | " '''\n",
182 | "]"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 36,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "VOTE_EVENTS = [\n",
192 | " 'Voto confirmado para [Conselheiro Distrital]',\n",
193 | " 'Voto confirmado para [Deputado Distrital]',\n",
194 | " 'Voto confirmado para [Deputado Estadual]',\n",
195 | " 'Voto confirmado para [Deputado Federal]',\n",
196 | " 'Voto confirmado para [Governador]',\n",
197 | " 'Voto confirmado para [Prefeito]',\n",
198 | " 'Voto confirmado para [Presidente]',\n",
199 | " 'Voto confirmado para [Senador]',\n",
200 | "]\n",
201 | "\n",
202 | "timestamp_vote_events = [\n",
203 | " f'''\n",
204 | " MAX(\n",
205 | " CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n",
206 | " ) AS timestamp_voto_{event.replace(\"Voto confirmado para [\", \"\").replace(\"]\", \"\").lower().replace(' ', '_')}\n",
207 | " '''\n",
208 | " for event in VOTE_EVENTS\n",
209 | "]"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 37,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "BIOMETRIA_TENTATIVAS = [\n",
219 | " 'Solicita digital. Tentativa [1] de [4]',\n",
220 | " 'Solicita digital. Tentativa [2] de [4]',\n",
221 | " 'Solicita digital. Tentativa [3] de [4]',\n",
222 | " 'Solicita digital. Tentativa [4] de [4]',\n",
223 | " 'Solicitação de dado pessoal do eleitor para habilitação manual',\n",
224 | " 'Eleitor foi habilitado'\n",
225 | "]\n",
226 | "\n",
227 | "timestamp_biometria_tentativas = [\n",
228 | " f'''\n",
229 | " MAX(\n",
230 | " CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n",
231 | " ) AS timestamp_biometria_{event.replace(\"Solicita digital. Tentativa [\", \"\").replace(\"] de [4]\", \"\").lower()}\n",
232 | " '''\n",
233 | " for event in BIOMETRIA_TENTATIVAS\n",
234 | " if event.startswith('Solicita digital')\n",
235 | "] + [\n",
236 | " f'''\n",
237 | " MAX(\n",
238 | " CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-2]}\\' THEN event_timestamp ELSE NULL END \n",
239 | " ) AS timestamp_biometria_manual\n",
240 | " '''\n",
241 | "] + [\n",
242 | " f'''\n",
243 | " MAX(\n",
244 | " CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-1]}\\' THEN event_timestamp ELSE NULL END \n",
245 | " ) AS timestamp_habilitacao_eleitor\n",
246 | " '''\n",
247 | "]\n",
248 | " "
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 38,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "query_pivot_timestamps = f\"\"\"(\n",
258 | " SELECT\n",
259 | " event_date, uf, filename, vote_id,\n",
260 | " \n",
261 | " MAX(city_code) AS city_code,\n",
262 | " MAX(zone_code) AS zone_code,\n",
263 | " MAX(zone_group) AS zone_group,\n",
264 | " MAX(section_code) AS section_code,\n",
265 | "\n",
266 | " SUM( (event_description='O voto do eleitor foi computado')::INT ) AS quantidade_votos_computados,\n",
267 | " SUM( (event_description ILIKE 'Solicita digital%')::INT ) AS quantidade_solicitacoes_biometria,\n",
268 | " SUM( (event_description ILIKE 'Voto confirmado para%')::INT ) AS quantidade_cargos_votados,\n",
269 | " MAX( (event_description='Solicitação de dado pessoal do eleitor para habilitação manual')::INT ) AS biometria_nao_funcionou,\n",
270 | "\n",
271 | " MIN( event_timestamp ) AS timestamp_primeiro_evento,\n",
272 | "\n",
273 | " {', '.join(timestamp_vote_events+timestamp_biometria_tentativas+timestamp_inicio_fim_voto)}\n",
274 | " \n",
275 | " FROM {query_create_id}\n",
276 | " GROUP BY event_date, uf, filename, vote_id\n",
277 | ")\n",
278 | "\"\"\""
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "### Construindo e Executando a query"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL por duas razões:\n",
293 | "\n",
294 | " - Facilitar a leitura dos dados posteriormente\n",
295 | " - Permitir a execução da query em partes, evitando a sobrecarga de memória ao processar todos os dados de uma vez\n",
296 | "\n",
297 | "As ZONAS foram agrupadas em grupos de 100, esse número é empírico, pensado para abarcar a grande maioria das UFs em um único grupo, já que a grande maioria dos estados não pssui mais de 100 zonas eleitorais, e dividir as UFs mais populosas em grupos menores."
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "ACCEPTED_DATES = [\n",
307 | " '2022-10-02', '2022-10-30', \n",
308 | " '2022-10-03', '2022-10-31',\n",
309 | "]\n",
310 | "UFS = [\n",
311 | " 'AC', 'AL', 'AM', 'AP', \n",
312 | " 'BA', \n",
313 | " 'CE', 'DF', 'ES', 'GO', \n",
314 | " 'MT', 'PA', 'PB', 'PE', \n",
315 | " 'MA',\n",
316 | " \n",
317 | " 'MG', 'MS', \n",
318 | " 'PI', 'PR', 'RJ', 'RN', \n",
319 | " 'RO', 'RR', 'RS', 'SC', \n",
320 | " 'SE', 'SP', 'TO', 'ZZ'\n",
321 | "]\n",
322 | "\n",
323 | "PROCESSING_TIMES = []\n",
324 | "\n",
325 | "for uf in UFS:\n",
326 | " for date in ACCEPTED_DATES:\n",
327 | " for zone_group in ZONE_GROUPS:\n",
328 | "\n",
329 | " \n",
330 | " query = F\"\"\"\n",
331 | " COPY \n",
332 | " {\n",
333 | " query_pivot_timestamps\n",
334 | " .replace('', uf)\n",
335 | " .replace('', date)\n",
336 | " .replace('', str(zone_group[0]))\n",
337 | " .replace('', str(zone_group[1]))\n",
338 | " } \n",
339 | " TO 'VOTES.parquet' \n",
340 | " (FORMAT 'parquet', PARTITION_BY (event_date, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
341 | " \"\"\"\n",
342 | " \n",
343 | " print(\"Processing \", uf, date)\n",
344 | " tic = time.time()\n",
345 | " cursor.execute(query)\n",
346 | " toc = time.time()\n",
347 | " print(F\"Time for {uf} {date} {zone_group}: {toc-tic}\")\n",
348 | "\n",
349 | " PROCESSING_TIMES.append({\n",
350 | " 'uf': uf,\n",
351 | " 'date': date,\n",
352 | " 'zone_group': zone_group,\n",
353 | " 'time': toc-tic\n",
354 | " })"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "Salvando o resultado dos tempos de processamento."
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": 42,
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "PROCESSING_TIMES\n",
371 | "\n",
372 | "# convert to pandas and save as csv\n",
373 | "df_processing_times = pd.DataFrame(PROCESSING_TIMES)\n",
374 | "df_processing_times.to_csv('processing_times.csv', index=False)"
375 | ]
376 | }
377 | ],
378 | "metadata": {
379 | "kernelspec": {
380 | "display_name": "base",
381 | "language": "python",
382 | "name": "python3"
383 | },
384 | "language_info": {
385 | "codemirror_mode": {
386 | "name": "ipython",
387 | "version": 3
388 | },
389 | "file_extension": ".py",
390 | "mimetype": "text/x-python",
391 | "name": "python",
392 | "nbconvert_exporter": "python",
393 | "pygments_lexer": "ipython3",
394 | "version": "3.11.5"
395 | }
396 | },
397 | "nbformat": 4,
398 | "nbformat_minor": 2
399 | }
400 |
--------------------------------------------------------------------------------
/streamlit/app/widgets.py:
--------------------------------------------------------------------------------
1 |
2 | import geopandas as gpd
3 | import pandas as pd
4 | import datetime
5 | import re
6 | import io
7 | import streamlit as st
8 | import seaborn as sns
9 |
10 | from maps import add_ufs_and_links_to_map, load_brazil_simplified_map, load_ufs_city_simplified_map
11 | from data import DuckDBConnector
12 | import numpy as np
13 |
14 | import matplotlib.pyplot as plt
15 | from matplotlib.colors import LinearSegmentedColormap
16 |
17 | @st.cache_resource
18 | def get_duckdb_connector():
19 | return DuckDBConnector.get_instance()
20 |
21 | PRIMARY_COLOR = "#0B1D51"
22 | HIGHLIGHT_COLOR = "#F08902"
23 |
24 | # Seaborn set theme
25 | # no grid
26 | # gray background
27 | sns.set_style("whitegrid")
28 | sns.set_theme(style='whitegrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)
29 |
30 | def format_number_mi_mil(number):
31 | number_mi = number//1e6
32 | number_mil = (number - number_mi*1e6) / 1e3
33 |
34 | number_formatted = f"{number_mi:.0f} Mihão" if number_mi > 0 else ''
35 | if number_mil > 0:
36 | number_formatted += f" {number_mil:.0f} Mil"
37 | elif number_mil > 0:
38 | number_formatted = str(number_mil).replace('.', ',')
39 | number_formatted = number_formatted[:number_formatted.index(',')+2] + ' Mil'
40 | number_formatted = number_formatted.strip()
41 | return number_formatted
42 |
43 |
44 | def format_time(time_in_seconds):
45 |
46 | years = time_in_seconds // (365 * 24 * 3600)
47 | time_in_seconds = time_in_seconds % (365 * 24 * 3600)
48 | months = time_in_seconds // (30 * 24 * 3600)
49 | time_in_seconds = time_in_seconds % (30 * 24 * 3600)
50 | days = time_in_seconds // (24 * 3600)
51 | time_in_seconds = time_in_seconds % (24 * 3600)
52 | hours = time_in_seconds // 3600
53 | time_in_seconds %= 3600
54 | minutes = time_in_seconds // 60
55 | seconds = time_in_seconds % 60
56 |
57 | days = int(days)
58 | hours = int(hours)
59 | minutes = int(minutes)
60 | seconds = int(seconds)
61 |
62 | time_formated = ""
63 | if seconds > 0:
64 | time_formated += f"{seconds:.0f}s"
65 | if minutes > 0:
66 | time_formated = f"{minutes:.0f}m " + time_formated
67 | if hours > 0:
68 | time_formated = f"{hours:.0f}h " + time_formated
69 | if days > 0:
70 | time_formated = f"{days:.0f} dias " + time_formated
71 | if months > 0:
72 | time_formated = f"{months:.0f} Meses " + time_formated
73 | if months == 1:
74 | time_formated = time_formated.replace('Meses', 'Mês')
75 | if years > 0:
76 | time_formated = f"{years:.0f} Anos " + time_formated
77 | if years == 1:
78 | time_formated = time_formated.replace('Anos', 'Ano')
79 |
80 | # Remover horas, minutos e segundos
81 | time_formated = re.sub(r'\d+[hms]', '', time_formated)
82 |
83 | return time_formated
84 |
85 |
86 | def format_number(number):
87 | return (
88 | f"{number//1e6:.0f} Mi"
89 | if number >= 1e6 else f"{number//1e3:.0f} Mil"
90 | if number >= 1e3 else f"{number:.0f}"
91 | )
92 |
93 |
94 | def widget_numero_votos_intervalo_5min(container, turno, uf, zona, secao):
95 |
96 | metrics_df = get_duckdb_connector().get_metrics_over_time(uf, turno, zona, secao)
97 | metrics_df['timestamp_voto_computado_5min'] = pd.to_datetime(metrics_df['timestamp_voto_computado_5min'])
98 | metrics_df = metrics_df.sort_values('timestamp_voto_computado_5min')
99 | metrics_df = metrics_df.fillna( pd.NaT )
100 |
101 | # define x and y
102 | y_metric = metrics_df['total_votos'].astype(int)
103 |
104 | # Get the maximum value of y
105 | # and the corresponding x value
106 | # ------------------------------
107 | x_value_max_y, max_y = metrics_df.loc[y_metric.idxmax(), ['timestamp_voto_computado_5min', 'total_votos']]
108 | x_value_max_y_formatted = x_value_max_y.strftime('%H:%M')
109 | max_y_formatted = format_number_mi_mil(max_y)
110 |
111 | # lineplot with time series
112 | FIGSIZE = (10, 5)
113 | fig, ax = plt.subplots( figsize=FIGSIZE )
114 |
115 | # pegar só horas fechadas e 30min
116 | x_axis_values = (
117 | metrics_df
118 | .query("timestamp_voto_computado_5min.dt.minute == 0")
119 | ['timestamp_voto_computado_5min']
120 | )
121 | x_axis_labels = x_axis_values.dt.strftime('%H:%M')
122 |
123 | if uf in ['ALL', 'SP', 'MG']:
124 | y_axis_values = [ 5e4, 1e5, 2.5e5, 5e5, 7.5e5, 1e6 ]
125 | else:
126 | y_axis_values = [ 1e3, 3e3, 5e3, 1e4, 1.5e4, 2e4, 5e4, 1e5, 5e5 ]
127 | y_axis_labels = [format_number(y) for y in y_axis_values]
128 |
129 | sns.lineplot(
130 | x=metrics_df['timestamp_voto_computado_5min'],
131 | y=y_metric,
132 | ax=ax,
133 | color=PRIMARY_COLOR
134 | )
135 |
136 | # Fill area under the line
137 | # ------------------------
138 | ax.fill_between(
139 | metrics_df['timestamp_voto_computado_5min'],
140 | y_metric,
141 | 0,
142 | zorder=0,
143 | alpha=0.5,
144 | color=PRIMARY_COLOR
145 | )
146 |
147 | # Add vertical line at the maximum value
148 | # --------------------------------------
149 | ax.axvline(
150 | x=metrics_df.loc[y_metric.idxmax(), 'timestamp_voto_computado_5min'],
151 | color=HIGHLIGHT_COLOR,
152 | ymin=0,
153 | ymax=1,
154 | linestyle='-',
155 | linewidth=2
156 | )
157 |
158 | # Add a box in the line with the maximum value
159 | # left aligned
160 | # --------------------------------------------
161 | ax.text(
162 | x_value_max_y,
163 | 0.9*max_y,
164 | f"{max_y_formatted}",
165 | color='white',
166 | fontsize=10,
167 | ha='left',
168 | va='center',
169 | bbox=dict(facecolor=HIGHLIGHT_COLOR, alpha=1)
170 | )
171 |
172 |
173 | ax.set_xticks(x_axis_values)
174 | ax.set_xticklabels(x_axis_labels, rotation=45, ha='right', fontsize=10)
175 |
176 | # remove right and top spines
177 | ax.spines['right'].set_visible(False)
178 | ax.spines['left'].set_visible(False)
179 | ax.spines['bottom'].set_visible(False)
180 | ax.spines['top'].set_visible(False)
181 |
182 | ax.set_yticks(y_axis_values)
183 | ax.set_yticklabels(y_axis_labels, fontsize=10)
184 | # add horizontal grid lines on the y axis
185 | # in the background
186 | ax.yaxis.grid(True, linestyle='-', alpha=1)
187 | # remove x grid lines
188 | ax.xaxis.grid(False)
189 | # remove x and y labels
190 | ax.set_xlabel('')
191 | ax.set_ylabel('')
192 | # set y limit
193 | ax.set_ylim(0, max_y)
194 |
195 | container.markdown('#### Número de votos efetuados a cada 5min')
196 | container.pyplot(fig)
197 | container.markdown(f'#### Às {x_value_max_y_formatted}, houve o pico de votos, com **{max_y_formatted}** computados em 5 minutos!')
198 |
199 |
200 | def widget_tempo_medio_voto(container, turno, uf, zona, secao):
201 |
202 | if uf=='ALL':
203 | widget_heatmap_tempo_medio_voto_mapa(container, turno, uf, zona, secao)
204 | elif zona=='ALL':
205 | widget_tabela_tempo_medio_zonas(container, turno, uf, zona, secao)
206 |
207 |
208 | def widget_tabela_tempo_medio_zonas( container, turno, uf, zona, secao ):
209 |
210 | map_gdf = load_brazil_simplified_map()
211 | map_gdf = map_gdf.query(f"SIGLA_UF == '{uf}'")
212 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
213 | metrics_df = metrics_df[ ['zone_group', 'zone_code', 'total_votos', 'tempo_voto_medio'] ]
214 | metrics_df_all_zones = metrics_df.query("zone_code == 'ALL'")
215 | metrics_df = metrics_df.query("zone_code != 'ALL'")
216 |
217 | unique_zone_groups = list(metrics_df['zone_group'].unique())
218 | unique_zone_groups.sort( key=lambda x: int(x.split('-')[0]) )
219 |
220 | # plot a small map with the selected UF
221 | fig, ax = plt.subplots( figsize=(1, 1) )
222 | map_gdf.plot(ax=ax, color=HIGHLIGHT_COLOR)
223 | ax.axis('off')
224 | # add the sigla of the UF
225 | ax.text(
226 | map_gdf.centroid.x.values[0],
227 | map_gdf.centroid.y.values[0],
228 | uf,
229 | fontsize=8,
230 | weight='bold',
231 | ha='center',
232 | va='center',
233 | color='white'
234 | )
235 |
236 | x=.15
237 | col_map_uf, col_title = container.columns( [x, 1-x] )
238 | col_map_uf.pyplot(fig, use_container_width=True)
239 | col_title.markdown(f"### Detalhamento por Zona \n")
240 |
241 | zone_gorup_tabs = container.tabs( unique_zone_groups )
242 | for zone_group, zone_group_tab in zip(unique_zone_groups, zone_gorup_tabs):
243 |
244 | metrics_df_zone_group = metrics_df.query(f"zone_group == '{zone_group}'")
245 | top_3_most_last_zones = metrics_df_zone_group.sort_values('tempo_voto_medio', ascending=False).head(3)['zone_code'].values
246 |
247 | metrics_df_zone_group = metrics_df_zone_group.sort_values('zone_code')
248 | metrics_df_zone_group['tempo_voto_medio'] = metrics_df_zone_group['tempo_voto_medio'].apply(format_time)
249 | metrics_df_zone_group['total_votos'] = metrics_df_zone_group['total_votos'].apply(format_number)
250 |
251 | # add medals to the top 3 most last zones
252 | # in the tempo_voto_medio column
253 |
254 | for medal, zone in zip(['🥇', '🥈', '🥉'], top_3_most_last_zones):
255 | metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio'] = medal \
256 | + ' ' + metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio']
257 |
258 |
259 | metrics_df_zone_group = metrics_df_zone_group.rename(
260 | columns={
261 | 'zone_code': 'Zona',
262 | 'total_votos': 'Votos',
263 | 'tempo_voto_medio': 'Tempo Médio'
264 | }
265 | ).drop(columns='zone_group')
266 |
267 | zone_group_tab.dataframe(
268 | metrics_df_zone_group
269 | .style
270 | .apply(
271 | lambda x:
272 | [
273 | f'background-color: {HIGHLIGHT_COLOR}; color: white; font-weight: bold; font-size: 15px'
274 | if x['Zona'] in top_3_most_last_zones else '',
275 | ]*len(x),
276 | axis=1
277 | ),
278 | height=400,
279 | use_container_width = True,
280 | hide_index=True
281 | )
282 |
283 |
284 | def widget_heatmap_tempo_medio_voto_mapa( container, turno, uf, zona, secao ):
285 | COLORMAP = 'coolwarm'
286 | RANGE_SECONDS_PLOT = 15
287 | FIGSIZE = (6, 6)
288 |
289 | map_gdf = load_brazil_simplified_map()
290 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
291 | map_gdf = map_gdf.merge(metrics_df, left_on='SIGLA_UF', right_on='uf', how='left')
292 | map_gdf = gpd.GeoDataFrame(map_gdf)
293 |
294 | tempo_voto_medio_ALL = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max()
295 | map_gdf['tempo_voto_medio'] = map_gdf['tempo_voto_medio'] - tempo_voto_medio_ALL
296 |
297 | fig = plt.figure(figsize=FIGSIZE)
298 | ax = fig.add_subplot(1, 1, 1)
299 | ax.axis('off')
300 | UFS = map_gdf['uf'].unique()
301 |
302 | for uf in UFS:
303 | (
304 | map_gdf
305 | .query(f"uf == '{uf}'")
306 | .plot(
307 | column='tempo_voto_medio',
308 | ax=ax,
309 | cmap=COLORMAP,
310 | legend=False,
311 | vmin=-RANGE_SECONDS_PLOT,
312 | vmax=+RANGE_SECONDS_PLOT,
313 | gid=uf
314 | )
315 | )
316 |
317 | # add a horizontal colorbar
318 | sm = plt.cm.ScalarMappable(
319 | cmap=COLORMAP,
320 | norm=plt.Normalize(vmin=-RANGE_SECONDS_PLOT, vmax=+RANGE_SECONDS_PLOT)
321 | )
322 |
323 | cbar = fig.colorbar(sm, ax=ax, orientation='horizontal', pad=0.01, aspect=20, fraction=0.035)
324 | cbar.set_label('Segundos abaixo/acima da média', fontsize=10)
325 | cbar.ax.tick_params(labelsize=8)
326 |
327 | # save svg image to buffer
328 | svg_image_buffer = io.StringIO()
329 | plt.savefig(svg_image_buffer, format='svg')
330 | plt.close(fig)
331 |
332 | svg_image_with_links = add_ufs_and_links_to_map(svg_image_buffer.getvalue())
333 |
334 | container.markdown('#### Tempo Médio de Votação por UF')
335 | container.markdown(':point_down: Clique no Mapa para detalhes')
336 | container.markdown(svg_image_with_links, unsafe_allow_html=True)
337 |
338 |
339 | def widget_bignumber_votos( container, turno, uf, zona, secao ):
340 |
341 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
342 | if uf == 'ALL':
343 | votos = metrics_df.query(f"uf == 'ALL'")['total_votos'].max()
344 | else:
345 | votos = metrics_df['total_votos'].max()
346 |
347 | votos_formatado = f"{votos:,}".replace(',', ' ')
348 | container.metric(label=':white_check_mark: Votos', value=votos_formatado)
349 |
350 |
351 | def widget_bignumber_secoes( container, turno, uf, zona, secao ):
352 |
353 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
354 | if uf == 'ALL':
355 | secoes = metrics_df.query(f"uf == 'ALL'")['total_secoes'].max()
356 | else:
357 | secoes = metrics_df['total_secoes'].max()
358 |
359 | section_formatado = f"{secoes:,}".replace(',', ' ')
360 | container.metric(label=':pushpin: Seções', value=section_formatado)
361 |
362 |
363 | def widget_big_number_tempo_medio( container, turno, uf, zona, secao ):
364 |
365 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
366 | if uf == 'ALL':
367 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max()
368 | else:
369 | tempo_medio = metrics_df['tempo_voto_medio'].max()
370 |
371 | tempo_medio_formatado = format_time(tempo_medio)
372 | container.metric(label=':stopwatch: Tempo Médio', value=tempo_medio_formatado)
373 |
374 |
375 | def widget_big_number_tempo_medio_bio( container, turno, uf, zona, secao ):
376 |
377 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
378 | if uf == 'ALL':
379 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_biometria_medio'].max()
380 | else:
381 | tempo_medio = metrics_df['tempo_biometria_medio'].max()
382 |
383 | tempo_medio_formatado = format_time(tempo_medio)
384 | container.metric(label=':point_up: Tempo Médio Biometria', value=tempo_medio_formatado)
385 |
386 |
387 | def widget_big_number_tempo_total_voto( container, turno, uf, zona, secao ):
388 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
389 |
390 | if uf == 'ALL':
391 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_soma'].max()
392 | else:
393 | tempo_medio = metrics_df['tempo_voto_soma'].max()
394 |
395 | tempo_medio_anos = tempo_medio / (365 * 24 * 3600)
396 | if tempo_medio_anos < 5:
397 | icon = ':baby:'
398 | elif tempo_medio_anos < 10:
399 | icon = ':boy:'
400 | elif tempo_medio_anos < 15:
401 | icon = ':child:'
402 | elif tempo_medio_anos < 30:
403 | icon = ':man:'
404 | elif tempo_medio_anos < 60:
405 | icon = ':older_adult:'
406 | else:
407 | icon = ':older_man:'
408 |
409 |
410 | tempo_medio_formatado = format_time(tempo_medio)
411 | container.metric(label=f'{icon} Tempo Total Gasto', value=tempo_medio_formatado)
412 |
413 |
414 | def widget_qtd_votos_intervalo_tempo( container, turno, uf, zona, secao ):
415 |
416 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
417 | if uf == 'ALL':
418 | metrics_df = metrics_df.query(f"uf == 'ALL'")
419 |
420 | format_time = lambda x: f"{x // 60}:{x % 60:02d}"
421 | # format number in Mi, Mil, and integer
422 | format_number = lambda number : (
423 | f"{number//1e6:.0f} Mi"
424 | if number >= 1e6 else f"{number//1e3:.0f} Mil"
425 | if number >= 1e3 else f"{number:.0f}"
426 | )
427 |
428 | extrair_intervalo_superior_segundos = lambda col: int(col.split('_')[-2])
429 | extrair_intervalo_inferior_segundos = lambda col: int(col.split('_')[-3])
430 |
431 | colunas_qtd_votos_intervalo = [
432 | 'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos',
433 | 'votos_90_120_segundos', 'votos_120_150_segundos',
434 | 'votos_150_180_segundos', 'votos_180_210_segundos',
435 | 'votos_210_300_segundos', 'votos_300_9999_segundos'
436 | ]
437 |
438 | valores_qtd_votos_intervalo = [
439 | (
440 | format_time(extrair_intervalo_inferior_segundos(col)) + " a " +
441 | format_time(extrair_intervalo_superior_segundos(col)),
442 | col,
443 | metrics_df[col].max()
444 | )
445 | if col != 'votos_300_9999_segundos' and col != 'votos_0_30_segundos'
446 | else ("mais de 5:00", col, metrics_df[col].max())
447 | if col == 'votos_300_9999_segundos'
448 | else ("até 0:30", col, metrics_df[col].max())
449 | for col in colunas_qtd_votos_intervalo
450 | ]
451 | # revert order
452 | valores_qtd_votos_intervalo = valores_qtd_votos_intervalo[::-1]
453 |
454 | df_valores_qtd_votos_intervalo = pd.DataFrame(
455 | valores_qtd_votos_intervalo,
456 | columns=['intervalo', 'coluna', 'valor']
457 | )
458 |
459 | container.markdown('#### Em quantos minutos as pessoas votam?')
460 |
461 | # plot horizontal bar chart
462 | fig, ax = plt.subplots( figsize=(5, 12) )
463 | # df_valores_qtd_votos_intervalo.plot.barh(x='intervalo', y='valor', legend=False, width=.8, ax=ax)
464 |
465 | # make the barplot with seaborn
466 | sns.barplot(
467 | x='valor',
468 | y='intervalo',
469 | data=df_valores_qtd_votos_intervalo,
470 | color=PRIMARY_COLOR,
471 | ax=ax
472 | )
473 | fig.gca().invert_yaxis()
474 |
475 | # make the biggest bar red
476 | max_value = df_valores_qtd_votos_intervalo['valor'].max()
477 | max_value_index = df_valores_qtd_votos_intervalo['valor'].idxmax()
478 | ax.patches[max_value_index].set_facecolor(HIGHLIGHT_COLOR)
479 | # add the % inside the biggest bar
480 | max_value_percent = max_value / df_valores_qtd_votos_intervalo['valor'].sum()
481 | ax.text(
482 | max_value - 0.05 * max_value,
483 | max_value_index,
484 | f"{max_value_percent:.1%}",
485 | color='white',
486 | ha = 'right',
487 | va = 'center',
488 | size=20
489 | )
490 |
491 | ax.set_xlabel('Quantidade de Votos')
492 | ax.set_ylabel('')
493 | # ax.set_title('Em quanto tempo as pessoas votam?\n', fontsize=20)
494 |
495 | # remover linha superior, direita e inferior
496 | ax.spines['top'].set_visible(False)
497 | ax.spines['right'].set_visible(False)
498 | ax.spines['bottom'].set_visible(False)
499 |
500 | # remove x axis
501 | ax.xaxis.set_visible(False)
502 |
503 | # increase y axis font size
504 | ax.tick_params(axis='y', labelsize=20)
505 |
506 | # adicionar número no final de cada barra
507 | maior_valor = df_valores_qtd_votos_intervalo['valor'].max()
508 | offset = 0.05 * maior_valor
509 | for i, valor in enumerate(df_valores_qtd_votos_intervalo['valor']):
510 | ax.text(valor+offset, i, format_number(valor), color='black', va='center', fontsize=18)
511 |
512 | container.pyplot(fig)
513 |
514 |
515 |
516 |
--------------------------------------------------------------------------------
/src/03A_calcular_metricas_tempo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Calcular Métricas - Tempo de Votação, Biometria, etc."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "---"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Importing libraries"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import duckdb\n",
31 | "import pandas as pd\n",
32 | "import time"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Importing Data"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "cursor = duckdb.connect()"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "Dados Brutos"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 3,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n",
65 | "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Adicionar TURNO e Timestamp final de Biometria"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "source = F\"\"\"\n",
82 | "(\n",
83 | " SELECT \n",
84 | " *,\n",
85 | " CASE event_date\n",
86 | " WHEN '2022-10-02' THEN 1\n",
87 | " WHEN '2022-10-03' THEN 1\n",
88 | " WHEN '2022-10-30' THEN 2\n",
89 | " WHEN '2022-10-31' THEN 2\n",
90 | " ELSE NULL\n",
91 | " END::INT AS turno,\n",
92 | " COALESCE(\n",
93 | " timestamp_biometria_manual,\n",
94 | " timestamp_biometria_4,\n",
95 | " timestamp_biometria_3,\n",
96 | " timestamp_biometria_2,\n",
97 | " timestamp_biometria_1\n",
98 | " ) AS timestamp_biometria_final\n",
99 | " FROM \n",
100 | " {TABLE}\n",
101 | ") _\n",
102 | "\"\"\""
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Preparinga Data"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n",
117 | "\n",
118 | "- Número de Votos\n",
119 | "- Número de Seções Eleitorais\n",
120 | "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n",
121 | "\n",
122 | "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n",
123 | "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n",
124 | "- Quantidade de Teclas Pressionadas\n",
125 | "- Quantidade de Cargos Distintos Votados"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "**Definição das métricas de tempo**"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 5,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_titulo_digitado))\"\n",
142 | "tempo_voto = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_habilitacao_eleitor))\"\n",
143 | "tempo_biometria = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n",
144 | "\n",
145 | "intervalos_tempo_segundos_votos = [0, 30, 60, 90, 120, 150, 180, 210, 300, 9999]\n",
146 | "contagem_de_votos_em_intervalos_de_tempo = \", \".join([\n",
147 | " F\"\"\"\n",
148 | " SUM( \n",
149 | " CASE WHEN \n",
150 | " {tempo_voto} >= {intervalos_tempo_segundos_votos[i]} \n",
151 | " AND {tempo_voto} < {intervalos_tempo_segundos_votos[i+1]}\n",
152 | " THEN 1 ELSE 0 END \n",
153 | " ) AS votos_{intervalos_tempo_segundos_votos[i]}_{intervalos_tempo_segundos_votos[i+1]}_segundos\n",
154 | " \"\"\"\n",
155 | " for i in range(0, len(intervalos_tempo_segundos_votos)-1)\n",
156 | "])"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | "**Contagem de cargos distintos votados e número total de teclas pressionadas**"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "Aproximação a partir do número de digitos de cada cargo + 1 (CONFIRMA)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 6,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "COLUNAS_VOTOS_CARGOS_NR_TECLAS = [\n",
180 | " # 2 digitos\n",
181 | " ('timestamp_voto_prefeito', 2), \n",
182 | " ('timestamp_voto_presidente', 2),\n",
183 | " ('timestamp_voto_governador', 2),\n",
184 | " \n",
185 | " # 3 digitos\n",
186 | " ('timestamp_voto_senador', 3),\n",
187 | "\n",
188 | " # 4 digitos\n",
189 | " ('timestamp_voto_deputado_distrital', 4), \n",
190 | " ('timestamp_voto_deputado_federal', 4),\n",
191 | "\n",
192 | " # 5 digitos\n",
193 | " ('timestamp_voto_deputado_estadual', 5),\n",
194 | "]\n",
195 | "\n",
196 | "nr_total_cargos_votados = \" + \".join([\n",
197 | " F\"({coluna} IS NOT NULL)::INT\"\n",
198 | " for coluna, _ in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n",
199 | "])\n",
200 | "\n",
201 | "nr_total_teclas_digitadas = \" + \".join([\n",
202 | " F\"({coluna} IS NOT NULL)::INT*({teclas}+1)\"\n",
203 | " for coluna, teclas in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n",
204 | "])"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 7,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(10), 'ALL')\""
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 8,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "query_metrics = F\"\"\"\n",
223 | " SELECT\n",
224 | " {fix_null_values('turno') } AS turno,\n",
225 | " {fix_null_values('uf') } AS uf,\n",
226 | " {fix_null_values('zone_code') } AS zone_code,\n",
227 | " {fix_null_values('section_code') } AS section_code,\n",
228 | "\n",
229 | " COUNT(*) AS total_votos,\n",
230 | " COUNT( DISTINCT uf || zone_code || section_code ) AS total_secoes,\n",
231 | "\n",
232 | " SUM( {tempo_voto} ) AS tempo_voto_soma,\n",
233 | " AVG( {tempo_voto} ) AS tempo_voto_medio,\n",
234 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_mediana,\n",
235 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_90percentil,\n",
236 | "\n",
237 | " SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n",
238 | " AVG( {tempo_biometria} ) AS tempo_biometria_medio,\n",
239 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_mediana,\n",
240 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_90percentil,\n",
241 | "\n",
242 | " SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n",
243 | " AVG( {tempo_voto_total} ) AS tempo_voto_total_medio,\n",
244 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_mediana,\n",
245 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_90percentil,\n",
246 | " \n",
247 | " {contagem_de_votos_em_intervalos_de_tempo},\n",
248 | " 1-AVG(biometria_nao_funcionou::INT) AS tx_sucesso_biometria,\n",
249 | "\n",
250 | " MAX({nr_total_cargos_votados}) AS nr_total_cargos_votados,\n",
251 | " SUM({nr_total_teclas_digitadas}) AS nr_total_teclas_digitadas\n",
252 | "\n",
253 | " FROM\n",
254 | " {source}\n",
255 | " WHERE quantidade_votos_computados = 1\n",
256 | " GROUP BY ROLLUP(turno, uf, zone_code, section_code)\n",
257 | "\"\"\""
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n",
265 | "\n",
266 | "As ZONAS foram agrupadas em grupos de 20, esse número é empírico."
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 9,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "query_metrics_with_zone_group = F\"\"\"\n",
276 | " SELECT\n",
277 | " *,\n",
278 | " CASE\n",
279 | " {\n",
280 | " \"\".join(\n",
281 | " [\n",
282 | " f\"WHEN zone_code!='ALL' AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
283 | " for min_zone, max_zone in ZONE_GROUPS\n",
284 | " ]\n",
285 | " )\n",
286 | " }\n",
287 | " ELSE zone_code\n",
288 | " END AS zone_group\n",
289 | " FROM (\n",
290 | " {query_metrics}\n",
291 | " ) _\n",
292 | "\"\"\""
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 10,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "query = F\"\"\"\n",
302 | " COPY (\n",
303 | " {\n",
304 | " query_metrics_with_zone_group\n",
305 | " } )\n",
306 | " TO 'VOTES_TIME_METRICS.parquet' \n",
307 | " (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
308 | "\"\"\""
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 11,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "data": {
318 | "application/vnd.jupyter.widget-view+json": {
319 | "model_id": "df1f82e654f446ccb9e0f3171cf3edef",
320 | "version_major": 2,
321 | "version_minor": 0
322 | },
323 | "text/plain": [
324 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
325 | ]
326 | },
327 | "metadata": {},
328 | "output_type": "display_data"
329 | },
330 | {
331 | "data": {
332 | "text/plain": [
333 | ""
334 | ]
335 | },
336 | "execution_count": 11,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "cursor.execute(query)"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 12,
348 | "metadata": {},
349 | "outputs": [
350 | {
351 | "data": {
352 | "text/html": [
353 | "\n",
354 | "\n",
367 | "
\n",
368 | " \n",
369 | " \n",
370 | " | \n",
371 | " turno | \n",
372 | " uf | \n",
373 | " zone_code | \n",
374 | " section_code | \n",
375 | " total_votos | \n",
376 | " total_secoes | \n",
377 | " tempo_voto_soma | \n",
378 | " tempo_voto_medio | \n",
379 | " tempo_biometria_soma | \n",
380 | " tempo_biometria_medio | \n",
381 | " ... | \n",
382 | " votos_90_120_segundos | \n",
383 | " votos_120_150_segundos | \n",
384 | " votos_150_180_segundos | \n",
385 | " votos_180_210_segundos | \n",
386 | " votos_210_300_segundos | \n",
387 | " votos_300_9999_segundos | \n",
388 | " tx_sucesso_biometria | \n",
389 | " nr_total_cargos_votados | \n",
390 | " nr_total_teclas_digitadas | \n",
391 | " zone_group | \n",
392 | "
\n",
393 | " \n",
394 | " \n",
395 | " \n",
396 | " | 0 | \n",
397 | " 1 | \n",
398 | " DF | \n",
399 | " 0014 | \n",
400 | " ALL | \n",
401 | " 89318 | \n",
402 | " 308 | \n",
403 | " 4345747.0 | \n",
404 | " 48.654773 | \n",
405 | " 869184.0 | \n",
406 | " 10.492576 | \n",
407 | " ... | \n",
408 | " 2633.0 | \n",
409 | " 906.0 | \n",
410 | " 409.0 | \n",
411 | " 237.0 | \n",
412 | " 276.0 | \n",
413 | " 174.0 | \n",
414 | " 0.953615 | \n",
415 | " 5 | \n",
416 | " 1745890.0 | \n",
417 | " 0-20 | \n",
418 | "
\n",
419 | " \n",
420 | " | 1 | \n",
421 | " 1 | \n",
422 | " DF | \n",
423 | " 0015 | \n",
424 | " ALL | \n",
425 | " 134744 | \n",
426 | " 505 | \n",
427 | " 6785043.0 | \n",
428 | " 50.355066 | \n",
429 | " 1171645.0 | \n",
430 | " 9.482934 | \n",
431 | " ... | \n",
432 | " 4310.0 | \n",
433 | " 1623.0 | \n",
434 | " 837.0 | \n",
435 | " 425.0 | \n",
436 | " 494.0 | \n",
437 | " 244.0 | \n",
438 | " 0.963776 | \n",
439 | " 5 | \n",
440 | " 2667519.0 | \n",
441 | " 0-20 | \n",
442 | "
\n",
443 | " \n",
444 | " | 2 | \n",
445 | " 1 | \n",
446 | " DF | \n",
447 | " 0017 | \n",
448 | " ALL | \n",
449 | " 100240 | \n",
450 | " 378 | \n",
451 | " 5899118.0 | \n",
452 | " 58.849940 | \n",
453 | " 1344604.0 | \n",
454 | " 14.337855 | \n",
455 | " ... | \n",
456 | " 5206.0 | \n",
457 | " 2062.0 | \n",
458 | " 1012.0 | \n",
459 | " 587.0 | \n",
460 | " 793.0 | \n",
461 | " 364.0 | \n",
462 | " 0.931993 | \n",
463 | " 5 | \n",
464 | " 1997143.0 | \n",
465 | " 0-20 | \n",
466 | "
\n",
467 | " \n",
468 | " | 3 | \n",
469 | " 1 | \n",
470 | " DF | \n",
471 | " 0005 | \n",
472 | " 0050 | \n",
473 | " 294 | \n",
474 | " 1 | \n",
475 | " 17810.0 | \n",
476 | " 60.578231 | \n",
477 | " 3067.0 | \n",
478 | " 11.193431 | \n",
479 | " ... | \n",
480 | " 23.0 | \n",
481 | " 8.0 | \n",
482 | " 4.0 | \n",
483 | " 1.0 | \n",
484 | " 3.0 | \n",
485 | " 1.0 | \n",
486 | " 0.955782 | \n",
487 | " 5 | \n",
488 | " 5880.0 | \n",
489 | " 0-20 | \n",
490 | "
\n",
491 | " \n",
492 | " | 4 | \n",
493 | " 1 | \n",
494 | " DF | \n",
495 | " 0015 | \n",
496 | " 0229 | \n",
497 | " 281 | \n",
498 | " 1 | \n",
499 | " 14992.0 | \n",
500 | " 53.352313 | \n",
501 | " 2154.0 | \n",
502 | " 8.381323 | \n",
503 | " ... | \n",
504 | " 16.0 | \n",
505 | " 5.0 | \n",
506 | " 1.0 | \n",
507 | " 0.0 | \n",
508 | " 1.0 | \n",
509 | " 0.0 | \n",
510 | " 0.975089 | \n",
511 | " 5 | \n",
512 | " 5620.0 | \n",
513 | " 0-20 | \n",
514 | "
\n",
515 | " \n",
516 | " | ... | \n",
517 | " ... | \n",
518 | " ... | \n",
519 | " ... | \n",
520 | " ... | \n",
521 | " ... | \n",
522 | " ... | \n",
523 | " ... | \n",
524 | " ... | \n",
525 | " ... | \n",
526 | " ... | \n",
527 | " ... | \n",
528 | " ... | \n",
529 | " ... | \n",
530 | " ... | \n",
531 | " ... | \n",
532 | " ... | \n",
533 | " ... | \n",
534 | " ... | \n",
535 | " ... | \n",
536 | " ... | \n",
537 | " ... | \n",
538 | "
\n",
539 | " \n",
540 | " | 6625 | \n",
541 | " 1 | \n",
542 | " DF | \n",
543 | " 0021 | \n",
544 | " 0082 | \n",
545 | " 253 | \n",
546 | " 1 | \n",
547 | " 13693.0 | \n",
548 | " 54.122530 | \n",
549 | " 2232.0 | \n",
550 | " 9.073171 | \n",
551 | " ... | \n",
552 | " 16.0 | \n",
553 | " 6.0 | \n",
554 | " 1.0 | \n",
555 | " 1.0 | \n",
556 | " 2.0 | \n",
557 | " 0.0 | \n",
558 | " 0.984190 | \n",
559 | " 5 | \n",
560 | " 5060.0 | \n",
561 | " 20-40 | \n",
562 | "
\n",
563 | " \n",
564 | " | 6626 | \n",
565 | " 1 | \n",
566 | " DF | \n",
567 | " 0021 | \n",
568 | " 0332 | \n",
569 | " 234 | \n",
570 | " 1 | \n",
571 | " 12897.0 | \n",
572 | " 55.115385 | \n",
573 | " 2474.0 | \n",
574 | " 10.850877 | \n",
575 | " ... | \n",
576 | " 5.0 | \n",
577 | " 6.0 | \n",
578 | " 0.0 | \n",
579 | " 1.0 | \n",
580 | " 2.0 | \n",
581 | " 3.0 | \n",
582 | " 0.957265 | \n",
583 | " 5 | \n",
584 | " 4680.0 | \n",
585 | " 20-40 | \n",
586 | "
\n",
587 | " \n",
588 | " | 6627 | \n",
589 | " 1 | \n",
590 | " DF | \n",
591 | " 0021 | \n",
592 | " 0318 | \n",
593 | " 312 | \n",
594 | " 1 | \n",
595 | " 17853.0 | \n",
596 | " 57.221154 | \n",
597 | " 3879.0 | \n",
598 | " 13.104730 | \n",
599 | " ... | \n",
600 | " 14.0 | \n",
601 | " 5.0 | \n",
602 | " 4.0 | \n",
603 | " 1.0 | \n",
604 | " 5.0 | \n",
605 | " 1.0 | \n",
606 | " 0.971154 | \n",
607 | " 5 | \n",
608 | " 6240.0 | \n",
609 | " 20-40 | \n",
610 | "
\n",
611 | " \n",
612 | " | 6628 | \n",
613 | " 1 | \n",
614 | " DF | \n",
615 | " 0021 | \n",
616 | " 0185 | \n",
617 | " 315 | \n",
618 | " 1 | \n",
619 | " 16997.0 | \n",
620 | " 53.958730 | \n",
621 | " 2877.0 | \n",
622 | " 9.558140 | \n",
623 | " ... | \n",
624 | " 15.0 | \n",
625 | " 7.0 | \n",
626 | " 7.0 | \n",
627 | " 0.0 | \n",
628 | " 0.0 | \n",
629 | " 0.0 | \n",
630 | " 0.946032 | \n",
631 | " 5 | \n",
632 | " 6300.0 | \n",
633 | " 20-40 | \n",
634 | "
\n",
635 | " \n",
636 | " | 6629 | \n",
637 | " 1 | \n",
638 | " DF | \n",
639 | " ALL | \n",
640 | " ALL | \n",
641 | " 1779224 | \n",
642 | " 6610 | \n",
643 | " 99817162.0 | \n",
644 | " 56.101515 | \n",
645 | " 18036165.0 | \n",
646 | " 10.941314 | \n",
647 | " ... | \n",
648 | " 81643.0 | \n",
649 | " 33515.0 | \n",
650 | " 16930.0 | \n",
651 | " 9692.0 | \n",
652 | " 11890.0 | \n",
653 | " 5609.0 | \n",
654 | " 0.952647 | \n",
655 | " 5 | \n",
656 | " 35370312.0 | \n",
657 | " ALL | \n",
658 | "
\n",
659 | " \n",
660 | "
\n",
661 | "
6630 rows × 25 columns
\n",
662 | "
"
663 | ],
664 | "text/plain": [
665 | " turno uf zone_code section_code total_votos total_secoes \\\n",
666 | "0 1 DF 0014 ALL 89318 308 \n",
667 | "1 1 DF 0015 ALL 134744 505 \n",
668 | "2 1 DF 0017 ALL 100240 378 \n",
669 | "3 1 DF 0005 0050 294 1 \n",
670 | "4 1 DF 0015 0229 281 1 \n",
671 | "... ... .. ... ... ... ... \n",
672 | "6625 1 DF 0021 0082 253 1 \n",
673 | "6626 1 DF 0021 0332 234 1 \n",
674 | "6627 1 DF 0021 0318 312 1 \n",
675 | "6628 1 DF 0021 0185 315 1 \n",
676 | "6629 1 DF ALL ALL 1779224 6610 \n",
677 | "\n",
678 | " tempo_voto_soma tempo_voto_medio tempo_biometria_soma \\\n",
679 | "0 4345747.0 48.654773 869184.0 \n",
680 | "1 6785043.0 50.355066 1171645.0 \n",
681 | "2 5899118.0 58.849940 1344604.0 \n",
682 | "3 17810.0 60.578231 3067.0 \n",
683 | "4 14992.0 53.352313 2154.0 \n",
684 | "... ... ... ... \n",
685 | "6625 13693.0 54.122530 2232.0 \n",
686 | "6626 12897.0 55.115385 2474.0 \n",
687 | "6627 17853.0 57.221154 3879.0 \n",
688 | "6628 16997.0 53.958730 2877.0 \n",
689 | "6629 99817162.0 56.101515 18036165.0 \n",
690 | "\n",
691 | " tempo_biometria_medio ... votos_90_120_segundos \\\n",
692 | "0 10.492576 ... 2633.0 \n",
693 | "1 9.482934 ... 4310.0 \n",
694 | "2 14.337855 ... 5206.0 \n",
695 | "3 11.193431 ... 23.0 \n",
696 | "4 8.381323 ... 16.0 \n",
697 | "... ... ... ... \n",
698 | "6625 9.073171 ... 16.0 \n",
699 | "6626 10.850877 ... 5.0 \n",
700 | "6627 13.104730 ... 14.0 \n",
701 | "6628 9.558140 ... 15.0 \n",
702 | "6629 10.941314 ... 81643.0 \n",
703 | "\n",
704 | " votos_120_150_segundos votos_150_180_segundos votos_180_210_segundos \\\n",
705 | "0 906.0 409.0 237.0 \n",
706 | "1 1623.0 837.0 425.0 \n",
707 | "2 2062.0 1012.0 587.0 \n",
708 | "3 8.0 4.0 1.0 \n",
709 | "4 5.0 1.0 0.0 \n",
710 | "... ... ... ... \n",
711 | "6625 6.0 1.0 1.0 \n",
712 | "6626 6.0 0.0 1.0 \n",
713 | "6627 5.0 4.0 1.0 \n",
714 | "6628 7.0 7.0 0.0 \n",
715 | "6629 33515.0 16930.0 9692.0 \n",
716 | "\n",
717 | " votos_210_300_segundos votos_300_9999_segundos tx_sucesso_biometria \\\n",
718 | "0 276.0 174.0 0.953615 \n",
719 | "1 494.0 244.0 0.963776 \n",
720 | "2 793.0 364.0 0.931993 \n",
721 | "3 3.0 1.0 0.955782 \n",
722 | "4 1.0 0.0 0.975089 \n",
723 | "... ... ... ... \n",
724 | "6625 2.0 0.0 0.984190 \n",
725 | "6626 2.0 3.0 0.957265 \n",
726 | "6627 5.0 1.0 0.971154 \n",
727 | "6628 0.0 0.0 0.946032 \n",
728 | "6629 11890.0 5609.0 0.952647 \n",
729 | "\n",
730 | " nr_total_cargos_votados nr_total_teclas_digitadas zone_group \n",
731 | "0 5 1745890.0 0-20 \n",
732 | "1 5 2667519.0 0-20 \n",
733 | "2 5 1997143.0 0-20 \n",
734 | "3 5 5880.0 0-20 \n",
735 | "4 5 5620.0 0-20 \n",
736 | "... ... ... ... \n",
737 | "6625 5 5060.0 20-40 \n",
738 | "6626 5 4680.0 20-40 \n",
739 | "6627 5 6240.0 20-40 \n",
740 | "6628 5 6300.0 20-40 \n",
741 | "6629 5 35370312.0 ALL \n",
742 | "\n",
743 | "[6630 rows x 25 columns]"
744 | ]
745 | },
746 | "execution_count": 12,
747 | "metadata": {},
748 | "output_type": "execute_result"
749 | }
750 | ],
751 | "source": [
752 | "table = \"\"\"\n",
753 | " read_parquet(\n",
754 | " 'VOTES_TIME_METRICS.parquet/*/*/*/*.parquet', \n",
755 | " hive_partitioning=True,\n",
756 | " hive_types_autocast=0\n",
757 | " )\n",
758 | " \"\"\"\n",
759 | "turno = 1\n",
760 | "uf = 'DF'\n",
761 | "zone_group = 'ALL'\n",
762 | "zone = 1\n",
763 | "\n",
764 | "\n",
765 | "query = f\"\"\"\n",
766 | " SELECT *\n",
767 | " FROM {table}\n",
768 | " WHERE 1=1\n",
769 | " AND turno = '{turno}'\n",
770 | " AND uf = '{uf}'\n",
771 | " -- AND zone_group = '{zone_group}'\n",
772 | " -- AND zone_code = {zone}\n",
773 | " \"\"\"\n",
774 | "\n",
775 | "df = cursor.execute(query).df()\n",
776 | "df"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 13,
782 | "metadata": {},
783 | "outputs": [
784 | {
785 | "data": {
786 | "text/plain": [
787 | "Index(['turno', 'uf', 'zone_code', 'section_code', 'total_votos',\n",
788 | " 'total_secoes', 'tempo_voto_soma', 'tempo_voto_medio',\n",
789 | " 'tempo_biometria_soma', 'tempo_biometria_medio',\n",
790 | " 'tempo_voto_total_soma', 'tempo_voto_total_medio',\n",
791 | " 'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos',\n",
792 | " 'votos_90_120_segundos', 'votos_120_150_segundos',\n",
793 | " 'votos_150_180_segundos', 'votos_180_210_segundos',\n",
794 | " 'votos_210_300_segundos', 'votos_300_9999_segundos',\n",
795 | " 'tx_sucesso_biometria', 'nr_total_cargos_votados',\n",
796 | " 'nr_total_teclas_digitadas', 'zone_group'],\n",
797 | " dtype='object')"
798 | ]
799 | },
800 | "execution_count": 13,
801 | "metadata": {},
802 | "output_type": "execute_result"
803 | }
804 | ],
805 | "source": [
806 | "df.columns"
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": null,
812 | "metadata": {},
813 | "outputs": [],
814 | "source": []
815 | }
816 | ],
817 | "metadata": {
818 | "kernelspec": {
819 | "display_name": "base",
820 | "language": "python",
821 | "name": "python3"
822 | },
823 | "language_info": {
824 | "codemirror_mode": {
825 | "name": "ipython",
826 | "version": 3
827 | },
828 | "file_extension": ".py",
829 | "mimetype": "text/x-python",
830 | "name": "python",
831 | "nbconvert_exporter": "python",
832 | "pygments_lexer": "ipython3",
833 | "version": "3.11.5"
834 | }
835 | },
836 | "nbformat": 4,
837 | "nbformat_minor": 2
838 | }
839 |
--------------------------------------------------------------------------------
/src/test-basic-queries.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Performance das Consultas"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Importando Bibliotecas"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import duckdb\n",
24 | "import time"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Conectando à base de dados"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "cursor = duckdb.connect()\n",
41 | "DATABASE = '*.parquet'"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 14,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "def execute_query_and_calculate_time(cursor, query, return_df=False):\n",
51 | " \n",
52 | " if return_df:\n",
53 | " tic = time.time()\n",
54 | " cursor.execute(query)\n",
55 | " df = cursor.df()\n",
56 | " toc = time.time()\n",
57 | " return df, toc - tic\n",
58 | " else:\n",
59 | " tic = time.time()\n",
60 | " cursor.execute(query)\n",
61 | " toc = time.time()\n",
62 | " return toc - tic"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Consultas"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "### Quantidade de Registros"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 3,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "query = f\"SELECT COUNT(*) FROM '{DATABASE}'\""
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "Number of rows: 4,283,329,488\n",
98 | "Time: 1.47s\n"
99 | ]
100 | }
101 | ],
102 | "source": [
103 | "tic = time.time()\n",
104 | "\n",
105 | "cursor.execute(query)\n",
106 | "n_rows = cursor.fetchone()\n",
107 | "\n",
108 | "toc = time.time()\n",
109 | "\n",
110 | "print(f\"Number of rows: {n_rows[0]:,}\")\n",
111 | "print(f\"Time: {toc - tic:.2f}s\")"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "### Primeiros Registros"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "Time: 0.82s\n"
131 | ]
132 | },
133 | {
134 | "data": {
135 | "text/html": [
136 | "\n",
137 | "\n",
150 | "
\n",
151 | " \n",
152 | " \n",
153 | " | \n",
154 | " event_timestamp | \n",
155 | " event_type | \n",
156 | " some_id | \n",
157 | " event_system | \n",
158 | " event_description | \n",
159 | " event_id | \n",
160 | " filename | \n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | " \n",
165 | " | 0 | \n",
166 | " 2022-10-26 10:39:36 | \n",
167 | " INFO | \n",
168 | " 67305985 | \n",
169 | " LOGD | \n",
170 | " Início das operações do logd | \n",
171 | " E2C58C3021D6DB87 | \n",
172 | " /data/logs/2_AC/o00407-0100700090001_new.csv | \n",
173 | "
\n",
174 | " \n",
175 | " | 1 | \n",
176 | " 2022-10-26 10:39:36 | \n",
177 | " INFO | \n",
178 | " 67305985 | \n",
179 | " LOGD | \n",
180 | " Urna ligada em 26/10/2022 às 10:38:20 | \n",
181 | " DFBD462E26E8F1EA | \n",
182 | " /data/logs/2_AC/o00407-0100700090001_new.csv | \n",
183 | "
\n",
184 | " \n",
185 | " | 2 | \n",
186 | " 2022-10-26 10:39:36 | \n",
187 | " INFO | \n",
188 | " 67305985 | \n",
189 | " SCUE | \n",
190 | " Iniciando aplicação - Oficial - 1º turno | \n",
191 | " B8E2CBFADB3EF46B | \n",
192 | " /data/logs/2_AC/o00407-0100700090001_new.csv | \n",
193 | "
\n",
194 | " \n",
195 | " | 3 | \n",
196 | " 2022-10-26 10:39:36 | \n",
197 | " INFO | \n",
198 | " 67305985 | \n",
199 | " SCUE | \n",
200 | " Versão da aplicação: 8.26.0.0 - Onça-pintada | \n",
201 | " AC76A5B17419CB2E | \n",
202 | " /data/logs/2_AC/o00407-0100700090001_new.csv | \n",
203 | "
\n",
204 | " \n",
205 | " | 4 | \n",
206 | " 2022-10-26 10:39:38 | \n",
207 | " INFO | \n",
208 | " 67305985 | \n",
209 | " SCUE | \n",
210 | " Urna operando com rede elétrica | \n",
211 | " ED0703CBF6110D2C | \n",
212 | " /data/logs/2_AC/o00407-0100700090001_new.csv | \n",
213 | "
\n",
214 | " \n",
215 | "
\n",
216 | "
"
217 | ],
218 | "text/plain": [
219 | " event_timestamp event_type some_id event_system \\\n",
220 | "0 2022-10-26 10:39:36 INFO 67305985 LOGD \n",
221 | "1 2022-10-26 10:39:36 INFO 67305985 LOGD \n",
222 | "2 2022-10-26 10:39:36 INFO 67305985 SCUE \n",
223 | "3 2022-10-26 10:39:36 INFO 67305985 SCUE \n",
224 | "4 2022-10-26 10:39:38 INFO 67305985 SCUE \n",
225 | "\n",
226 | " event_description event_id \\\n",
227 | "0 Início das operações do logd E2C58C3021D6DB87 \n",
228 | "1 Urna ligada em 26/10/2022 às 10:38:20 DFBD462E26E8F1EA \n",
229 | "2 Iniciando aplicação - Oficial - 1º turno B8E2CBFADB3EF46B \n",
230 | "3 Versão da aplicação: 8.26.0.0 - Onça-pintada AC76A5B17419CB2E \n",
231 | "4 Urna operando com rede elétrica ED0703CBF6110D2C \n",
232 | "\n",
233 | " filename \n",
234 | "0 /data/logs/2_AC/o00407-0100700090001_new.csv \n",
235 | "1 /data/logs/2_AC/o00407-0100700090001_new.csv \n",
236 | "2 /data/logs/2_AC/o00407-0100700090001_new.csv \n",
237 | "3 /data/logs/2_AC/o00407-0100700090001_new.csv \n",
238 | "4 /data/logs/2_AC/o00407-0100700090001_new.csv "
239 | ]
240 | },
241 | "execution_count": 6,
242 | "metadata": {},
243 | "output_type": "execute_result"
244 | }
245 | ],
246 | "source": [
247 | "query = f\"\"\"\n",
248 | " SELECT \n",
249 | " *\n",
250 | " FROM '{DATABASE}' LIMIT 5\n",
251 | "\"\"\"\n",
252 | "\n",
253 | "tic = time.time()\n",
254 | "cursor.execute(query)\n",
255 | "df_result = cursor.df()\n",
256 | "toc = time.time()\n",
257 | "\n",
258 | "print(f\"Time: {toc - tic:.2f}s\")\n",
259 | "df_result"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "### Primeiros registros + filtro RN"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 7,
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "Time: 69.65s\n"
279 | ]
280 | }
281 | ],
282 | "source": [
283 | "query = f\"\"\"\n",
284 | " SELECT \n",
285 | " *\n",
286 | " FROM '{DATABASE}'\n",
287 | " WHERE filename ILIKE '%RN%'\n",
288 | " LIMIT 500\n",
289 | "\"\"\"\n",
290 | "\n",
291 | "tic = time.time()\n",
292 | "cursor.execute(query)\n",
293 | "df_result = cursor.df()\n",
294 | "toc = time.time()\n",
295 | "\n",
296 | "print(f\"Time: {toc - tic:.2f}s\")"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 11,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "Time: 91.12s\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "query = f\"\"\"\n",
314 | " SELECT \n",
315 | " *\n",
316 | " FROM '{DATABASE}'\n",
317 | " WHERE filename ILIKE '%SP%'\n",
318 | " LIMIT 500\n",
319 | "\"\"\"\n",
320 | "\n",
321 | "tic = time.time()\n",
322 | "cursor.execute(query)\n",
323 | "df_result = cursor.df()\n",
324 | "toc = time.time()\n",
325 | "\n",
326 | "print(f\"Time: {toc - tic:.2f}s\")"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "### Distinct"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "event_type"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 12,
346 | "metadata": {},
347 | "outputs": [
348 | {
349 | "name": "stdout",
350 | "output_type": "stream",
351 | "text": [
352 | "Time: 5.69s\n"
353 | ]
354 | }
355 | ],
356 | "source": [
357 | "query = f\"\"\"\n",
358 | " SELECT DISTINCT\n",
359 | " event_type\n",
360 | " FROM '{DATABASE}'\n",
361 | "\"\"\"\n",
362 | "\n",
363 | "tic = time.time()\n",
364 | "cursor.execute(query)\n",
365 | "df_result = cursor.df()\n",
366 | "toc = time.time()\n",
367 | "\n",
368 | "print(f\"Time: {toc - tic:.2f}s\")"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "event_description"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 14,
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "name": "stdout",
385 | "output_type": "stream",
386 | "text": [
387 | "Time: 29.33s\n"
388 | ]
389 | }
390 | ],
391 | "source": [
392 | "query = f\"\"\"\n",
393 | " SELECT DISTINCT\n",
394 | " event_description\n",
395 | " FROM '{DATABASE}'\n",
396 | "\"\"\"\n",
397 | "\n",
398 | "tic = time.time()\n",
399 | "cursor.execute(query)\n",
400 | "df_result = cursor.df()\n",
401 | "toc = time.time()\n",
402 | "\n",
403 | "print(f\"Time: {toc - tic:.2f}s\")"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "### Group By"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 3,
416 | "metadata": {},
417 | "outputs": [
418 | {
419 | "data": {
420 | "application/vnd.jupyter.widget-view+json": {
421 | "model_id": "2941ff10abd0446cb443aafd4e0fc77c",
422 | "version_major": 2,
423 | "version_minor": 0
424 | },
425 | "text/plain": [
426 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
427 | ]
428 | },
429 | "metadata": {},
430 | "output_type": "display_data"
431 | },
432 | {
433 | "name": "stdout",
434 | "output_type": "stream",
435 | "text": [
436 | "Time: 6.77s\n"
437 | ]
438 | }
439 | ],
440 | "source": [
441 | "query = f\"\"\"\n",
442 | " SELECT \n",
443 | " event_system,\n",
444 | " COUNT(*) AS qtd_linhas\n",
445 | " FROM '{DATABASE}'\n",
446 | " GROUP BY event_system\n",
447 | "\"\"\"\n",
448 | "\n",
449 | "tic = time.time()\n",
450 | "cursor.execute(query) \n",
451 | "df_result = cursor.df()\n",
452 | "toc = time.time()\n",
453 | "\n",
454 | "print(f\"Time: {toc - tic:.2f}s\")"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 4,
460 | "metadata": {},
461 | "outputs": [
462 | {
463 | "data": {
464 | "text/html": [
465 | "\n",
466 | "\n",
479 | "
\n",
480 | " \n",
481 | " \n",
482 | " | \n",
483 | " event_system | \n",
484 | " qtd_linhas | \n",
485 | "
\n",
486 | " \n",
487 | " \n",
488 | " \n",
489 | " | 0 | \n",
490 | " INITJE | \n",
491 | " 3044304 | \n",
492 | "
\n",
493 | " \n",
494 | " | 1 | \n",
495 | " VERIFICADOR | \n",
496 | " 37931 | \n",
497 | "
\n",
498 | " \n",
499 | " | 2 | \n",
500 | " STE | \n",
501 | " 394 | \n",
502 | "
\n",
503 | " \n",
504 | " | 3 | \n",
505 | " LOGD | \n",
506 | " 17978454 | \n",
507 | "
\n",
508 | " \n",
509 | " | 4 | \n",
510 | " ADH | \n",
511 | " 5188 | \n",
512 | "
\n",
513 | " \n",
514 | " | 5 | \n",
515 | " SA | \n",
516 | " 784 | \n",
517 | "
\n",
518 | " \n",
519 | " | 6 | \n",
520 | " SCUE | \n",
521 | " 39756883 | \n",
522 | "
\n",
523 | " \n",
524 | " | 7 | \n",
525 | " VPP | \n",
526 | " 223388 | \n",
527 | "
\n",
528 | " \n",
529 | " | 8 | \n",
530 | " VO\u0014A | \n",
531 | " 1 | \n",
532 | "
\n",
533 | " \n",
534 | " | 9 | \n",
535 | " VOTA | \n",
536 | " 3879701660 | \n",
537 | "
\n",
538 | " \n",
539 | " | 10 | \n",
540 | " RED | \n",
541 | " 76691 | \n",
542 | "
\n",
543 | " \n",
544 | " | 11 | \n",
545 | " GAP | \n",
546 | " 262715525 | \n",
547 | "
\n",
548 | " \n",
549 | " | 12 | \n",
550 | " ATUE | \n",
551 | " 79788285 | \n",
552 | "
\n",
553 | " \n",
554 | "
\n",
555 | "
"
556 | ],
557 | "text/plain": [
558 | " event_system qtd_linhas\n",
559 | "0 INITJE 3044304\n",
560 | "1 VERIFICADOR 37931\n",
561 | "2 STE 394\n",
562 | "3 LOGD 17978454\n",
563 | "4 ADH 5188\n",
564 | "5 SA 784\n",
565 | "6 SCUE 39756883\n",
566 | "7 VPP 223388\n",
567 | "8 VO\u0014A 1\n",
568 | "9 VOTA 3879701660\n",
569 | "10 RED 76691\n",
570 | "11 GAP 262715525\n",
571 | "12 ATUE 79788285"
572 | ]
573 | },
574 | "execution_count": 4,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "df_result"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "metadata": {},
586 | "source": [
587 | "### Group By + Filtro"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 9,
593 | "metadata": {},
594 | "outputs": [
595 | {
596 | "data": {
597 | "application/vnd.jupyter.widget-view+json": {
598 | "model_id": "fde6505a3b484b28880584a0d5f7bb84",
599 | "version_major": 2,
600 | "version_minor": 0
601 | },
602 | "text/plain": [
603 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
604 | ]
605 | },
606 | "metadata": {},
607 | "output_type": "display_data"
608 | },
609 | {
610 | "name": "stdout",
611 | "output_type": "stream",
612 | "text": [
613 | "Time: 7.98s\n"
614 | ]
615 | }
616 | ],
617 | "source": [
618 | "query = f\"\"\"\n",
619 | " SELECT \n",
620 | " event_type,\n",
621 | " COUNT(*) AS qtd_linhas\n",
622 | " FROM '{DATABASE}'\n",
623 | " WHERE event_system='VOTA' OR event_system='RED'\n",
624 | " GROUP BY event_type\n",
625 | "\"\"\"\n",
626 | "\n",
627 | "tic = time.time()\n",
628 | "cursor.execute(query) \n",
629 | "df_result = cursor.df()\n",
630 | "toc = time.time()\n",
631 | "\n",
632 | "print(f\"Time: {toc - tic:.2f}s\")"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": 10,
638 | "metadata": {},
639 | "outputs": [
640 | {
641 | "data": {
642 | "text/html": [
643 | "\n",
644 | "\n",
657 | "
\n",
658 | " \n",
659 | " \n",
660 | " | \n",
661 | " event_type | \n",
662 | " qtd_linhas | \n",
663 | "
\n",
664 | " \n",
665 | " \n",
666 | " \n",
667 | " | 0 | \n",
668 | " ALERTA | \n",
669 | " 50460553 | \n",
670 | "
\n",
671 | " \n",
672 | " | 1 | \n",
673 | " ERRO | \n",
674 | " 1024682 | \n",
675 | "
\n",
676 | " \n",
677 | " | 2 | \n",
678 | " INFO | \n",
679 | " 3828293116 | \n",
680 | "
\n",
681 | " \n",
682 | "
\n",
683 | "
"
684 | ],
685 | "text/plain": [
686 | " event_type qtd_linhas\n",
687 | "0 ALERTA 50460553\n",
688 | "1 ERRO 1024682\n",
689 | "2 INFO 3828293116"
690 | ]
691 | },
692 | "execution_count": 10,
693 | "metadata": {},
694 | "output_type": "execute_result"
695 | }
696 | ],
697 | "source": [
698 | "df_result"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "### Verificar se event_id é unico"
706 | ]
707 | },
708 | {
709 | "cell_type": "markdown",
710 | "metadata": {},
711 | "source": [
712 | "[WIP] Descrição básica da razão de cada consulta, qual sua função e como ela é utilizada no dia a dia"
713 | ]
714 | },
715 | {
716 | "cell_type": "markdown",
717 | "metadata": {},
718 | "source": [
719 | "1 - Usando GroupBy"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": 4,
725 | "metadata": {},
726 | "outputs": [
727 | {
728 | "data": {
729 | "application/vnd.jupyter.widget-view+json": {
730 | "model_id": "db88550cf5cf4fc7a508e50d1839f168",
731 | "version_major": 2,
732 | "version_minor": 0
733 | },
734 | "text/plain": [
735 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
736 | ]
737 | },
738 | "metadata": {},
739 | "output_type": "display_data"
740 | },
741 | {
742 | "ename": "",
743 | "evalue": "",
744 | "output_type": "error",
745 | "traceback": [
746 | "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
747 | "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
748 | "\u001b[1;31mClick here for more info. \n",
749 | "\u001b[1;31mView Jupyter log for further details."
750 | ]
751 | }
752 | ],
753 | "source": [
754 | "query = f\"\"\"\n",
755 | " SELECT\n",
756 | " COUNT(*) \n",
757 | " FROM (\n",
758 | " SELECT \n",
759 | " event_id,\n",
760 | " COUNT(*)\n",
761 | " FROM '{DATABASE}'\n",
762 | " GROUP BY event_id\n",
763 | " HAVING COUNT(*) > 1\n",
764 | " )\n",
765 | "\"\"\"\n",
766 | "\n",
767 | "tic = time.time()\n",
768 | "cursor.execute(query) \n",
769 | "df_result = cursor.df()\n",
770 | "toc = time.time()\n",
771 | "\n",
772 | "print(f\"Time: {toc - tic:.2f}s\")"
773 | ]
774 | },
775 | {
776 | "cell_type": "markdown",
777 | "metadata": {},
778 | "source": [
779 | "2 - Usando Windows Function"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": 3,
785 | "metadata": {},
786 | "outputs": [
787 | {
788 | "data": {
789 | "application/vnd.jupyter.widget-view+json": {
790 | "model_id": "f529e0b4f1e149e6bdab3ade2e1d665f",
791 | "version_major": 2,
792 | "version_minor": 0
793 | },
794 | "text/plain": [
795 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
796 | ]
797 | },
798 | "metadata": {},
799 | "output_type": "display_data"
800 | }
801 | ],
802 | "source": [
803 | "query = f\"\"\"\n",
804 | " SELECT\n",
805 | " event_id, qtd_linhas\n",
806 | " FROM (\n",
807 | " SELECT \n",
808 | " event_id,\n",
809 | " COUNT(*) OVER( PARTITION BY event_id ) AS qtd_linhas\n",
810 | " FROM '{DATABASE}'\n",
811 | " ) _\n",
812 | " WHERE qtd_linhas > 1\n",
813 | " LIMIT 1\n",
814 | "\"\"\"\n",
815 | "\n",
816 | "tic = time.time()\n",
817 | "cursor.execute(query) \n",
818 | "df_result = cursor.df()\n",
819 | "toc = time.time()\n",
820 | "\n",
821 | "print(f\"Time: {toc - tic:.2f}s\")"
822 | ]
823 | },
824 | {
825 | "cell_type": "markdown",
826 | "metadata": {},
827 | "source": [
828 | "3 - Usando Count Distinct"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 5,
834 | "metadata": {},
835 | "outputs": [
836 | {
837 | "data": {
838 | "application/vnd.jupyter.widget-view+json": {
839 | "model_id": "71e66f8947c24b12b66fb5b11196bf63",
840 | "version_major": 2,
841 | "version_minor": 0
842 | },
843 | "text/plain": [
844 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
845 | ]
846 | },
847 | "metadata": {},
848 | "output_type": "display_data"
849 | }
850 | ],
851 | "source": [
852 | "query = f\"\"\"\n",
853 | " SELECT COUNT(*)\n",
854 | " FROM (\n",
855 | " SELECT DISTINCT event_id \n",
856 | " FROM '{DATABASE}'\n",
857 | " ) _\n",
858 | "\"\"\"\n",
859 | "\n",
860 | "tic = time.time()\n",
861 | "cursor.execute(query) \n",
862 | "df_result = cursor.df()\n",
863 | "toc = time.time()\n",
864 | "\n",
865 | "print(f\"Time: {toc - tic:.2f}s\")"
866 | ]
867 | },
868 | {
869 | "cell_type": "markdown",
870 | "metadata": {},
871 | "source": [
872 | "4 - Usando Distinct + write to disk"
873 | ]
874 | },
875 | {
876 | "cell_type": "code",
877 | "execution_count": 4,
878 | "metadata": {},
879 | "outputs": [
880 | {
881 | "data": {
882 | "application/vnd.jupyter.widget-view+json": {
883 | "model_id": "f70e43b1e2f24f38adebfcd80815d560",
884 | "version_major": 2,
885 | "version_minor": 0
886 | },
887 | "text/plain": [
888 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
889 | ]
890 | },
891 | "metadata": {},
892 | "output_type": "display_data"
893 | },
894 | {
895 | "ename": "",
896 | "evalue": "",
897 | "output_type": "error",
898 | "traceback": [
899 | "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
900 | "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
901 | "\u001b[1;31mClick here for more info. \n",
902 | "\u001b[1;31mView Jupyter log for further details."
903 | ]
904 | }
905 | ],
906 | "source": [
907 | "query = f\"\"\"\n",
908 | " COPY (\n",
909 | " SELECT DISTINCT event_id \n",
910 | " FROM '{DATABASE}'\n",
911 | " ) TO 'event_id.parquet' \n",
912 | " (FORMAT 'parquet')\n",
913 | "\"\"\"\n",
914 | "\n",
915 | "tic = time.time()\n",
916 | "cursor.execute(query) \n",
917 | "df_result = cursor.df()\n",
918 | "toc = time.time()\n",
919 | "\n",
920 | "print(f\"Time: {toc - tic:.2f}s\")"
921 | ]
922 | },
923 | {
924 | "cell_type": "markdown",
925 | "metadata": {},
926 | "source": [
927 | "### Distinct mensagens"
928 | ]
929 | },
930 | {
931 | "cell_type": "markdown",
932 | "metadata": {},
933 | "source": [
934 | "Primeira aproximação"
935 | ]
936 | },
937 | {
938 | "cell_type": "code",
939 | "execution_count": 15,
940 | "metadata": {},
941 | "outputs": [
942 | {
943 | "data": {
944 | "application/vnd.jupyter.widget-view+json": {
945 | "model_id": "f37594cbd3e24d858c8a99e7f2841d9e",
946 | "version_major": 2,
947 | "version_minor": 0
948 | },
949 | "text/plain": [
950 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
951 | ]
952 | },
953 | "metadata": {},
954 | "output_type": "display_data"
955 | },
956 | {
957 | "name": "stdout",
958 | "output_type": "stream",
959 | "text": [
960 | "Time: 159.81s\n"
961 | ]
962 | },
963 | {
964 | "data": {
965 | "text/plain": [
966 | "159.71511435508728"
967 | ]
968 | },
969 | "execution_count": 15,
970 | "metadata": {},
971 | "output_type": "execute_result"
972 | }
973 | ],
974 | "source": [
975 | "query = f\"\"\"\n",
976 | " SELECT DISTINCT\n",
977 | " regexp_replace(event_description, '[0-9]', 'X', 'g') AS event_description\n",
978 | " FROM '{DATABASE}'\n",
979 | "\"\"\"\n",
980 | "\n",
981 | "duration, df_result = execute_query_and_calculate_time(cursor, query, return_df=True)\n",
982 | "print(f\"Time: {toc - tic:.2f}s\")\n",
983 | "df_result"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": 22,
989 | "metadata": {},
990 | "outputs": [
991 | {
992 | "data": {
993 | "application/vnd.jupyter.widget-view+json": {
994 | "model_id": "34112b0f74864cbf812b851009072faf",
995 | "version_major": 2,
996 | "version_minor": 0
997 | },
998 | "text/plain": [
999 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
1000 | ]
1001 | },
1002 | "metadata": {},
1003 | "output_type": "display_data"
1004 | }
1005 | ],
1006 | "source": [
1007 | "# Identificador da mídia de carga\n",
1008 | "# Serial da MI copiada da MV da urna original\n",
1009 | "# Serial de votação da MV\n",
1010 | "# Verificação de assinatura de dado por etapa\n",
1011 | "\n",
1012 | "query = f\"\"\"\n",
1013 | " SELECT DISTINCT\n",
1014 | " CASE\n",
1015 | " WHEN event_description ILIKE 'Identificador da mídia de carga%' \n",
1016 | " THEN 'Identificador da mídia de carga'\n",
1017 | "\n",
1018 | " WHEN event_description ILIKE 'Serial da MI copiada da MV da urna original%' \n",
1019 | " THEN 'Serial da MI copiada da MV da urna original'\n",
1020 | "\n",
1021 | " WHEN event_description ILIKE 'Serial de votação da MV%' \n",
1022 | " THEN 'Serial de votação da MV'\n",
1023 | "\n",
1024 | " WHEN event_description ILIKE 'Verificação de assinatura de dado por etapa%' \n",
1025 | " THEN 'Verificação de assinatura de dado por etapa'\n",
1026 | "\n",
1027 | " WHEN event_description ILIKE 'Número de série da MR%'\n",
1028 | " THEN 'Número de série da MR'\n",
1029 | " \n",
1030 | " ELSE regexp_replace(event_description, '[0-9]', 'X', 'g') \n",
1031 | " END AS event_description\n",
1032 | " FROM '{DATABASE}'\n",
1033 | "\"\"\"\n",
1034 | "\n",
1035 | "df_result, duration = execute_query_and_calculate_time(cursor, query, return_df=True)"
1036 | ]
1037 | },
1038 | {
1039 | "cell_type": "code",
1040 | "execution_count": 23,
1041 | "metadata": {},
1042 | "outputs": [
1043 | {
1044 | "name": "stdout",
1045 | "output_type": "stream",
1046 | "text": [
1047 | "Time: 478.24s\n",
1048 | "Number of rows: 1,391\n"
1049 | ]
1050 | }
1051 | ],
1052 | "source": [
1053 | "print(f\"Time: {duration:.2f}s\")\n",
1054 | "print(f\"Number of rows: {df_result.shape[0]:,}\")\n",
1055 | "df_result.to_csv('event_description.csv', index=False)"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "execution_count": null,
1061 | "metadata": {},
1062 | "outputs": [],
1063 | "source": []
1064 | }
1065 | ],
1066 | "metadata": {
1067 | "kernelspec": {
1068 | "display_name": "Python 3 (ipykernel)",
1069 | "language": "python",
1070 | "name": "python3"
1071 | },
1072 | "language_info": {
1073 | "codemirror_mode": {
1074 | "name": "ipython",
1075 | "version": 3
1076 | },
1077 | "file_extension": ".py",
1078 | "mimetype": "text/x-python",
1079 | "name": "python",
1080 | "nbconvert_exporter": "python",
1081 | "pygments_lexer": "ipython3",
1082 | "version": "3.11.5"
1083 | }
1084 | },
1085 | "nbformat": 4,
1086 | "nbformat_minor": 2
1087 | }
1088 |
--------------------------------------------------------------------------------