├── data └── logs │ ├── .gitkeep │ ├── .gitignore │ └── calculate_lines.sh ├── src ├── .gitignore ├── tests │ ├── test-create-table.py │ └── test-convert-to-parquet.py ├── convert_state_csv_data_to_parquet.py ├── count_number_of_lines.py ├── 01_extrair_eventos_relacionados_a_votos.ipynb ├── 03B_calcular_metricas_temporais.ipynb ├── 02_isolar_timestamps_eventos.ipynb ├── 03A_calcular_metricas_tempo.ipynb └── test-basic-queries.ipynb ├── duckdb ├── requirements.txt └── Dockerfile ├── streamlit ├── .gitignore ├── app │ ├── requirements.txt │ ├── maps │ │ └── donwload_files.txt │ ├── maps.py │ ├── data.py │ ├── main.py │ └── widgets.py └── Dockerfile ├── docker-compose.yaml ├── convert_encoding_from_files.py ├── extract_log_files.py ├── download_log_urnas.py └── README.md /data/logs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.parquet 2 | *.csv 3 | .tmp -------------------------------------------------------------------------------- /data/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *.logjez 3 | *.csv -------------------------------------------------------------------------------- /duckdb/requirements.txt: -------------------------------------------------------------------------------- 1 | duckdb 2 | pandas==2.2.1 -------------------------------------------------------------------------------- /streamlit/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.svg 3 | *.zip -------------------------------------------------------------------------------- /streamlit/app/requirements.txt: -------------------------------------------------------------------------------- 1 | altair 2 | pandas 3 | duckdb 4 | streamlit 5 | matplotlib 6 | geopandas 7 | seaborn -------------------------------------------------------------------------------- /duckdb/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/minimal-notebook 2 | 3 | COPY requirements.txt /app/requirements.txt 4 | WORKDIR /app 5 | 6 | RUN pip install -r requirements.txt --upgrade 7 | WORKDIR /src 8 | 9 | # EXPOSE JUPYTER PORT 10 | EXPOSE 8888 11 | -------------------------------------------------------------------------------- /streamlit/app/maps/donwload_files.txt: -------------------------------------------------------------------------------- 1 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_UF_2022.zip 2 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_Municipios_2022.zip 3 | -------------------------------------------------------------------------------- /streamlit/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /app 3 | COPY ./app/requirements.txt /app/requirements.txt 4 | 5 | RUN pip3 install -r requirements.txt 6 | 7 | EXPOSE 8500 8 | 9 | HEALTHCHECK CMD curl --fail http://localhost:8500/_stcore/health 10 | 11 | ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8500", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /src/tests/test-create-table.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | 3 | cursor = duckdb.connect('test.db') 4 | print(cursor.execute( 5 | """ 6 | CREATE OR REPLACE TABLE test_zz AS 7 | SELECT 8 | * 9 | FROM 10 | read_csv('/data/logs/2_ZZ/*_new.csv', filename=True) 11 | """ 12 | )) 13 | 14 | # Select the data from the table LIMIT 10 -------------------------------------------------------------------------------- /src/tests/test-convert-to-parquet.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import time 3 | cursor = duckdb.connect('test.db') 4 | 5 | tic = time.time() 6 | cursor.execute( 7 | """ 8 | COPY ( 9 | SELECT 10 | * 11 | FROM read_csv('/data/logs/2_ZZ/*_new.csv', filename=True) 12 | ) TO 'test_zz.parquet' (FORMAT 'parquet'); 13 | """ 14 | ) 15 | toc = time.time() 16 | print(f"Time taken: {toc - tic} seconds") -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | duckdb: 4 | build: ./duckdb 5 | volumes: 6 | - ./data:/data 7 | - ./src:/src 8 | ports: 9 | - "8888:8888" 10 | command: start-notebook.sh --NotebookApp.token='1234' 11 | streamlit: 12 | build: ./streamlit 13 | volumes: 14 | - ./src:/src 15 | - ./streamlit/app/:/app 16 | ports: 17 | - "8600:8500" 18 | command: streamlit run /src/app.py 19 | -------------------------------------------------------------------------------- /data/logs/calculate_lines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Array of Brazilian states 4 | states=("AC" "AL" "AP" "AM" "BA" "CE" "DF" "ES" "GO" "MA" "MT" "MS" "MG" "PA" "PB" "PR" "PE" "PI" "RJ" "RN" "RS" "RO" "RR" "SC" "SP" "SE" "TO") 5 | 6 | # Iterate over each state 7 | for state in "${states[@]}" 8 | do 9 | # Concatenate "2_" in front of the state 10 | state_with_prefix="2_$state" 11 | echo "Calculating total lines for $state_with_prefix" 12 | find "./$state_with_prefix" -type f -exec wc -l {} + | awk -v st="$state_with_prefix" '{total += $1} END {print "Total lines in " st ":", total}' 13 | done 14 | -------------------------------------------------------------------------------- /src/convert_state_csv_data_to_parquet.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import time 3 | import sys 4 | 5 | 6 | STATES = [ 7 | "AC", "AL", "AM", "AP", "BA", "CE", 8 | "DF", "ES", "GO", "MA", "MG", "MS", 9 | "MT", "PA", "PB", "PE", "PI", "PR", 10 | "RJ", "RN", "RO", "RR", "RS", "SC", 11 | "SE", "SP", "TO", "ZZ"] 12 | 13 | if __name__ == "__main__": 14 | # get the first sys arg 15 | uf = sys.argv[1] 16 | 17 | # if sys arg not in the brazilian states 18 | if uf not in STATES: 19 | print("Invalid state") 20 | sys.exit(1) 21 | 22 | tic = time.time() 23 | cursor = duckdb.connect("") 24 | query = f""" 25 | COPY ( 26 | SELECT 27 | * 28 | FROM read_csv('/data/logs/2_{uf}/*.csv', filename=True) 29 | ) TO '{uf}.parquet' (FORMAT 'parquet'); 30 | """ 31 | 32 | cursor.execute(query) 33 | toc = time.time() 34 | print(f"Time taken to convert {uf} to parquet: {toc - tic} seconds") 35 | -------------------------------------------------------------------------------- /src/count_number_of_lines.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import time 3 | import sys 4 | 5 | 6 | STATES = [ 7 | "AC", "AL", "AM", "AP", "BA", "CE", 8 | "DF", "ES", "GO", "MA", "MG", "MS", 9 | "MT", "PA", "PB", "PE", "PI", "PR", 10 | "RJ", "RN", "RO", "RR", "RS", "SC", 11 | "SE", "SP", "TO", "ZZ", "ALL"] 12 | 13 | if __name__ == "__main__": 14 | # get the first sys arg 15 | uf = sys.argv[1] 16 | 17 | # if sys arg not in the brazilian states 18 | if uf not in STATES: 19 | print("Invalid state") 20 | sys.exit(1) 21 | 22 | tic = time.time() 23 | cursor = duckdb.connect("") 24 | 25 | if uf == "ALL": 26 | query = f""" 27 | SELECT 28 | COUNT(*) 29 | FROM '*.parquet' 30 | """ 31 | else: 32 | query = f""" 33 | SELECT 34 | COUNT(*) 35 | FROM '{uf}.parquet' 36 | """ 37 | 38 | cursor.execute(query) 39 | toc = time.time() 40 | print(f"Time taken to count number of lines in {uf}: {toc - tic} seconds") 41 | print(cursor.fetchall()) 42 | -------------------------------------------------------------------------------- /convert_encoding_from_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tqdm 3 | import time 4 | 5 | 6 | if __name__ == "__main__": 7 | 8 | BASE_LOGS_PATH = "./data/logs" 9 | # list all directories in the base path 10 | directories = os.listdir(BASE_LOGS_PATH) 11 | command = "touch {} && iconv -f ISO-8859-1 -t UTF-8//TRANSLIT {} > {} && rm {}" 12 | 13 | tic = time.time() 14 | for directory in directories: 15 | path = BASE_LOGS_PATH + "/" + directory 16 | if not os.path.isdir(path): 17 | continue 18 | 19 | files = os.listdir(path) 20 | files = [file for file in files if file.endswith(".csv") and not file.endswith("_new.csv")] 21 | print(f"Processing directory {directory} with {len(files)} files") 22 | 23 | for file in tqdm.tqdm(files): 24 | # convert the encoding of the file 25 | filename = file.split(".")[0] 26 | new_filename = filename + "_new.csv" 27 | 28 | path_old_file = path + "/" + file 29 | path_new_file = path + "/" + new_filename 30 | 31 | os.system(command.format(path_new_file, path_old_file, path_new_file, path_old_file)) 32 | toc = time.time() 33 | 34 | print(f"Conversion took {toc - tic} seconds") -------------------------------------------------------------------------------- /extract_log_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | BASE_PATH = './data/logs' 4 | 5 | 6 | def unzip_log_files(zip_file): 7 | 8 | # Each ZIP file contains MULTIPLE *.logjez files 9 | # Each A.logjez file contains a logd.dat file that is the LOG file 10 | # This code extract all A.logjez file and rename its logd.dat to A.csv 11 | 12 | # unzip file 13 | # extracting only the .logjez files 14 | filepath = zip_file[:-4] 15 | os.system(f'7z e {zip_file} -o{filepath} *.logjez -r') 16 | 17 | # Remove unnecessary files 18 | os.system(f'rm {zip_file}') # Zip file 19 | 20 | # list all files in the directory 21 | files = os.listdir(filepath) 22 | 23 | for file in files: 24 | # extract .logjez files 25 | # and rename to .csv 26 | if file.endswith('.logjez'): 27 | new_filename = file[:-7] 28 | os.system( 29 | f'7z e {filepath}/{file} -y -o{filepath}/{new_filename} \ 30 | > /dev/null' 31 | ) 32 | os.system( 33 | f'mv \ 34 | {filepath}/{new_filename}/logd.dat \ 35 | {filepath}/{new_filename}.csv' 36 | ) 37 | os.system( 38 | f'rm -r {filepath}/{new_filename}' 39 | ) 40 | 41 | os.system(f'chmod 777 -R {filepath}') 42 | os.system(f'rm {filepath}/*.logjez') 43 | 44 | 45 | if __name__ == "__main__": 46 | for file in os.listdir(BASE_PATH): 47 | if file.endswith('.zip'): 48 | unzip_log_files(os.path.join(BASE_PATH, file)) -------------------------------------------------------------------------------- /streamlit/app/maps.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | import matplotlib.pyplot as plt 3 | import re 4 | import streamlit as st 5 | 6 | @st.cache_data() 7 | def load_brazil_simplified_map(): 8 | """ 9 | Load the simplified map of Brazil. 10 | The simplification is done to reduce the file size 11 | and improve performance on the streamlit app. 12 | 13 | Returns: 14 | gpd.GeoDataFrame: GeoDataFrame with the simplified map of Brazil. 15 | """ 16 | 17 | map_ufs = './maps/BR_UF_2022.zip' 18 | 19 | gdf = gpd.read_file(map_ufs) 20 | gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01) 21 | 22 | return gdf 23 | 24 | @st.cache_data() 25 | def load_ufs_city_simplified_map(): 26 | """ 27 | Load the simplified map of Brazil with cities. 28 | """ 29 | map_municipios = './maps/BR_Municipios_2022.zip' 30 | 31 | gdf = gpd.read_file(map_municipios) 32 | gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01) 33 | 34 | return gdf 35 | 36 | def add_ufs_and_links_to_map(svg_image_buffer): 37 | """ 38 | Generate links for each UF in the SVG image buffer. 39 | And make the map clickable. 40 | 41 | Args: 42 | svg_image_buffer (str): SVG image buffer. 43 | 44 | Returns: 45 | str: SVG image buffer with links for each UF. 46 | """ 47 | 48 | re_uf_map_pattern = r'(((.|\s)*?))' 49 | image_with_links = re.sub( 50 | re_uf_map_pattern, 51 | r"\1", 52 | svg_image_buffer 53 | ) 54 | return image_with_links 55 | -------------------------------------------------------------------------------- /download_log_urnas.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import queue 3 | import os 4 | import sys 5 | import logging 6 | from itertools import product 7 | 8 | BASE_URL = ( 9 | 'https://cdn.tse.jus.br/estatistica/sead/eleicoes/' + 10 | 'eleicoes2022/arqurnatot/bu_imgbu_logjez_rdv_vscmr_2022_{}t_{}.zip' 11 | ) 12 | 13 | UFS_BR = [ 14 | 'AC', 'AL', 'AP', 'AM', 15 | 'BA', 'CE', 'DF', 'ES', 16 | 'GO', 'MA', 'MT', 'MS', 17 | 'MG', 'PA', 'PB', 'PR', 18 | 'PE', 'PI', 'RJ', 'RN', 19 | 'RS', 'RO', 'RR', 'SC', 20 | 'SP', 'SE', 'TO', 'ZZ' 21 | ] 22 | # TURNOS = [1, 2] 23 | TURNOS = [2] 24 | 25 | 26 | NUM_TRHEADS = 4 27 | 28 | # Create a queue to communicate with the worker threads 29 | turnos_uf_queue = queue.Queue() 30 | 31 | # Configure logging 32 | logging.basicConfig( 33 | level=logging.INFO, 34 | format='[%(asctime)s] %(message)s', 35 | datefmt='%d/%m/%y %H:%M:%S' 36 | ) 37 | 38 | 39 | def download_file(): 40 | uf_turno = turnos_uf_queue.get() 41 | url = BASE_URL.format(*uf_turno) 42 | path = os.path.join('data', 'logs', f'{uf_turno[0]}_{uf_turno[1]}.zip') 43 | 44 | logging.info(f'Downloading {url} to {path}') 45 | 46 | logging.info(f'Iniciando download de {url}') 47 | try: 48 | os.system(f'wget -O {path} {url}') 49 | except Exception as e: 50 | logging.error(f"Erro ao tentar baixar o arquivo {url}") 51 | logging.error(e) 52 | return 53 | 54 | logging.info(f'Finalizado download de {url}') 55 | 56 | if turnos_uf_queue.empty(): 57 | logging.info('All downloads finished') 58 | else: 59 | logging.info(f'{turnos_uf_queue.qsize()} downloads remaining') 60 | download_file() 61 | 62 | turnos_uf_queue.task_done() 63 | return 64 | 65 | 66 | if __name__ == "__main__": 67 | ufs_br_download = UFS_BR 68 | if len(sys.argv) > 1: 69 | ufs_br_download = sys.argv[1:] 70 | 71 | logging.info(f'Iniciando download de {len(ufs_br_download)} arquivos') 72 | logging.info(f'UFs: {ufs_br_download}') 73 | logging.info(f'Turnos: {TURNOS}') 74 | 75 | for uf_br, turno in product(ufs_br_download, TURNOS): 76 | turnos_uf_queue.put((turno, uf_br)) 77 | 78 | for i in range(NUM_TRHEADS): 79 | worker = threading.Thread( 80 | target=download_file, 81 | daemon=True 82 | ) 83 | worker.start() 84 | 85 | turnos_uf_queue.join() 86 | logging.info("Done") 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Processing Logs of Electronic Ballot Boxes 3 | This repository contains Python + DuckDB scripts for processing logs from [Brazilian Electronic Ballot Boxes](https://international.tse.jus.br/en/electronic-ballot-box/presentation?set_language=en) to compute several time-related metrics (mean vote time, number of votes computed in 5min, percentage of biometric identification success). 4 | 5 | ## The Data 6 | The logs from the voting machines can be directly downloaded from the [TSE open data website](https://dadosabertos.tse.jus.br/dataset/resultados-2022-arquivos-transmitidos-para-totalizacao). This repository contains Python scripts that automatically download and extract the logs. 7 | 8 | ## What are the logs of the Electronic Ballot Boxes? 9 | Files that contain all operations performed on the machine, from the initial setup to the end of voting in the second round (if applicable). The files are stored in plain text, with each line representing an event. See an example below: 10 | 11 | ``` 12 | 21/09/2022 17:21:41 INFO 67305985 LOGD Start of logd operations FDE9B0FC7A079096 13 | 21/09/2022 17:21:41 INFO 67305985 LOGD Machine turned on on 21/09/2022 at 17:20:16 B637C17E565B039B 14 | 21/09/2022 17:21:41 INFO 67305985 SCUE Starting application - Official - 1st round F82E007ACCAF93A5 15 | 21/09/2022 17:21:41 INFO 67305985 SCUE Application version: 8.26.0.0 - Jaguar D499E9A173814A70 16 | ``` 17 | With these logs, it is possible to extract numerous pieces of information about the electoral process. Due to their verbosity, the logs of the Ballot Boxes are very heavy. In their original format, the set of log files for a single Brazilian state can range from 2GB to over 50GB, with all the files combined reaching 450GB! Therefore, robust processing tools and optimized file formats are indispensable. 18 | 19 | ## Note on Approximations and Errors 20 | Processing the logs of the voting machines is not a simple task. 21 | Although they are easy to read, defining a process that perfectly isolates each vote is a complex task because numerous situations can occur during the voting process. 22 | 23 | The scripts coded here attempt to be as generic and simple as possible, to facilitate understanding, maintenance, and reduce the computational cost of processing. Therefore, they may occasionally not capture ALL votes perfectly. The error rate (uncaptured votes) considering the official count from the TSE is ~3% (experiment conducted with RN data). 24 | -------------------------------------------------------------------------------- /streamlit/app/data.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | 3 | ZONE_GROUPS = [ f'{x}-{x+20}' for x in range(0, 800, 20) ] 4 | UFS = [ 5 | "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS", 6 | "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC", 7 | "SP", "SE", "TO", "ZZ", "ALL" 8 | ] 9 | 10 | class DuckDBConnector: 11 | # Connect to the database 12 | # singleton pattern 13 | _instance = None 14 | def __init__(self) -> None: 15 | self.connect() 16 | 17 | @staticmethod 18 | def get_instance(): 19 | if DuckDBConnector._instance is None: 20 | DuckDBConnector._instance = DuckDBConnector() 21 | return DuckDBConnector._instance 22 | 23 | 24 | def connect(self): 25 | self.cursor = duckdb.connect() 26 | 27 | 28 | def calculate_zone_group(self, zone): 29 | if zone == 'ALL': 30 | return zone 31 | 32 | zone = int(zone) 33 | ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ] 34 | for group in ZONE_GROUPS: 35 | if zone >= group[0] and zone < group[1]: 36 | return f"{group[0]}-{group[1]}" 37 | 38 | 39 | def get_vote_time_metrics(self, uf, turno, zone, section): 40 | table = """ 41 | read_parquet( 42 | '/src/VOTES_TIME_METRICS.parquet/*/*/*/*.parquet', 43 | hive_partitioning=True, 44 | hive_types_autocast=0 45 | ) 46 | """ 47 | zone_group = self.calculate_zone_group(zone) 48 | zone = F"{int(zone):04d}" if zone != 'ALL' else zone 49 | section = F"{int(section):04d}" if section != 'ALL' else section 50 | 51 | zone_filter = f"AND zone_code = '{zone}' AND zone_group = '{zone_group}'" 52 | if uf == 'ALL': 53 | uf = "','".join(UFS) 54 | elif zone == 'ALL': 55 | zone = "','".join(ZONE_GROUPS) 56 | zone_filter = f"AND zone_group in ('{zone}', 'ALL')" 57 | 58 | query = f""" 59 | SELECT * 60 | FROM {table} 61 | WHERE 1=1 62 | AND turno = '{turno}' 63 | AND uf in ('{uf}') 64 | {zone_filter} 65 | AND section_code = '{section}' 66 | """ 67 | 68 | data = self.cursor.execute(query).df() 69 | return data 70 | 71 | 72 | def get_metrics_over_time(self, uf, turno, zone, section): 73 | table = """ 74 | read_parquet( 75 | '/src/VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet/*/*/*/*.parquet', 76 | hive_partitioning=True, 77 | hive_types_autocast=0 78 | ) 79 | """ 80 | zone_group = self.calculate_zone_group(zone) 81 | zone = F"{int(zone):04d}" if zone != 'ALL' else zone 82 | section = F"{int(section):04d}" if section != 'ALL' else section 83 | 84 | fix_zone_code = """ 85 | CASE WHEN zone_code IS NULL THEN 'ALL' 86 | ELSE zone_code 87 | END 88 | """ 89 | 90 | query = f""" 91 | SELECT * 92 | FROM {table} 93 | WHERE 1=1 94 | AND turno = '{turno}' 95 | AND uf in ('{uf}') 96 | AND zone_group = '{zone_group}' 97 | AND {fix_zone_code} = '{zone}' 98 | AND section_code = '{section}' 99 | AND timestamp_voto_computado_5min != 'ALL' 100 | """ 101 | 102 | data = self.cursor.execute(query).df() 103 | return data -------------------------------------------------------------------------------- /streamlit/app/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | from widgets import ( 4 | widget_bignumber_votos, widget_bignumber_secoes, 5 | widget_big_number_tempo_medio, widget_big_number_tempo_medio_bio, 6 | widget_big_number_tempo_total_voto, 7 | widget_tempo_medio_voto, widget_qtd_votos_intervalo_tempo, 8 | widget_numero_votos_intervalo_5min 9 | ) 10 | 11 | UFS = [ 12 | "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS", 13 | "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC", 14 | "SP", "SE", "TO", "ZZ", "ALL" 15 | ] 16 | TURNOS = ['1', '2'] 17 | 18 | def get_parameters_from_http_query_params(): 19 | query_parameters = st.query_params 20 | select_parameters = lambda x, default, accepted: ( 21 | default 22 | if x not in query_parameters 23 | else query_parameters[x] if query_parameters[x] in accepted 24 | else default 25 | ) 26 | nr_zonas_secoes = [str(x) for x in range(0, 800)] 27 | 28 | uf = select_parameters('uf', 'ALL', UFS ) 29 | turno = select_parameters('turno', '1', TURNOS ) 30 | zona = select_parameters('zona', 'ALL', nr_zonas_secoes) 31 | secao = select_parameters('secao', 'ALL', nr_zonas_secoes) 32 | 33 | return uf, turno, zona, secao 34 | 35 | if __name__ == "__main__": 36 | st.set_page_config(layout="wide") 37 | 38 | uf, turno, zona, secao = get_parameters_from_http_query_params() 39 | 40 | st.title(f'Eleições em Números - Tempo de Votação') 41 | subtitulo = '' 42 | subtitulo = subtitulo + f' - {uf}' if uf != 'ALL' else subtitulo + " - Brasil" 43 | subtitulo = subtitulo + f' - Zona {zona}' if zona != 'ALL' else subtitulo 44 | subtitulo = subtitulo + f', Seção {secao}' if secao != 'ALL' else subtitulo 45 | 46 | col_subtitle, col_change_turn = st.columns([4, 1]) 47 | # col_subtitle.markdown( subtitulo ) 48 | # add button to change the turn 49 | 50 | outro_turno = '1' if turno == '2' else '2' 51 | query_parameters = f"?turno={outro_turno}&uf={uf}&zona={zona}&secao={secao}" 52 | st.components.v1.html( 53 | f""" 54 |
55 | 56 | 69 | 70 |

72 | {subtitulo} 73 |

74 |
75 | """, 76 | height=70 77 | ) 78 | 79 | # ============================ 80 | # Big Number Widgets 81 | # ============================ 82 | 83 | col_bignumber_votos, col_bignumber_secoes, col_bignumber_tmedio, col_bignumber_tmedio_bio, col_bignumber_tempo_total = st.columns(5) 84 | widget_bignumber_votos(col_bignumber_votos, turno, uf, zona, secao) 85 | widget_bignumber_secoes(col_bignumber_secoes, turno, uf, zona, secao) 86 | widget_big_number_tempo_medio(col_bignumber_tmedio, turno, uf, zona, secao) 87 | widget_big_number_tempo_medio_bio(col_bignumber_tmedio_bio, turno, uf, zona, secao) 88 | widget_big_number_tempo_total_voto(col_bignumber_tempo_total, turno, uf, zona, secao) 89 | st.divider() 90 | 91 | # ================================= 92 | # Heatmap and Histogram Widgets 93 | # ================================= 94 | col_map, col_histogram, col_temporal_series = st.columns( [.3, .2, .5] ) 95 | widget_tempo_medio_voto(col_map, turno, uf, zona, secao) 96 | widget_qtd_votos_intervalo_tempo(col_histogram, turno, uf, zona, secao) 97 | widget_numero_votos_intervalo_5min(col_temporal_series, turno, uf, zona, secao) 98 | 99 | st.divider() 100 | 101 | # ================================= 102 | # Foot note. Author: João Pedro. Data gathered from TSE Open Data Portal. All code available at github. 103 | # ================================= 104 | 105 | st.text('Author: João Pedro. Dados coletados do Portal de Dados Abertos do TSE. All code available at Github.') 106 | st.text('O projeto é complexo. Os podem não ser 100% precisos.') 107 | -------------------------------------------------------------------------------- /src/01_extrair_eventos_relacionados_a_votos.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extrair apenas eventos relacionados a votos e metadados" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Como o Log das urnas compila todo o histórico de eventos que ocorreu em uma unna, incluindo testes, carga, preparação, etc, para garantir uma consulta mais eficiente, são extraídos apenas os eventos relacionados aos votos em si.\n", 15 | "\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Importing libraries" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import duckdb\n", 32 | "import pandas as pd\n", 33 | "import time" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Importing Data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "DATASET = 'ALL_UFS.parquet'" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "cursor = duckdb.connect()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Preparing Data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Definindo os filtros" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "Colunas relacionadas a metadados da Seção Eleitoral" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "METADATA = [\n", 89 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Zona Eleitoral%'\",\n", 90 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Seção Eleitoral%'\",\n", 91 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Município%'\",\n", 92 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Local de Votação%'\",\n", 93 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Turno da UE%'\",\n", 94 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Identificação do Modelo de Urna%'\"\n", 95 | "]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "COLUMN_EVENT_DESCRIPTION = 'event_description'\n", 105 | "\n", 106 | "EVENTS_DESCRIPTIONS = [\n", 107 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Urna pronta para receber vot%'\",\n", 108 | "]\n", 109 | "\n", 110 | "VOTES_DESCRIPTIONS = [\n", 111 | " # VOTOS\n", 112 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Aguardando digitação do título'\",\n", 113 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Título digitado pelo mesário'\",\n", 114 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Eleitor foi habilitado'\",\n", 115 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Voto confirmado par%'\",\n", 116 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'O voto do eleitor foi computado'\",\n", 117 | " \n", 118 | " # BIOMETRIA\n", 119 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE '%Digital%' \",\n", 120 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Dedo reconhecido%' \",\n", 121 | " F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Solicita digital%' \",\n", 122 | " F\"{COLUMN_EVENT_DESCRIPTION} = 'Solicitação de dado pessoal do eleitor para habilitação manual' \",\n", 123 | "]\n", 124 | "\n", 125 | "ACCEPTED_DATES = [\n", 126 | " '2022-10-02', '2022-10-30', # Data constitucional da eleição\n", 127 | " '2022-10-03', '2022-10-31', # No caso da seção 'virar a noite' e acabar depois da meia noite, imagino que sejam casos RARÍSSIMOS\n", 128 | "]\n", 129 | "\n", 130 | "ALL_FILTERS = METADATA + EVENTS_DESCRIPTIONS + VOTES_DESCRIPTIONS" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### Construindo e Executando a query" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "**Notas:** \n", 145 | "\n", 146 | "**1. Extração de metadados a partir do nome dos arquivos.**\n", 147 | " \n", 148 | "Cada arquivo TSV possui informações de uma Seção Eleitoral (que é a mesma coisa de uma Urna), e o nome do arquivo é a concatenação dos metadados da Seção Eleitoral:\n", 149 | "\n", 150 | " - Os 5 Primeiros Dígitos são o código do Município\n", 151 | " - Os 4 Dígitos seguintes são o código da Zona Eleitoral\n", 152 | " - Os 4 Dígitos seguintes são o código da Seção Eleitoral\n", 153 | "\n", 154 | "**2. Data da Eleição**\n", 155 | "\n", 156 | "A Data em que os eventos aconteceram é uma ótima forma de aproximar ainda mais os eventos que têm haver com a votação, uma vez que a votação no Brasil acontece em um único dia - aprende aí EUA ;)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "query = F\"\"\"\n", 166 | " SELECT \n", 167 | " *\n", 168 | " FROM (\n", 169 | " SELECT\n", 170 | " event_timestamp,\n", 171 | " event_timestamp::date AS event_date,\n", 172 | " event_type,\n", 173 | " some_id,\n", 174 | " event_system,\n", 175 | " event_description,\n", 176 | " event_id,\n", 177 | " \n", 178 | " REPLACE(SPLIT_PART(filename, '/', 5), '_new.csv', '') AS filename,\n", 179 | " \n", 180 | " -- Metadata from filename\n", 181 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 1, 5 ) AS city_code,\n", 182 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 6, 4 ) AS zone_code,\n", 183 | " SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 10, 4 ) AS section_code,\n", 184 | " REPLACE(SPLIT_PART(filename, '/', 4), '2_', '') AS uf\n", 185 | " FROM\n", 186 | " {DATASET}\n", 187 | " WHERE 1=1\n", 188 | " AND ( {' OR '.join(ALL_FILTERS)} )\n", 189 | " ) _\n", 190 | " WHERE 1=1\n", 191 | " AND event_date IN ({', '.join([F\"'{date}'\" for date in ACCEPTED_DATES])})\n", 192 | "\"\"\"" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Para facilitar consultas, os arquivos parquet são particionados por DATA DO EVENTO e UF." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 6, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "query = F\"\"\"\n", 209 | " COPY ({query}) TO 'UFS_VOTE_EVENTS.parquet' (FORMAT 'parquet', PARTITION_BY (event_date, uf), OVERWRITE_OR_IGNORE 1);\n", 210 | "\"\"\"" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 7, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "application/vnd.jupyter.widget-view+json": { 221 | "model_id": "ca3b1617f6524b85b061c9579b6cc506", 222 | "version_major": 2, 223 | "version_minor": 0 224 | }, 225 | "text/plain": [ 226 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 227 | ] 228 | }, 229 | "metadata": {}, 230 | "output_type": "display_data" 231 | }, 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "Time 1027.0729978084564s\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "tic = time.time()\n", 242 | "cursor.execute(query)\n", 243 | "toc = time.time()\n", 244 | "\n", 245 | "print(F\"Time {toc - tic}s\")" 246 | ] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "base", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.11.5" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 2 270 | } 271 | -------------------------------------------------------------------------------- /src/03B_calcular_metricas_temporais.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calcular Métricas - Tempo de Votação, Biometria, etc." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Importing libraries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 7, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import duckdb\n", 31 | "import pandas as pd\n", 32 | "import time" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Importing Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 8, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "cursor = duckdb.connect()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Dados Brutos" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n", 65 | "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Adicionar TURNO e Timestamp final de Biometria" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "source = F\"\"\"\n", 82 | "(\n", 83 | " SELECT \n", 84 | " *,\n", 85 | " \n", 86 | " CASE event_date\n", 87 | " WHEN '2022-10-02' THEN 1\n", 88 | " WHEN '2022-10-03' THEN 1\n", 89 | " WHEN '2022-10-30' THEN 2\n", 90 | " WHEN '2022-10-31' THEN 2\n", 91 | " ELSE NULL\n", 92 | " END::INT AS turno,\n", 93 | "\n", 94 | " COALESCE(\n", 95 | " timestamp_biometria_1,\n", 96 | " timestamp_biometria_2,\n", 97 | " timestamp_biometria_3,\n", 98 | " timestamp_biometria_4,\n", 99 | " timestamp_biometria_manual\n", 100 | " ) AS timestamp_biometria_final,\n", 101 | "\n", 102 | " strftime( '%Y-%m-%d %H:', timestamp_voto_computado )\n", 103 | " || (EXTRACT(MINUTE FROM timestamp_voto_computado)//5)*5 + 5\n", 104 | " || ':00' AS timestamp_voto_computado_5min\n", 105 | " \n", 106 | " FROM \n", 107 | " {TABLE}\n", 108 | " -- WHERE uf='DF'\n", 109 | ") _\n", 110 | "\"\"\"" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Preparinga Data" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n", 125 | "\n", 126 | "- Número de Votos\n", 127 | "- Número de Seções Eleitorais\n", 128 | "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n", 129 | "\n", 130 | "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n", 131 | "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n", 132 | "- Quantidade de Teclas Pressionadas\n", 133 | "- Quantidade de Cargos Distintos Votados" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "**Definição das métricas de tempo**" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 5, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_titulo_digitado))\"\n", 150 | "tempo_voto = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_habilitacao_eleitor))\"\n", 151 | "tempo_biometria = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n", 152 | "\n", 153 | "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(20), 'ALL')\"" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "query_metrics = F\"\"\"\n", 163 | " SELECT\n", 164 | " {fix_null_values('turno') } AS turno,\n", 165 | " {fix_null_values('timestamp_voto_computado_5min') } AS timestamp_voto_computado_5min,\n", 166 | " {fix_null_values('uf') } AS uf,\n", 167 | " zone_code,\n", 168 | " {fix_null_values('section_code') } AS section_code,\n", 169 | "\n", 170 | " COUNT(*) AS total_votos,\n", 171 | " SUM( {tempo_voto} ) AS tempo_voto_soma,\n", 172 | " SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n", 173 | " SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n", 174 | " \n", 175 | " FROM\n", 176 | " {source}\n", 177 | " WHERE quantidade_votos_computados = 1\n", 178 | " GROUP BY ROLLUP(turno, timestamp_voto_computado_5min, uf, zone_code, section_code)\n", 179 | "\"\"\"" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "**Salvar resultado intermediário**" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 7, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "query = F\"\"\"\n", 196 | " COPY (\n", 197 | " {\n", 198 | " query_metrics\n", 199 | " } )\n", 200 | " TO 'VOTES_TIME_METRICS_OVER_TIME.parquet' \n", 201 | " (FORMAT 'parquet', PARTITION_BY (turno, uf), OVERWRITE_OR_IGNORE 1);\n", 202 | "\"\"\"" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 8, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "application/vnd.jupyter.widget-view+json": { 213 | "model_id": "a86d242b27054c0683c1dca6f79697d6", 214 | "version_major": 2, 215 | "version_minor": 0 216 | }, 217 | "text/plain": [ 218 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 219 | ] 220 | }, 221 | "metadata": {}, 222 | "output_type": "display_data" 223 | } 224 | ], 225 | "source": [ 226 | "cursor.execute(query)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Cálculo cumulativo das métricas" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "TABLE_METRICS = \"\"\"read_parquet(\n", 243 | " 'VOTES_TIME_METRICS_OVER_TIME.parquet/*/*/*.parquet', \n", 244 | " hive_partitioning=True,\n", 245 | " hive_types_autocast =0\n", 246 | ")\n", 247 | "\"\"\"" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 12, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "query_cumulative_metrics = F\"\"\"\n", 257 | " SELECT\n", 258 | " *,\n", 259 | " SUM(total_votos) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS total_votos_cumulativo,\n", 260 | " SUM(tempo_voto_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_soma_cumulativo,\n", 261 | " SUM(tempo_biometria_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_biometria_soma_cumulativo,\n", 262 | " SUM(tempo_voto_total_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_total_soma_cumulativo\n", 263 | " FROM\n", 264 | " {TABLE_METRICS}\n", 265 | "\"\"\"" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n", 273 | "\n", 274 | "As ZONAS foram agrupadas em grupos de 20, esse número é empírico." 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 13, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "query_metrics_with_zone_group = F\"\"\"\n", 284 | " SELECT\n", 285 | " *,\n", 286 | " CASE\n", 287 | " {\n", 288 | " \"\".join(\n", 289 | " [\n", 290 | " f\"WHEN zone_code IS NOT NULL AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n", 291 | " for min_zone, max_zone in ZONE_GROUPS\n", 292 | " ]\n", 293 | " )\n", 294 | " }\n", 295 | " ELSE 'ALL'\n", 296 | " END AS zone_group\n", 297 | " FROM (\n", 298 | " {query_cumulative_metrics}\n", 299 | " ) _\n", 300 | "\"\"\"" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 14, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "query = F\"\"\"\n", 310 | " COPY (\n", 311 | " {\n", 312 | " query_metrics_with_zone_group\n", 313 | " } )\n", 314 | " TO 'VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet' \n", 315 | " (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n", 316 | "\"\"\"" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 15, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "application/vnd.jupyter.widget-view+json": { 327 | "model_id": "b9ea3c1f88764a56a54bce624acaf93e", 328 | "version_major": 2, 329 | "version_minor": 0 330 | }, 331 | "text/plain": [ 332 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 333 | ] 334 | }, 335 | "metadata": {}, 336 | "output_type": "display_data" 337 | }, 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "" 342 | ] 343 | }, 344 | "execution_count": 15, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "cursor.execute(query)" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "base", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.11.5" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 2 375 | } 376 | -------------------------------------------------------------------------------- /src/02_isolar_timestamps_eventos.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calcular o Tempo de Voto" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Este notebook tem como objetivo calcular o tempo de voto de um eleitor. \n", 15 | "O primeiro passo é definir exatamente o que é um voto, dado que o log das urnas contém apenas uma sequência de eventos.\n", 16 | "\n", 17 | "Na sequência, os votos são individualizados (um por linha) e o tempo de cada evento relevante é calculado." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Importing libraries" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 28, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import duckdb\n", 34 | "import pandas as pd\n", 35 | "import time" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Importing Data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 29, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "cursor = duckdb.connect()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 31, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "TABLE = \"read_parquet('UFS_VOTE_EVENTS.parquet/*/*/*.parquet', hive_partitioning=True)\"" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 32, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "source_data = f\"\"\"\n", 70 | " (\n", 71 | " SELECT\n", 72 | " *\n", 73 | " FROM {TABLE}\n", 74 | " ) AS source\n", 75 | "\"\"\"" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Preparinga Data" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Criando um ID único para cada voto" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "Como heurística, vamos criar um id único para cada voto, que será determinado a partir de uma operação 'âncora'.\n", 97 | "\n", 98 | "A operação servirá como marcação de que um voto foi iniciado e, todas as linhas entre uma operação âncora e a próxima, serão consideradas como um único voto." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 33, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "ANCHOR_OPERATION = 'Aguardando digitação do título'\n", 108 | "ZONE_GROUPS = [ (0, 100), (101, 200), (201, 300), (301, 400), (401, 500) ]" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Após uma exploração dos LOGS, a operação escolhida foi 'AGUARDANDO DIGITAÇÃO DO TÍTULO', exatamente por ser o PRIMEIRO e OBRIAGTÓRIO passo para que um voto seja autorizado." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 34, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "query_create_id = f\"\"\"\n", 125 | " (\n", 126 | " SELECT\n", 127 | " *,\n", 128 | " SUM(CASE WHEN event_description = '{ANCHOR_OPERATION}' THEN 1 ELSE 0 END) \n", 129 | " OVER (PARTITION BY event_date, uf, filename ORDER BY event_timestamp) AS vote_id,\n", 130 | " \n", 131 | " CASE\n", 132 | " {\n", 133 | " \"\".join(\n", 134 | " [\n", 135 | " f\"WHEN zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n", 136 | " for min_zone, max_zone in ZONE_GROUPS\n", 137 | " ]\n", 138 | " )\n", 139 | " }\n", 140 | " END AS zone_group\n", 141 | "\n", 142 | " FROM {source_data}\n", 143 | " WHERE \n", 144 | " uf = '' \n", 145 | " AND event_date = ''\n", 146 | " AND zone_code::INT BETWEEN AND \n", 147 | " ) AS query_vote_id\n", 148 | "\"\"\"" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Pivotando Timestamp dos eventos por id" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Para calcular o tempo dos votos e dos eventos individuais que o compõem (biometria, voto) é necessário extrair o timestamp de cada evento." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 35, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "timestamp_inicio_fim_voto = [\n", 172 | " f'''\n", 173 | " MAX(\n", 174 | " CASE WHEN event_description = 'Título digitado pelo mesário' THEN event_timestamp ELSE NULL END \n", 175 | " ) AS timestamp_titulo_digitado\n", 176 | " ''',\n", 177 | " f'''\n", 178 | " MAX(\n", 179 | " CASE WHEN event_description = 'O voto do eleitor foi computado' THEN event_timestamp ELSE NULL END \n", 180 | " ) AS timestamp_voto_computado\n", 181 | " '''\n", 182 | "]" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 36, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "VOTE_EVENTS = [\n", 192 | " 'Voto confirmado para [Conselheiro Distrital]',\n", 193 | " 'Voto confirmado para [Deputado Distrital]',\n", 194 | " 'Voto confirmado para [Deputado Estadual]',\n", 195 | " 'Voto confirmado para [Deputado Federal]',\n", 196 | " 'Voto confirmado para [Governador]',\n", 197 | " 'Voto confirmado para [Prefeito]',\n", 198 | " 'Voto confirmado para [Presidente]',\n", 199 | " 'Voto confirmado para [Senador]',\n", 200 | "]\n", 201 | "\n", 202 | "timestamp_vote_events = [\n", 203 | " f'''\n", 204 | " MAX(\n", 205 | " CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n", 206 | " ) AS timestamp_voto_{event.replace(\"Voto confirmado para [\", \"\").replace(\"]\", \"\").lower().replace(' ', '_')}\n", 207 | " '''\n", 208 | " for event in VOTE_EVENTS\n", 209 | "]" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 37, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "BIOMETRIA_TENTATIVAS = [\n", 219 | " 'Solicita digital. Tentativa [1] de [4]',\n", 220 | " 'Solicita digital. Tentativa [2] de [4]',\n", 221 | " 'Solicita digital. Tentativa [3] de [4]',\n", 222 | " 'Solicita digital. Tentativa [4] de [4]',\n", 223 | " 'Solicitação de dado pessoal do eleitor para habilitação manual',\n", 224 | " 'Eleitor foi habilitado'\n", 225 | "]\n", 226 | "\n", 227 | "timestamp_biometria_tentativas = [\n", 228 | " f'''\n", 229 | " MAX(\n", 230 | " CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n", 231 | " ) AS timestamp_biometria_{event.replace(\"Solicita digital. Tentativa [\", \"\").replace(\"] de [4]\", \"\").lower()}\n", 232 | " '''\n", 233 | " for event in BIOMETRIA_TENTATIVAS\n", 234 | " if event.startswith('Solicita digital')\n", 235 | "] + [\n", 236 | " f'''\n", 237 | " MAX(\n", 238 | " CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-2]}\\' THEN event_timestamp ELSE NULL END \n", 239 | " ) AS timestamp_biometria_manual\n", 240 | " '''\n", 241 | "] + [\n", 242 | " f'''\n", 243 | " MAX(\n", 244 | " CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-1]}\\' THEN event_timestamp ELSE NULL END \n", 245 | " ) AS timestamp_habilitacao_eleitor\n", 246 | " '''\n", 247 | "]\n", 248 | " " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 38, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "query_pivot_timestamps = f\"\"\"(\n", 258 | " SELECT\n", 259 | " event_date, uf, filename, vote_id,\n", 260 | " \n", 261 | " MAX(city_code) AS city_code,\n", 262 | " MAX(zone_code) AS zone_code,\n", 263 | " MAX(zone_group) AS zone_group,\n", 264 | " MAX(section_code) AS section_code,\n", 265 | "\n", 266 | " SUM( (event_description='O voto do eleitor foi computado')::INT ) AS quantidade_votos_computados,\n", 267 | " SUM( (event_description ILIKE 'Solicita digital%')::INT ) AS quantidade_solicitacoes_biometria,\n", 268 | " SUM( (event_description ILIKE 'Voto confirmado para%')::INT ) AS quantidade_cargos_votados,\n", 269 | " MAX( (event_description='Solicitação de dado pessoal do eleitor para habilitação manual')::INT ) AS biometria_nao_funcionou,\n", 270 | "\n", 271 | " MIN( event_timestamp ) AS timestamp_primeiro_evento,\n", 272 | "\n", 273 | " {', '.join(timestamp_vote_events+timestamp_biometria_tentativas+timestamp_inicio_fim_voto)}\n", 274 | " \n", 275 | " FROM {query_create_id}\n", 276 | " GROUP BY event_date, uf, filename, vote_id\n", 277 | ")\n", 278 | "\"\"\"" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "### Construindo e Executando a query" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL por duas razões:\n", 293 | "\n", 294 | " - Facilitar a leitura dos dados posteriormente\n", 295 | " - Permitir a execução da query em partes, evitando a sobrecarga de memória ao processar todos os dados de uma vez\n", 296 | "\n", 297 | "As ZONAS foram agrupadas em grupos de 100, esse número é empírico, pensado para abarcar a grande maioria das UFs em um único grupo, já que a grande maioria dos estados não pssui mais de 100 zonas eleitorais, e dividir as UFs mais populosas em grupos menores." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "ACCEPTED_DATES = [\n", 307 | " '2022-10-02', '2022-10-30', \n", 308 | " '2022-10-03', '2022-10-31',\n", 309 | "]\n", 310 | "UFS = [\n", 311 | " 'AC', 'AL', 'AM', 'AP', \n", 312 | " 'BA', \n", 313 | " 'CE', 'DF', 'ES', 'GO', \n", 314 | " 'MT', 'PA', 'PB', 'PE', \n", 315 | " 'MA',\n", 316 | " \n", 317 | " 'MG', 'MS', \n", 318 | " 'PI', 'PR', 'RJ', 'RN', \n", 319 | " 'RO', 'RR', 'RS', 'SC', \n", 320 | " 'SE', 'SP', 'TO', 'ZZ'\n", 321 | "]\n", 322 | "\n", 323 | "PROCESSING_TIMES = []\n", 324 | "\n", 325 | "for uf in UFS:\n", 326 | " for date in ACCEPTED_DATES:\n", 327 | " for zone_group in ZONE_GROUPS:\n", 328 | "\n", 329 | " \n", 330 | " query = F\"\"\"\n", 331 | " COPY \n", 332 | " {\n", 333 | " query_pivot_timestamps\n", 334 | " .replace('', uf)\n", 335 | " .replace('', date)\n", 336 | " .replace('', str(zone_group[0]))\n", 337 | " .replace('', str(zone_group[1]))\n", 338 | " } \n", 339 | " TO 'VOTES.parquet' \n", 340 | " (FORMAT 'parquet', PARTITION_BY (event_date, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n", 341 | " \"\"\"\n", 342 | " \n", 343 | " print(\"Processing \", uf, date)\n", 344 | " tic = time.time()\n", 345 | " cursor.execute(query)\n", 346 | " toc = time.time()\n", 347 | " print(F\"Time for {uf} {date} {zone_group}: {toc-tic}\")\n", 348 | "\n", 349 | " PROCESSING_TIMES.append({\n", 350 | " 'uf': uf,\n", 351 | " 'date': date,\n", 352 | " 'zone_group': zone_group,\n", 353 | " 'time': toc-tic\n", 354 | " })" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "Salvando o resultado dos tempos de processamento." 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 42, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "PROCESSING_TIMES\n", 371 | "\n", 372 | "# convert to pandas and save as csv\n", 373 | "df_processing_times = pd.DataFrame(PROCESSING_TIMES)\n", 374 | "df_processing_times.to_csv('processing_times.csv', index=False)" 375 | ] 376 | } 377 | ], 378 | "metadata": { 379 | "kernelspec": { 380 | "display_name": "base", 381 | "language": "python", 382 | "name": "python3" 383 | }, 384 | "language_info": { 385 | "codemirror_mode": { 386 | "name": "ipython", 387 | "version": 3 388 | }, 389 | "file_extension": ".py", 390 | "mimetype": "text/x-python", 391 | "name": "python", 392 | "nbconvert_exporter": "python", 393 | "pygments_lexer": "ipython3", 394 | "version": "3.11.5" 395 | } 396 | }, 397 | "nbformat": 4, 398 | "nbformat_minor": 2 399 | } 400 | -------------------------------------------------------------------------------- /streamlit/app/widgets.py: -------------------------------------------------------------------------------- 1 | 2 | import geopandas as gpd 3 | import pandas as pd 4 | import datetime 5 | import re 6 | import io 7 | import streamlit as st 8 | import seaborn as sns 9 | 10 | from maps import add_ufs_and_links_to_map, load_brazil_simplified_map, load_ufs_city_simplified_map 11 | from data import DuckDBConnector 12 | import numpy as np 13 | 14 | import matplotlib.pyplot as plt 15 | from matplotlib.colors import LinearSegmentedColormap 16 | 17 | @st.cache_resource 18 | def get_duckdb_connector(): 19 | return DuckDBConnector.get_instance() 20 | 21 | PRIMARY_COLOR = "#0B1D51" 22 | HIGHLIGHT_COLOR = "#F08902" 23 | 24 | # Seaborn set theme 25 | # no grid 26 | # gray background 27 | sns.set_style("whitegrid") 28 | sns.set_theme(style='whitegrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None) 29 | 30 | def format_number_mi_mil(number): 31 | number_mi = number//1e6 32 | number_mil = (number - number_mi*1e6) / 1e3 33 | 34 | number_formatted = f"{number_mi:.0f} Mihão" if number_mi > 0 else '' 35 | if number_mil > 0: 36 | number_formatted += f" {number_mil:.0f} Mil" 37 | elif number_mil > 0: 38 | number_formatted = str(number_mil).replace('.', ',') 39 | number_formatted = number_formatted[:number_formatted.index(',')+2] + ' Mil' 40 | number_formatted = number_formatted.strip() 41 | return number_formatted 42 | 43 | 44 | def format_time(time_in_seconds): 45 | 46 | years = time_in_seconds // (365 * 24 * 3600) 47 | time_in_seconds = time_in_seconds % (365 * 24 * 3600) 48 | months = time_in_seconds // (30 * 24 * 3600) 49 | time_in_seconds = time_in_seconds % (30 * 24 * 3600) 50 | days = time_in_seconds // (24 * 3600) 51 | time_in_seconds = time_in_seconds % (24 * 3600) 52 | hours = time_in_seconds // 3600 53 | time_in_seconds %= 3600 54 | minutes = time_in_seconds // 60 55 | seconds = time_in_seconds % 60 56 | 57 | days = int(days) 58 | hours = int(hours) 59 | minutes = int(minutes) 60 | seconds = int(seconds) 61 | 62 | time_formated = "" 63 | if seconds > 0: 64 | time_formated += f"{seconds:.0f}s" 65 | if minutes > 0: 66 | time_formated = f"{minutes:.0f}m " + time_formated 67 | if hours > 0: 68 | time_formated = f"{hours:.0f}h " + time_formated 69 | if days > 0: 70 | time_formated = f"{days:.0f} dias " + time_formated 71 | if months > 0: 72 | time_formated = f"{months:.0f} Meses " + time_formated 73 | if months == 1: 74 | time_formated = time_formated.replace('Meses', 'Mês') 75 | if years > 0: 76 | time_formated = f"{years:.0f} Anos " + time_formated 77 | if years == 1: 78 | time_formated = time_formated.replace('Anos', 'Ano') 79 | 80 | # Remover horas, minutos e segundos 81 | time_formated = re.sub(r'\d+[hms]', '', time_formated) 82 | 83 | return time_formated 84 | 85 | 86 | def format_number(number): 87 | return ( 88 | f"{number//1e6:.0f} Mi" 89 | if number >= 1e6 else f"{number//1e3:.0f} Mil" 90 | if number >= 1e3 else f"{number:.0f}" 91 | ) 92 | 93 | 94 | def widget_numero_votos_intervalo_5min(container, turno, uf, zona, secao): 95 | 96 | metrics_df = get_duckdb_connector().get_metrics_over_time(uf, turno, zona, secao) 97 | metrics_df['timestamp_voto_computado_5min'] = pd.to_datetime(metrics_df['timestamp_voto_computado_5min']) 98 | metrics_df = metrics_df.sort_values('timestamp_voto_computado_5min') 99 | metrics_df = metrics_df.fillna( pd.NaT ) 100 | 101 | # define x and y 102 | y_metric = metrics_df['total_votos'].astype(int) 103 | 104 | # Get the maximum value of y 105 | # and the corresponding x value 106 | # ------------------------------ 107 | x_value_max_y, max_y = metrics_df.loc[y_metric.idxmax(), ['timestamp_voto_computado_5min', 'total_votos']] 108 | x_value_max_y_formatted = x_value_max_y.strftime('%H:%M') 109 | max_y_formatted = format_number_mi_mil(max_y) 110 | 111 | # lineplot with time series 112 | FIGSIZE = (10, 5) 113 | fig, ax = plt.subplots( figsize=FIGSIZE ) 114 | 115 | # pegar só horas fechadas e 30min 116 | x_axis_values = ( 117 | metrics_df 118 | .query("timestamp_voto_computado_5min.dt.minute == 0") 119 | ['timestamp_voto_computado_5min'] 120 | ) 121 | x_axis_labels = x_axis_values.dt.strftime('%H:%M') 122 | 123 | if uf in ['ALL', 'SP', 'MG']: 124 | y_axis_values = [ 5e4, 1e5, 2.5e5, 5e5, 7.5e5, 1e6 ] 125 | else: 126 | y_axis_values = [ 1e3, 3e3, 5e3, 1e4, 1.5e4, 2e4, 5e4, 1e5, 5e5 ] 127 | y_axis_labels = [format_number(y) for y in y_axis_values] 128 | 129 | sns.lineplot( 130 | x=metrics_df['timestamp_voto_computado_5min'], 131 | y=y_metric, 132 | ax=ax, 133 | color=PRIMARY_COLOR 134 | ) 135 | 136 | # Fill area under the line 137 | # ------------------------ 138 | ax.fill_between( 139 | metrics_df['timestamp_voto_computado_5min'], 140 | y_metric, 141 | 0, 142 | zorder=0, 143 | alpha=0.5, 144 | color=PRIMARY_COLOR 145 | ) 146 | 147 | # Add vertical line at the maximum value 148 | # -------------------------------------- 149 | ax.axvline( 150 | x=metrics_df.loc[y_metric.idxmax(), 'timestamp_voto_computado_5min'], 151 | color=HIGHLIGHT_COLOR, 152 | ymin=0, 153 | ymax=1, 154 | linestyle='-', 155 | linewidth=2 156 | ) 157 | 158 | # Add a box in the line with the maximum value 159 | # left aligned 160 | # -------------------------------------------- 161 | ax.text( 162 | x_value_max_y, 163 | 0.9*max_y, 164 | f"{max_y_formatted}", 165 | color='white', 166 | fontsize=10, 167 | ha='left', 168 | va='center', 169 | bbox=dict(facecolor=HIGHLIGHT_COLOR, alpha=1) 170 | ) 171 | 172 | 173 | ax.set_xticks(x_axis_values) 174 | ax.set_xticklabels(x_axis_labels, rotation=45, ha='right', fontsize=10) 175 | 176 | # remove right and top spines 177 | ax.spines['right'].set_visible(False) 178 | ax.spines['left'].set_visible(False) 179 | ax.spines['bottom'].set_visible(False) 180 | ax.spines['top'].set_visible(False) 181 | 182 | ax.set_yticks(y_axis_values) 183 | ax.set_yticklabels(y_axis_labels, fontsize=10) 184 | # add horizontal grid lines on the y axis 185 | # in the background 186 | ax.yaxis.grid(True, linestyle='-', alpha=1) 187 | # remove x grid lines 188 | ax.xaxis.grid(False) 189 | # remove x and y labels 190 | ax.set_xlabel('') 191 | ax.set_ylabel('') 192 | # set y limit 193 | ax.set_ylim(0, max_y) 194 | 195 | container.markdown('#### Número de votos efetuados a cada 5min') 196 | container.pyplot(fig) 197 | container.markdown(f'#### Às {x_value_max_y_formatted}, houve o pico de votos, com **{max_y_formatted}** computados em 5 minutos!') 198 | 199 | 200 | def widget_tempo_medio_voto(container, turno, uf, zona, secao): 201 | 202 | if uf=='ALL': 203 | widget_heatmap_tempo_medio_voto_mapa(container, turno, uf, zona, secao) 204 | elif zona=='ALL': 205 | widget_tabela_tempo_medio_zonas(container, turno, uf, zona, secao) 206 | 207 | 208 | def widget_tabela_tempo_medio_zonas( container, turno, uf, zona, secao ): 209 | 210 | map_gdf = load_brazil_simplified_map() 211 | map_gdf = map_gdf.query(f"SIGLA_UF == '{uf}'") 212 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 213 | metrics_df = metrics_df[ ['zone_group', 'zone_code', 'total_votos', 'tempo_voto_medio'] ] 214 | metrics_df_all_zones = metrics_df.query("zone_code == 'ALL'") 215 | metrics_df = metrics_df.query("zone_code != 'ALL'") 216 | 217 | unique_zone_groups = list(metrics_df['zone_group'].unique()) 218 | unique_zone_groups.sort( key=lambda x: int(x.split('-')[0]) ) 219 | 220 | # plot a small map with the selected UF 221 | fig, ax = plt.subplots( figsize=(1, 1) ) 222 | map_gdf.plot(ax=ax, color=HIGHLIGHT_COLOR) 223 | ax.axis('off') 224 | # add the sigla of the UF 225 | ax.text( 226 | map_gdf.centroid.x.values[0], 227 | map_gdf.centroid.y.values[0], 228 | uf, 229 | fontsize=8, 230 | weight='bold', 231 | ha='center', 232 | va='center', 233 | color='white' 234 | ) 235 | 236 | x=.15 237 | col_map_uf, col_title = container.columns( [x, 1-x] ) 238 | col_map_uf.pyplot(fig, use_container_width=True) 239 | col_title.markdown(f"### Detalhamento por Zona \n") 240 | 241 | zone_gorup_tabs = container.tabs( unique_zone_groups ) 242 | for zone_group, zone_group_tab in zip(unique_zone_groups, zone_gorup_tabs): 243 | 244 | metrics_df_zone_group = metrics_df.query(f"zone_group == '{zone_group}'") 245 | top_3_most_last_zones = metrics_df_zone_group.sort_values('tempo_voto_medio', ascending=False).head(3)['zone_code'].values 246 | 247 | metrics_df_zone_group = metrics_df_zone_group.sort_values('zone_code') 248 | metrics_df_zone_group['tempo_voto_medio'] = metrics_df_zone_group['tempo_voto_medio'].apply(format_time) 249 | metrics_df_zone_group['total_votos'] = metrics_df_zone_group['total_votos'].apply(format_number) 250 | 251 | # add medals to the top 3 most last zones 252 | # in the tempo_voto_medio column 253 | 254 | for medal, zone in zip(['🥇', '🥈', '🥉'], top_3_most_last_zones): 255 | metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio'] = medal \ 256 | + ' ' + metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio'] 257 | 258 | 259 | metrics_df_zone_group = metrics_df_zone_group.rename( 260 | columns={ 261 | 'zone_code': 'Zona', 262 | 'total_votos': 'Votos', 263 | 'tempo_voto_medio': 'Tempo Médio' 264 | } 265 | ).drop(columns='zone_group') 266 | 267 | zone_group_tab.dataframe( 268 | metrics_df_zone_group 269 | .style 270 | .apply( 271 | lambda x: 272 | [ 273 | f'background-color: {HIGHLIGHT_COLOR}; color: white; font-weight: bold; font-size: 15px' 274 | if x['Zona'] in top_3_most_last_zones else '', 275 | ]*len(x), 276 | axis=1 277 | ), 278 | height=400, 279 | use_container_width = True, 280 | hide_index=True 281 | ) 282 | 283 | 284 | def widget_heatmap_tempo_medio_voto_mapa( container, turno, uf, zona, secao ): 285 | COLORMAP = 'coolwarm' 286 | RANGE_SECONDS_PLOT = 15 287 | FIGSIZE = (6, 6) 288 | 289 | map_gdf = load_brazil_simplified_map() 290 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 291 | map_gdf = map_gdf.merge(metrics_df, left_on='SIGLA_UF', right_on='uf', how='left') 292 | map_gdf = gpd.GeoDataFrame(map_gdf) 293 | 294 | tempo_voto_medio_ALL = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max() 295 | map_gdf['tempo_voto_medio'] = map_gdf['tempo_voto_medio'] - tempo_voto_medio_ALL 296 | 297 | fig = plt.figure(figsize=FIGSIZE) 298 | ax = fig.add_subplot(1, 1, 1) 299 | ax.axis('off') 300 | UFS = map_gdf['uf'].unique() 301 | 302 | for uf in UFS: 303 | ( 304 | map_gdf 305 | .query(f"uf == '{uf}'") 306 | .plot( 307 | column='tempo_voto_medio', 308 | ax=ax, 309 | cmap=COLORMAP, 310 | legend=False, 311 | vmin=-RANGE_SECONDS_PLOT, 312 | vmax=+RANGE_SECONDS_PLOT, 313 | gid=uf 314 | ) 315 | ) 316 | 317 | # add a horizontal colorbar 318 | sm = plt.cm.ScalarMappable( 319 | cmap=COLORMAP, 320 | norm=plt.Normalize(vmin=-RANGE_SECONDS_PLOT, vmax=+RANGE_SECONDS_PLOT) 321 | ) 322 | 323 | cbar = fig.colorbar(sm, ax=ax, orientation='horizontal', pad=0.01, aspect=20, fraction=0.035) 324 | cbar.set_label('Segundos abaixo/acima da média', fontsize=10) 325 | cbar.ax.tick_params(labelsize=8) 326 | 327 | # save svg image to buffer 328 | svg_image_buffer = io.StringIO() 329 | plt.savefig(svg_image_buffer, format='svg') 330 | plt.close(fig) 331 | 332 | svg_image_with_links = add_ufs_and_links_to_map(svg_image_buffer.getvalue()) 333 | 334 | container.markdown('#### Tempo Médio de Votação por UF') 335 | container.markdown(':point_down: Clique no Mapa para detalhes') 336 | container.markdown(svg_image_with_links, unsafe_allow_html=True) 337 | 338 | 339 | def widget_bignumber_votos( container, turno, uf, zona, secao ): 340 | 341 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 342 | if uf == 'ALL': 343 | votos = metrics_df.query(f"uf == 'ALL'")['total_votos'].max() 344 | else: 345 | votos = metrics_df['total_votos'].max() 346 | 347 | votos_formatado = f"{votos:,}".replace(',', ' ') 348 | container.metric(label=':white_check_mark: Votos', value=votos_formatado) 349 | 350 | 351 | def widget_bignumber_secoes( container, turno, uf, zona, secao ): 352 | 353 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 354 | if uf == 'ALL': 355 | secoes = metrics_df.query(f"uf == 'ALL'")['total_secoes'].max() 356 | else: 357 | secoes = metrics_df['total_secoes'].max() 358 | 359 | section_formatado = f"{secoes:,}".replace(',', ' ') 360 | container.metric(label=':pushpin: Seções', value=section_formatado) 361 | 362 | 363 | def widget_big_number_tempo_medio( container, turno, uf, zona, secao ): 364 | 365 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 366 | if uf == 'ALL': 367 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max() 368 | else: 369 | tempo_medio = metrics_df['tempo_voto_medio'].max() 370 | 371 | tempo_medio_formatado = format_time(tempo_medio) 372 | container.metric(label=':stopwatch: Tempo Médio', value=tempo_medio_formatado) 373 | 374 | 375 | def widget_big_number_tempo_medio_bio( container, turno, uf, zona, secao ): 376 | 377 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 378 | if uf == 'ALL': 379 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_biometria_medio'].max() 380 | else: 381 | tempo_medio = metrics_df['tempo_biometria_medio'].max() 382 | 383 | tempo_medio_formatado = format_time(tempo_medio) 384 | container.metric(label=':point_up: Tempo Médio Biometria', value=tempo_medio_formatado) 385 | 386 | 387 | def widget_big_number_tempo_total_voto( container, turno, uf, zona, secao ): 388 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 389 | 390 | if uf == 'ALL': 391 | tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_soma'].max() 392 | else: 393 | tempo_medio = metrics_df['tempo_voto_soma'].max() 394 | 395 | tempo_medio_anos = tempo_medio / (365 * 24 * 3600) 396 | if tempo_medio_anos < 5: 397 | icon = ':baby:' 398 | elif tempo_medio_anos < 10: 399 | icon = ':boy:' 400 | elif tempo_medio_anos < 15: 401 | icon = ':child:' 402 | elif tempo_medio_anos < 30: 403 | icon = ':man:' 404 | elif tempo_medio_anos < 60: 405 | icon = ':older_adult:' 406 | else: 407 | icon = ':older_man:' 408 | 409 | 410 | tempo_medio_formatado = format_time(tempo_medio) 411 | container.metric(label=f'{icon} Tempo Total Gasto', value=tempo_medio_formatado) 412 | 413 | 414 | def widget_qtd_votos_intervalo_tempo( container, turno, uf, zona, secao ): 415 | 416 | metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao) 417 | if uf == 'ALL': 418 | metrics_df = metrics_df.query(f"uf == 'ALL'") 419 | 420 | format_time = lambda x: f"{x // 60}:{x % 60:02d}" 421 | # format number in Mi, Mil, and integer 422 | format_number = lambda number : ( 423 | f"{number//1e6:.0f} Mi" 424 | if number >= 1e6 else f"{number//1e3:.0f} Mil" 425 | if number >= 1e3 else f"{number:.0f}" 426 | ) 427 | 428 | extrair_intervalo_superior_segundos = lambda col: int(col.split('_')[-2]) 429 | extrair_intervalo_inferior_segundos = lambda col: int(col.split('_')[-3]) 430 | 431 | colunas_qtd_votos_intervalo = [ 432 | 'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos', 433 | 'votos_90_120_segundos', 'votos_120_150_segundos', 434 | 'votos_150_180_segundos', 'votos_180_210_segundos', 435 | 'votos_210_300_segundos', 'votos_300_9999_segundos' 436 | ] 437 | 438 | valores_qtd_votos_intervalo = [ 439 | ( 440 | format_time(extrair_intervalo_inferior_segundos(col)) + " a " + 441 | format_time(extrair_intervalo_superior_segundos(col)), 442 | col, 443 | metrics_df[col].max() 444 | ) 445 | if col != 'votos_300_9999_segundos' and col != 'votos_0_30_segundos' 446 | else ("mais de 5:00", col, metrics_df[col].max()) 447 | if col == 'votos_300_9999_segundos' 448 | else ("até 0:30", col, metrics_df[col].max()) 449 | for col in colunas_qtd_votos_intervalo 450 | ] 451 | # revert order 452 | valores_qtd_votos_intervalo = valores_qtd_votos_intervalo[::-1] 453 | 454 | df_valores_qtd_votos_intervalo = pd.DataFrame( 455 | valores_qtd_votos_intervalo, 456 | columns=['intervalo', 'coluna', 'valor'] 457 | ) 458 | 459 | container.markdown('#### Em quantos minutos as pessoas votam?') 460 | 461 | # plot horizontal bar chart 462 | fig, ax = plt.subplots( figsize=(5, 12) ) 463 | # df_valores_qtd_votos_intervalo.plot.barh(x='intervalo', y='valor', legend=False, width=.8, ax=ax) 464 | 465 | # make the barplot with seaborn 466 | sns.barplot( 467 | x='valor', 468 | y='intervalo', 469 | data=df_valores_qtd_votos_intervalo, 470 | color=PRIMARY_COLOR, 471 | ax=ax 472 | ) 473 | fig.gca().invert_yaxis() 474 | 475 | # make the biggest bar red 476 | max_value = df_valores_qtd_votos_intervalo['valor'].max() 477 | max_value_index = df_valores_qtd_votos_intervalo['valor'].idxmax() 478 | ax.patches[max_value_index].set_facecolor(HIGHLIGHT_COLOR) 479 | # add the % inside the biggest bar 480 | max_value_percent = max_value / df_valores_qtd_votos_intervalo['valor'].sum() 481 | ax.text( 482 | max_value - 0.05 * max_value, 483 | max_value_index, 484 | f"{max_value_percent:.1%}", 485 | color='white', 486 | ha = 'right', 487 | va = 'center', 488 | size=20 489 | ) 490 | 491 | ax.set_xlabel('Quantidade de Votos') 492 | ax.set_ylabel('') 493 | # ax.set_title('Em quanto tempo as pessoas votam?\n', fontsize=20) 494 | 495 | # remover linha superior, direita e inferior 496 | ax.spines['top'].set_visible(False) 497 | ax.spines['right'].set_visible(False) 498 | ax.spines['bottom'].set_visible(False) 499 | 500 | # remove x axis 501 | ax.xaxis.set_visible(False) 502 | 503 | # increase y axis font size 504 | ax.tick_params(axis='y', labelsize=20) 505 | 506 | # adicionar número no final de cada barra 507 | maior_valor = df_valores_qtd_votos_intervalo['valor'].max() 508 | offset = 0.05 * maior_valor 509 | for i, valor in enumerate(df_valores_qtd_votos_intervalo['valor']): 510 | ax.text(valor+offset, i, format_number(valor), color='black', va='center', fontsize=18) 511 | 512 | container.pyplot(fig) 513 | 514 | 515 | 516 | -------------------------------------------------------------------------------- /src/03A_calcular_metricas_tempo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calcular Métricas - Tempo de Votação, Biometria, etc." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Importing libraries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import duckdb\n", 31 | "import pandas as pd\n", 32 | "import time" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Importing Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "cursor = duckdb.connect()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Dados Brutos" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n", 65 | "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Adicionar TURNO e Timestamp final de Biometria" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "source = F\"\"\"\n", 82 | "(\n", 83 | " SELECT \n", 84 | " *,\n", 85 | " CASE event_date\n", 86 | " WHEN '2022-10-02' THEN 1\n", 87 | " WHEN '2022-10-03' THEN 1\n", 88 | " WHEN '2022-10-30' THEN 2\n", 89 | " WHEN '2022-10-31' THEN 2\n", 90 | " ELSE NULL\n", 91 | " END::INT AS turno,\n", 92 | " COALESCE(\n", 93 | " timestamp_biometria_manual,\n", 94 | " timestamp_biometria_4,\n", 95 | " timestamp_biometria_3,\n", 96 | " timestamp_biometria_2,\n", 97 | " timestamp_biometria_1\n", 98 | " ) AS timestamp_biometria_final\n", 99 | " FROM \n", 100 | " {TABLE}\n", 101 | ") _\n", 102 | "\"\"\"" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Preparinga Data" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n", 117 | "\n", 118 | "- Número de Votos\n", 119 | "- Número de Seções Eleitorais\n", 120 | "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n", 121 | "\n", 122 | "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n", 123 | "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n", 124 | "- Quantidade de Teclas Pressionadas\n", 125 | "- Quantidade de Cargos Distintos Votados" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "**Definição das métricas de tempo**" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_titulo_digitado))\"\n", 142 | "tempo_voto = \"EXTRACT(EPOCH FROM (timestamp_voto_computado - timestamp_habilitacao_eleitor))\"\n", 143 | "tempo_biometria = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n", 144 | "\n", 145 | "intervalos_tempo_segundos_votos = [0, 30, 60, 90, 120, 150, 180, 210, 300, 9999]\n", 146 | "contagem_de_votos_em_intervalos_de_tempo = \", \".join([\n", 147 | " F\"\"\"\n", 148 | " SUM( \n", 149 | " CASE WHEN \n", 150 | " {tempo_voto} >= {intervalos_tempo_segundos_votos[i]} \n", 151 | " AND {tempo_voto} < {intervalos_tempo_segundos_votos[i+1]}\n", 152 | " THEN 1 ELSE 0 END \n", 153 | " ) AS votos_{intervalos_tempo_segundos_votos[i]}_{intervalos_tempo_segundos_votos[i+1]}_segundos\n", 154 | " \"\"\"\n", 155 | " for i in range(0, len(intervalos_tempo_segundos_votos)-1)\n", 156 | "])" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "**Contagem de cargos distintos votados e número total de teclas pressionadas**" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Aproximação a partir do número de digitos de cada cargo + 1 (CONFIRMA)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "COLUNAS_VOTOS_CARGOS_NR_TECLAS = [\n", 180 | " # 2 digitos\n", 181 | " ('timestamp_voto_prefeito', 2), \n", 182 | " ('timestamp_voto_presidente', 2),\n", 183 | " ('timestamp_voto_governador', 2),\n", 184 | " \n", 185 | " # 3 digitos\n", 186 | " ('timestamp_voto_senador', 3),\n", 187 | "\n", 188 | " # 4 digitos\n", 189 | " ('timestamp_voto_deputado_distrital', 4), \n", 190 | " ('timestamp_voto_deputado_federal', 4),\n", 191 | "\n", 192 | " # 5 digitos\n", 193 | " ('timestamp_voto_deputado_estadual', 5),\n", 194 | "]\n", 195 | "\n", 196 | "nr_total_cargos_votados = \" + \".join([\n", 197 | " F\"({coluna} IS NOT NULL)::INT\"\n", 198 | " for coluna, _ in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n", 199 | "])\n", 200 | "\n", 201 | "nr_total_teclas_digitadas = \" + \".join([\n", 202 | " F\"({coluna} IS NOT NULL)::INT*({teclas}+1)\"\n", 203 | " for coluna, teclas in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n", 204 | "])" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(10), 'ALL')\"" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 8, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "query_metrics = F\"\"\"\n", 223 | " SELECT\n", 224 | " {fix_null_values('turno') } AS turno,\n", 225 | " {fix_null_values('uf') } AS uf,\n", 226 | " {fix_null_values('zone_code') } AS zone_code,\n", 227 | " {fix_null_values('section_code') } AS section_code,\n", 228 | "\n", 229 | " COUNT(*) AS total_votos,\n", 230 | " COUNT( DISTINCT uf || zone_code || section_code ) AS total_secoes,\n", 231 | "\n", 232 | " SUM( {tempo_voto} ) AS tempo_voto_soma,\n", 233 | " AVG( {tempo_voto} ) AS tempo_voto_medio,\n", 234 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_mediana,\n", 235 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_90percentil,\n", 236 | "\n", 237 | " SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n", 238 | " AVG( {tempo_biometria} ) AS tempo_biometria_medio,\n", 239 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_mediana,\n", 240 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_90percentil,\n", 241 | "\n", 242 | " SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n", 243 | " AVG( {tempo_voto_total} ) AS tempo_voto_total_medio,\n", 244 | " --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_mediana,\n", 245 | " --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_90percentil,\n", 246 | " \n", 247 | " {contagem_de_votos_em_intervalos_de_tempo},\n", 248 | " 1-AVG(biometria_nao_funcionou::INT) AS tx_sucesso_biometria,\n", 249 | "\n", 250 | " MAX({nr_total_cargos_votados}) AS nr_total_cargos_votados,\n", 251 | " SUM({nr_total_teclas_digitadas}) AS nr_total_teclas_digitadas\n", 252 | "\n", 253 | " FROM\n", 254 | " {source}\n", 255 | " WHERE quantidade_votos_computados = 1\n", 256 | " GROUP BY ROLLUP(turno, uf, zone_code, section_code)\n", 257 | "\"\"\"" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n", 265 | "\n", 266 | "As ZONAS foram agrupadas em grupos de 20, esse número é empírico." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 9, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "query_metrics_with_zone_group = F\"\"\"\n", 276 | " SELECT\n", 277 | " *,\n", 278 | " CASE\n", 279 | " {\n", 280 | " \"\".join(\n", 281 | " [\n", 282 | " f\"WHEN zone_code!='ALL' AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n", 283 | " for min_zone, max_zone in ZONE_GROUPS\n", 284 | " ]\n", 285 | " )\n", 286 | " }\n", 287 | " ELSE zone_code\n", 288 | " END AS zone_group\n", 289 | " FROM (\n", 290 | " {query_metrics}\n", 291 | " ) _\n", 292 | "\"\"\"" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "query = F\"\"\"\n", 302 | " COPY (\n", 303 | " {\n", 304 | " query_metrics_with_zone_group\n", 305 | " } )\n", 306 | " TO 'VOTES_TIME_METRICS.parquet' \n", 307 | " (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n", 308 | "\"\"\"" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 11, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "application/vnd.jupyter.widget-view+json": { 319 | "model_id": "df1f82e654f446ccb9e0f3171cf3edef", 320 | "version_major": 2, 321 | "version_minor": 0 322 | }, 323 | "text/plain": [ 324 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 325 | ] 326 | }, 327 | "metadata": {}, 328 | "output_type": "display_data" 329 | }, 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "" 334 | ] 335 | }, 336 | "execution_count": 11, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "cursor.execute(query)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 12, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/html": [ 353 | "
\n", 354 | "\n", 367 | "\n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | "
turnoufzone_codesection_codetotal_votostotal_secoestempo_voto_somatempo_voto_mediotempo_biometria_somatempo_biometria_medio...votos_90_120_segundosvotos_120_150_segundosvotos_150_180_segundosvotos_180_210_segundosvotos_210_300_segundosvotos_300_9999_segundostx_sucesso_biometrianr_total_cargos_votadosnr_total_teclas_digitadaszone_group
01DF0014ALL893183084345747.048.654773869184.010.492576...2633.0906.0409.0237.0276.0174.00.95361551745890.00-20
11DF0015ALL1347445056785043.050.3550661171645.09.482934...4310.01623.0837.0425.0494.0244.00.96377652667519.00-20
21DF0017ALL1002403785899118.058.8499401344604.014.337855...5206.02062.01012.0587.0793.0364.00.93199351997143.00-20
31DF00050050294117810.060.5782313067.011.193431...23.08.04.01.03.01.00.95578255880.00-20
41DF00150229281114992.053.3523132154.08.381323...16.05.01.00.01.00.00.97508955620.00-20
..................................................................
66251DF00210082253113693.054.1225302232.09.073171...16.06.01.01.02.00.00.98419055060.020-40
66261DF00210332234112897.055.1153852474.010.850877...5.06.00.01.02.03.00.95726554680.020-40
66271DF00210318312117853.057.2211543879.013.104730...14.05.04.01.05.01.00.97115456240.020-40
66281DF00210185315116997.053.9587302877.09.558140...15.07.07.00.00.00.00.94603256300.020-40
66291DFALLALL1779224661099817162.056.10151518036165.010.941314...81643.033515.016930.09692.011890.05609.00.952647535370312.0ALL
\n", 661 | "

6630 rows × 25 columns

\n", 662 | "
" 663 | ], 664 | "text/plain": [ 665 | " turno uf zone_code section_code total_votos total_secoes \\\n", 666 | "0 1 DF 0014 ALL 89318 308 \n", 667 | "1 1 DF 0015 ALL 134744 505 \n", 668 | "2 1 DF 0017 ALL 100240 378 \n", 669 | "3 1 DF 0005 0050 294 1 \n", 670 | "4 1 DF 0015 0229 281 1 \n", 671 | "... ... .. ... ... ... ... \n", 672 | "6625 1 DF 0021 0082 253 1 \n", 673 | "6626 1 DF 0021 0332 234 1 \n", 674 | "6627 1 DF 0021 0318 312 1 \n", 675 | "6628 1 DF 0021 0185 315 1 \n", 676 | "6629 1 DF ALL ALL 1779224 6610 \n", 677 | "\n", 678 | " tempo_voto_soma tempo_voto_medio tempo_biometria_soma \\\n", 679 | "0 4345747.0 48.654773 869184.0 \n", 680 | "1 6785043.0 50.355066 1171645.0 \n", 681 | "2 5899118.0 58.849940 1344604.0 \n", 682 | "3 17810.0 60.578231 3067.0 \n", 683 | "4 14992.0 53.352313 2154.0 \n", 684 | "... ... ... ... \n", 685 | "6625 13693.0 54.122530 2232.0 \n", 686 | "6626 12897.0 55.115385 2474.0 \n", 687 | "6627 17853.0 57.221154 3879.0 \n", 688 | "6628 16997.0 53.958730 2877.0 \n", 689 | "6629 99817162.0 56.101515 18036165.0 \n", 690 | "\n", 691 | " tempo_biometria_medio ... votos_90_120_segundos \\\n", 692 | "0 10.492576 ... 2633.0 \n", 693 | "1 9.482934 ... 4310.0 \n", 694 | "2 14.337855 ... 5206.0 \n", 695 | "3 11.193431 ... 23.0 \n", 696 | "4 8.381323 ... 16.0 \n", 697 | "... ... ... ... \n", 698 | "6625 9.073171 ... 16.0 \n", 699 | "6626 10.850877 ... 5.0 \n", 700 | "6627 13.104730 ... 14.0 \n", 701 | "6628 9.558140 ... 15.0 \n", 702 | "6629 10.941314 ... 81643.0 \n", 703 | "\n", 704 | " votos_120_150_segundos votos_150_180_segundos votos_180_210_segundos \\\n", 705 | "0 906.0 409.0 237.0 \n", 706 | "1 1623.0 837.0 425.0 \n", 707 | "2 2062.0 1012.0 587.0 \n", 708 | "3 8.0 4.0 1.0 \n", 709 | "4 5.0 1.0 0.0 \n", 710 | "... ... ... ... \n", 711 | "6625 6.0 1.0 1.0 \n", 712 | "6626 6.0 0.0 1.0 \n", 713 | "6627 5.0 4.0 1.0 \n", 714 | "6628 7.0 7.0 0.0 \n", 715 | "6629 33515.0 16930.0 9692.0 \n", 716 | "\n", 717 | " votos_210_300_segundos votos_300_9999_segundos tx_sucesso_biometria \\\n", 718 | "0 276.0 174.0 0.953615 \n", 719 | "1 494.0 244.0 0.963776 \n", 720 | "2 793.0 364.0 0.931993 \n", 721 | "3 3.0 1.0 0.955782 \n", 722 | "4 1.0 0.0 0.975089 \n", 723 | "... ... ... ... \n", 724 | "6625 2.0 0.0 0.984190 \n", 725 | "6626 2.0 3.0 0.957265 \n", 726 | "6627 5.0 1.0 0.971154 \n", 727 | "6628 0.0 0.0 0.946032 \n", 728 | "6629 11890.0 5609.0 0.952647 \n", 729 | "\n", 730 | " nr_total_cargos_votados nr_total_teclas_digitadas zone_group \n", 731 | "0 5 1745890.0 0-20 \n", 732 | "1 5 2667519.0 0-20 \n", 733 | "2 5 1997143.0 0-20 \n", 734 | "3 5 5880.0 0-20 \n", 735 | "4 5 5620.0 0-20 \n", 736 | "... ... ... ... \n", 737 | "6625 5 5060.0 20-40 \n", 738 | "6626 5 4680.0 20-40 \n", 739 | "6627 5 6240.0 20-40 \n", 740 | "6628 5 6300.0 20-40 \n", 741 | "6629 5 35370312.0 ALL \n", 742 | "\n", 743 | "[6630 rows x 25 columns]" 744 | ] 745 | }, 746 | "execution_count": 12, 747 | "metadata": {}, 748 | "output_type": "execute_result" 749 | } 750 | ], 751 | "source": [ 752 | "table = \"\"\"\n", 753 | " read_parquet(\n", 754 | " 'VOTES_TIME_METRICS.parquet/*/*/*/*.parquet', \n", 755 | " hive_partitioning=True,\n", 756 | " hive_types_autocast=0\n", 757 | " )\n", 758 | " \"\"\"\n", 759 | "turno = 1\n", 760 | "uf = 'DF'\n", 761 | "zone_group = 'ALL'\n", 762 | "zone = 1\n", 763 | "\n", 764 | "\n", 765 | "query = f\"\"\"\n", 766 | " SELECT *\n", 767 | " FROM {table}\n", 768 | " WHERE 1=1\n", 769 | " AND turno = '{turno}'\n", 770 | " AND uf = '{uf}'\n", 771 | " -- AND zone_group = '{zone_group}'\n", 772 | " -- AND zone_code = {zone}\n", 773 | " \"\"\"\n", 774 | "\n", 775 | "df = cursor.execute(query).df()\n", 776 | "df" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 13, 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "Index(['turno', 'uf', 'zone_code', 'section_code', 'total_votos',\n", 788 | " 'total_secoes', 'tempo_voto_soma', 'tempo_voto_medio',\n", 789 | " 'tempo_biometria_soma', 'tempo_biometria_medio',\n", 790 | " 'tempo_voto_total_soma', 'tempo_voto_total_medio',\n", 791 | " 'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos',\n", 792 | " 'votos_90_120_segundos', 'votos_120_150_segundos',\n", 793 | " 'votos_150_180_segundos', 'votos_180_210_segundos',\n", 794 | " 'votos_210_300_segundos', 'votos_300_9999_segundos',\n", 795 | " 'tx_sucesso_biometria', 'nr_total_cargos_votados',\n", 796 | " 'nr_total_teclas_digitadas', 'zone_group'],\n", 797 | " dtype='object')" 798 | ] 799 | }, 800 | "execution_count": 13, 801 | "metadata": {}, 802 | "output_type": "execute_result" 803 | } 804 | ], 805 | "source": [ 806 | "df.columns" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": {}, 813 | "outputs": [], 814 | "source": [] 815 | } 816 | ], 817 | "metadata": { 818 | "kernelspec": { 819 | "display_name": "base", 820 | "language": "python", 821 | "name": "python3" 822 | }, 823 | "language_info": { 824 | "codemirror_mode": { 825 | "name": "ipython", 826 | "version": 3 827 | }, 828 | "file_extension": ".py", 829 | "mimetype": "text/x-python", 830 | "name": "python", 831 | "nbconvert_exporter": "python", 832 | "pygments_lexer": "ipython3", 833 | "version": "3.11.5" 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 2 838 | } 839 | -------------------------------------------------------------------------------- /src/test-basic-queries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Performance das Consultas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Importando Bibliotecas" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import duckdb\n", 24 | "import time" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Conectando à base de dados" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "cursor = duckdb.connect()\n", 41 | "DATABASE = '*.parquet'" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 14, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "def execute_query_and_calculate_time(cursor, query, return_df=False):\n", 51 | " \n", 52 | " if return_df:\n", 53 | " tic = time.time()\n", 54 | " cursor.execute(query)\n", 55 | " df = cursor.df()\n", 56 | " toc = time.time()\n", 57 | " return df, toc - tic\n", 58 | " else:\n", 59 | " tic = time.time()\n", 60 | " cursor.execute(query)\n", 61 | " toc = time.time()\n", 62 | " return toc - tic" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Consultas" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Quantidade de Registros" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "query = f\"SELECT COUNT(*) FROM '{DATABASE}'\"" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "Number of rows: 4,283,329,488\n", 98 | "Time: 1.47s\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "tic = time.time()\n", 104 | "\n", 105 | "cursor.execute(query)\n", 106 | "n_rows = cursor.fetchone()\n", 107 | "\n", 108 | "toc = time.time()\n", 109 | "\n", 110 | "print(f\"Number of rows: {n_rows[0]:,}\")\n", 111 | "print(f\"Time: {toc - tic:.2f}s\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Primeiros Registros" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Time: 0.82s\n" 131 | ] 132 | }, 133 | { 134 | "data": { 135 | "text/html": [ 136 | "
\n", 137 | "\n", 150 | "\n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | "
event_timestampevent_typesome_idevent_systemevent_descriptionevent_idfilename
02022-10-26 10:39:36INFO67305985LOGDInício das operações do logdE2C58C3021D6DB87/data/logs/2_AC/o00407-0100700090001_new.csv
12022-10-26 10:39:36INFO67305985LOGDUrna ligada em 26/10/2022 às 10:38:20DFBD462E26E8F1EA/data/logs/2_AC/o00407-0100700090001_new.csv
22022-10-26 10:39:36INFO67305985SCUEIniciando aplicação - Oficial - 1º turnoB8E2CBFADB3EF46B/data/logs/2_AC/o00407-0100700090001_new.csv
32022-10-26 10:39:36INFO67305985SCUEVersão da aplicação: 8.26.0.0 - Onça-pintadaAC76A5B17419CB2E/data/logs/2_AC/o00407-0100700090001_new.csv
42022-10-26 10:39:38INFO67305985SCUEUrna operando com rede elétricaED0703CBF6110D2C/data/logs/2_AC/o00407-0100700090001_new.csv
\n", 216 | "
" 217 | ], 218 | "text/plain": [ 219 | " event_timestamp event_type some_id event_system \\\n", 220 | "0 2022-10-26 10:39:36 INFO 67305985 LOGD \n", 221 | "1 2022-10-26 10:39:36 INFO 67305985 LOGD \n", 222 | "2 2022-10-26 10:39:36 INFO 67305985 SCUE \n", 223 | "3 2022-10-26 10:39:36 INFO 67305985 SCUE \n", 224 | "4 2022-10-26 10:39:38 INFO 67305985 SCUE \n", 225 | "\n", 226 | " event_description event_id \\\n", 227 | "0 Início das operações do logd E2C58C3021D6DB87 \n", 228 | "1 Urna ligada em 26/10/2022 às 10:38:20 DFBD462E26E8F1EA \n", 229 | "2 Iniciando aplicação - Oficial - 1º turno B8E2CBFADB3EF46B \n", 230 | "3 Versão da aplicação: 8.26.0.0 - Onça-pintada AC76A5B17419CB2E \n", 231 | "4 Urna operando com rede elétrica ED0703CBF6110D2C \n", 232 | "\n", 233 | " filename \n", 234 | "0 /data/logs/2_AC/o00407-0100700090001_new.csv \n", 235 | "1 /data/logs/2_AC/o00407-0100700090001_new.csv \n", 236 | "2 /data/logs/2_AC/o00407-0100700090001_new.csv \n", 237 | "3 /data/logs/2_AC/o00407-0100700090001_new.csv \n", 238 | "4 /data/logs/2_AC/o00407-0100700090001_new.csv " 239 | ] 240 | }, 241 | "execution_count": 6, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "query = f\"\"\"\n", 248 | " SELECT \n", 249 | " *\n", 250 | " FROM '{DATABASE}' LIMIT 5\n", 251 | "\"\"\"\n", 252 | "\n", 253 | "tic = time.time()\n", 254 | "cursor.execute(query)\n", 255 | "df_result = cursor.df()\n", 256 | "toc = time.time()\n", 257 | "\n", 258 | "print(f\"Time: {toc - tic:.2f}s\")\n", 259 | "df_result" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### Primeiros registros + filtro RN" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 7, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "Time: 69.65s\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "query = f\"\"\"\n", 284 | " SELECT \n", 285 | " *\n", 286 | " FROM '{DATABASE}'\n", 287 | " WHERE filename ILIKE '%RN%'\n", 288 | " LIMIT 500\n", 289 | "\"\"\"\n", 290 | "\n", 291 | "tic = time.time()\n", 292 | "cursor.execute(query)\n", 293 | "df_result = cursor.df()\n", 294 | "toc = time.time()\n", 295 | "\n", 296 | "print(f\"Time: {toc - tic:.2f}s\")" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 11, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "Time: 91.12s\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "query = f\"\"\"\n", 314 | " SELECT \n", 315 | " *\n", 316 | " FROM '{DATABASE}'\n", 317 | " WHERE filename ILIKE '%SP%'\n", 318 | " LIMIT 500\n", 319 | "\"\"\"\n", 320 | "\n", 321 | "tic = time.time()\n", 322 | "cursor.execute(query)\n", 323 | "df_result = cursor.df()\n", 324 | "toc = time.time()\n", 325 | "\n", 326 | "print(f\"Time: {toc - tic:.2f}s\")" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### Distinct" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "event_type" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 12, 346 | "metadata": {}, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "Time: 5.69s\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "query = f\"\"\"\n", 358 | " SELECT DISTINCT\n", 359 | " event_type\n", 360 | " FROM '{DATABASE}'\n", 361 | "\"\"\"\n", 362 | "\n", 363 | "tic = time.time()\n", 364 | "cursor.execute(query)\n", 365 | "df_result = cursor.df()\n", 366 | "toc = time.time()\n", 367 | "\n", 368 | "print(f\"Time: {toc - tic:.2f}s\")" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "event_description" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 14, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "Time: 29.33s\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "query = f\"\"\"\n", 393 | " SELECT DISTINCT\n", 394 | " event_description\n", 395 | " FROM '{DATABASE}'\n", 396 | "\"\"\"\n", 397 | "\n", 398 | "tic = time.time()\n", 399 | "cursor.execute(query)\n", 400 | "df_result = cursor.df()\n", 401 | "toc = time.time()\n", 402 | "\n", 403 | "print(f\"Time: {toc - tic:.2f}s\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "### Group By" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 3, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "application/vnd.jupyter.widget-view+json": { 421 | "model_id": "2941ff10abd0446cb443aafd4e0fc77c", 422 | "version_major": 2, 423 | "version_minor": 0 424 | }, 425 | "text/plain": [ 426 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 427 | ] 428 | }, 429 | "metadata": {}, 430 | "output_type": "display_data" 431 | }, 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Time: 6.77s\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "query = f\"\"\"\n", 442 | " SELECT \n", 443 | " event_system,\n", 444 | " COUNT(*) AS qtd_linhas\n", 445 | " FROM '{DATABASE}'\n", 446 | " GROUP BY event_system\n", 447 | "\"\"\"\n", 448 | "\n", 449 | "tic = time.time()\n", 450 | "cursor.execute(query) \n", 451 | "df_result = cursor.df()\n", 452 | "toc = time.time()\n", 453 | "\n", 454 | "print(f\"Time: {toc - tic:.2f}s\")" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 4, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/html": [ 465 | "
\n", 466 | "\n", 479 | "\n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | "
event_systemqtd_linhas
0INITJE3044304
1VERIFICADOR37931
2STE394
3LOGD17978454
4ADH5188
5SA784
6SCUE39756883
7VPP223388
8VO\u0014A1
9VOTA3879701660
10RED76691
11GAP262715525
12ATUE79788285
\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " event_system qtd_linhas\n", 559 | "0 INITJE 3044304\n", 560 | "1 VERIFICADOR 37931\n", 561 | "2 STE 394\n", 562 | "3 LOGD 17978454\n", 563 | "4 ADH 5188\n", 564 | "5 SA 784\n", 565 | "6 SCUE 39756883\n", 566 | "7 VPP 223388\n", 567 | "8 VO\u0014A 1\n", 568 | "9 VOTA 3879701660\n", 569 | "10 RED 76691\n", 570 | "11 GAP 262715525\n", 571 | "12 ATUE 79788285" 572 | ] 573 | }, 574 | "execution_count": 4, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "df_result" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "### Group By + Filtro" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 9, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "data": { 597 | "application/vnd.jupyter.widget-view+json": { 598 | "model_id": "fde6505a3b484b28880584a0d5f7bb84", 599 | "version_major": 2, 600 | "version_minor": 0 601 | }, 602 | "text/plain": [ 603 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 604 | ] 605 | }, 606 | "metadata": {}, 607 | "output_type": "display_data" 608 | }, 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "Time: 7.98s\n" 614 | ] 615 | } 616 | ], 617 | "source": [ 618 | "query = f\"\"\"\n", 619 | " SELECT \n", 620 | " event_type,\n", 621 | " COUNT(*) AS qtd_linhas\n", 622 | " FROM '{DATABASE}'\n", 623 | " WHERE event_system='VOTA' OR event_system='RED'\n", 624 | " GROUP BY event_type\n", 625 | "\"\"\"\n", 626 | "\n", 627 | "tic = time.time()\n", 628 | "cursor.execute(query) \n", 629 | "df_result = cursor.df()\n", 630 | "toc = time.time()\n", 631 | "\n", 632 | "print(f\"Time: {toc - tic:.2f}s\")" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 10, 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/html": [ 643 | "
\n", 644 | "\n", 657 | "\n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | "
event_typeqtd_linhas
0ALERTA50460553
1ERRO1024682
2INFO3828293116
\n", 683 | "
" 684 | ], 685 | "text/plain": [ 686 | " event_type qtd_linhas\n", 687 | "0 ALERTA 50460553\n", 688 | "1 ERRO 1024682\n", 689 | "2 INFO 3828293116" 690 | ] 691 | }, 692 | "execution_count": 10, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "df_result" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "### Verificar se event_id é unico" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "[WIP] Descrição básica da razão de cada consulta, qual sua função e como ela é utilizada no dia a dia" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "1 - Usando GroupBy" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 4, 725 | "metadata": {}, 726 | "outputs": [ 727 | { 728 | "data": { 729 | "application/vnd.jupyter.widget-view+json": { 730 | "model_id": "db88550cf5cf4fc7a508e50d1839f168", 731 | "version_major": 2, 732 | "version_minor": 0 733 | }, 734 | "text/plain": [ 735 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 736 | ] 737 | }, 738 | "metadata": {}, 739 | "output_type": "display_data" 740 | }, 741 | { 742 | "ename": "", 743 | "evalue": "", 744 | "output_type": "error", 745 | "traceback": [ 746 | "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", 747 | "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", 748 | "\u001b[1;31mClick here for more info. \n", 749 | "\u001b[1;31mView Jupyter log for further details." 750 | ] 751 | } 752 | ], 753 | "source": [ 754 | "query = f\"\"\"\n", 755 | " SELECT\n", 756 | " COUNT(*) \n", 757 | " FROM (\n", 758 | " SELECT \n", 759 | " event_id,\n", 760 | " COUNT(*)\n", 761 | " FROM '{DATABASE}'\n", 762 | " GROUP BY event_id\n", 763 | " HAVING COUNT(*) > 1\n", 764 | " )\n", 765 | "\"\"\"\n", 766 | "\n", 767 | "tic = time.time()\n", 768 | "cursor.execute(query) \n", 769 | "df_result = cursor.df()\n", 770 | "toc = time.time()\n", 771 | "\n", 772 | "print(f\"Time: {toc - tic:.2f}s\")" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "2 - Usando Windows Function" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 3, 785 | "metadata": {}, 786 | "outputs": [ 787 | { 788 | "data": { 789 | "application/vnd.jupyter.widget-view+json": { 790 | "model_id": "f529e0b4f1e149e6bdab3ade2e1d665f", 791 | "version_major": 2, 792 | "version_minor": 0 793 | }, 794 | "text/plain": [ 795 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 796 | ] 797 | }, 798 | "metadata": {}, 799 | "output_type": "display_data" 800 | } 801 | ], 802 | "source": [ 803 | "query = f\"\"\"\n", 804 | " SELECT\n", 805 | " event_id, qtd_linhas\n", 806 | " FROM (\n", 807 | " SELECT \n", 808 | " event_id,\n", 809 | " COUNT(*) OVER( PARTITION BY event_id ) AS qtd_linhas\n", 810 | " FROM '{DATABASE}'\n", 811 | " ) _\n", 812 | " WHERE qtd_linhas > 1\n", 813 | " LIMIT 1\n", 814 | "\"\"\"\n", 815 | "\n", 816 | "tic = time.time()\n", 817 | "cursor.execute(query) \n", 818 | "df_result = cursor.df()\n", 819 | "toc = time.time()\n", 820 | "\n", 821 | "print(f\"Time: {toc - tic:.2f}s\")" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "3 - Usando Count Distinct" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 5, 834 | "metadata": {}, 835 | "outputs": [ 836 | { 837 | "data": { 838 | "application/vnd.jupyter.widget-view+json": { 839 | "model_id": "71e66f8947c24b12b66fb5b11196bf63", 840 | "version_major": 2, 841 | "version_minor": 0 842 | }, 843 | "text/plain": [ 844 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 845 | ] 846 | }, 847 | "metadata": {}, 848 | "output_type": "display_data" 849 | } 850 | ], 851 | "source": [ 852 | "query = f\"\"\"\n", 853 | " SELECT COUNT(*)\n", 854 | " FROM (\n", 855 | " SELECT DISTINCT event_id \n", 856 | " FROM '{DATABASE}'\n", 857 | " ) _\n", 858 | "\"\"\"\n", 859 | "\n", 860 | "tic = time.time()\n", 861 | "cursor.execute(query) \n", 862 | "df_result = cursor.df()\n", 863 | "toc = time.time()\n", 864 | "\n", 865 | "print(f\"Time: {toc - tic:.2f}s\")" 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "metadata": {}, 871 | "source": [ 872 | "4 - Usando Distinct + write to disk" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 4, 878 | "metadata": {}, 879 | "outputs": [ 880 | { 881 | "data": { 882 | "application/vnd.jupyter.widget-view+json": { 883 | "model_id": "f70e43b1e2f24f38adebfcd80815d560", 884 | "version_major": 2, 885 | "version_minor": 0 886 | }, 887 | "text/plain": [ 888 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 889 | ] 890 | }, 891 | "metadata": {}, 892 | "output_type": "display_data" 893 | }, 894 | { 895 | "ename": "", 896 | "evalue": "", 897 | "output_type": "error", 898 | "traceback": [ 899 | "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", 900 | "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", 901 | "\u001b[1;31mClick here for more info. \n", 902 | "\u001b[1;31mView Jupyter log for further details." 903 | ] 904 | } 905 | ], 906 | "source": [ 907 | "query = f\"\"\"\n", 908 | " COPY (\n", 909 | " SELECT DISTINCT event_id \n", 910 | " FROM '{DATABASE}'\n", 911 | " ) TO 'event_id.parquet' \n", 912 | " (FORMAT 'parquet')\n", 913 | "\"\"\"\n", 914 | "\n", 915 | "tic = time.time()\n", 916 | "cursor.execute(query) \n", 917 | "df_result = cursor.df()\n", 918 | "toc = time.time()\n", 919 | "\n", 920 | "print(f\"Time: {toc - tic:.2f}s\")" 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "metadata": {}, 926 | "source": [ 927 | "### Distinct mensagens" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "Primeira aproximação" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": 15, 940 | "metadata": {}, 941 | "outputs": [ 942 | { 943 | "data": { 944 | "application/vnd.jupyter.widget-view+json": { 945 | "model_id": "f37594cbd3e24d858c8a99e7f2841d9e", 946 | "version_major": 2, 947 | "version_minor": 0 948 | }, 949 | "text/plain": [ 950 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 951 | ] 952 | }, 953 | "metadata": {}, 954 | "output_type": "display_data" 955 | }, 956 | { 957 | "name": "stdout", 958 | "output_type": "stream", 959 | "text": [ 960 | "Time: 159.81s\n" 961 | ] 962 | }, 963 | { 964 | "data": { 965 | "text/plain": [ 966 | "159.71511435508728" 967 | ] 968 | }, 969 | "execution_count": 15, 970 | "metadata": {}, 971 | "output_type": "execute_result" 972 | } 973 | ], 974 | "source": [ 975 | "query = f\"\"\"\n", 976 | " SELECT DISTINCT\n", 977 | " regexp_replace(event_description, '[0-9]', 'X', 'g') AS event_description\n", 978 | " FROM '{DATABASE}'\n", 979 | "\"\"\"\n", 980 | "\n", 981 | "duration, df_result = execute_query_and_calculate_time(cursor, query, return_df=True)\n", 982 | "print(f\"Time: {toc - tic:.2f}s\")\n", 983 | "df_result" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 22, 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "application/vnd.jupyter.widget-view+json": { 994 | "model_id": "34112b0f74864cbf812b851009072faf", 995 | "version_major": 2, 996 | "version_minor": 0 997 | }, 998 | "text/plain": [ 999 | "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" 1000 | ] 1001 | }, 1002 | "metadata": {}, 1003 | "output_type": "display_data" 1004 | } 1005 | ], 1006 | "source": [ 1007 | "# Identificador da mídia de carga\n", 1008 | "# Serial da MI copiada da MV da urna original\n", 1009 | "# Serial de votação da MV\n", 1010 | "# Verificação de assinatura de dado por etapa\n", 1011 | "\n", 1012 | "query = f\"\"\"\n", 1013 | " SELECT DISTINCT\n", 1014 | " CASE\n", 1015 | " WHEN event_description ILIKE 'Identificador da mídia de carga%' \n", 1016 | " THEN 'Identificador da mídia de carga'\n", 1017 | "\n", 1018 | " WHEN event_description ILIKE 'Serial da MI copiada da MV da urna original%' \n", 1019 | " THEN 'Serial da MI copiada da MV da urna original'\n", 1020 | "\n", 1021 | " WHEN event_description ILIKE 'Serial de votação da MV%' \n", 1022 | " THEN 'Serial de votação da MV'\n", 1023 | "\n", 1024 | " WHEN event_description ILIKE 'Verificação de assinatura de dado por etapa%' \n", 1025 | " THEN 'Verificação de assinatura de dado por etapa'\n", 1026 | "\n", 1027 | " WHEN event_description ILIKE 'Número de série da MR%'\n", 1028 | " THEN 'Número de série da MR'\n", 1029 | " \n", 1030 | " ELSE regexp_replace(event_description, '[0-9]', 'X', 'g') \n", 1031 | " END AS event_description\n", 1032 | " FROM '{DATABASE}'\n", 1033 | "\"\"\"\n", 1034 | "\n", 1035 | "df_result, duration = execute_query_and_calculate_time(cursor, query, return_df=True)" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 23, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "name": "stdout", 1045 | "output_type": "stream", 1046 | "text": [ 1047 | "Time: 478.24s\n", 1048 | "Number of rows: 1,391\n" 1049 | ] 1050 | } 1051 | ], 1052 | "source": [ 1053 | "print(f\"Time: {duration:.2f}s\")\n", 1054 | "print(f\"Number of rows: {df_result.shape[0]:,}\")\n", 1055 | "df_result.to_csv('event_description.csv', index=False)" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": null, 1061 | "metadata": {}, 1062 | "outputs": [], 1063 | "source": [] 1064 | } 1065 | ], 1066 | "metadata": { 1067 | "kernelspec": { 1068 | "display_name": "Python 3 (ipykernel)", 1069 | "language": "python", 1070 | "name": "python3" 1071 | }, 1072 | "language_info": { 1073 | "codemirror_mode": { 1074 | "name": "ipython", 1075 | "version": 3 1076 | }, 1077 | "file_extension": ".py", 1078 | "mimetype": "text/x-python", 1079 | "name": "python", 1080 | "nbconvert_exporter": "python", 1081 | "pygments_lexer": "ipython3", 1082 | "version": "3.11.5" 1083 | } 1084 | }, 1085 | "nbformat": 4, 1086 | "nbformat_minor": 2 1087 | } 1088 | --------------------------------------------------------------------------------