├── data
    └── logs
    │   ├── .gitkeep
    │   ├── .gitignore
    │   └── calculate_lines.sh
├── src
    ├── .gitignore
    ├── tests
    │   ├── test-create-table.py
    │   └── test-convert-to-parquet.py
    ├── convert_state_csv_data_to_parquet.py
    ├── count_number_of_lines.py
    ├── 01_extrair_eventos_relacionados_a_votos.ipynb
    ├── 03B_calcular_metricas_temporais.ipynb
    ├── 02_isolar_timestamps_eventos.ipynb
    ├── 03A_calcular_metricas_tempo.ipynb
    └── test-basic-queries.ipynb
├── duckdb
    ├── requirements.txt
    └── Dockerfile
├── streamlit
    ├── .gitignore
    ├── app
    │   ├── requirements.txt
    │   ├── maps
    │   │   └── donwload_files.txt
    │   ├── maps.py
    │   ├── data.py
    │   ├── main.py
    │   └── widgets.py
    └── Dockerfile
├── docker-compose.yaml
├── convert_encoding_from_files.py
├── extract_log_files.py
├── download_log_urnas.py
└── README.md


/data/logs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.parquet
2 | *.csv
3 | .tmp


--------------------------------------------------------------------------------
/data/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *.zip
2 | *.logjez
3 | *.csv


--------------------------------------------------------------------------------
/duckdb/requirements.txt:
--------------------------------------------------------------------------------
1 | duckdb
2 | pandas==2.2.1


--------------------------------------------------------------------------------
/streamlit/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.svg
3 | *.zip


--------------------------------------------------------------------------------
/streamlit/app/requirements.txt:
--------------------------------------------------------------------------------
1 | altair
2 | pandas
3 | duckdb
4 | streamlit
5 | matplotlib
6 | geopandas
7 | seaborn


--------------------------------------------------------------------------------
/duckdb/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jupyter/minimal-notebook
 2 | 
 3 | COPY requirements.txt /app/requirements.txt
 4 | WORKDIR /app
 5 | 
 6 | RUN pip install -r requirements.txt --upgrade
 7 | WORKDIR /src
 8 | 
 9 | # EXPOSE JUPYTER PORT
10 | EXPOSE 8888
11 | 


--------------------------------------------------------------------------------
/streamlit/app/maps/donwload_files.txt:
--------------------------------------------------------------------------------
1 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_UF_2022.zip
2 | https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/Brasil/BR/BR_Municipios_2022.zip
3 | 


--------------------------------------------------------------------------------
/streamlit/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | WORKDIR /app
 3 | COPY ./app/requirements.txt /app/requirements.txt
 4 | 
 5 | RUN pip3 install -r requirements.txt
 6 | 
 7 | EXPOSE 8500
 8 | 
 9 | HEALTHCHECK CMD curl --fail http://localhost:8500/_stcore/health
10 | 
11 | ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8500", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/src/tests/test-create-table.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | 
 3 | cursor = duckdb.connect('test.db')
 4 | print(cursor.execute(
 5 |     """
 6 |         CREATE OR REPLACE TABLE test_zz AS
 7 |         SELECT 
 8 |             * 
 9 |         FROM
10 |         read_csv('/data/logs/2_ZZ/*_new.csv', filename=True)
11 |     """
12 | ))
13 | 
14 | # Select the data from the table LIMIT 10


--------------------------------------------------------------------------------
/src/tests/test-convert-to-parquet.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | import time
 3 | cursor = duckdb.connect('test.db')
 4 | 
 5 | tic = time.time()
 6 | cursor.execute(
 7 |     """
 8 |         COPY (
 9 |             SELECT 
10 |                 * 
11 |             FROM read_csv('/data/logs/2_ZZ/*_new.csv', filename=True)
12 |         ) TO 'test_zz.parquet' (FORMAT 'parquet');
13 |     """
14 | )
15 | toc = time.time()
16 | print(f"Time taken: {toc - tic} seconds")


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   duckdb:
 4 |     build: ./duckdb
 5 |     volumes:
 6 |       - ./data:/data
 7 |       - ./src:/src
 8 |     ports:
 9 |       - "8888:8888"
10 |     command: start-notebook.sh --NotebookApp.token='1234'
11 |   streamlit:
12 |     build: ./streamlit
13 |     volumes:
14 |       - ./src:/src
15 |       - ./streamlit/app/:/app
16 |     ports:
17 |       - "8600:8500"
18 |     command: streamlit run /src/app.py
19 | 


--------------------------------------------------------------------------------
/data/logs/calculate_lines.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Array of Brazilian states
 4 | states=("AC" "AL" "AP" "AM" "BA" "CE" "DF" "ES" "GO" "MA" "MT" "MS" "MG" "PA" "PB" "PR" "PE" "PI" "RJ" "RN" "RS" "RO" "RR" "SC" "SP" "SE" "TO")
 5 | 
 6 | # Iterate over each state
 7 | for state in "${states[@]}"
 8 | do
 9 |     # Concatenate "2_" in front of the state
10 |     state_with_prefix="2_$state"
11 |     echo "Calculating total lines for $state_with_prefix"
12 |     find "./$state_with_prefix" -type f -exec wc -l {} + | awk -v st="$state_with_prefix" '{total += $1} END {print "Total lines in " st ":", total}'
13 | done
14 | 


--------------------------------------------------------------------------------
/src/convert_state_csv_data_to_parquet.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | import time
 3 | import sys
 4 | 
 5 | 
 6 | STATES = [
 7 |     "AC", "AL", "AM", "AP", "BA", "CE", 
 8 |     "DF", "ES", "GO", "MA", "MG", "MS", 
 9 |     "MT", "PA", "PB", "PE", "PI", "PR", 
10 |     "RJ", "RN", "RO", "RR", "RS", "SC", 
11 |     "SE", "SP", "TO", "ZZ"]
12 | 
13 | if __name__ == "__main__":
14 |     # get the first sys arg
15 |     uf = sys.argv[1]
16 | 
17 |     # if sys arg not in the brazilian states
18 |     if uf not in STATES:
19 |         print("Invalid state")
20 |         sys.exit(1)
21 | 
22 |     tic = time.time()
23 |     cursor = duckdb.connect("")
24 |     query = f"""
25 |         COPY (
26 |             SELECT 
27 |                 * 
28 |             FROM read_csv('/data/logs/2_{uf}/*.csv', filename=True)
29 |         ) TO '{uf}.parquet' (FORMAT 'parquet');
30 |     """
31 |     
32 |     cursor.execute(query)
33 |     toc = time.time()
34 |     print(f"Time taken to convert {uf} to parquet: {toc - tic} seconds")
35 | 


--------------------------------------------------------------------------------
/src/count_number_of_lines.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | import time
 3 | import sys
 4 | 
 5 | 
 6 | STATES = [
 7 |     "AC", "AL", "AM", "AP", "BA", "CE", 
 8 |     "DF", "ES", "GO", "MA", "MG", "MS", 
 9 |     "MT", "PA", "PB", "PE", "PI", "PR", 
10 |     "RJ", "RN", "RO", "RR", "RS", "SC", 
11 |     "SE", "SP", "TO", "ZZ", "ALL"]
12 | 
13 | if __name__ == "__main__":
14 |     # get the first sys arg
15 |     uf = sys.argv[1]
16 | 
17 |     # if sys arg not in the brazilian states
18 |     if uf not in STATES:
19 |         print("Invalid state")
20 |         sys.exit(1)
21 | 
22 |     tic = time.time()
23 |     cursor = duckdb.connect("")
24 | 
25 |     if uf == "ALL":
26 |         query = f"""
27 |             SELECT 
28 |                 COUNT(*) 
29 |             FROM '*.parquet'
30 |         """
31 |     else:
32 |         query = f"""
33 |             SELECT 
34 |                 COUNT(*) 
35 |             FROM '{uf}.parquet'
36 |         """
37 |     
38 |     cursor.execute(query)
39 |     toc = time.time()
40 |     print(f"Time taken to count number of lines in {uf}: {toc - tic} seconds")
41 |     print(cursor.fetchall())
42 | 


--------------------------------------------------------------------------------
/convert_encoding_from_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tqdm
 3 | import time
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 | 
 8 |     BASE_LOGS_PATH = "./data/logs"
 9 |     # list all directories in the base path
10 |     directories = os.listdir(BASE_LOGS_PATH)
11 |     command = "touch {} && iconv -f ISO-8859-1 -t UTF-8//TRANSLIT {} > {} && rm {}"
12 | 
13 |     tic = time.time()
14 |     for directory in directories:
15 |         path = BASE_LOGS_PATH + "/" + directory
16 |         if not os.path.isdir(path):
17 |             continue
18 | 
19 |         files = os.listdir(path)
20 |         files = [file for file in files if file.endswith(".csv") and not file.endswith("_new.csv")]
21 |         print(f"Processing directory {directory} with {len(files)} files")
22 | 
23 |         for file in tqdm.tqdm(files):
24 |             # convert the encoding of the file
25 |             filename = file.split(".")[0]
26 |             new_filename = filename + "_new.csv"
27 |             
28 |             path_old_file = path + "/" + file
29 |             path_new_file = path + "/" + new_filename
30 | 
31 |             os.system(command.format(path_new_file, path_old_file, path_new_file, path_old_file))
32 |     toc = time.time()
33 | 
34 |     print(f"Conversion took {toc - tic} seconds")


--------------------------------------------------------------------------------
/extract_log_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | BASE_PATH = './data/logs'
 4 | 
 5 | 
 6 | def unzip_log_files(zip_file):
 7 | 
 8 |     # Each ZIP file contains MULTIPLE *.logjez files
 9 |     # Each A.logjez file contains a logd.dat file that is the LOG file
10 |     # This code extract all A.logjez file and rename its logd.dat to A.csv
11 | 
12 |     # unzip file
13 |     # extracting only the .logjez files
14 |     filepath = zip_file[:-4]
15 |     os.system(f'7z e {zip_file} -o{filepath} *.logjez -r')
16 | 
17 |     # Remove unnecessary files
18 |     os.system(f'rm {zip_file}')  # Zip file
19 | 
20 |     # list all files in the directory
21 |     files = os.listdir(filepath)
22 | 
23 |     for file in files:
24 |         # extract .logjez files
25 |         # and rename to .csv
26 |         if file.endswith('.logjez'):
27 |             new_filename = file[:-7]
28 |             os.system(
29 |                 f'7z e {filepath}/{file} -y -o{filepath}/{new_filename} \
30 |                 > /dev/null'
31 |             )
32 |             os.system(
33 |                 f'mv \
34 |                 {filepath}/{new_filename}/logd.dat \
35 |                 {filepath}/{new_filename}.csv'
36 |             )
37 |             os.system(
38 |                 f'rm -r {filepath}/{new_filename}'
39 |             )
40 | 
41 |     os.system(f'chmod 777 -R {filepath}')
42 |     os.system(f'rm {filepath}/*.logjez')
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     for file in os.listdir(BASE_PATH):
47 |         if file.endswith('.zip'):
48 |             unzip_log_files(os.path.join(BASE_PATH, file))


--------------------------------------------------------------------------------
/streamlit/app/maps.py:
--------------------------------------------------------------------------------
 1 | import geopandas as gpd
 2 | import matplotlib.pyplot as plt
 3 | import re
 4 | import streamlit as st  
 5 | 
 6 | @st.cache_data()
 7 | def load_brazil_simplified_map():
 8 |     """
 9 |     Load the simplified map of Brazil.
10 |     The simplification is done to reduce the file size
11 |     and improve performance on the streamlit app.
12 | 
13 |     Returns:
14 |         gpd.GeoDataFrame: GeoDataFrame with the simplified map of Brazil.
15 |     """
16 |     
17 |     map_ufs = './maps/BR_UF_2022.zip'
18 | 
19 |     gdf = gpd.read_file(map_ufs)
20 |     gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01)
21 | 
22 |     return gdf
23 | 
24 | @st.cache_data()
25 | def load_ufs_city_simplified_map():
26 |     """
27 |     Load the simplified map of Brazil with cities.
28 |     """
29 |     map_municipios = './maps/BR_Municipios_2022.zip'
30 | 
31 |     gdf = gpd.read_file(map_municipios)
32 |     gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01)
33 | 
34 |     return gdf
35 | 
36 | def add_ufs_and_links_to_map(svg_image_buffer):
37 |     """
38 |     Generate links for each UF in the SVG image buffer.
39 |     And make the map clickable.
40 | 
41 |     Args:
42 |         svg_image_buffer (str): SVG image buffer.
43 | 
44 |     Returns:
45 |         str: SVG image buffer with links for each UF.
46 |     """
47 | 
48 |     re_uf_map_pattern = r'(<g id="([A-Z][A-Z])">((.|\s)*?)</g>)'
49 |     image_with_links = re.sub(
50 |         re_uf_map_pattern, 
51 |         r"<a target=\"_self\" rel=\"noopener noreferrer\" href=?uf=\2>\1</a>", 
52 |         svg_image_buffer
53 |     )
54 |     return image_with_links
55 | 


--------------------------------------------------------------------------------
/download_log_urnas.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import queue
 3 | import os
 4 | import sys
 5 | import logging
 6 | from itertools import product
 7 | 
 8 | BASE_URL = (
 9 |     'https://cdn.tse.jus.br/estatistica/sead/eleicoes/' +
10 |     'eleicoes2022/arqurnatot/bu_imgbu_logjez_rdv_vscmr_2022_{}t_{}.zip'
11 | )
12 | 
13 | UFS_BR = [
14 |     'AC', 'AL', 'AP', 'AM',
15 |     'BA', 'CE', 'DF', 'ES',
16 |     'GO', 'MA', 'MT', 'MS',
17 |     'MG', 'PA', 'PB', 'PR',
18 |     'PE', 'PI', 'RJ', 'RN',
19 |     'RS', 'RO', 'RR', 'SC',
20 |     'SP', 'SE', 'TO', 'ZZ'
21 | ]
22 | # TURNOS = [1, 2]
23 | TURNOS = [2]
24 | 
25 | 
26 | NUM_TRHEADS = 4
27 | 
28 | # Create a queue to communicate with the worker threads
29 | turnos_uf_queue = queue.Queue()
30 | 
31 | # Configure logging
32 | logging.basicConfig(
33 |     level=logging.INFO,
34 |     format='[%(asctime)s] %(message)s',
35 |     datefmt='%d/%m/%y %H:%M:%S'
36 | )
37 | 
38 | 
39 | def download_file():
40 |     uf_turno = turnos_uf_queue.get()
41 |     url = BASE_URL.format(*uf_turno)
42 |     path = os.path.join('data', 'logs', f'{uf_turno[0]}_{uf_turno[1]}.zip')
43 | 
44 |     logging.info(f'Downloading {url} to {path}')
45 | 
46 |     logging.info(f'Iniciando download de {url}')
47 |     try:
48 |         os.system(f'wget -O {path} {url}')
49 |     except Exception as e:
50 |         logging.error(f"Erro ao tentar baixar o arquivo {url}")
51 |         logging.error(e)
52 |         return
53 | 
54 |     logging.info(f'Finalizado download de {url}')
55 | 
56 |     if turnos_uf_queue.empty():
57 |         logging.info('All downloads finished')
58 |     else:
59 |         logging.info(f'{turnos_uf_queue.qsize()} downloads remaining')
60 |         download_file()
61 | 
62 |     turnos_uf_queue.task_done()
63 |     return
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     ufs_br_download = UFS_BR
68 |     if len(sys.argv) > 1:
69 |         ufs_br_download = sys.argv[1:]
70 | 
71 |     logging.info(f'Iniciando download de {len(ufs_br_download)} arquivos')
72 |     logging.info(f'UFs:    {ufs_br_download}')
73 |     logging.info(f'Turnos: {TURNOS}')
74 | 
75 |     for uf_br, turno in product(ufs_br_download, TURNOS):
76 |         turnos_uf_queue.put((turno, uf_br))
77 | 
78 |     for i in range(NUM_TRHEADS):
79 |         worker = threading.Thread(
80 |             target=download_file,
81 |             daemon=True
82 |         )
83 |         worker.start()
84 | 
85 |     turnos_uf_queue.join()
86 |     logging.info("Done")
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Processing Logs of Electronic Ballot Boxes
 3 | This repository contains Python + DuckDB scripts for processing logs from [Brazilian Electronic Ballot Boxes](https://international.tse.jus.br/en/electronic-ballot-box/presentation?set_language=en) to compute several time-related metrics (mean vote time, number of votes computed in 5min, percentage of biometric identification success).
 4 | 
 5 | ## The Data
 6 | The logs from the voting machines can be directly downloaded from the [TSE open data website](https://dadosabertos.tse.jus.br/dataset/resultados-2022-arquivos-transmitidos-para-totalizacao). This repository contains Python scripts that automatically download and extract the logs.
 7 | 
 8 | ## What are the logs of the Electronic Ballot Boxes?
 9 | Files that contain all operations performed on the machine, from the initial setup to the end of voting in the second round (if applicable). The files are stored in plain text, with each line representing an event. See an example below:
10 | 
11 | ```
12 | 21/09/2022 17:21:41	INFO	67305985	LOGD	Start of logd operations	                  FDE9B0FC7A079096
13 | 21/09/2022 17:21:41	INFO	67305985	LOGD	Machine turned on on 21/09/2022 at 17:20:16	B637C17E565B039B
14 | 21/09/2022 17:21:41	INFO	67305985	SCUE	Starting application - Official - 1st round	F82E007ACCAF93A5
15 | 21/09/2022 17:21:41	INFO	67305985	SCUE	Application version: 8.26.0.0 - Jaguar	    D499E9A173814A70
16 | ```
17 | With these logs, it is possible to extract numerous pieces of information about the electoral process. Due to their verbosity, the logs of the Ballot Boxes are very heavy. In their original format, the set of log files for a single Brazilian state can range from 2GB to over 50GB, with all the files combined reaching 450GB! Therefore, robust processing tools and optimized file formats are indispensable.
18 | 
19 | ## Note on Approximations and Errors
20 | Processing the logs of the voting machines is not a simple task.
21 | Although they are easy to read, defining a process that perfectly isolates each vote is a complex task because numerous situations can occur during the voting process.
22 | 
23 | The scripts coded here attempt to be as generic and simple as possible, to facilitate understanding, maintenance, and reduce the computational cost of processing. Therefore, they may occasionally not capture ALL votes perfectly. The error rate (uncaptured votes) considering the official count from the TSE is ~3% (experiment conducted with RN data).
24 | 


--------------------------------------------------------------------------------
/streamlit/app/data.py:
--------------------------------------------------------------------------------
  1 | import duckdb
  2 | 
  3 | ZONE_GROUPS = [ f'{x}-{x+20}' for x in range(0, 800, 20) ]
  4 | UFS = [
  5 |     "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS",
  6 |     "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC",
  7 |     "SP", "SE", "TO", "ZZ", "ALL"
  8 | ]
  9 | 
 10 | class DuckDBConnector:
 11 |     # Connect to the database
 12 |     # singleton pattern
 13 |     _instance = None
 14 |     def __init__(self) -> None:
 15 |         self.connect()
 16 | 
 17 |     @staticmethod
 18 |     def get_instance():
 19 |         if DuckDBConnector._instance is None:
 20 |             DuckDBConnector._instance = DuckDBConnector()
 21 |         return DuckDBConnector._instance
 22 |     
 23 |     
 24 |     def connect(self):
 25 |         self.cursor = duckdb.connect()
 26 | 
 27 | 
 28 |     def calculate_zone_group(self, zone):
 29 |         if zone == 'ALL':
 30 |             return zone
 31 |         
 32 |         zone = int(zone)
 33 |         ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]
 34 |         for group in ZONE_GROUPS:
 35 |             if zone >= group[0] and zone < group[1]:
 36 |                 return f"{group[0]}-{group[1]}"
 37 | 
 38 | 
 39 |     def get_vote_time_metrics(self, uf, turno, zone, section):
 40 |         table = """
 41 |             read_parquet(
 42 |                 '/src/VOTES_TIME_METRICS.parquet/*/*/*/*.parquet', 
 43 |                 hive_partitioning=True,
 44 |                 hive_types_autocast=0
 45 |             )
 46 |         """
 47 |         zone_group = self.calculate_zone_group(zone)
 48 |         zone = F"{int(zone):04d}" if zone != 'ALL' else zone
 49 |         section = F"{int(section):04d}" if section != 'ALL' else section
 50 | 
 51 |         zone_filter = f"AND zone_code = '{zone}' AND zone_group = '{zone_group}'"
 52 |         if uf == 'ALL':
 53 |             uf = "','".join(UFS)
 54 |         elif zone == 'ALL':
 55 |             zone = "','".join(ZONE_GROUPS)
 56 |             zone_filter = f"AND zone_group in ('{zone}', 'ALL')"
 57 |             
 58 |         query = f"""
 59 |             SELECT *
 60 |             FROM {table}
 61 |             WHERE 1=1
 62 |             AND turno = '{turno}'
 63 |             AND uf in ('{uf}')
 64 |             {zone_filter}
 65 |             AND section_code = '{section}'
 66 |         """
 67 | 
 68 |         data = self.cursor.execute(query).df()
 69 |         return data
 70 |     
 71 | 
 72 |     def get_metrics_over_time(self, uf, turno, zone, section):
 73 |         table = """
 74 |             read_parquet(
 75 |                 '/src/VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet/*/*/*/*.parquet', 
 76 |                 hive_partitioning=True,
 77 |                 hive_types_autocast=0
 78 |             )
 79 |         """
 80 |         zone_group = self.calculate_zone_group(zone)
 81 |         zone = F"{int(zone):04d}" if zone != 'ALL' else zone
 82 |         section = F"{int(section):04d}" if section != 'ALL' else section
 83 | 
 84 |         fix_zone_code = """
 85 |             CASE WHEN zone_code IS NULL THEN 'ALL'
 86 |                 ELSE zone_code
 87 |             END
 88 |         """
 89 | 
 90 |         query = f"""
 91 |             SELECT *
 92 |             FROM {table}
 93 |             WHERE 1=1
 94 |             AND turno = '{turno}'
 95 |             AND uf in ('{uf}')
 96 |             AND zone_group = '{zone_group}'
 97 |             AND {fix_zone_code} = '{zone}'
 98 |             AND section_code = '{section}'
 99 |             AND timestamp_voto_computado_5min != 'ALL'
100 |         """
101 | 
102 |         data = self.cursor.execute(query).df()
103 |         return data


--------------------------------------------------------------------------------
/streamlit/app/main.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | 
  3 | from widgets import (
  4 |     widget_bignumber_votos, widget_bignumber_secoes, 
  5 |     widget_big_number_tempo_medio, widget_big_number_tempo_medio_bio,
  6 |     widget_big_number_tempo_total_voto,
  7 |     widget_tempo_medio_voto, widget_qtd_votos_intervalo_tempo,
  8 |     widget_numero_votos_intervalo_5min
  9 | )
 10 | 
 11 | UFS = [
 12 |     "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", "MT", "MS",
 13 |     "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", "RS", "RO", "RR", "SC",
 14 |     "SP", "SE", "TO", "ZZ", "ALL"
 15 | ]
 16 | TURNOS = ['1', '2']
 17 | 
 18 | def get_parameters_from_http_query_params():
 19 |     query_parameters = st.query_params
 20 |     select_parameters = lambda x, default, accepted: (
 21 |         default 
 22 |         if x not in query_parameters 
 23 |         else query_parameters[x] if query_parameters[x] in accepted
 24 |         else default
 25 |     )
 26 |     nr_zonas_secoes = [str(x) for x in range(0, 800)]
 27 | 
 28 |     uf =    select_parameters('uf',    'ALL', UFS            )
 29 |     turno = select_parameters('turno',   '1', TURNOS         )
 30 |     zona =  select_parameters('zona',  'ALL', nr_zonas_secoes)
 31 |     secao = select_parameters('secao', 'ALL', nr_zonas_secoes)
 32 |     
 33 |     return uf, turno, zona, secao
 34 | 
 35 | if __name__ == "__main__":
 36 |     st.set_page_config(layout="wide")
 37 | 
 38 |     uf, turno, zona, secao = get_parameters_from_http_query_params()
 39 |     
 40 |     st.title(f'Eleições em Números - Tempo de Votação')
 41 |     subtitulo = ''
 42 |     subtitulo = subtitulo + f' - {uf}' if uf != 'ALL' else subtitulo + " - Brasil"
 43 |     subtitulo = subtitulo + f' - Zona {zona}' if zona != 'ALL' else subtitulo
 44 |     subtitulo = subtitulo + f', Seção {secao}' if secao != 'ALL' else subtitulo
 45 | 
 46 |     col_subtitle, col_change_turn = st.columns([4, 1])
 47 |     # col_subtitle.markdown( subtitulo )
 48 |     # add button to change the turn
 49 | 
 50 |     outro_turno = '1' if turno == '2' else '2'
 51 |     query_parameters = f"?turno={outro_turno}&uf={uf}&zona={zona}&secao={secao}"
 52 |     st.components.v1.html(
 53 |         f"""
 54 |         <div>
 55 |             <a href="{query_parameters}" class="btn btn-primary" role="button" target="_blank" style="text-decoration: none;">
 56 |                 <button 
 57 |                     style="background-color: #F08902; 
 58 |                     color: white; 
 59 |                     padding: 10px 20px; 
 60 |                     font-size: 32px; 
 61 |                     border: none; 
 62 |                     cursor: pointer; 
 63 |                     font-family: sans-serif; 
 64 |                     font-weight: bold;"
 65 |                     height="70px";
 66 |                 >
 67 |                 {turno}º Turno
 68 |                 </button>
 69 |             </a>
 70 |             <p style="font-size: 32px; margin-top: 10px; margin-left: 5px;
 71 |             font-family: sans-serif; font-weight: bold; display: inline-block;">
 72 |                 {subtitulo}
 73 |             </p>
 74 |         </div>
 75 |         """,
 76 |         height=70
 77 |     )
 78 | 
 79 |     # ============================
 80 |     # Big Number Widgets
 81 |     # ============================
 82 | 
 83 |     col_bignumber_votos, col_bignumber_secoes, col_bignumber_tmedio, col_bignumber_tmedio_bio, col_bignumber_tempo_total = st.columns(5)
 84 |     widget_bignumber_votos(col_bignumber_votos, turno, uf, zona, secao)
 85 |     widget_bignumber_secoes(col_bignumber_secoes, turno, uf, zona, secao)
 86 |     widget_big_number_tempo_medio(col_bignumber_tmedio, turno, uf, zona, secao)
 87 |     widget_big_number_tempo_medio_bio(col_bignumber_tmedio_bio, turno, uf, zona, secao)
 88 |     widget_big_number_tempo_total_voto(col_bignumber_tempo_total, turno, uf, zona, secao)
 89 |     st.divider()
 90 | 
 91 |     # =================================
 92 |     # Heatmap and Histogram Widgets
 93 |     # =================================
 94 |     col_map, col_histogram, col_temporal_series = st.columns( [.3, .2, .5] )
 95 |     widget_tempo_medio_voto(col_map, turno, uf, zona, secao)
 96 |     widget_qtd_votos_intervalo_tempo(col_histogram, turno, uf, zona, secao)
 97 |     widget_numero_votos_intervalo_5min(col_temporal_series, turno, uf, zona, secao)
 98 | 
 99 |     st.divider()
100 | 
101 |     # =================================
102 |     # Foot note. Author: João Pedro. Data gathered from TSE Open Data Portal. All code available at github.
103 |     # =================================
104 | 
105 |     st.text('Author: João Pedro. Dados coletados do Portal de Dados Abertos do TSE. All code available at Github.')
106 |     st.text('O projeto é complexo. Os podem não ser 100% precisos.')
107 | 


--------------------------------------------------------------------------------
/src/01_extrair_eventos_relacionados_a_votos.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extrair apenas eventos relacionados a votos e metadados"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Como o Log das urnas compila todo o histórico de eventos que ocorreu em uma unna, incluindo testes, carga, preparação, etc, para garantir uma consulta mais eficiente, são extraídos apenas os eventos relacionados aos votos em si.\n",
 15 |     "\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Importing libraries"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import duckdb\n",
 32 |     "import pandas as pd\n",
 33 |     "import time"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## Importing Data"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "DATASET = 'ALL_UFS.parquet'"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "cursor = duckdb.connect()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Preparing Data"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Definindo os filtros"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "Colunas relacionadas a metadados da Seção Eleitoral"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "METADATA = [\n",
 89 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Zona Eleitoral%'\",\n",
 90 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Seção Eleitoral%'\",\n",
 91 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Município%'\",\n",
 92 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Local de Votação%'\",\n",
 93 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Turno da UE%'\",\n",
 94 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Identificação do Modelo de Urna%'\"\n",
 95 |     "]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "COLUMN_EVENT_DESCRIPTION = 'event_description'\n",
105 |     "\n",
106 |     "EVENTS_DESCRIPTIONS = [\n",
107 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Urna pronta para receber vot%'\",\n",
108 |     "]\n",
109 |     "\n",
110 |     "VOTES_DESCRIPTIONS = [\n",
111 |     "    # VOTOS\n",
112 |     "    F\"{COLUMN_EVENT_DESCRIPTION} = 'Aguardando digitação do título'\",\n",
113 |     "    F\"{COLUMN_EVENT_DESCRIPTION} = 'Título digitado pelo mesário'\",\n",
114 |     "    F\"{COLUMN_EVENT_DESCRIPTION} = 'Eleitor foi habilitado'\",\n",
115 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Voto confirmado par%'\",\n",
116 |     "    F\"{COLUMN_EVENT_DESCRIPTION} = 'O voto do eleitor foi computado'\",\n",
117 |     "    \n",
118 |     "    # BIOMETRIA\n",
119 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE '%Digital%' \",\n",
120 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Dedo reconhecido%' \",\n",
121 |     "    F\"{COLUMN_EVENT_DESCRIPTION} ILIKE 'Solicita digital%' \",\n",
122 |     "    F\"{COLUMN_EVENT_DESCRIPTION} = 'Solicitação de dado pessoal do eleitor para habilitação manual' \",\n",
123 |     "]\n",
124 |     "\n",
125 |     "ACCEPTED_DATES = [\n",
126 |     "    '2022-10-02', '2022-10-30', # Data constitucional da eleição\n",
127 |     "    '2022-10-03', '2022-10-31', # No caso da seção 'virar a noite' e acabar depois da meia noite, imagino que sejam casos RARÍSSIMOS\n",
128 |     "]\n",
129 |     "\n",
130 |     "ALL_FILTERS = METADATA + EVENTS_DESCRIPTIONS + VOTES_DESCRIPTIONS"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "### Construindo e Executando a query"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "**Notas:** \n",
145 |     "\n",
146 |     "**1. Extração de metadados a partir do nome dos arquivos.**\n",
147 |     "    \n",
148 |     "Cada arquivo TSV possui informações de uma Seção Eleitoral (que é a mesma coisa de uma Urna), e o nome do arquivo é a concatenação dos metadados da Seção Eleitoral:\n",
149 |     "\n",
150 |     "    - Os 5 Primeiros Dígitos são o código do Município\n",
151 |     "    - Os 4 Dígitos seguintes são o código da Zona Eleitoral\n",
152 |     "    - Os 4 Dígitos seguintes são o código da Seção Eleitoral\n",
153 |     "\n",
154 |     "**2. Data da Eleição**\n",
155 |     "\n",
156 |     "A Data em que os eventos aconteceram é uma ótima forma de aproximar ainda mais os eventos que têm haver com a votação, uma vez que a votação no Brasil acontece em um único dia - aprende aí EUA ;)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 5,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "query = F\"\"\"\n",
166 |     "    SELECT \n",
167 |     "        *\n",
168 |     "    FROM (\n",
169 |     "        SELECT\n",
170 |     "            event_timestamp,\n",
171 |     "            event_timestamp::date AS event_date,\n",
172 |     "            event_type,\n",
173 |     "            some_id,\n",
174 |     "            event_system,\n",
175 |     "            event_description,\n",
176 |     "            event_id,\n",
177 |     "                \n",
178 |     "            REPLACE(SPLIT_PART(filename, '/', 5), '_new.csv', '') AS filename,\n",
179 |     "            \n",
180 |     "            -- Metadata from filename\n",
181 |     "            SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2),  1, 5 ) AS city_code,\n",
182 |     "            SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2),  6, 4 ) AS zone_code,\n",
183 |     "            SUBSTRING( SPLIT_PART(SPLIT_PART(filename, '/', 5), '-', 2), 10, 4 ) AS section_code,\n",
184 |     "            REPLACE(SPLIT_PART(filename, '/', 4), '2_', '') AS uf\n",
185 |     "        FROM\n",
186 |     "            {DATASET}\n",
187 |     "        WHERE 1=1\n",
188 |     "            AND ( {' OR '.join(ALL_FILTERS)} )\n",
189 |     "    ) _\n",
190 |     "    WHERE 1=1\n",
191 |     "    AND event_date IN ({', '.join([F\"'{date}'\" for date in ACCEPTED_DATES])})\n",
192 |     "\"\"\""
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "Para facilitar consultas, os arquivos parquet são particionados por DATA DO EVENTO e UF."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 6,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "query = F\"\"\"\n",
209 |     "    COPY ({query}) TO 'UFS_VOTE_EVENTS.parquet' (FORMAT 'parquet', PARTITION_BY (event_date, uf), OVERWRITE_OR_IGNORE 1);\n",
210 |     "\"\"\""
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 7,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "application/vnd.jupyter.widget-view+json": {
221 |        "model_id": "ca3b1617f6524b85b061c9579b6cc506",
222 |        "version_major": 2,
223 |        "version_minor": 0
224 |       },
225 |       "text/plain": [
226 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
227 |       ]
228 |      },
229 |      "metadata": {},
230 |      "output_type": "display_data"
231 |     },
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "Time 1027.0729978084564s\n"
237 |      ]
238 |     }
239 |    ],
240 |    "source": [
241 |     "tic = time.time()\n",
242 |     "cursor.execute(query)\n",
243 |     "toc = time.time()\n",
244 |     "\n",
245 |     "print(F\"Time {toc - tic}s\")"
246 |    ]
247 |   }
248 |  ],
249 |  "metadata": {
250 |   "kernelspec": {
251 |    "display_name": "base",
252 |    "language": "python",
253 |    "name": "python3"
254 |   },
255 |   "language_info": {
256 |    "codemirror_mode": {
257 |     "name": "ipython",
258 |     "version": 3
259 |    },
260 |    "file_extension": ".py",
261 |    "mimetype": "text/x-python",
262 |    "name": "python",
263 |    "nbconvert_exporter": "python",
264 |    "pygments_lexer": "ipython3",
265 |    "version": "3.11.5"
266 |   }
267 |  },
268 |  "nbformat": 4,
269 |  "nbformat_minor": 2
270 | }
271 | 


--------------------------------------------------------------------------------
/src/03B_calcular_metricas_temporais.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calcular Métricas - Tempo de Votação, Biometria, etc."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "---"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Importing libraries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 7,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import duckdb\n",
 31 |     "import pandas as pd\n",
 32 |     "import time"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Importing Data"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 8,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "cursor = duckdb.connect()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Dados Brutos"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 9,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n",
 65 |     "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Adicionar TURNO e Timestamp final de Biometria"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "source = F\"\"\"\n",
 82 |     "(\n",
 83 |     "    SELECT \n",
 84 |     "        *,\n",
 85 |     "        \n",
 86 |     "        CASE event_date\n",
 87 |     "            WHEN '2022-10-02' THEN 1\n",
 88 |     "            WHEN '2022-10-03' THEN 1\n",
 89 |     "            WHEN '2022-10-30' THEN 2\n",
 90 |     "            WHEN '2022-10-31' THEN 2\n",
 91 |     "            ELSE NULL\n",
 92 |     "        END::INT AS turno,\n",
 93 |     "\n",
 94 |     "        COALESCE(\n",
 95 |     "            timestamp_biometria_1,\n",
 96 |     "            timestamp_biometria_2,\n",
 97 |     "            timestamp_biometria_3,\n",
 98 |     "            timestamp_biometria_4,\n",
 99 |     "            timestamp_biometria_manual\n",
100 |     "        ) AS timestamp_biometria_final,\n",
101 |     "\n",
102 |     "        strftime( '%Y-%m-%d %H:', timestamp_voto_computado )\n",
103 |     "        || (EXTRACT(MINUTE FROM timestamp_voto_computado)//5)*5 + 5\n",
104 |     "        || ':00' AS timestamp_voto_computado_5min\n",
105 |     "        \n",
106 |     "    FROM \n",
107 |     "        {TABLE}\n",
108 |     "    -- WHERE uf='DF'\n",
109 |     ") _\n",
110 |     "\"\"\""
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Preparinga Data"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n",
125 |     "\n",
126 |     "- Número de Votos\n",
127 |     "- Número de Seções Eleitorais\n",
128 |     "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n",
129 |     "\n",
130 |     "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n",
131 |     "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n",
132 |     "- Quantidade de Teclas Pressionadas\n",
133 |     "- Quantidade de Cargos Distintos Votados"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "**Definição das métricas de tempo**"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 5,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado  - timestamp_titulo_digitado))\"\n",
150 |     "tempo_voto       = \"EXTRACT(EPOCH FROM (timestamp_voto_computado  - timestamp_habilitacao_eleitor))\"\n",
151 |     "tempo_biometria  = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n",
152 |     "\n",
153 |     "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(20), 'ALL')\""
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 6,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "query_metrics = F\"\"\"\n",
163 |     "    SELECT\n",
164 |     "        {fix_null_values('turno') } AS turno,\n",
165 |     "        {fix_null_values('timestamp_voto_computado_5min') } AS timestamp_voto_computado_5min,\n",
166 |     "        {fix_null_values('uf') } AS uf,\n",
167 |     "        zone_code,\n",
168 |     "        {fix_null_values('section_code') } AS section_code,\n",
169 |     "\n",
170 |     "        COUNT(*) AS total_votos,\n",
171 |     "        SUM( {tempo_voto} ) AS tempo_voto_soma,\n",
172 |     "        SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n",
173 |     "        SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n",
174 |     "        \n",
175 |     "    FROM\n",
176 |     "        {source}\n",
177 |     "    WHERE quantidade_votos_computados = 1\n",
178 |     "    GROUP BY ROLLUP(turno, timestamp_voto_computado_5min, uf, zone_code, section_code)\n",
179 |     "\"\"\""
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "**Salvar resultado intermediário**"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "query = F\"\"\"\n",
196 |     "    COPY (\n",
197 |     "    {\n",
198 |     "        query_metrics\n",
199 |     "    } )\n",
200 |     "    TO 'VOTES_TIME_METRICS_OVER_TIME.parquet' \n",
201 |     "    (FORMAT 'parquet', PARTITION_BY (turno, uf), OVERWRITE_OR_IGNORE 1);\n",
202 |     "\"\"\""
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 8,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "application/vnd.jupyter.widget-view+json": {
213 |        "model_id": "a86d242b27054c0683c1dca6f79697d6",
214 |        "version_major": 2,
215 |        "version_minor": 0
216 |       },
217 |       "text/plain": [
218 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
219 |       ]
220 |      },
221 |      "metadata": {},
222 |      "output_type": "display_data"
223 |     }
224 |    ],
225 |    "source": [
226 |     "cursor.execute(query)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "Cálculo cumulativo das métricas"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 11,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "TABLE_METRICS = \"\"\"read_parquet(\n",
243 |     "    'VOTES_TIME_METRICS_OVER_TIME.parquet/*/*/*.parquet', \n",
244 |     "    hive_partitioning=True,\n",
245 |     "    hive_types_autocast =0\n",
246 |     ")\n",
247 |     "\"\"\""
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 12,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "query_cumulative_metrics = F\"\"\"\n",
257 |     "    SELECT\n",
258 |     "        *,\n",
259 |     "        SUM(total_votos) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS total_votos_cumulativo,\n",
260 |     "        SUM(tempo_voto_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_soma_cumulativo,\n",
261 |     "        SUM(tempo_biometria_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_biometria_soma_cumulativo,\n",
262 |     "        SUM(tempo_voto_total_soma) OVER (PARTITION BY turno, uf, zone_code, section_code ORDER BY timestamp_voto_computado_5min) AS tempo_voto_total_soma_cumulativo\n",
263 |     "    FROM\n",
264 |     "        {TABLE_METRICS}\n",
265 |     "\"\"\""
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n",
273 |     "\n",
274 |     "As ZONAS foram agrupadas em grupos de 20, esse número é empírico."
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 13,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "query_metrics_with_zone_group = F\"\"\"\n",
284 |     "    SELECT\n",
285 |     "    *,\n",
286 |     "    CASE\n",
287 |     "        {\n",
288 |     "            \"\".join(\n",
289 |     "                [\n",
290 |     "                    f\"WHEN zone_code IS NOT NULL AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
291 |     "                    for min_zone, max_zone in ZONE_GROUPS\n",
292 |     "                ]\n",
293 |     "            )\n",
294 |     "        }\n",
295 |     "        ELSE 'ALL'\n",
296 |     "    END AS zone_group\n",
297 |     "    FROM (\n",
298 |     "        {query_cumulative_metrics}\n",
299 |     "    ) _\n",
300 |     "\"\"\""
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 14,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "query = F\"\"\"\n",
310 |     "    COPY (\n",
311 |     "    {\n",
312 |     "        query_metrics_with_zone_group\n",
313 |     "    } )\n",
314 |     "    TO 'VOTES_TIME_CUMULATIVE_METRICS_OVER_TIME.parquet' \n",
315 |     "    (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
316 |     "\"\"\""
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 15,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "application/vnd.jupyter.widget-view+json": {
327 |        "model_id": "b9ea3c1f88764a56a54bce624acaf93e",
328 |        "version_major": 2,
329 |        "version_minor": 0
330 |       },
331 |       "text/plain": [
332 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
333 |       ]
334 |      },
335 |      "metadata": {},
336 |      "output_type": "display_data"
337 |     },
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "<duckdb.duckdb.DuckDBPyConnection at 0x7f6ba1b9ed30>"
342 |       ]
343 |      },
344 |      "execution_count": 15,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "cursor.execute(query)"
351 |    ]
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "base",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.11.5"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------
/src/02_isolar_timestamps_eventos.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calcular o Tempo de Voto"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Este notebook tem como objetivo calcular o tempo de voto de um eleitor. \n",
 15 |     "O primeiro passo é definir exatamente o que é um voto, dado que o log das urnas contém apenas uma sequência de eventos.\n",
 16 |     "\n",
 17 |     "Na sequência, os votos são individualizados (um por linha) e o tempo de cada evento relevante é calculado."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Importing libraries"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 28,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import duckdb\n",
 34 |     "import pandas as pd\n",
 35 |     "import time"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Importing Data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 29,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "cursor = duckdb.connect()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 31,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "TABLE = \"read_parquet('UFS_VOTE_EVENTS.parquet/*/*/*.parquet', hive_partitioning=True)\""
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 32,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "source_data = f\"\"\"\n",
 70 |     "    (\n",
 71 |     "        SELECT\n",
 72 |     "        *\n",
 73 |     "        FROM {TABLE}\n",
 74 |     "    ) AS source\n",
 75 |     "\"\"\""
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Preparinga Data"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### Criando um ID único para cada voto"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Como heurística, vamos criar um id único para cada voto, que será determinado a partir de uma operação 'âncora'.\n",
 97 |     "\n",
 98 |     "A operação servirá como marcação de que um voto foi iniciado e, todas as linhas entre uma operação âncora e a próxima, serão consideradas como um único voto."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 33,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "ANCHOR_OPERATION = 'Aguardando digitação do título'\n",
108 |     "ZONE_GROUPS = [ (0, 100), (101, 200), (201, 300), (301, 400), (401, 500) ]"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Após uma exploração dos LOGS, a operação escolhida foi 'AGUARDANDO DIGITAÇÃO DO TÍTULO', exatamente por ser o PRIMEIRO e OBRIAGTÓRIO passo para que um voto seja autorizado."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 34,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "query_create_id = f\"\"\"\n",
125 |     "    (\n",
126 |     "        SELECT\n",
127 |     "            *,\n",
128 |     "            SUM(CASE WHEN event_description = '{ANCHOR_OPERATION}' THEN 1 ELSE 0 END) \n",
129 |     "            OVER (PARTITION BY event_date, uf, filename ORDER BY event_timestamp) AS vote_id,\n",
130 |     "            \n",
131 |     "            CASE\n",
132 |     "                {\n",
133 |     "                    \"\".join(\n",
134 |     "                        [\n",
135 |     "                            f\"WHEN zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
136 |     "                            for min_zone, max_zone in ZONE_GROUPS\n",
137 |     "                        ]\n",
138 |     "                    )\n",
139 |     "                }\n",
140 |     "            END AS zone_group\n",
141 |     "\n",
142 |     "        FROM {source_data}\n",
143 |     "        WHERE \n",
144 |     "        uf = '<uf>' \n",
145 |     "        AND event_date = '<event_date>'\n",
146 |     "        AND zone_code::INT BETWEEN <zone_id_min> AND <zone_id_max>\n",
147 |     "    ) AS query_vote_id\n",
148 |     "\"\"\""
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "### Pivotando Timestamp dos eventos por id"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "Para calcular o tempo dos votos e dos eventos individuais que o compõem (biometria, voto) é necessário extrair o timestamp de cada evento."
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 35,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "timestamp_inicio_fim_voto = [\n",
172 |     "    f'''\n",
173 |     "        MAX(\n",
174 |     "            CASE WHEN event_description = 'Título digitado pelo mesário' THEN event_timestamp ELSE NULL END \n",
175 |     "        ) AS timestamp_titulo_digitado\n",
176 |     "    ''',\n",
177 |     "    f'''\n",
178 |     "        MAX(\n",
179 |     "            CASE WHEN event_description = 'O voto do eleitor foi computado' THEN event_timestamp ELSE NULL END \n",
180 |     "        ) AS timestamp_voto_computado\n",
181 |     "    '''\n",
182 |     "]"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 36,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "VOTE_EVENTS = [\n",
192 |     "    'Voto confirmado para [Conselheiro Distrital]',\n",
193 |     "    'Voto confirmado para [Deputado Distrital]',\n",
194 |     "    'Voto confirmado para [Deputado Estadual]',\n",
195 |     "    'Voto confirmado para [Deputado Federal]',\n",
196 |     "    'Voto confirmado para [Governador]',\n",
197 |     "    'Voto confirmado para [Prefeito]',\n",
198 |     "    'Voto confirmado para [Presidente]',\n",
199 |     "    'Voto confirmado para [Senador]',\n",
200 |     "]\n",
201 |     "\n",
202 |     "timestamp_vote_events = [\n",
203 |     "    f'''\n",
204 |     "        MAX(\n",
205 |     "            CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n",
206 |     "        ) AS timestamp_voto_{event.replace(\"Voto confirmado para [\", \"\").replace(\"]\", \"\").lower().replace(' ', '_')}\n",
207 |     "    '''\n",
208 |     "    for event in VOTE_EVENTS\n",
209 |     "]"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 37,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "BIOMETRIA_TENTATIVAS = [\n",
219 |     "    'Solicita digital. Tentativa [1] de [4]',\n",
220 |     "    'Solicita digital. Tentativa [2] de [4]',\n",
221 |     "    'Solicita digital. Tentativa [3] de [4]',\n",
222 |     "    'Solicita digital. Tentativa [4] de [4]',\n",
223 |     "    'Solicitação de dado pessoal do eleitor para habilitação manual',\n",
224 |     "    'Eleitor foi habilitado'\n",
225 |     "]\n",
226 |     "\n",
227 |     "timestamp_biometria_tentativas = [\n",
228 |     "    f'''\n",
229 |     "        MAX(\n",
230 |     "            CASE WHEN event_description = \\'{event}\\' THEN event_timestamp ELSE NULL END \n",
231 |     "        ) AS timestamp_biometria_{event.replace(\"Solicita digital. Tentativa [\", \"\").replace(\"] de [4]\", \"\").lower()}\n",
232 |     "    '''\n",
233 |     "    for event in BIOMETRIA_TENTATIVAS\n",
234 |     "    if event.startswith('Solicita digital')\n",
235 |     "] + [\n",
236 |     "    f'''\n",
237 |     "        MAX(\n",
238 |     "            CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-2]}\\' THEN event_timestamp ELSE NULL END \n",
239 |     "        ) AS timestamp_biometria_manual\n",
240 |     "    '''\n",
241 |     "] + [\n",
242 |     "    f'''\n",
243 |     "        MAX(\n",
244 |     "            CASE WHEN event_description = \\'{BIOMETRIA_TENTATIVAS[-1]}\\' THEN event_timestamp ELSE NULL END \n",
245 |     "        ) AS timestamp_habilitacao_eleitor\n",
246 |     "    '''\n",
247 |     "]\n",
248 |     "    "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 38,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "query_pivot_timestamps = f\"\"\"(\n",
258 |     "    SELECT\n",
259 |     "        event_date, uf, filename, vote_id,\n",
260 |     "        \n",
261 |     "        MAX(city_code) AS city_code,\n",
262 |     "        MAX(zone_code) AS zone_code,\n",
263 |     "        MAX(zone_group) AS zone_group,\n",
264 |     "        MAX(section_code) AS section_code,\n",
265 |     "\n",
266 |     "        SUM( (event_description='O voto do eleitor foi computado')::INT ) AS quantidade_votos_computados,\n",
267 |     "        SUM( (event_description ILIKE 'Solicita digital%')::INT ) AS quantidade_solicitacoes_biometria,\n",
268 |     "        SUM( (event_description ILIKE 'Voto confirmado para%')::INT ) AS quantidade_cargos_votados,\n",
269 |     "        MAX( (event_description='Solicitação de dado pessoal do eleitor para habilitação manual')::INT ) AS biometria_nao_funcionou,\n",
270 |     "\n",
271 |     "        MIN( event_timestamp ) AS timestamp_primeiro_evento,\n",
272 |     "\n",
273 |     "        {', '.join(timestamp_vote_events+timestamp_biometria_tentativas+timestamp_inicio_fim_voto)}\n",
274 |     "        \n",
275 |     "    FROM {query_create_id}\n",
276 |     "    GROUP BY event_date, uf, filename, vote_id\n",
277 |     ")\n",
278 |     "\"\"\""
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "### Construindo e Executando a query"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL por duas razões:\n",
293 |     "\n",
294 |     "    - Facilitar a leitura dos dados posteriormente\n",
295 |     "    - Permitir a execução da query em partes, evitando a sobrecarga de memória ao processar todos os dados de uma vez\n",
296 |     "\n",
297 |     "As ZONAS foram agrupadas em grupos de 100, esse número é empírico, pensado para abarcar a grande maioria das UFs em um único grupo, já que a grande maioria dos estados não pssui mais de 100 zonas eleitorais, e dividir as UFs mais populosas em grupos menores."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "ACCEPTED_DATES = [\n",
307 |     "    '2022-10-02', '2022-10-30', \n",
308 |     "    '2022-10-03', '2022-10-31',\n",
309 |     "]\n",
310 |     "UFS = [\n",
311 |     "    'AC', 'AL', 'AM', 'AP', \n",
312 |     "    'BA', \n",
313 |     "    'CE', 'DF', 'ES', 'GO', \n",
314 |     "    'MT', 'PA', 'PB', 'PE', \n",
315 |     "    'MA',\n",
316 |     "    \n",
317 |     "    'MG', 'MS', \n",
318 |     "    'PI', 'PR', 'RJ', 'RN', \n",
319 |     "    'RO', 'RR', 'RS', 'SC', \n",
320 |     "    'SE', 'SP', 'TO', 'ZZ'\n",
321 |     "]\n",
322 |     "\n",
323 |     "PROCESSING_TIMES = []\n",
324 |     "\n",
325 |     "for uf in UFS:\n",
326 |     "    for date in ACCEPTED_DATES:\n",
327 |     "        for zone_group in ZONE_GROUPS:\n",
328 |     "\n",
329 |     "            \n",
330 |     "            query = F\"\"\"\n",
331 |     "                COPY \n",
332 |     "                {\n",
333 |     "                    query_pivot_timestamps\n",
334 |     "                    .replace('<uf>', uf)\n",
335 |     "                    .replace('<event_date>', date)\n",
336 |     "                    .replace('<zone_id_min>', str(zone_group[0]))\n",
337 |     "                    .replace('<zone_id_max>', str(zone_group[1]))\n",
338 |     "                } \n",
339 |     "                TO 'VOTES.parquet' \n",
340 |     "                (FORMAT 'parquet', PARTITION_BY (event_date, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
341 |     "            \"\"\"\n",
342 |     "            \n",
343 |     "            print(\"Processing \", uf, date)\n",
344 |     "            tic = time.time()\n",
345 |     "            cursor.execute(query)\n",
346 |     "            toc = time.time()\n",
347 |     "            print(F\"Time for {uf} {date} {zone_group}: {toc-tic}\")\n",
348 |     "\n",
349 |     "            PROCESSING_TIMES.append({\n",
350 |     "                'uf': uf,\n",
351 |     "                'date': date,\n",
352 |     "                'zone_group': zone_group,\n",
353 |     "                'time': toc-tic\n",
354 |     "            })"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "Salvando o resultado dos tempos de processamento."
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 42,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "PROCESSING_TIMES\n",
371 |     "\n",
372 |     "# convert to pandas and save as csv\n",
373 |     "df_processing_times = pd.DataFrame(PROCESSING_TIMES)\n",
374 |     "df_processing_times.to_csv('processing_times.csv', index=False)"
375 |    ]
376 |   }
377 |  ],
378 |  "metadata": {
379 |   "kernelspec": {
380 |    "display_name": "base",
381 |    "language": "python",
382 |    "name": "python3"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 3
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython3",
394 |    "version": "3.11.5"
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 2
399 | }
400 | 


--------------------------------------------------------------------------------
/streamlit/app/widgets.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import geopandas as gpd
  3 | import pandas as pd
  4 | import datetime
  5 | import re
  6 | import io
  7 | import streamlit as st
  8 | import seaborn as sns
  9 | 
 10 | from maps import add_ufs_and_links_to_map, load_brazil_simplified_map, load_ufs_city_simplified_map
 11 | from data import DuckDBConnector
 12 | import numpy as np
 13 | 
 14 | import matplotlib.pyplot as plt
 15 | from matplotlib.colors import LinearSegmentedColormap
 16 | 
 17 | @st.cache_resource
 18 | def get_duckdb_connector():
 19 |     return DuckDBConnector.get_instance()
 20 | 
 21 | PRIMARY_COLOR    = "#0B1D51"
 22 | HIGHLIGHT_COLOR  = "#F08902"
 23 | 
 24 | # Seaborn set theme
 25 | # no grid
 26 | # gray background
 27 | sns.set_style("whitegrid")
 28 | sns.set_theme(style='whitegrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)
 29 | 
 30 | def format_number_mi_mil(number):
 31 |     number_mi = number//1e6
 32 |     number_mil = (number - number_mi*1e6) / 1e3
 33 | 
 34 |     number_formatted = f"{number_mi:.0f} Mihão" if number_mi > 0 else ''
 35 |     if number_mil > 0:
 36 |         number_formatted  += f" {number_mil:.0f} Mil"
 37 |     elif number_mil > 0:
 38 |         number_formatted = str(number_mil).replace('.', ',')
 39 |         number_formatted = number_formatted[:number_formatted.index(',')+2] + ' Mil'
 40 |     number_formatted = number_formatted.strip()
 41 |     return number_formatted
 42 | 
 43 | 
 44 | def format_time(time_in_seconds):
 45 | 
 46 |     years = time_in_seconds // (365 * 24 * 3600)
 47 |     time_in_seconds = time_in_seconds % (365 * 24 * 3600)
 48 |     months = time_in_seconds // (30 * 24 * 3600)
 49 |     time_in_seconds = time_in_seconds % (30 * 24 * 3600)
 50 |     days = time_in_seconds // (24 * 3600)
 51 |     time_in_seconds = time_in_seconds % (24 * 3600)
 52 |     hours = time_in_seconds // 3600
 53 |     time_in_seconds %= 3600
 54 |     minutes = time_in_seconds // 60
 55 |     seconds = time_in_seconds % 60
 56 | 
 57 |     days = int(days)
 58 |     hours = int(hours)
 59 |     minutes = int(minutes)
 60 |     seconds = int(seconds)
 61 | 
 62 |     time_formated = ""
 63 |     if seconds > 0:
 64 |         time_formated += f"{seconds:.0f}s"
 65 |     if minutes > 0:
 66 |         time_formated = f"{minutes:.0f}m " + time_formated
 67 |     if hours > 0:
 68 |         time_formated = f"{hours:.0f}h " + time_formated
 69 |     if days > 0:
 70 |         time_formated = f"{days:.0f} dias " + time_formated
 71 |     if months > 0:
 72 |         time_formated = f"{months:.0f} Meses " + time_formated
 73 |         if months == 1:
 74 |             time_formated = time_formated.replace('Meses', 'Mês')
 75 |     if years > 0:
 76 |         time_formated = f"{years:.0f} Anos " + time_formated
 77 |         if years == 1:
 78 |             time_formated = time_formated.replace('Anos', 'Ano')
 79 | 
 80 |         # Remover horas, minutos e segundos
 81 |         time_formated = re.sub(r'\d+[hms]', '', time_formated)
 82 | 
 83 |     return time_formated
 84 | 
 85 | 
 86 | def format_number(number):
 87 |     return (
 88 |         f"{number//1e6:.0f} Mi" 
 89 |         if number >= 1e6 else f"{number//1e3:.0f} Mil" 
 90 |         if number >= 1e3 else f"{number:.0f}"
 91 |     )
 92 | 
 93 | 
 94 | def widget_numero_votos_intervalo_5min(container, turno, uf, zona, secao):
 95 |     
 96 |     metrics_df = get_duckdb_connector().get_metrics_over_time(uf, turno, zona, secao)
 97 |     metrics_df['timestamp_voto_computado_5min'] = pd.to_datetime(metrics_df['timestamp_voto_computado_5min'])
 98 |     metrics_df = metrics_df.sort_values('timestamp_voto_computado_5min')
 99 |     metrics_df = metrics_df.fillna( pd.NaT )
100 | 
101 |     # define x and y
102 |     y_metric = metrics_df['total_votos'].astype(int)
103 | 
104 |     # Get the maximum value of y
105 |     # and the corresponding x value
106 |     # ------------------------------
107 |     x_value_max_y, max_y = metrics_df.loc[y_metric.idxmax(), ['timestamp_voto_computado_5min', 'total_votos']]
108 |     x_value_max_y_formatted = x_value_max_y.strftime('%H:%M')
109 |     max_y_formatted = format_number_mi_mil(max_y)
110 | 
111 |     # lineplot with time series
112 |     FIGSIZE = (10, 5)
113 |     fig, ax = plt.subplots( figsize=FIGSIZE )
114 | 
115 |     # pegar só horas fechadas e 30min
116 |     x_axis_values = (
117 |         metrics_df
118 |         .query("timestamp_voto_computado_5min.dt.minute == 0")
119 |         ['timestamp_voto_computado_5min']
120 |     )
121 |     x_axis_labels = x_axis_values.dt.strftime('%H:%M')
122 | 
123 |     if uf in ['ALL', 'SP', 'MG']:
124 |         y_axis_values = [ 5e4, 1e5, 2.5e5, 5e5, 7.5e5, 1e6 ]
125 |     else:
126 |         y_axis_values = [ 1e3, 3e3, 5e3, 1e4, 1.5e4, 2e4, 5e4, 1e5, 5e5 ]
127 |     y_axis_labels = [format_number(y) for y in y_axis_values]
128 | 
129 |     sns.lineplot(
130 |         x=metrics_df['timestamp_voto_computado_5min'],
131 |         y=y_metric,
132 |         ax=ax,
133 |         color=PRIMARY_COLOR
134 |     )
135 | 
136 |     # Fill area under the line
137 |     # ------------------------
138 |     ax.fill_between(
139 |         metrics_df['timestamp_voto_computado_5min'],
140 |         y_metric,
141 |         0,
142 |         zorder=0,
143 |         alpha=0.5,
144 |         color=PRIMARY_COLOR
145 |     )
146 | 
147 |     # Add vertical line at the maximum value
148 |     # --------------------------------------
149 |     ax.axvline(
150 |         x=metrics_df.loc[y_metric.idxmax(), 'timestamp_voto_computado_5min'],
151 |         color=HIGHLIGHT_COLOR,
152 |         ymin=0,
153 |         ymax=1,
154 |         linestyle='-',
155 |         linewidth=2
156 |     )
157 | 
158 |     # Add a box in the line with the maximum value
159 |     # left aligned
160 |     # --------------------------------------------
161 |     ax.text(
162 |         x_value_max_y,
163 |         0.9*max_y,
164 |         f"{max_y_formatted}",
165 |         color='white',
166 |         fontsize=10,
167 |         ha='left',
168 |         va='center',
169 |         bbox=dict(facecolor=HIGHLIGHT_COLOR, alpha=1)
170 |     )
171 | 
172 | 
173 |     ax.set_xticks(x_axis_values)
174 |     ax.set_xticklabels(x_axis_labels, rotation=45, ha='right', fontsize=10)
175 | 
176 |     # remove right and top spines
177 |     ax.spines['right'].set_visible(False)
178 |     ax.spines['left'].set_visible(False)
179 |     ax.spines['bottom'].set_visible(False)
180 |     ax.spines['top'].set_visible(False)
181 | 
182 |     ax.set_yticks(y_axis_values)
183 |     ax.set_yticklabels(y_axis_labels, fontsize=10)
184 |     # add horizontal grid lines on the y axis
185 |     # in the background
186 |     ax.yaxis.grid(True, linestyle='-', alpha=1)
187 |     # remove x grid lines
188 |     ax.xaxis.grid(False)
189 |     # remove x and y labels
190 |     ax.set_xlabel('')
191 |     ax.set_ylabel('')
192 |     # set y limit
193 |     ax.set_ylim(0, max_y)
194 |     
195 |     container.markdown('#### Número de votos efetuados a cada 5min')
196 |     container.pyplot(fig)
197 |     container.markdown(f'#### Às {x_value_max_y_formatted}, houve o pico de votos, com **{max_y_formatted}** computados em 5 minutos!')
198 | 
199 | 
200 | def widget_tempo_medio_voto(container, turno, uf, zona, secao):
201 | 
202 |     if uf=='ALL':
203 |         widget_heatmap_tempo_medio_voto_mapa(container, turno, uf, zona, secao)
204 |     elif zona=='ALL':
205 |         widget_tabela_tempo_medio_zonas(container, turno, uf, zona, secao)
206 | 
207 | 
208 | def widget_tabela_tempo_medio_zonas( container, turno, uf, zona, secao ):
209 | 
210 |     map_gdf = load_brazil_simplified_map()
211 |     map_gdf = map_gdf.query(f"SIGLA_UF == '{uf}'")
212 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
213 |     metrics_df = metrics_df[ ['zone_group', 'zone_code', 'total_votos', 'tempo_voto_medio'] ]
214 |     metrics_df_all_zones = metrics_df.query("zone_code == 'ALL'")
215 |     metrics_df = metrics_df.query("zone_code != 'ALL'")
216 | 
217 |     unique_zone_groups = list(metrics_df['zone_group'].unique())
218 |     unique_zone_groups.sort( key=lambda x: int(x.split('-')[0]) )
219 | 
220 |     # plot a small map with the selected UF
221 |     fig, ax = plt.subplots( figsize=(1, 1) )
222 |     map_gdf.plot(ax=ax, color=HIGHLIGHT_COLOR)
223 |     ax.axis('off')
224 |     # add the sigla of the UF
225 |     ax.text(
226 |         map_gdf.centroid.x.values[0],
227 |         map_gdf.centroid.y.values[0],
228 |         uf,
229 |         fontsize=8,
230 |         weight='bold',
231 |         ha='center',
232 |         va='center',
233 |         color='white'
234 |     )
235 | 
236 |     x=.15
237 |     col_map_uf, col_title = container.columns( [x, 1-x] )
238 |     col_map_uf.pyplot(fig, use_container_width=True)
239 |     col_title.markdown(f"### Detalhamento por Zona \n")
240 |     
241 |     zone_gorup_tabs = container.tabs( unique_zone_groups )
242 |     for zone_group, zone_group_tab in zip(unique_zone_groups, zone_gorup_tabs):
243 | 
244 |         metrics_df_zone_group = metrics_df.query(f"zone_group == '{zone_group}'")
245 |         top_3_most_last_zones = metrics_df_zone_group.sort_values('tempo_voto_medio', ascending=False).head(3)['zone_code'].values
246 | 
247 |         metrics_df_zone_group = metrics_df_zone_group.sort_values('zone_code')
248 |         metrics_df_zone_group['tempo_voto_medio'] = metrics_df_zone_group['tempo_voto_medio'].apply(format_time)
249 |         metrics_df_zone_group['total_votos'] = metrics_df_zone_group['total_votos'].apply(format_number)
250 | 
251 |         # add medals to the top 3 most last zones
252 |         # in the tempo_voto_medio column
253 | 
254 |         for medal, zone in zip(['🥇', '🥈', '🥉'], top_3_most_last_zones):
255 |             metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio'] = medal \
256 |             + ' ' + metrics_df_zone_group.loc[metrics_df_zone_group['zone_code'] == zone, 'tempo_voto_medio']
257 |             
258 |         
259 |         metrics_df_zone_group = metrics_df_zone_group.rename(
260 |             columns={
261 |                 'zone_code': 'Zona',
262 |                 'total_votos': 'Votos',
263 |                 'tempo_voto_medio': 'Tempo Médio'
264 |             }
265 |         ).drop(columns='zone_group')
266 | 
267 |         zone_group_tab.dataframe(
268 |             metrics_df_zone_group
269 |             .style
270 |             .apply(
271 |                 lambda x: 
272 |                     [
273 |                         f'background-color: {HIGHLIGHT_COLOR}; color: white; font-weight: bold; font-size: 15px'
274 |                         if x['Zona'] in top_3_most_last_zones else '',
275 |                     ]*len(x),
276 |                 axis=1
277 |             ),
278 |             height=400, 
279 |             use_container_width = True,
280 |             hide_index=True
281 |         )
282 | 
283 | 
284 | def widget_heatmap_tempo_medio_voto_mapa( container, turno, uf, zona, secao ):
285 |     COLORMAP = 'coolwarm'
286 |     RANGE_SECONDS_PLOT = 15
287 |     FIGSIZE = (6, 6)
288 |     
289 |     map_gdf = load_brazil_simplified_map()
290 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
291 |     map_gdf = map_gdf.merge(metrics_df, left_on='SIGLA_UF', right_on='uf', how='left') 
292 |     map_gdf = gpd.GeoDataFrame(map_gdf)
293 | 
294 |     tempo_voto_medio_ALL = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max()
295 |     map_gdf['tempo_voto_medio'] = map_gdf['tempo_voto_medio'] - tempo_voto_medio_ALL
296 |     
297 |     fig = plt.figure(figsize=FIGSIZE)
298 |     ax = fig.add_subplot(1, 1, 1)
299 |     ax.axis('off')
300 |     UFS = map_gdf['uf'].unique()
301 | 
302 |     for uf in UFS:
303 |         (
304 |             map_gdf
305 |             .query(f"uf == '{uf}'")
306 |             .plot(
307 |                 column='tempo_voto_medio', 
308 |                 ax=ax, 
309 |                 cmap=COLORMAP,
310 |                 legend=False,
311 |                 vmin=-RANGE_SECONDS_PLOT,
312 |                 vmax=+RANGE_SECONDS_PLOT,
313 |                 gid=uf
314 |             )
315 |         )
316 | 
317 |     # add a horizontal colorbar
318 |     sm = plt.cm.ScalarMappable(
319 |         cmap=COLORMAP,
320 |         norm=plt.Normalize(vmin=-RANGE_SECONDS_PLOT, vmax=+RANGE_SECONDS_PLOT)
321 |     )
322 | 
323 |     cbar = fig.colorbar(sm, ax=ax, orientation='horizontal', pad=0.01, aspect=20, fraction=0.035)
324 |     cbar.set_label('Segundos abaixo/acima da média', fontsize=10)
325 |     cbar.ax.tick_params(labelsize=8)
326 |     
327 |     # save svg image to buffer
328 |     svg_image_buffer = io.StringIO()
329 |     plt.savefig(svg_image_buffer, format='svg')
330 |     plt.close(fig)
331 | 
332 |     svg_image_with_links = add_ufs_and_links_to_map(svg_image_buffer.getvalue())
333 | 
334 |     container.markdown('#### Tempo Médio de Votação por UF')
335 |     container.markdown(':point_down: Clique no Mapa para detalhes')
336 |     container.markdown(svg_image_with_links, unsafe_allow_html=True)
337 | 
338 | 
339 | def widget_bignumber_votos( container, turno, uf, zona, secao ):
340 |     
341 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
342 |     if uf == 'ALL':
343 |         votos = metrics_df.query(f"uf == 'ALL'")['total_votos'].max()
344 |     else:
345 |         votos = metrics_df['total_votos'].max()
346 |     
347 |     votos_formatado = f"{votos:,}".replace(',', ' ')
348 |     container.metric(label=':white_check_mark: Votos', value=votos_formatado)
349 | 
350 | 
351 | def widget_bignumber_secoes( container, turno, uf, zona, secao ):
352 |     
353 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
354 |     if uf == 'ALL':
355 |         secoes = metrics_df.query(f"uf == 'ALL'")['total_secoes'].max()
356 |     else:
357 |         secoes = metrics_df['total_secoes'].max()
358 | 
359 |     section_formatado = f"{secoes:,}".replace(',', ' ')
360 |     container.metric(label=':pushpin: Seções', value=section_formatado)
361 | 
362 | 
363 | def widget_big_number_tempo_medio( container, turno, uf, zona, secao ):
364 |     
365 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
366 |     if uf == 'ALL':
367 |         tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_medio'].max()
368 |     else:
369 |         tempo_medio = metrics_df['tempo_voto_medio'].max()
370 | 
371 |     tempo_medio_formatado = format_time(tempo_medio)
372 |     container.metric(label=':stopwatch: Tempo Médio', value=tempo_medio_formatado)
373 | 
374 | 
375 | def widget_big_number_tempo_medio_bio( container, turno, uf, zona, secao ):
376 | 
377 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
378 |     if uf == 'ALL':
379 |         tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_biometria_medio'].max()
380 |     else:
381 |         tempo_medio = metrics_df['tempo_biometria_medio'].max()
382 | 
383 |     tempo_medio_formatado = format_time(tempo_medio)
384 |     container.metric(label=':point_up: Tempo Médio Biometria', value=tempo_medio_formatado)
385 | 
386 | 
387 | def widget_big_number_tempo_total_voto( container, turno, uf, zona, secao ):
388 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
389 | 
390 |     if uf == 'ALL':
391 |         tempo_medio = metrics_df.query(f"uf == 'ALL'")['tempo_voto_soma'].max()
392 |     else:
393 |         tempo_medio = metrics_df['tempo_voto_soma'].max()
394 |     
395 |     tempo_medio_anos = tempo_medio / (365 * 24 * 3600)
396 |     if tempo_medio_anos < 5:
397 |         icon = ':baby:'
398 |     elif tempo_medio_anos < 10:
399 |         icon = ':boy:'
400 |     elif tempo_medio_anos < 15:
401 |         icon = ':child:'
402 |     elif tempo_medio_anos < 30:
403 |         icon = ':man:'
404 |     elif tempo_medio_anos < 60:
405 |         icon = ':older_adult:'
406 |     else:
407 |         icon = ':older_man:'
408 |     
409 | 
410 |     tempo_medio_formatado = format_time(tempo_medio)
411 |     container.metric(label=f'{icon} Tempo Total Gasto', value=tempo_medio_formatado)
412 | 
413 | 
414 | def widget_qtd_votos_intervalo_tempo( container, turno, uf, zona, secao ):
415 | 
416 |     metrics_df = get_duckdb_connector().get_vote_time_metrics(uf, turno, zona, secao)
417 |     if uf == 'ALL':
418 |         metrics_df = metrics_df.query(f"uf == 'ALL'")
419 | 
420 |     format_time = lambda x: f"{x // 60}:{x % 60:02d}"
421 |     # format number in Mi, Mil, and integer 
422 |     format_number = lambda number : (
423 |         f"{number//1e6:.0f} Mi" 
424 |         if number >= 1e6 else f"{number//1e3:.0f} Mil" 
425 |         if number >= 1e3 else f"{number:.0f}"
426 |     )
427 | 
428 |     extrair_intervalo_superior_segundos = lambda col: int(col.split('_')[-2])
429 |     extrair_intervalo_inferior_segundos = lambda col: int(col.split('_')[-3])
430 | 
431 |     colunas_qtd_votos_intervalo = [
432 |         'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos',
433 |         'votos_90_120_segundos', 'votos_120_150_segundos',
434 |         'votos_150_180_segundos', 'votos_180_210_segundos',
435 |         'votos_210_300_segundos', 'votos_300_9999_segundos'
436 |     ]
437 | 
438 |     valores_qtd_votos_intervalo = [
439 |         (
440 |          format_time(extrair_intervalo_inferior_segundos(col)) + " a " +
441 |          format_time(extrair_intervalo_superior_segundos(col)),
442 |          col, 
443 |          metrics_df[col].max()
444 |         )
445 |         if col != 'votos_300_9999_segundos' and col != 'votos_0_30_segundos'
446 |         else ("mais de 5:00", col, metrics_df[col].max())
447 |         if col == 'votos_300_9999_segundos'
448 |         else ("até 0:30", col, metrics_df[col].max())
449 |         for col in colunas_qtd_votos_intervalo
450 |     ]
451 |     # revert order
452 |     valores_qtd_votos_intervalo = valores_qtd_votos_intervalo[::-1]
453 | 
454 |     df_valores_qtd_votos_intervalo = pd.DataFrame(
455 |         valores_qtd_votos_intervalo,
456 |         columns=['intervalo', 'coluna', 'valor']
457 |     )
458 | 
459 |     container.markdown('#### Em quantos minutos as pessoas votam?')
460 | 
461 |     # plot horizontal bar chart
462 |     fig, ax = plt.subplots( figsize=(5, 12) )
463 |     # df_valores_qtd_votos_intervalo.plot.barh(x='intervalo', y='valor', legend=False, width=.8, ax=ax)
464 | 
465 |     # make the barplot with seaborn
466 |     sns.barplot(
467 |         x='valor', 
468 |         y='intervalo', 
469 |         data=df_valores_qtd_votos_intervalo, 
470 |         color=PRIMARY_COLOR,
471 |         ax=ax
472 |     )
473 |     fig.gca().invert_yaxis()
474 | 
475 |     # make the biggest bar red
476 |     max_value = df_valores_qtd_votos_intervalo['valor'].max()
477 |     max_value_index = df_valores_qtd_votos_intervalo['valor'].idxmax()
478 |     ax.patches[max_value_index].set_facecolor(HIGHLIGHT_COLOR)
479 |     # add the % inside the biggest bar
480 |     max_value_percent = max_value / df_valores_qtd_votos_intervalo['valor'].sum()
481 |     ax.text(
482 |         max_value - 0.05 * max_value, 
483 |         max_value_index, 
484 |         f"{max_value_percent:.1%}",
485 |         color='white', 
486 |         ha = 'right', 
487 |         va = 'center',
488 |         size=20
489 |     )
490 | 
491 |     ax.set_xlabel('Quantidade de Votos')
492 |     ax.set_ylabel('')
493 |     # ax.set_title('Em quanto tempo as pessoas votam?\n', fontsize=20)
494 | 
495 |     # remover linha superior, direita e inferior
496 |     ax.spines['top'].set_visible(False)
497 |     ax.spines['right'].set_visible(False)
498 |     ax.spines['bottom'].set_visible(False)
499 |     
500 |     # remove x axis
501 |     ax.xaxis.set_visible(False)
502 | 
503 |     # increase y axis font size
504 |     ax.tick_params(axis='y', labelsize=20)
505 | 
506 |     # adicionar número no final de cada barra
507 |     maior_valor = df_valores_qtd_votos_intervalo['valor'].max()
508 |     offset = 0.05 * maior_valor
509 |     for i, valor in enumerate(df_valores_qtd_votos_intervalo['valor']):
510 |         ax.text(valor+offset, i, format_number(valor), color='black', va='center', fontsize=18)
511 | 
512 |     container.pyplot(fig)
513 | 
514 | 
515 | 
516 | 


--------------------------------------------------------------------------------
/src/03A_calcular_metricas_tempo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calcular Métricas - Tempo de Votação, Biometria, etc."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "---"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Importing libraries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import duckdb\n",
 31 |     "import pandas as pd\n",
 32 |     "import time"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Importing Data"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "cursor = duckdb.connect()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Dados Brutos"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "TABLE = \"read_parquet('VOTES.parquet/*/*/*/*.parquet', hive_partitioning=True)\"\n",
 65 |     "ZONE_GROUPS = [ (x, x+20) for x in range(0, 800, 20) ]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Adicionar TURNO e Timestamp final de Biometria"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "source = F\"\"\"\n",
 82 |     "(\n",
 83 |     "    SELECT \n",
 84 |     "        *,\n",
 85 |     "        CASE event_date\n",
 86 |     "            WHEN '2022-10-02' THEN 1\n",
 87 |     "            WHEN '2022-10-03' THEN 1\n",
 88 |     "            WHEN '2022-10-30' THEN 2\n",
 89 |     "            WHEN '2022-10-31' THEN 2\n",
 90 |     "            ELSE NULL\n",
 91 |     "        END::INT AS turno,\n",
 92 |     "        COALESCE(\n",
 93 |     "            timestamp_biometria_manual,\n",
 94 |     "            timestamp_biometria_4,\n",
 95 |     "            timestamp_biometria_3,\n",
 96 |     "            timestamp_biometria_2,\n",
 97 |     "            timestamp_biometria_1\n",
 98 |     "        ) AS timestamp_biometria_final\n",
 99 |     "    FROM \n",
100 |     "        {TABLE}\n",
101 |     ") _\n",
102 |     "\"\"\""
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Preparinga Data"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Méticas no Cubo OLAP - Turno, UF, Zona, Seção.\n",
117 |     "\n",
118 |     "- Número de Votos\n",
119 |     "- Número de Seções Eleitorais\n",
120 |     "- Média, Soma, q50%, q90% do Tempo total de Voto, Tempo de Biometria, Tempo Total\n",
121 |     "\n",
122 |     "- Quantidade de Votos efetuados em até 30s, 1min, 1min30s, 2min, 2min30s, 3min+\n",
123 |     "- Taxa de Sucesso da Biometria em 1 tentativa, 2 tentativas, 3 tentativas, 4 tentativas, Falha\n",
124 |     "- Quantidade de Teclas Pressionadas\n",
125 |     "- Quantidade de Cargos Distintos Votados"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "**Definição das métricas de tempo**"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 5,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "tempo_voto_total = \"EXTRACT(EPOCH FROM (timestamp_voto_computado  - timestamp_titulo_digitado))\"\n",
142 |     "tempo_voto       = \"EXTRACT(EPOCH FROM (timestamp_voto_computado  - timestamp_habilitacao_eleitor))\"\n",
143 |     "tempo_biometria  = \"EXTRACT(EPOCH FROM (timestamp_biometria_final - timestamp_titulo_digitado))\"\n",
144 |     "\n",
145 |     "intervalos_tempo_segundos_votos = [0, 30, 60, 90, 120, 150, 180, 210, 300, 9999]\n",
146 |     "contagem_de_votos_em_intervalos_de_tempo = \", \".join([\n",
147 |     "    F\"\"\"\n",
148 |     "    SUM( \n",
149 |     "        CASE WHEN \n",
150 |     "        {tempo_voto} >= {intervalos_tempo_segundos_votos[i]} \n",
151 |     "        AND {tempo_voto} < {intervalos_tempo_segundos_votos[i+1]}\n",
152 |     "        THEN 1 ELSE 0 END \n",
153 |     "    ) AS votos_{intervalos_tempo_segundos_votos[i]}_{intervalos_tempo_segundos_votos[i+1]}_segundos\n",
154 |     "    \"\"\"\n",
155 |     "    for i in range(0, len(intervalos_tempo_segundos_votos)-1)\n",
156 |     "])"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "**Contagem de cargos distintos votados e número total de teclas pressionadas**"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Aproximação a partir do número de digitos de cada cargo + 1 (CONFIRMA)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "COLUNAS_VOTOS_CARGOS_NR_TECLAS = [\n",
180 |     "    # 2 digitos\n",
181 |     "    ('timestamp_voto_prefeito', 2), \n",
182 |     "    ('timestamp_voto_presidente', 2),\n",
183 |     "    ('timestamp_voto_governador', 2),\n",
184 |     "    \n",
185 |     "    # 3 digitos\n",
186 |     "    ('timestamp_voto_senador', 3),\n",
187 |     "\n",
188 |     "    # 4 digitos\n",
189 |     "    ('timestamp_voto_deputado_distrital', 4), \n",
190 |     "    ('timestamp_voto_deputado_federal', 4),\n",
191 |     "\n",
192 |     "    # 5 digitos\n",
193 |     "    ('timestamp_voto_deputado_estadual', 5),\n",
194 |     "]\n",
195 |     "\n",
196 |     "nr_total_cargos_votados = \" + \".join([\n",
197 |     "    F\"({coluna} IS NOT NULL)::INT\"\n",
198 |     "    for coluna, _ in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n",
199 |     "])\n",
200 |     "\n",
201 |     "nr_total_teclas_digitadas = \" + \".join([\n",
202 |     "    F\"({coluna} IS NOT NULL)::INT*({teclas}+1)\"\n",
203 |     "    for coluna, teclas in COLUNAS_VOTOS_CARGOS_NR_TECLAS\n",
204 |     "])"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 7,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "fix_null_values = lambda column: F\"COALESCE({column}::VARCHAR(10), 'ALL')\""
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 8,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "query_metrics = F\"\"\"\n",
223 |     "    SELECT\n",
224 |     "        {fix_null_values('turno') } AS turno,\n",
225 |     "        {fix_null_values('uf') } AS uf,\n",
226 |     "        {fix_null_values('zone_code') } AS zone_code,\n",
227 |     "        {fix_null_values('section_code') } AS section_code,\n",
228 |     "\n",
229 |     "        COUNT(*) AS total_votos,\n",
230 |     "        COUNT( DISTINCT uf || zone_code || section_code ) AS total_secoes,\n",
231 |     "\n",
232 |     "        SUM( {tempo_voto} ) AS tempo_voto_soma,\n",
233 |     "        AVG( {tempo_voto} ) AS tempo_voto_medio,\n",
234 |     "        --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_mediana,\n",
235 |     "        --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto}) AS tempo_voto_90percentil,\n",
236 |     "\n",
237 |     "        SUM( {tempo_biometria} ) AS tempo_biometria_soma,\n",
238 |     "        AVG( {tempo_biometria} ) AS tempo_biometria_medio,\n",
239 |     "        --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_mediana,\n",
240 |     "        --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_biometria}) AS tempo_biometria_90percentil,\n",
241 |     "\n",
242 |     "        SUM( {tempo_voto_total} ) AS tempo_voto_total_soma,\n",
243 |     "        AVG( {tempo_voto_total} ) AS tempo_voto_total_medio,\n",
244 |     "        --PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_mediana,\n",
245 |     "        --PERCENTILE_CONT(0.9) WITHIN GROUP(ORDER BY {tempo_voto_total}) AS tempo_voto_total_90percentil,\n",
246 |     "        \n",
247 |     "        {contagem_de_votos_em_intervalos_de_tempo},\n",
248 |     "        1-AVG(biometria_nao_funcionou::INT) AS tx_sucesso_biometria,\n",
249 |     "\n",
250 |     "        MAX({nr_total_cargos_votados})   AS nr_total_cargos_votados,\n",
251 |     "        SUM({nr_total_teclas_digitadas}) AS nr_total_teclas_digitadas\n",
252 |     "\n",
253 |     "    FROM\n",
254 |     "        {source}\n",
255 |     "    WHERE quantidade_votos_computados = 1\n",
256 |     "    GROUP BY ROLLUP(turno, uf, zone_code, section_code)\n",
257 |     "\"\"\""
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "Os arquivos parquet são particionados por DATA DO EVENTO, UF e GRUPO DE ZONA ELEITORAL para agilizar a leitura dos dados pelo Dashboard.\n",
265 |     "\n",
266 |     "As ZONAS foram agrupadas em grupos de 20, esse número é empírico."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 9,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "query_metrics_with_zone_group = F\"\"\"\n",
276 |     "    SELECT\n",
277 |     "    *,\n",
278 |     "    CASE\n",
279 |     "        {\n",
280 |     "            \"\".join(\n",
281 |     "                [\n",
282 |     "                    f\"WHEN zone_code!='ALL' AND zone_code::INT BETWEEN {min_zone} AND {max_zone} THEN '{min_zone}-{max_zone}' \" \n",
283 |     "                    for min_zone, max_zone in ZONE_GROUPS\n",
284 |     "                ]\n",
285 |     "            )\n",
286 |     "        }\n",
287 |     "        ELSE zone_code\n",
288 |     "    END AS zone_group\n",
289 |     "    FROM (\n",
290 |     "        {query_metrics}\n",
291 |     "    ) _\n",
292 |     "\"\"\""
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 10,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "query = F\"\"\"\n",
302 |     "    COPY (\n",
303 |     "    {\n",
304 |     "        query_metrics_with_zone_group\n",
305 |     "    } )\n",
306 |     "    TO 'VOTES_TIME_METRICS.parquet' \n",
307 |     "    (FORMAT 'parquet', PARTITION_BY (turno, uf, zone_group), OVERWRITE_OR_IGNORE 1);\n",
308 |     "\"\"\""
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 11,
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "application/vnd.jupyter.widget-view+json": {
319 |        "model_id": "df1f82e654f446ccb9e0f3171cf3edef",
320 |        "version_major": 2,
321 |        "version_minor": 0
322 |       },
323 |       "text/plain": [
324 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
325 |       ]
326 |      },
327 |      "metadata": {},
328 |      "output_type": "display_data"
329 |     },
330 |     {
331 |      "data": {
332 |       "text/plain": [
333 |        "<duckdb.duckdb.DuckDBPyConnection at 0x7fe591ffb2b0>"
334 |       ]
335 |      },
336 |      "execution_count": 11,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     }
340 |    ],
341 |    "source": [
342 |     "cursor.execute(query)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 12,
348 |    "metadata": {},
349 |    "outputs": [
350 |     {
351 |      "data": {
352 |       "text/html": [
353 |        "<div>\n",
354 |        "<style scoped>\n",
355 |        "    .dataframe tbody tr th:only-of-type {\n",
356 |        "        vertical-align: middle;\n",
357 |        "    }\n",
358 |        "\n",
359 |        "    .dataframe tbody tr th {\n",
360 |        "        vertical-align: top;\n",
361 |        "    }\n",
362 |        "\n",
363 |        "    .dataframe thead th {\n",
364 |        "        text-align: right;\n",
365 |        "    }\n",
366 |        "</style>\n",
367 |        "<table border=\"1\" class=\"dataframe\">\n",
368 |        "  <thead>\n",
369 |        "    <tr style=\"text-align: right;\">\n",
370 |        "      <th></th>\n",
371 |        "      <th>turno</th>\n",
372 |        "      <th>uf</th>\n",
373 |        "      <th>zone_code</th>\n",
374 |        "      <th>section_code</th>\n",
375 |        "      <th>total_votos</th>\n",
376 |        "      <th>total_secoes</th>\n",
377 |        "      <th>tempo_voto_soma</th>\n",
378 |        "      <th>tempo_voto_medio</th>\n",
379 |        "      <th>tempo_biometria_soma</th>\n",
380 |        "      <th>tempo_biometria_medio</th>\n",
381 |        "      <th>...</th>\n",
382 |        "      <th>votos_90_120_segundos</th>\n",
383 |        "      <th>votos_120_150_segundos</th>\n",
384 |        "      <th>votos_150_180_segundos</th>\n",
385 |        "      <th>votos_180_210_segundos</th>\n",
386 |        "      <th>votos_210_300_segundos</th>\n",
387 |        "      <th>votos_300_9999_segundos</th>\n",
388 |        "      <th>tx_sucesso_biometria</th>\n",
389 |        "      <th>nr_total_cargos_votados</th>\n",
390 |        "      <th>nr_total_teclas_digitadas</th>\n",
391 |        "      <th>zone_group</th>\n",
392 |        "    </tr>\n",
393 |        "  </thead>\n",
394 |        "  <tbody>\n",
395 |        "    <tr>\n",
396 |        "      <th>0</th>\n",
397 |        "      <td>1</td>\n",
398 |        "      <td>DF</td>\n",
399 |        "      <td>0014</td>\n",
400 |        "      <td>ALL</td>\n",
401 |        "      <td>89318</td>\n",
402 |        "      <td>308</td>\n",
403 |        "      <td>4345747.0</td>\n",
404 |        "      <td>48.654773</td>\n",
405 |        "      <td>869184.0</td>\n",
406 |        "      <td>10.492576</td>\n",
407 |        "      <td>...</td>\n",
408 |        "      <td>2633.0</td>\n",
409 |        "      <td>906.0</td>\n",
410 |        "      <td>409.0</td>\n",
411 |        "      <td>237.0</td>\n",
412 |        "      <td>276.0</td>\n",
413 |        "      <td>174.0</td>\n",
414 |        "      <td>0.953615</td>\n",
415 |        "      <td>5</td>\n",
416 |        "      <td>1745890.0</td>\n",
417 |        "      <td>0-20</td>\n",
418 |        "    </tr>\n",
419 |        "    <tr>\n",
420 |        "      <th>1</th>\n",
421 |        "      <td>1</td>\n",
422 |        "      <td>DF</td>\n",
423 |        "      <td>0015</td>\n",
424 |        "      <td>ALL</td>\n",
425 |        "      <td>134744</td>\n",
426 |        "      <td>505</td>\n",
427 |        "      <td>6785043.0</td>\n",
428 |        "      <td>50.355066</td>\n",
429 |        "      <td>1171645.0</td>\n",
430 |        "      <td>9.482934</td>\n",
431 |        "      <td>...</td>\n",
432 |        "      <td>4310.0</td>\n",
433 |        "      <td>1623.0</td>\n",
434 |        "      <td>837.0</td>\n",
435 |        "      <td>425.0</td>\n",
436 |        "      <td>494.0</td>\n",
437 |        "      <td>244.0</td>\n",
438 |        "      <td>0.963776</td>\n",
439 |        "      <td>5</td>\n",
440 |        "      <td>2667519.0</td>\n",
441 |        "      <td>0-20</td>\n",
442 |        "    </tr>\n",
443 |        "    <tr>\n",
444 |        "      <th>2</th>\n",
445 |        "      <td>1</td>\n",
446 |        "      <td>DF</td>\n",
447 |        "      <td>0017</td>\n",
448 |        "      <td>ALL</td>\n",
449 |        "      <td>100240</td>\n",
450 |        "      <td>378</td>\n",
451 |        "      <td>5899118.0</td>\n",
452 |        "      <td>58.849940</td>\n",
453 |        "      <td>1344604.0</td>\n",
454 |        "      <td>14.337855</td>\n",
455 |        "      <td>...</td>\n",
456 |        "      <td>5206.0</td>\n",
457 |        "      <td>2062.0</td>\n",
458 |        "      <td>1012.0</td>\n",
459 |        "      <td>587.0</td>\n",
460 |        "      <td>793.0</td>\n",
461 |        "      <td>364.0</td>\n",
462 |        "      <td>0.931993</td>\n",
463 |        "      <td>5</td>\n",
464 |        "      <td>1997143.0</td>\n",
465 |        "      <td>0-20</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>3</th>\n",
469 |        "      <td>1</td>\n",
470 |        "      <td>DF</td>\n",
471 |        "      <td>0005</td>\n",
472 |        "      <td>0050</td>\n",
473 |        "      <td>294</td>\n",
474 |        "      <td>1</td>\n",
475 |        "      <td>17810.0</td>\n",
476 |        "      <td>60.578231</td>\n",
477 |        "      <td>3067.0</td>\n",
478 |        "      <td>11.193431</td>\n",
479 |        "      <td>...</td>\n",
480 |        "      <td>23.0</td>\n",
481 |        "      <td>8.0</td>\n",
482 |        "      <td>4.0</td>\n",
483 |        "      <td>1.0</td>\n",
484 |        "      <td>3.0</td>\n",
485 |        "      <td>1.0</td>\n",
486 |        "      <td>0.955782</td>\n",
487 |        "      <td>5</td>\n",
488 |        "      <td>5880.0</td>\n",
489 |        "      <td>0-20</td>\n",
490 |        "    </tr>\n",
491 |        "    <tr>\n",
492 |        "      <th>4</th>\n",
493 |        "      <td>1</td>\n",
494 |        "      <td>DF</td>\n",
495 |        "      <td>0015</td>\n",
496 |        "      <td>0229</td>\n",
497 |        "      <td>281</td>\n",
498 |        "      <td>1</td>\n",
499 |        "      <td>14992.0</td>\n",
500 |        "      <td>53.352313</td>\n",
501 |        "      <td>2154.0</td>\n",
502 |        "      <td>8.381323</td>\n",
503 |        "      <td>...</td>\n",
504 |        "      <td>16.0</td>\n",
505 |        "      <td>5.0</td>\n",
506 |        "      <td>1.0</td>\n",
507 |        "      <td>0.0</td>\n",
508 |        "      <td>1.0</td>\n",
509 |        "      <td>0.0</td>\n",
510 |        "      <td>0.975089</td>\n",
511 |        "      <td>5</td>\n",
512 |        "      <td>5620.0</td>\n",
513 |        "      <td>0-20</td>\n",
514 |        "    </tr>\n",
515 |        "    <tr>\n",
516 |        "      <th>...</th>\n",
517 |        "      <td>...</td>\n",
518 |        "      <td>...</td>\n",
519 |        "      <td>...</td>\n",
520 |        "      <td>...</td>\n",
521 |        "      <td>...</td>\n",
522 |        "      <td>...</td>\n",
523 |        "      <td>...</td>\n",
524 |        "      <td>...</td>\n",
525 |        "      <td>...</td>\n",
526 |        "      <td>...</td>\n",
527 |        "      <td>...</td>\n",
528 |        "      <td>...</td>\n",
529 |        "      <td>...</td>\n",
530 |        "      <td>...</td>\n",
531 |        "      <td>...</td>\n",
532 |        "      <td>...</td>\n",
533 |        "      <td>...</td>\n",
534 |        "      <td>...</td>\n",
535 |        "      <td>...</td>\n",
536 |        "      <td>...</td>\n",
537 |        "      <td>...</td>\n",
538 |        "    </tr>\n",
539 |        "    <tr>\n",
540 |        "      <th>6625</th>\n",
541 |        "      <td>1</td>\n",
542 |        "      <td>DF</td>\n",
543 |        "      <td>0021</td>\n",
544 |        "      <td>0082</td>\n",
545 |        "      <td>253</td>\n",
546 |        "      <td>1</td>\n",
547 |        "      <td>13693.0</td>\n",
548 |        "      <td>54.122530</td>\n",
549 |        "      <td>2232.0</td>\n",
550 |        "      <td>9.073171</td>\n",
551 |        "      <td>...</td>\n",
552 |        "      <td>16.0</td>\n",
553 |        "      <td>6.0</td>\n",
554 |        "      <td>1.0</td>\n",
555 |        "      <td>1.0</td>\n",
556 |        "      <td>2.0</td>\n",
557 |        "      <td>0.0</td>\n",
558 |        "      <td>0.984190</td>\n",
559 |        "      <td>5</td>\n",
560 |        "      <td>5060.0</td>\n",
561 |        "      <td>20-40</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>6626</th>\n",
565 |        "      <td>1</td>\n",
566 |        "      <td>DF</td>\n",
567 |        "      <td>0021</td>\n",
568 |        "      <td>0332</td>\n",
569 |        "      <td>234</td>\n",
570 |        "      <td>1</td>\n",
571 |        "      <td>12897.0</td>\n",
572 |        "      <td>55.115385</td>\n",
573 |        "      <td>2474.0</td>\n",
574 |        "      <td>10.850877</td>\n",
575 |        "      <td>...</td>\n",
576 |        "      <td>5.0</td>\n",
577 |        "      <td>6.0</td>\n",
578 |        "      <td>0.0</td>\n",
579 |        "      <td>1.0</td>\n",
580 |        "      <td>2.0</td>\n",
581 |        "      <td>3.0</td>\n",
582 |        "      <td>0.957265</td>\n",
583 |        "      <td>5</td>\n",
584 |        "      <td>4680.0</td>\n",
585 |        "      <td>20-40</td>\n",
586 |        "    </tr>\n",
587 |        "    <tr>\n",
588 |        "      <th>6627</th>\n",
589 |        "      <td>1</td>\n",
590 |        "      <td>DF</td>\n",
591 |        "      <td>0021</td>\n",
592 |        "      <td>0318</td>\n",
593 |        "      <td>312</td>\n",
594 |        "      <td>1</td>\n",
595 |        "      <td>17853.0</td>\n",
596 |        "      <td>57.221154</td>\n",
597 |        "      <td>3879.0</td>\n",
598 |        "      <td>13.104730</td>\n",
599 |        "      <td>...</td>\n",
600 |        "      <td>14.0</td>\n",
601 |        "      <td>5.0</td>\n",
602 |        "      <td>4.0</td>\n",
603 |        "      <td>1.0</td>\n",
604 |        "      <td>5.0</td>\n",
605 |        "      <td>1.0</td>\n",
606 |        "      <td>0.971154</td>\n",
607 |        "      <td>5</td>\n",
608 |        "      <td>6240.0</td>\n",
609 |        "      <td>20-40</td>\n",
610 |        "    </tr>\n",
611 |        "    <tr>\n",
612 |        "      <th>6628</th>\n",
613 |        "      <td>1</td>\n",
614 |        "      <td>DF</td>\n",
615 |        "      <td>0021</td>\n",
616 |        "      <td>0185</td>\n",
617 |        "      <td>315</td>\n",
618 |        "      <td>1</td>\n",
619 |        "      <td>16997.0</td>\n",
620 |        "      <td>53.958730</td>\n",
621 |        "      <td>2877.0</td>\n",
622 |        "      <td>9.558140</td>\n",
623 |        "      <td>...</td>\n",
624 |        "      <td>15.0</td>\n",
625 |        "      <td>7.0</td>\n",
626 |        "      <td>7.0</td>\n",
627 |        "      <td>0.0</td>\n",
628 |        "      <td>0.0</td>\n",
629 |        "      <td>0.0</td>\n",
630 |        "      <td>0.946032</td>\n",
631 |        "      <td>5</td>\n",
632 |        "      <td>6300.0</td>\n",
633 |        "      <td>20-40</td>\n",
634 |        "    </tr>\n",
635 |        "    <tr>\n",
636 |        "      <th>6629</th>\n",
637 |        "      <td>1</td>\n",
638 |        "      <td>DF</td>\n",
639 |        "      <td>ALL</td>\n",
640 |        "      <td>ALL</td>\n",
641 |        "      <td>1779224</td>\n",
642 |        "      <td>6610</td>\n",
643 |        "      <td>99817162.0</td>\n",
644 |        "      <td>56.101515</td>\n",
645 |        "      <td>18036165.0</td>\n",
646 |        "      <td>10.941314</td>\n",
647 |        "      <td>...</td>\n",
648 |        "      <td>81643.0</td>\n",
649 |        "      <td>33515.0</td>\n",
650 |        "      <td>16930.0</td>\n",
651 |        "      <td>9692.0</td>\n",
652 |        "      <td>11890.0</td>\n",
653 |        "      <td>5609.0</td>\n",
654 |        "      <td>0.952647</td>\n",
655 |        "      <td>5</td>\n",
656 |        "      <td>35370312.0</td>\n",
657 |        "      <td>ALL</td>\n",
658 |        "    </tr>\n",
659 |        "  </tbody>\n",
660 |        "</table>\n",
661 |        "<p>6630 rows × 25 columns</p>\n",
662 |        "</div>"
663 |       ],
664 |       "text/plain": [
665 |        "     turno  uf zone_code section_code  total_votos  total_secoes  \\\n",
666 |        "0        1  DF      0014          ALL        89318           308   \n",
667 |        "1        1  DF      0015          ALL       134744           505   \n",
668 |        "2        1  DF      0017          ALL       100240           378   \n",
669 |        "3        1  DF      0005         0050          294             1   \n",
670 |        "4        1  DF      0015         0229          281             1   \n",
671 |        "...    ...  ..       ...          ...          ...           ...   \n",
672 |        "6625     1  DF      0021         0082          253             1   \n",
673 |        "6626     1  DF      0021         0332          234             1   \n",
674 |        "6627     1  DF      0021         0318          312             1   \n",
675 |        "6628     1  DF      0021         0185          315             1   \n",
676 |        "6629     1  DF       ALL          ALL      1779224          6610   \n",
677 |        "\n",
678 |        "      tempo_voto_soma  tempo_voto_medio  tempo_biometria_soma  \\\n",
679 |        "0           4345747.0         48.654773              869184.0   \n",
680 |        "1           6785043.0         50.355066             1171645.0   \n",
681 |        "2           5899118.0         58.849940             1344604.0   \n",
682 |        "3             17810.0         60.578231                3067.0   \n",
683 |        "4             14992.0         53.352313                2154.0   \n",
684 |        "...               ...               ...                   ...   \n",
685 |        "6625          13693.0         54.122530                2232.0   \n",
686 |        "6626          12897.0         55.115385                2474.0   \n",
687 |        "6627          17853.0         57.221154                3879.0   \n",
688 |        "6628          16997.0         53.958730                2877.0   \n",
689 |        "6629       99817162.0         56.101515            18036165.0   \n",
690 |        "\n",
691 |        "      tempo_biometria_medio  ...  votos_90_120_segundos  \\\n",
692 |        "0                 10.492576  ...                 2633.0   \n",
693 |        "1                  9.482934  ...                 4310.0   \n",
694 |        "2                 14.337855  ...                 5206.0   \n",
695 |        "3                 11.193431  ...                   23.0   \n",
696 |        "4                  8.381323  ...                   16.0   \n",
697 |        "...                     ...  ...                    ...   \n",
698 |        "6625               9.073171  ...                   16.0   \n",
699 |        "6626              10.850877  ...                    5.0   \n",
700 |        "6627              13.104730  ...                   14.0   \n",
701 |        "6628               9.558140  ...                   15.0   \n",
702 |        "6629              10.941314  ...                81643.0   \n",
703 |        "\n",
704 |        "      votos_120_150_segundos  votos_150_180_segundos  votos_180_210_segundos  \\\n",
705 |        "0                      906.0                   409.0                   237.0   \n",
706 |        "1                     1623.0                   837.0                   425.0   \n",
707 |        "2                     2062.0                  1012.0                   587.0   \n",
708 |        "3                        8.0                     4.0                     1.0   \n",
709 |        "4                        5.0                     1.0                     0.0   \n",
710 |        "...                      ...                     ...                     ...   \n",
711 |        "6625                     6.0                     1.0                     1.0   \n",
712 |        "6626                     6.0                     0.0                     1.0   \n",
713 |        "6627                     5.0                     4.0                     1.0   \n",
714 |        "6628                     7.0                     7.0                     0.0   \n",
715 |        "6629                 33515.0                 16930.0                  9692.0   \n",
716 |        "\n",
717 |        "      votos_210_300_segundos  votos_300_9999_segundos  tx_sucesso_biometria  \\\n",
718 |        "0                      276.0                    174.0              0.953615   \n",
719 |        "1                      494.0                    244.0              0.963776   \n",
720 |        "2                      793.0                    364.0              0.931993   \n",
721 |        "3                        3.0                      1.0              0.955782   \n",
722 |        "4                        1.0                      0.0              0.975089   \n",
723 |        "...                      ...                      ...                   ...   \n",
724 |        "6625                     2.0                      0.0              0.984190   \n",
725 |        "6626                     2.0                      3.0              0.957265   \n",
726 |        "6627                     5.0                      1.0              0.971154   \n",
727 |        "6628                     0.0                      0.0              0.946032   \n",
728 |        "6629                 11890.0                   5609.0              0.952647   \n",
729 |        "\n",
730 |        "      nr_total_cargos_votados  nr_total_teclas_digitadas  zone_group  \n",
731 |        "0                           5                  1745890.0        0-20  \n",
732 |        "1                           5                  2667519.0        0-20  \n",
733 |        "2                           5                  1997143.0        0-20  \n",
734 |        "3                           5                     5880.0        0-20  \n",
735 |        "4                           5                     5620.0        0-20  \n",
736 |        "...                       ...                        ...         ...  \n",
737 |        "6625                        5                     5060.0       20-40  \n",
738 |        "6626                        5                     4680.0       20-40  \n",
739 |        "6627                        5                     6240.0       20-40  \n",
740 |        "6628                        5                     6300.0       20-40  \n",
741 |        "6629                        5                 35370312.0         ALL  \n",
742 |        "\n",
743 |        "[6630 rows x 25 columns]"
744 |       ]
745 |      },
746 |      "execution_count": 12,
747 |      "metadata": {},
748 |      "output_type": "execute_result"
749 |     }
750 |    ],
751 |    "source": [
752 |     "table = \"\"\"\n",
753 |     "            read_parquet(\n",
754 |     "                'VOTES_TIME_METRICS.parquet/*/*/*/*.parquet', \n",
755 |     "                hive_partitioning=True,\n",
756 |     "                hive_types_autocast=0\n",
757 |     "            )\n",
758 |     "        \"\"\"\n",
759 |     "turno = 1\n",
760 |     "uf = 'DF'\n",
761 |     "zone_group = 'ALL'\n",
762 |     "zone = 1\n",
763 |     "\n",
764 |     "\n",
765 |     "query = f\"\"\"\n",
766 |     "            SELECT *\n",
767 |     "            FROM {table}\n",
768 |     "            WHERE 1=1\n",
769 |     "            AND turno = '{turno}'\n",
770 |     "            AND uf = '{uf}'\n",
771 |     "            -- AND zone_group = '{zone_group}'\n",
772 |     "            -- AND zone_code = {zone}\n",
773 |     "        \"\"\"\n",
774 |     "\n",
775 |     "df = cursor.execute(query).df()\n",
776 |     "df"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 13,
782 |    "metadata": {},
783 |    "outputs": [
784 |     {
785 |      "data": {
786 |       "text/plain": [
787 |        "Index(['turno', 'uf', 'zone_code', 'section_code', 'total_votos',\n",
788 |        "       'total_secoes', 'tempo_voto_soma', 'tempo_voto_medio',\n",
789 |        "       'tempo_biometria_soma', 'tempo_biometria_medio',\n",
790 |        "       'tempo_voto_total_soma', 'tempo_voto_total_medio',\n",
791 |        "       'votos_0_30_segundos', 'votos_30_60_segundos', 'votos_60_90_segundos',\n",
792 |        "       'votos_90_120_segundos', 'votos_120_150_segundos',\n",
793 |        "       'votos_150_180_segundos', 'votos_180_210_segundos',\n",
794 |        "       'votos_210_300_segundos', 'votos_300_9999_segundos',\n",
795 |        "       'tx_sucesso_biometria', 'nr_total_cargos_votados',\n",
796 |        "       'nr_total_teclas_digitadas', 'zone_group'],\n",
797 |        "      dtype='object')"
798 |       ]
799 |      },
800 |      "execution_count": 13,
801 |      "metadata": {},
802 |      "output_type": "execute_result"
803 |     }
804 |    ],
805 |    "source": [
806 |     "df.columns"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": null,
812 |    "metadata": {},
813 |    "outputs": [],
814 |    "source": []
815 |   }
816 |  ],
817 |  "metadata": {
818 |   "kernelspec": {
819 |    "display_name": "base",
820 |    "language": "python",
821 |    "name": "python3"
822 |   },
823 |   "language_info": {
824 |    "codemirror_mode": {
825 |     "name": "ipython",
826 |     "version": 3
827 |    },
828 |    "file_extension": ".py",
829 |    "mimetype": "text/x-python",
830 |    "name": "python",
831 |    "nbconvert_exporter": "python",
832 |    "pygments_lexer": "ipython3",
833 |    "version": "3.11.5"
834 |   }
835 |  },
836 |  "nbformat": 4,
837 |  "nbformat_minor": 2
838 | }
839 | 


--------------------------------------------------------------------------------
/src/test-basic-queries.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Performance das Consultas"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## Importando Bibliotecas"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "import duckdb\n",
  24 |     "import time"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "metadata": {},
  30 |    "source": [
  31 |     "## Conectando à base de dados"
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "code",
  36 |    "execution_count": 2,
  37 |    "metadata": {},
  38 |    "outputs": [],
  39 |    "source": [
  40 |     "cursor = duckdb.connect()\n",
  41 |     "DATABASE = '*.parquet'"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 14,
  47 |    "metadata": {},
  48 |    "outputs": [],
  49 |    "source": [
  50 |     "def execute_query_and_calculate_time(cursor, query, return_df=False):\n",
  51 |     "    \n",
  52 |     "    if return_df:\n",
  53 |     "        tic = time.time()\n",
  54 |     "        cursor.execute(query)\n",
  55 |     "        df = cursor.df()\n",
  56 |     "        toc = time.time()\n",
  57 |     "        return df, toc - tic\n",
  58 |     "    else:\n",
  59 |     "        tic = time.time()\n",
  60 |     "        cursor.execute(query)\n",
  61 |     "        toc = time.time()\n",
  62 |     "        return toc - tic"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "markdown",
  67 |    "metadata": {},
  68 |    "source": [
  69 |     "## Consultas"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "metadata": {},
  75 |    "source": [
  76 |     "### Quantidade de Registros"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": 3,
  82 |    "metadata": {},
  83 |    "outputs": [],
  84 |    "source": [
  85 |     "query = f\"SELECT COUNT(*) FROM '{DATABASE}'\""
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 4,
  91 |    "metadata": {},
  92 |    "outputs": [
  93 |     {
  94 |      "name": "stdout",
  95 |      "output_type": "stream",
  96 |      "text": [
  97 |       "Number of rows: 4,283,329,488\n",
  98 |       "Time: 1.47s\n"
  99 |      ]
 100 |     }
 101 |    ],
 102 |    "source": [
 103 |     "tic = time.time()\n",
 104 |     "\n",
 105 |     "cursor.execute(query)\n",
 106 |     "n_rows = cursor.fetchone()\n",
 107 |     "\n",
 108 |     "toc = time.time()\n",
 109 |     "\n",
 110 |     "print(f\"Number of rows: {n_rows[0]:,}\")\n",
 111 |     "print(f\"Time: {toc - tic:.2f}s\")"
 112 |    ]
 113 |   },
 114 |   {
 115 |    "cell_type": "markdown",
 116 |    "metadata": {},
 117 |    "source": [
 118 |     "### Primeiros Registros"
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": 6,
 124 |    "metadata": {},
 125 |    "outputs": [
 126 |     {
 127 |      "name": "stdout",
 128 |      "output_type": "stream",
 129 |      "text": [
 130 |       "Time: 0.82s\n"
 131 |      ]
 132 |     },
 133 |     {
 134 |      "data": {
 135 |       "text/html": [
 136 |        "<div>\n",
 137 |        "<style scoped>\n",
 138 |        "    .dataframe tbody tr th:only-of-type {\n",
 139 |        "        vertical-align: middle;\n",
 140 |        "    }\n",
 141 |        "\n",
 142 |        "    .dataframe tbody tr th {\n",
 143 |        "        vertical-align: top;\n",
 144 |        "    }\n",
 145 |        "\n",
 146 |        "    .dataframe thead th {\n",
 147 |        "        text-align: right;\n",
 148 |        "    }\n",
 149 |        "</style>\n",
 150 |        "<table border=\"1\" class=\"dataframe\">\n",
 151 |        "  <thead>\n",
 152 |        "    <tr style=\"text-align: right;\">\n",
 153 |        "      <th></th>\n",
 154 |        "      <th>event_timestamp</th>\n",
 155 |        "      <th>event_type</th>\n",
 156 |        "      <th>some_id</th>\n",
 157 |        "      <th>event_system</th>\n",
 158 |        "      <th>event_description</th>\n",
 159 |        "      <th>event_id</th>\n",
 160 |        "      <th>filename</th>\n",
 161 |        "    </tr>\n",
 162 |        "  </thead>\n",
 163 |        "  <tbody>\n",
 164 |        "    <tr>\n",
 165 |        "      <th>0</th>\n",
 166 |        "      <td>2022-10-26 10:39:36</td>\n",
 167 |        "      <td>INFO</td>\n",
 168 |        "      <td>67305985</td>\n",
 169 |        "      <td>LOGD</td>\n",
 170 |        "      <td>Início das operações do logd</td>\n",
 171 |        "      <td>E2C58C3021D6DB87</td>\n",
 172 |        "      <td>/data/logs/2_AC/o00407-0100700090001_new.csv</td>\n",
 173 |        "    </tr>\n",
 174 |        "    <tr>\n",
 175 |        "      <th>1</th>\n",
 176 |        "      <td>2022-10-26 10:39:36</td>\n",
 177 |        "      <td>INFO</td>\n",
 178 |        "      <td>67305985</td>\n",
 179 |        "      <td>LOGD</td>\n",
 180 |        "      <td>Urna ligada em 26/10/2022 às 10:38:20</td>\n",
 181 |        "      <td>DFBD462E26E8F1EA</td>\n",
 182 |        "      <td>/data/logs/2_AC/o00407-0100700090001_new.csv</td>\n",
 183 |        "    </tr>\n",
 184 |        "    <tr>\n",
 185 |        "      <th>2</th>\n",
 186 |        "      <td>2022-10-26 10:39:36</td>\n",
 187 |        "      <td>INFO</td>\n",
 188 |        "      <td>67305985</td>\n",
 189 |        "      <td>SCUE</td>\n",
 190 |        "      <td>Iniciando aplicação - Oficial - 1º turno</td>\n",
 191 |        "      <td>B8E2CBFADB3EF46B</td>\n",
 192 |        "      <td>/data/logs/2_AC/o00407-0100700090001_new.csv</td>\n",
 193 |        "    </tr>\n",
 194 |        "    <tr>\n",
 195 |        "      <th>3</th>\n",
 196 |        "      <td>2022-10-26 10:39:36</td>\n",
 197 |        "      <td>INFO</td>\n",
 198 |        "      <td>67305985</td>\n",
 199 |        "      <td>SCUE</td>\n",
 200 |        "      <td>Versão da aplicação: 8.26.0.0 - Onça-pintada</td>\n",
 201 |        "      <td>AC76A5B17419CB2E</td>\n",
 202 |        "      <td>/data/logs/2_AC/o00407-0100700090001_new.csv</td>\n",
 203 |        "    </tr>\n",
 204 |        "    <tr>\n",
 205 |        "      <th>4</th>\n",
 206 |        "      <td>2022-10-26 10:39:38</td>\n",
 207 |        "      <td>INFO</td>\n",
 208 |        "      <td>67305985</td>\n",
 209 |        "      <td>SCUE</td>\n",
 210 |        "      <td>Urna operando com rede elétrica</td>\n",
 211 |        "      <td>ED0703CBF6110D2C</td>\n",
 212 |        "      <td>/data/logs/2_AC/o00407-0100700090001_new.csv</td>\n",
 213 |        "    </tr>\n",
 214 |        "  </tbody>\n",
 215 |        "</table>\n",
 216 |        "</div>"
 217 |       ],
 218 |       "text/plain": [
 219 |        "      event_timestamp event_type   some_id event_system  \\\n",
 220 |        "0 2022-10-26 10:39:36       INFO  67305985         LOGD   \n",
 221 |        "1 2022-10-26 10:39:36       INFO  67305985         LOGD   \n",
 222 |        "2 2022-10-26 10:39:36       INFO  67305985         SCUE   \n",
 223 |        "3 2022-10-26 10:39:36       INFO  67305985         SCUE   \n",
 224 |        "4 2022-10-26 10:39:38       INFO  67305985         SCUE   \n",
 225 |        "\n",
 226 |        "                              event_description          event_id  \\\n",
 227 |        "0                  Início das operações do logd  E2C58C3021D6DB87   \n",
 228 |        "1         Urna ligada em 26/10/2022 às 10:38:20  DFBD462E26E8F1EA   \n",
 229 |        "2      Iniciando aplicação - Oficial - 1º turno  B8E2CBFADB3EF46B   \n",
 230 |        "3  Versão da aplicação: 8.26.0.0 - Onça-pintada  AC76A5B17419CB2E   \n",
 231 |        "4               Urna operando com rede elétrica  ED0703CBF6110D2C   \n",
 232 |        "\n",
 233 |        "                                       filename  \n",
 234 |        "0  /data/logs/2_AC/o00407-0100700090001_new.csv  \n",
 235 |        "1  /data/logs/2_AC/o00407-0100700090001_new.csv  \n",
 236 |        "2  /data/logs/2_AC/o00407-0100700090001_new.csv  \n",
 237 |        "3  /data/logs/2_AC/o00407-0100700090001_new.csv  \n",
 238 |        "4  /data/logs/2_AC/o00407-0100700090001_new.csv  "
 239 |       ]
 240 |      },
 241 |      "execution_count": 6,
 242 |      "metadata": {},
 243 |      "output_type": "execute_result"
 244 |     }
 245 |    ],
 246 |    "source": [
 247 |     "query = f\"\"\"\n",
 248 |     "    SELECT \n",
 249 |     "        *\n",
 250 |     "    FROM '{DATABASE}' LIMIT 5\n",
 251 |     "\"\"\"\n",
 252 |     "\n",
 253 |     "tic = time.time()\n",
 254 |     "cursor.execute(query)\n",
 255 |     "df_result = cursor.df()\n",
 256 |     "toc = time.time()\n",
 257 |     "\n",
 258 |     "print(f\"Time: {toc - tic:.2f}s\")\n",
 259 |     "df_result"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "metadata": {},
 265 |    "source": [
 266 |     "### Primeiros registros + filtro RN"
 267 |    ]
 268 |   },
 269 |   {
 270 |    "cell_type": "code",
 271 |    "execution_count": 7,
 272 |    "metadata": {},
 273 |    "outputs": [
 274 |     {
 275 |      "name": "stdout",
 276 |      "output_type": "stream",
 277 |      "text": [
 278 |       "Time: 69.65s\n"
 279 |      ]
 280 |     }
 281 |    ],
 282 |    "source": [
 283 |     "query = f\"\"\"\n",
 284 |     "    SELECT \n",
 285 |     "        *\n",
 286 |     "    FROM '{DATABASE}'\n",
 287 |     "    WHERE filename ILIKE '%RN%'\n",
 288 |     "    LIMIT 500\n",
 289 |     "\"\"\"\n",
 290 |     "\n",
 291 |     "tic = time.time()\n",
 292 |     "cursor.execute(query)\n",
 293 |     "df_result = cursor.df()\n",
 294 |     "toc = time.time()\n",
 295 |     "\n",
 296 |     "print(f\"Time: {toc - tic:.2f}s\")"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "code",
 301 |    "execution_count": 11,
 302 |    "metadata": {},
 303 |    "outputs": [
 304 |     {
 305 |      "name": "stdout",
 306 |      "output_type": "stream",
 307 |      "text": [
 308 |       "Time: 91.12s\n"
 309 |      ]
 310 |     }
 311 |    ],
 312 |    "source": [
 313 |     "query = f\"\"\"\n",
 314 |     "    SELECT \n",
 315 |     "        *\n",
 316 |     "    FROM '{DATABASE}'\n",
 317 |     "    WHERE filename ILIKE '%SP%'\n",
 318 |     "    LIMIT 500\n",
 319 |     "\"\"\"\n",
 320 |     "\n",
 321 |     "tic = time.time()\n",
 322 |     "cursor.execute(query)\n",
 323 |     "df_result = cursor.df()\n",
 324 |     "toc = time.time()\n",
 325 |     "\n",
 326 |     "print(f\"Time: {toc - tic:.2f}s\")"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "markdown",
 331 |    "metadata": {},
 332 |    "source": [
 333 |     "### Distinct"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "markdown",
 338 |    "metadata": {},
 339 |    "source": [
 340 |     "event_type"
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "code",
 345 |    "execution_count": 12,
 346 |    "metadata": {},
 347 |    "outputs": [
 348 |     {
 349 |      "name": "stdout",
 350 |      "output_type": "stream",
 351 |      "text": [
 352 |       "Time: 5.69s\n"
 353 |      ]
 354 |     }
 355 |    ],
 356 |    "source": [
 357 |     "query = f\"\"\"\n",
 358 |     "    SELECT DISTINCT\n",
 359 |     "        event_type\n",
 360 |     "    FROM '{DATABASE}'\n",
 361 |     "\"\"\"\n",
 362 |     "\n",
 363 |     "tic = time.time()\n",
 364 |     "cursor.execute(query)\n",
 365 |     "df_result = cursor.df()\n",
 366 |     "toc = time.time()\n",
 367 |     "\n",
 368 |     "print(f\"Time: {toc - tic:.2f}s\")"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "markdown",
 373 |    "metadata": {},
 374 |    "source": [
 375 |     "event_description"
 376 |    ]
 377 |   },
 378 |   {
 379 |    "cell_type": "code",
 380 |    "execution_count": 14,
 381 |    "metadata": {},
 382 |    "outputs": [
 383 |     {
 384 |      "name": "stdout",
 385 |      "output_type": "stream",
 386 |      "text": [
 387 |       "Time: 29.33s\n"
 388 |      ]
 389 |     }
 390 |    ],
 391 |    "source": [
 392 |     "query = f\"\"\"\n",
 393 |     "    SELECT DISTINCT\n",
 394 |     "        event_description\n",
 395 |     "    FROM '{DATABASE}'\n",
 396 |     "\"\"\"\n",
 397 |     "\n",
 398 |     "tic = time.time()\n",
 399 |     "cursor.execute(query)\n",
 400 |     "df_result = cursor.df()\n",
 401 |     "toc = time.time()\n",
 402 |     "\n",
 403 |     "print(f\"Time: {toc - tic:.2f}s\")"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "markdown",
 408 |    "metadata": {},
 409 |    "source": [
 410 |     "### Group By"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": 3,
 416 |    "metadata": {},
 417 |    "outputs": [
 418 |     {
 419 |      "data": {
 420 |       "application/vnd.jupyter.widget-view+json": {
 421 |        "model_id": "2941ff10abd0446cb443aafd4e0fc77c",
 422 |        "version_major": 2,
 423 |        "version_minor": 0
 424 |       },
 425 |       "text/plain": [
 426 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 427 |       ]
 428 |      },
 429 |      "metadata": {},
 430 |      "output_type": "display_data"
 431 |     },
 432 |     {
 433 |      "name": "stdout",
 434 |      "output_type": "stream",
 435 |      "text": [
 436 |       "Time: 6.77s\n"
 437 |      ]
 438 |     }
 439 |    ],
 440 |    "source": [
 441 |     "query = f\"\"\"\n",
 442 |     "    SELECT \n",
 443 |     "        event_system,\n",
 444 |     "        COUNT(*) AS qtd_linhas\n",
 445 |     "    FROM '{DATABASE}'\n",
 446 |     "    GROUP BY event_system\n",
 447 |     "\"\"\"\n",
 448 |     "\n",
 449 |     "tic = time.time()\n",
 450 |     "cursor.execute(query) \n",
 451 |     "df_result = cursor.df()\n",
 452 |     "toc = time.time()\n",
 453 |     "\n",
 454 |     "print(f\"Time: {toc - tic:.2f}s\")"
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": 4,
 460 |    "metadata": {},
 461 |    "outputs": [
 462 |     {
 463 |      "data": {
 464 |       "text/html": [
 465 |        "<div>\n",
 466 |        "<style scoped>\n",
 467 |        "    .dataframe tbody tr th:only-of-type {\n",
 468 |        "        vertical-align: middle;\n",
 469 |        "    }\n",
 470 |        "\n",
 471 |        "    .dataframe tbody tr th {\n",
 472 |        "        vertical-align: top;\n",
 473 |        "    }\n",
 474 |        "\n",
 475 |        "    .dataframe thead th {\n",
 476 |        "        text-align: right;\n",
 477 |        "    }\n",
 478 |        "</style>\n",
 479 |        "<table border=\"1\" class=\"dataframe\">\n",
 480 |        "  <thead>\n",
 481 |        "    <tr style=\"text-align: right;\">\n",
 482 |        "      <th></th>\n",
 483 |        "      <th>event_system</th>\n",
 484 |        "      <th>qtd_linhas</th>\n",
 485 |        "    </tr>\n",
 486 |        "  </thead>\n",
 487 |        "  <tbody>\n",
 488 |        "    <tr>\n",
 489 |        "      <th>0</th>\n",
 490 |        "      <td>INITJE</td>\n",
 491 |        "      <td>3044304</td>\n",
 492 |        "    </tr>\n",
 493 |        "    <tr>\n",
 494 |        "      <th>1</th>\n",
 495 |        "      <td>VERIFICADOR</td>\n",
 496 |        "      <td>37931</td>\n",
 497 |        "    </tr>\n",
 498 |        "    <tr>\n",
 499 |        "      <th>2</th>\n",
 500 |        "      <td>STE</td>\n",
 501 |        "      <td>394</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>3</th>\n",
 505 |        "      <td>LOGD</td>\n",
 506 |        "      <td>17978454</td>\n",
 507 |        "    </tr>\n",
 508 |        "    <tr>\n",
 509 |        "      <th>4</th>\n",
 510 |        "      <td>ADH</td>\n",
 511 |        "      <td>5188</td>\n",
 512 |        "    </tr>\n",
 513 |        "    <tr>\n",
 514 |        "      <th>5</th>\n",
 515 |        "      <td>SA</td>\n",
 516 |        "      <td>784</td>\n",
 517 |        "    </tr>\n",
 518 |        "    <tr>\n",
 519 |        "      <th>6</th>\n",
 520 |        "      <td>SCUE</td>\n",
 521 |        "      <td>39756883</td>\n",
 522 |        "    </tr>\n",
 523 |        "    <tr>\n",
 524 |        "      <th>7</th>\n",
 525 |        "      <td>VPP</td>\n",
 526 |        "      <td>223388</td>\n",
 527 |        "    </tr>\n",
 528 |        "    <tr>\n",
 529 |        "      <th>8</th>\n",
 530 |        "      <td>VO\u0014A</td>\n",
 531 |        "      <td>1</td>\n",
 532 |        "    </tr>\n",
 533 |        "    <tr>\n",
 534 |        "      <th>9</th>\n",
 535 |        "      <td>VOTA</td>\n",
 536 |        "      <td>3879701660</td>\n",
 537 |        "    </tr>\n",
 538 |        "    <tr>\n",
 539 |        "      <th>10</th>\n",
 540 |        "      <td>RED</td>\n",
 541 |        "      <td>76691</td>\n",
 542 |        "    </tr>\n",
 543 |        "    <tr>\n",
 544 |        "      <th>11</th>\n",
 545 |        "      <td>GAP</td>\n",
 546 |        "      <td>262715525</td>\n",
 547 |        "    </tr>\n",
 548 |        "    <tr>\n",
 549 |        "      <th>12</th>\n",
 550 |        "      <td>ATUE</td>\n",
 551 |        "      <td>79788285</td>\n",
 552 |        "    </tr>\n",
 553 |        "  </tbody>\n",
 554 |        "</table>\n",
 555 |        "</div>"
 556 |       ],
 557 |       "text/plain": [
 558 |        "   event_system  qtd_linhas\n",
 559 |        "0        INITJE     3044304\n",
 560 |        "1   VERIFICADOR       37931\n",
 561 |        "2           STE         394\n",
 562 |        "3          LOGD    17978454\n",
 563 |        "4           ADH        5188\n",
 564 |        "5            SA         784\n",
 565 |        "6          SCUE    39756883\n",
 566 |        "7           VPP      223388\n",
 567 |        "8          VO\u0014A           1\n",
 568 |        "9          VOTA  3879701660\n",
 569 |        "10          RED       76691\n",
 570 |        "11          GAP   262715525\n",
 571 |        "12         ATUE    79788285"
 572 |       ]
 573 |      },
 574 |      "execution_count": 4,
 575 |      "metadata": {},
 576 |      "output_type": "execute_result"
 577 |     }
 578 |    ],
 579 |    "source": [
 580 |     "df_result"
 581 |    ]
 582 |   },
 583 |   {
 584 |    "cell_type": "markdown",
 585 |    "metadata": {},
 586 |    "source": [
 587 |     "### Group By + Filtro"
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "code",
 592 |    "execution_count": 9,
 593 |    "metadata": {},
 594 |    "outputs": [
 595 |     {
 596 |      "data": {
 597 |       "application/vnd.jupyter.widget-view+json": {
 598 |        "model_id": "fde6505a3b484b28880584a0d5f7bb84",
 599 |        "version_major": 2,
 600 |        "version_minor": 0
 601 |       },
 602 |       "text/plain": [
 603 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 604 |       ]
 605 |      },
 606 |      "metadata": {},
 607 |      "output_type": "display_data"
 608 |     },
 609 |     {
 610 |      "name": "stdout",
 611 |      "output_type": "stream",
 612 |      "text": [
 613 |       "Time: 7.98s\n"
 614 |      ]
 615 |     }
 616 |    ],
 617 |    "source": [
 618 |     "query = f\"\"\"\n",
 619 |     "    SELECT \n",
 620 |     "        event_type,\n",
 621 |     "        COUNT(*) AS qtd_linhas\n",
 622 |     "    FROM '{DATABASE}'\n",
 623 |     "    WHERE event_system='VOTA' OR event_system='RED'\n",
 624 |     "    GROUP BY event_type\n",
 625 |     "\"\"\"\n",
 626 |     "\n",
 627 |     "tic = time.time()\n",
 628 |     "cursor.execute(query) \n",
 629 |     "df_result = cursor.df()\n",
 630 |     "toc = time.time()\n",
 631 |     "\n",
 632 |     "print(f\"Time: {toc - tic:.2f}s\")"
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "code",
 637 |    "execution_count": 10,
 638 |    "metadata": {},
 639 |    "outputs": [
 640 |     {
 641 |      "data": {
 642 |       "text/html": [
 643 |        "<div>\n",
 644 |        "<style scoped>\n",
 645 |        "    .dataframe tbody tr th:only-of-type {\n",
 646 |        "        vertical-align: middle;\n",
 647 |        "    }\n",
 648 |        "\n",
 649 |        "    .dataframe tbody tr th {\n",
 650 |        "        vertical-align: top;\n",
 651 |        "    }\n",
 652 |        "\n",
 653 |        "    .dataframe thead th {\n",
 654 |        "        text-align: right;\n",
 655 |        "    }\n",
 656 |        "</style>\n",
 657 |        "<table border=\"1\" class=\"dataframe\">\n",
 658 |        "  <thead>\n",
 659 |        "    <tr style=\"text-align: right;\">\n",
 660 |        "      <th></th>\n",
 661 |        "      <th>event_type</th>\n",
 662 |        "      <th>qtd_linhas</th>\n",
 663 |        "    </tr>\n",
 664 |        "  </thead>\n",
 665 |        "  <tbody>\n",
 666 |        "    <tr>\n",
 667 |        "      <th>0</th>\n",
 668 |        "      <td>ALERTA</td>\n",
 669 |        "      <td>50460553</td>\n",
 670 |        "    </tr>\n",
 671 |        "    <tr>\n",
 672 |        "      <th>1</th>\n",
 673 |        "      <td>ERRO</td>\n",
 674 |        "      <td>1024682</td>\n",
 675 |        "    </tr>\n",
 676 |        "    <tr>\n",
 677 |        "      <th>2</th>\n",
 678 |        "      <td>INFO</td>\n",
 679 |        "      <td>3828293116</td>\n",
 680 |        "    </tr>\n",
 681 |        "  </tbody>\n",
 682 |        "</table>\n",
 683 |        "</div>"
 684 |       ],
 685 |       "text/plain": [
 686 |        "  event_type  qtd_linhas\n",
 687 |        "0     ALERTA    50460553\n",
 688 |        "1       ERRO     1024682\n",
 689 |        "2       INFO  3828293116"
 690 |       ]
 691 |      },
 692 |      "execution_count": 10,
 693 |      "metadata": {},
 694 |      "output_type": "execute_result"
 695 |     }
 696 |    ],
 697 |    "source": [
 698 |     "df_result"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "markdown",
 703 |    "metadata": {},
 704 |    "source": [
 705 |     "### Verificar se event_id é unico"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "markdown",
 710 |    "metadata": {},
 711 |    "source": [
 712 |     "[WIP] Descrição básica da razão de cada consulta, qual sua função e como ela é utilizada no dia a dia"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "markdown",
 717 |    "metadata": {},
 718 |    "source": [
 719 |     "1 - Usando GroupBy"
 720 |    ]
 721 |   },
 722 |   {
 723 |    "cell_type": "code",
 724 |    "execution_count": 4,
 725 |    "metadata": {},
 726 |    "outputs": [
 727 |     {
 728 |      "data": {
 729 |       "application/vnd.jupyter.widget-view+json": {
 730 |        "model_id": "db88550cf5cf4fc7a508e50d1839f168",
 731 |        "version_major": 2,
 732 |        "version_minor": 0
 733 |       },
 734 |       "text/plain": [
 735 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 736 |       ]
 737 |      },
 738 |      "metadata": {},
 739 |      "output_type": "display_data"
 740 |     },
 741 |     {
 742 |      "ename": "",
 743 |      "evalue": "",
 744 |      "output_type": "error",
 745 |      "traceback": [
 746 |       "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
 747 |       "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
 748 |       "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
 749 |       "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
 750 |      ]
 751 |     }
 752 |    ],
 753 |    "source": [
 754 |     "query = f\"\"\"\n",
 755 |     "    SELECT\n",
 756 |     "        COUNT(*) \n",
 757 |     "    FROM (\n",
 758 |     "        SELECT \n",
 759 |     "            event_id,\n",
 760 |     "            COUNT(*)\n",
 761 |     "        FROM '{DATABASE}'\n",
 762 |     "        GROUP BY event_id\n",
 763 |     "        HAVING COUNT(*) > 1\n",
 764 |     "    )\n",
 765 |     "\"\"\"\n",
 766 |     "\n",
 767 |     "tic = time.time()\n",
 768 |     "cursor.execute(query) \n",
 769 |     "df_result = cursor.df()\n",
 770 |     "toc = time.time()\n",
 771 |     "\n",
 772 |     "print(f\"Time: {toc - tic:.2f}s\")"
 773 |    ]
 774 |   },
 775 |   {
 776 |    "cell_type": "markdown",
 777 |    "metadata": {},
 778 |    "source": [
 779 |     "2 - Usando Windows Function"
 780 |    ]
 781 |   },
 782 |   {
 783 |    "cell_type": "code",
 784 |    "execution_count": 3,
 785 |    "metadata": {},
 786 |    "outputs": [
 787 |     {
 788 |      "data": {
 789 |       "application/vnd.jupyter.widget-view+json": {
 790 |        "model_id": "f529e0b4f1e149e6bdab3ade2e1d665f",
 791 |        "version_major": 2,
 792 |        "version_minor": 0
 793 |       },
 794 |       "text/plain": [
 795 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 796 |       ]
 797 |      },
 798 |      "metadata": {},
 799 |      "output_type": "display_data"
 800 |     }
 801 |    ],
 802 |    "source": [
 803 |     "query = f\"\"\"\n",
 804 |     "    SELECT\n",
 805 |     "        event_id, qtd_linhas\n",
 806 |     "    FROM (\n",
 807 |     "        SELECT \n",
 808 |     "            event_id,\n",
 809 |     "            COUNT(*) OVER( PARTITION BY event_id ) AS qtd_linhas\n",
 810 |     "        FROM '{DATABASE}'\n",
 811 |     "    ) _\n",
 812 |     "    WHERE qtd_linhas > 1\n",
 813 |     "    LIMIT 1\n",
 814 |     "\"\"\"\n",
 815 |     "\n",
 816 |     "tic = time.time()\n",
 817 |     "cursor.execute(query) \n",
 818 |     "df_result = cursor.df()\n",
 819 |     "toc = time.time()\n",
 820 |     "\n",
 821 |     "print(f\"Time: {toc - tic:.2f}s\")"
 822 |    ]
 823 |   },
 824 |   {
 825 |    "cell_type": "markdown",
 826 |    "metadata": {},
 827 |    "source": [
 828 |     "3 - Usando Count Distinct"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": 5,
 834 |    "metadata": {},
 835 |    "outputs": [
 836 |     {
 837 |      "data": {
 838 |       "application/vnd.jupyter.widget-view+json": {
 839 |        "model_id": "71e66f8947c24b12b66fb5b11196bf63",
 840 |        "version_major": 2,
 841 |        "version_minor": 0
 842 |       },
 843 |       "text/plain": [
 844 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 845 |       ]
 846 |      },
 847 |      "metadata": {},
 848 |      "output_type": "display_data"
 849 |     }
 850 |    ],
 851 |    "source": [
 852 |     "query = f\"\"\"\n",
 853 |     "    SELECT COUNT(*)\n",
 854 |     "    FROM (\n",
 855 |     "        SELECT DISTINCT event_id \n",
 856 |     "        FROM '{DATABASE}'\n",
 857 |     "    ) _\n",
 858 |     "\"\"\"\n",
 859 |     "\n",
 860 |     "tic = time.time()\n",
 861 |     "cursor.execute(query) \n",
 862 |     "df_result = cursor.df()\n",
 863 |     "toc = time.time()\n",
 864 |     "\n",
 865 |     "print(f\"Time: {toc - tic:.2f}s\")"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "markdown",
 870 |    "metadata": {},
 871 |    "source": [
 872 |     "4 - Usando Distinct + write to disk"
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": 4,
 878 |    "metadata": {},
 879 |    "outputs": [
 880 |     {
 881 |      "data": {
 882 |       "application/vnd.jupyter.widget-view+json": {
 883 |        "model_id": "f70e43b1e2f24f38adebfcd80815d560",
 884 |        "version_major": 2,
 885 |        "version_minor": 0
 886 |       },
 887 |       "text/plain": [
 888 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 889 |       ]
 890 |      },
 891 |      "metadata": {},
 892 |      "output_type": "display_data"
 893 |     },
 894 |     {
 895 |      "ename": "",
 896 |      "evalue": "",
 897 |      "output_type": "error",
 898 |      "traceback": [
 899 |       "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
 900 |       "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
 901 |       "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
 902 |       "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
 903 |      ]
 904 |     }
 905 |    ],
 906 |    "source": [
 907 |     "query = f\"\"\"\n",
 908 |     "    COPY (\n",
 909 |     "        SELECT DISTINCT event_id \n",
 910 |     "        FROM '{DATABASE}'\n",
 911 |     "    ) TO 'event_id.parquet' \n",
 912 |     "    (FORMAT 'parquet')\n",
 913 |     "\"\"\"\n",
 914 |     "\n",
 915 |     "tic = time.time()\n",
 916 |     "cursor.execute(query) \n",
 917 |     "df_result = cursor.df()\n",
 918 |     "toc = time.time()\n",
 919 |     "\n",
 920 |     "print(f\"Time: {toc - tic:.2f}s\")"
 921 |    ]
 922 |   },
 923 |   {
 924 |    "cell_type": "markdown",
 925 |    "metadata": {},
 926 |    "source": [
 927 |     "### Distinct mensagens"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "markdown",
 932 |    "metadata": {},
 933 |    "source": [
 934 |     "Primeira aproximação"
 935 |    ]
 936 |   },
 937 |   {
 938 |    "cell_type": "code",
 939 |    "execution_count": 15,
 940 |    "metadata": {},
 941 |    "outputs": [
 942 |     {
 943 |      "data": {
 944 |       "application/vnd.jupyter.widget-view+json": {
 945 |        "model_id": "f37594cbd3e24d858c8a99e7f2841d9e",
 946 |        "version_major": 2,
 947 |        "version_minor": 0
 948 |       },
 949 |       "text/plain": [
 950 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
 951 |       ]
 952 |      },
 953 |      "metadata": {},
 954 |      "output_type": "display_data"
 955 |     },
 956 |     {
 957 |      "name": "stdout",
 958 |      "output_type": "stream",
 959 |      "text": [
 960 |       "Time: 159.81s\n"
 961 |      ]
 962 |     },
 963 |     {
 964 |      "data": {
 965 |       "text/plain": [
 966 |        "159.71511435508728"
 967 |       ]
 968 |      },
 969 |      "execution_count": 15,
 970 |      "metadata": {},
 971 |      "output_type": "execute_result"
 972 |     }
 973 |    ],
 974 |    "source": [
 975 |     "query = f\"\"\"\n",
 976 |     "    SELECT DISTINCT\n",
 977 |     "        regexp_replace(event_description, '[0-9]', 'X', 'g') AS event_description\n",
 978 |     "    FROM '{DATABASE}'\n",
 979 |     "\"\"\"\n",
 980 |     "\n",
 981 |     "duration, df_result = execute_query_and_calculate_time(cursor, query, return_df=True)\n",
 982 |     "print(f\"Time: {toc - tic:.2f}s\")\n",
 983 |     "df_result"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": 22,
 989 |    "metadata": {},
 990 |    "outputs": [
 991 |     {
 992 |      "data": {
 993 |       "application/vnd.jupyter.widget-view+json": {
 994 |        "model_id": "34112b0f74864cbf812b851009072faf",
 995 |        "version_major": 2,
 996 |        "version_minor": 0
 997 |       },
 998 |       "text/plain": [
 999 |        "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
1000 |       ]
1001 |      },
1002 |      "metadata": {},
1003 |      "output_type": "display_data"
1004 |     }
1005 |    ],
1006 |    "source": [
1007 |     "# Identificador da mídia de carga\n",
1008 |     "# Serial da MI copiada da MV da urna original\n",
1009 |     "# Serial de votação da MV\n",
1010 |     "# Verificação de assinatura de dado por etapa\n",
1011 |     "\n",
1012 |     "query = f\"\"\"\n",
1013 |     "    SELECT DISTINCT\n",
1014 |     "        CASE\n",
1015 |     "            WHEN event_description ILIKE 'Identificador da mídia de carga%' \n",
1016 |     "            THEN 'Identificador da mídia de carga'\n",
1017 |     "\n",
1018 |     "            WHEN event_description ILIKE 'Serial da MI copiada da MV da urna original%' \n",
1019 |     "            THEN 'Serial da MI copiada da MV da urna original'\n",
1020 |     "\n",
1021 |     "            WHEN event_description ILIKE 'Serial de votação da MV%' \n",
1022 |     "            THEN 'Serial de votação da MV'\n",
1023 |     "\n",
1024 |     "            WHEN event_description ILIKE 'Verificação de assinatura de dado por etapa%' \n",
1025 |     "            THEN 'Verificação de assinatura de dado por etapa'\n",
1026 |     "\n",
1027 |     "            WHEN event_description ILIKE 'Número de série da MR%'\n",
1028 |     "            THEN 'Número de série da MR'\n",
1029 |     "            \n",
1030 |     "            ELSE regexp_replace(event_description, '[0-9]', 'X', 'g') \n",
1031 |     "        END AS event_description\n",
1032 |     "    FROM '{DATABASE}'\n",
1033 |     "\"\"\"\n",
1034 |     "\n",
1035 |     "df_result, duration = execute_query_and_calculate_time(cursor, query, return_df=True)"
1036 |    ]
1037 |   },
1038 |   {
1039 |    "cell_type": "code",
1040 |    "execution_count": 23,
1041 |    "metadata": {},
1042 |    "outputs": [
1043 |     {
1044 |      "name": "stdout",
1045 |      "output_type": "stream",
1046 |      "text": [
1047 |       "Time: 478.24s\n",
1048 |       "Number of rows: 1,391\n"
1049 |      ]
1050 |     }
1051 |    ],
1052 |    "source": [
1053 |     "print(f\"Time: {duration:.2f}s\")\n",
1054 |     "print(f\"Number of rows: {df_result.shape[0]:,}\")\n",
1055 |     "df_result.to_csv('event_description.csv', index=False)"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": null,
1061 |    "metadata": {},
1062 |    "outputs": [],
1063 |    "source": []
1064 |   }
1065 |  ],
1066 |  "metadata": {
1067 |   "kernelspec": {
1068 |    "display_name": "Python 3 (ipykernel)",
1069 |    "language": "python",
1070 |    "name": "python3"
1071 |   },
1072 |   "language_info": {
1073 |    "codemirror_mode": {
1074 |     "name": "ipython",
1075 |     "version": 3
1076 |    },
1077 |    "file_extension": ".py",
1078 |    "mimetype": "text/x-python",
1079 |    "name": "python",
1080 |    "nbconvert_exporter": "python",
1081 |    "pygments_lexer": "ipython3",
1082 |    "version": "3.11.5"
1083 |   }
1084 |  },
1085 |  "nbformat": 4,
1086 |  "nbformat_minor": 2
1087 | }
1088 | 


--------------------------------------------------------------------------------