├── .gitignore
├── ANAC-data-engineering-project
    ├── ANAC-data-engineering-project.md
    ├── Dockerfile.airflow
    ├── dags
    │   ├── __pycache__
    │   │   └── dag_anac_pipeline.cpython-310.pyc
    │   ├── config
    │   │   ├── __pycache__
    │   │   │   └── params.cpython-310.pyc
    │   │   └── params.py
    │   ├── dag_anac_pipeline.py
    │   └── tasks
    │   │   ├── __pycache__
    │   │       └── download_csv.cpython-310.pyc
    │   │   ├── bronze_para_silver.py
    │   │   ├── download_csv.py
    │   │   └── silver_para_gold.py
    ├── docker-compose.yml
    ├── logs
    │   ├── dag_id=anac_pipeline_dag
    │   │   ├── run_id=manual__2025-04-07T013752.587480+0000
    │   │   │   └── task_id=baixar_csv_anac
    │   │   │   │   ├── attempt=1.log
    │   │   │   │   ├── attempt=2.log
    │   │   │   │   ├── attempt=3.log
    │   │   │   │   ├── attempt=4.log
    │   │   │   │   └── attempt=5.log
    │   │   └── run_id=scheduled__2024-01-01T000000+0000
    │   │   │   └── task_id=baixar_csv_anac
    │   │   │       └── attempt=1.log
    │   ├── dag_processor_manager
    │   │   └── dag_processor_manager.log
    │   └── scheduler
    │   │   └── 2025-04-07
    │   │       └── dag_anac_pipeline.py.log
    └── requirements.txt
├── ApacheHopProject
    ├── Apache Hop Project Description.md
    ├── infrastructure
    │   └── jenkins
    │   │   ├── docker-compose.yml
    │   │   └── jenkins_hop_project.dockerfile
    ├── pipelines
    │   ├── pipeline1_load_date_parameters.hpl
    │   ├── pipeline2_reading_xls_to_database_error_handling.hpl
    │   ├── pipeline3_worldwide_wines_training.hpl
    │   ├── pipeline4_table_input_lookup.hpl
    │   └── pipeline5_basic_api.hpl
    ├── scripts_py
    │   └── fake_py.py
    ├── scripts_sql
    │   └── skeam_stage_parameters_dates_auxiliary.sql
    └── source_files
    │   ├── fake_data.xlsx
    │   ├── source_data_pipeline_2.xlsx
    │   └── worldwide_wines.csv
├── Apache_Airflow_Marc_Lamberti
    ├── The Complete Hands-On Introduction to Apache Airflow.md
    └── dags
    │   ├── consumer.py
    │   ├── group_dag.py
    │   ├── group_dag_subdags.py
    │   ├── groups
    │       ├── group_downloads.py
    │       └── group_transforms.py
    │   ├── producer.py
    │   ├── subdags
    │       ├── subdag_downloads.py
    │       └── subdag_transforms.py
    │   └── user_processing.py
├── Books
    └── FundamentalsOfDataEngineering
    │   ├── Fundamentals of Data Engineering -  Joe Reis & Matt Housley  (PT-BR).md
    │   └── Fundamentals of Data Engineering -  Joe Reis & Matt Housley (ENG).md
├── DIO-GitHub-Certification-Formation
    └── DIO-GitHub-Certification-Formation.md
├── Data_Engineering_Course_XPE
    ├── Data Engineering Course XPE.md
    └── python
    │   └── numpy_testing_bootcamp_jpmuller.ipynb
├── Hive_Impala
    ├── Hive and Impala.md
    └── Scripts
    │   ├── hive_commands.sh
    │   └── hiveql_samples.sql
├── IHateTechReview
    └── README.md
├── LICENSE
├── README.md
├── SQLReviewCoursewithChatGPT
    ├── SQL Review Course with ChatGPT.md
    └── SQL Scripts
    │   └── 100 SQL Junior Questions + Solutions.sql
└── dbt-pokemon-project
    ├── .gitignore
    ├── dbt_pkmn
        ├── .gitignore
        ├── README.md
        ├── analyses
        │   └── .gitkeep
        ├── dbt_project.yml
        ├── dev.duckdb
        ├── duckdb_config
        │   ├── fact_sales_data.csv
        │   ├── import_data_duckdb.py
        │   ├── import_dim_time.py
        │   ├── import_fact_sales.py
        │   └── selecting_tables.py
        ├── macros
        │   └── .gitkeep
        ├── models
        │   ├── dimCustomersView.sql
        │   ├── dimProdCategoriesView.sql
        │   ├── dimProductsView.sql
        │   ├── dimTimeView.sql
        │   ├── factSalesView.sql
        │   ├── monthlySales.sql
        │   ├── mostQuantitySold.sql
        │   ├── mostSoldCategories.sql
        │   ├── mostSoldProducts.sql
        │   ├── salesConsolidated.sql
        │   ├── sources.yml
        │   └── topBuyers.sql
        ├── seeds
        │   └── .gitkeep
        ├── snapshots
        │   └── .gitkeep
        ├── temp.py
        └── tests
        │   └── .gitkeep
    └── pokemon-dbt-testing-project.md


/.gitignore:
--------------------------------------------------------------------------------
1 | pics


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/ANAC-data-engineering-project.md:
--------------------------------------------------------------------------------
  1 | ![GitHub](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)
  2 | ![Airflow](https://img.shields.io/badge/Airflow-017CEE?style=for-the-badge&logo=Apache%20Airflow&logoColor=white)
  3 | ![Jupyter](https://img.shields.io/badge/Jupyter-F37626.svg?&style=for-the-badge&logo=Jupyter&logoColor=white)
  4 | ![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
  5 | ![Docker](https://img.shields.io/badge/Docker-2CA5E0?style=for-the-badge&logo=docker&logoColor=white)
  6 | ![Spark](https://img.shields.io/badge/Apache_Spark-FFFFFF?style=for-the-badge&logo=apachespark&logoColor=#E35A16)
  7 | ![PBI](https://img.shields.io/badge/PowerBI-F2C811?style=for-the-badge&logo=Power%20BI&logoColor=white)
  8 | ![GPT](https://img.shields.io/badge/ChatGPT%204o-000000?style=for-the-badge&logo=openai&logoColor=white&label=)
  9 | 
 10 | <div align="left">
 11 |   <img src="https://github.com/user-attachments/assets/50e1afad-2495-481b-ac3b-a07d3e8fbfda" alt="Badge" width="150">
 12 | </div>
 13 | 
 14 | <div align="right">
 15 |   <img src="https://github.com/user-attachments/assets/a0d71de3-788b-474a-8369-ec40f44901a5" alt="Badge" width="150">
 16 | </div>
 17 | 
 18 | # Projeto ANAC - Engenharia de Dados
 19 | 
 20 | Fala, pessoal, tudo certinho? Aqui é o João (aka `Shamslux`). Gostaria de compartilhar com vocês um pouco sobre esse mais novo projeto
 21 | que criei. Nele eu utilizei dados abertos do governo federal, em especial, dados sobre operações aéreas sobre território nacional, ao
 22 | longo de 25 anos. 
 23 | 
 24 | Antes de prosseguir em mais explicações, contudo, gostaria de inormar que subi um vídeo no YouTube para facilitar a explicação do projeto.
 25 | Sei que nem todos vão ter o tempo (ou paciência, rs) para ler a documentação aqui, então podem assistir ao meu vídeo (em 2x, de preferência).
 26 | 
 27 | Segue o link abaixo (só clicar nessa imagem que ela vai levá-lo ao YouTube:
 28 | 
 29 | [![Apresentação do Projeto ANAC](https://img.youtube.com/vi/NVXTRQ6NOS4/0.jpg)](https://youtu.be/NVXTRQ6NOS4)
 30 | 
 31 | # Tecnologias, metodologias e ferramentas
 32 | 
 33 | De forma suscinta, utilizei:
 34 | 
 35 | - Airflow (orquestrador dos pipelines) + PySpark (para rodar direto no container do Airflow)
 36 | - Spark + Jupyter (para testar com notebooks as futuras tasks da DAG)
 37 | - Python (base para o Airflow, Jupyter e PySpark)
 38 | - Docker (basicamente nossa infraestrutura toda nele)
 39 | 
 40 | A ideia foi criar um pequeno Lakehouse, organizado em camadas (bronze, silver e gold) para que o processo de extração dos dados da ANAC seguisse uma lógica
 41 | próxima ao que vemos nos pipelines do mundo real, isto é, uma extração do CSV da base da ANAC, tratamentos realizados em código e aplicação da metodologia
 42 | de inteligência de negócios (BI) para obtenção das informações e insights possíveis.
 43 | 
 44 | # DataOps - A infraestrutura do nosso projeto
 45 | 
 46 | O projeto todo usou o recurso do Docker em minha máquina local. Aqui estão os arquivos usados:
 47 | 
 48 | ## Docker Compose
 49 | 
 50 | ```yaml
 51 | version: "3.8"
 52 | 
 53 | #########################
 54 | #------ POSTGRES -------#
 55 | #########################
 56 | services:
 57 | 
 58 |   postgres:
 59 |     image: postgres:14
 60 |     container_name: pg-anac
 61 |     restart: always
 62 |     environment:
 63 |       POSTGRES_USER: airflow
 64 |       POSTGRES_PASSWORD: airflow
 65 |       POSTGRES_DB: airflow
 66 |     ports:
 67 |       - "5432:5432"
 68 |     volumes:
 69 |       - pgdata:/var/lib/postgresql/data
 70 | 
 71 | #########################
 72 | #-------- AIRFLOW -------#
 73 | #########################
 74 |   airflow:
 75 |     build:
 76 |       context: .
 77 |       dockerfile: Dockerfile.airflow
 78 |     container_name: airflow-anac
 79 |     depends_on:
 80 |       - postgres
 81 |     restart: always
 82 |     user: "0:0"  
 83 |     ports:
 84 |       - "8080:8080"
 85 |     environment:
 86 |       AIRFLOW__CORE__EXECUTOR: LocalExecutor
 87 |       AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 88 |       AIRFLOW__CORE__FERNET_KEY: ''
 89 |       AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'False'
 90 |       AIRFLOW__CORE__LOAD_EXAMPLES: 'False'
 91 |     volumes:
 92 |       - ./dags:/opt/airflow/dags
 93 |       - ./logs:/opt/airflow/logs
 94 |       - ./plugins:/opt/airflow/plugins
 95 |       - ./data:/opt/airflow/data
 96 |     command: >
 97 |       bash -c "
 98 |         airflow db upgrade &&
 99 |         airflow users create --username admin --firstname Admin --lastname User --role Admin --password admin --email admin@example.com &&
100 |         airflow scheduler & 
101 |         exec airflow webserver"
102 | 
103 | #########################
104 | #-------- SPARK --------#
105 | #########################
106 |   spark:
107 |     image: bitnami/spark:latest
108 |     container_name: spark-anac
109 |     ports:
110 |       - "4040:4040"  
111 |     volumes:
112 |       - ./data:/data
113 |     environment:
114 |       - SPARK_MODE=master
115 | 
116 | #########################
117 | #------- JUPYTER -------#
118 | #########################
119 |   jupyter:
120 |     image: jupyter/pyspark-notebook
121 |     container_name: jupyter-anac
122 |     ports:
123 |       - "8888:8888"
124 |     volumes:
125 |       - ./data:/home/jovyan/data
126 |       - ./notebooks:/home/jovyan/work
127 |     environment:
128 |       - PYSPARK_PYTHON=python3
129 |       - PYSPARK_DRIVER_PYTHON=jupyter
130 |       - PYSPARK_DRIVER_PYTHON_OPTS=notebook
131 |       - SPARK_OPTS=--driver-memory 2g
132 |     depends_on:
133 |       - spark
134 | 
135 | #########################
136 | #------ VOLUMES --------#
137 | #########################
138 | volumes:
139 |   pgdata:
140 | ```
141 | ## Docker File Airflow
142 | ```yaml
143 | FROM apache/airflow:2.8.1-python3.10
144 | 
145 | # Modo root temporariamente para instalar o Java
146 | USER root
147 | 
148 | # Instala o OpenJDK 17 (para compatibilidade com PySpark)
149 | RUN apt-get update && \
150 |     apt-get install -y openjdk-17-jdk && \
151 |     apt-get clean && \
152 |     rm -rf /var/lib/apt/lists/*
153 | 
154 | # Define variáveis de ambiente do Java
155 | ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
156 | ENV PATH="${JAVA_HOME}/bin:${PATH}"
157 | ENV PIP_DEFAULT_TIMEOUT=100
158 | 
159 | # Copia o requirements.txt
160 | COPY requirements.txt /requirements.txt
161 | 
162 | # Volta ao usuário padrão do Airflow
163 | USER airflow
164 | 
165 | # Instala primeiro o PySpark separadamente para evitar timeout
166 | RUN pip install --no-cache-dir pyspark==3.5.0 && \
167 |     pip install --no-cache-dir -r /requirements.txt
168 | ```
169 | ## Requirements
170 | ```txt
171 | requests            
172 | pandas              
173 | pyarrow             
174 | openpyxl            
175 | lxml                
176 | beautifulsoup4      
177 | python-dotenv
178 | pyspark       
179 | holidays
180 | ```
181 | Eu cheguei sim a ter problemas com o ambiente, mas fui buscando soluções. Em geral, usei a IA generativa (GPT 4o) para me auxiliar e fui conduzindo essa IA para resolver os problemas (ela sozinha não dá conta de resolver, deixando logo a dica, tem que guiá-la e prestar atenção).
182 | O bom desses "erros" durante esses projetos de estudo é justamente porque, na vida real, é muito comum a existência de erros e vamos precisar lidar com eles quase diariamente.
183 | 
184 | # Airflow
185 | 
186 | ## DAG
187 | 
188 | Vamos ver agora a estrutura da nossa DAG:
189 | 
190 | ```python
191 | from airflow import DAG
192 | from airflow.operators.python import PythonOperator
193 | from airflow.operators.dummy import DummyOperator
194 | from datetime import datetime
195 | 
196 | from tasks.download_csv import baixar_csv_anac
197 | from tasks.bronze_para_silver import bronze_para_silver
198 | from tasks.silver_para_gold import silver_para_gold
199 | from config.params import URL_INDEX_ANAC, CSV_ANAC
200 | 
201 | # Definição da DAG
202 | with DAG(
203 |     dag_id="anac_pipeline_dag",
204 |     start_date=datetime(2024, 1, 1),
205 |     schedule_interval="@once",
206 |     catchup=False,
207 |     tags=["anac", "dados", "airflow"],
208 |     description="Pipeline de dados da ANAC com Airflow",
209 | ) as dag:
210 | 
211 |     # Marca visual de início
212 |     begin = DummyOperator(task_id="begin")
213 | 
214 |     # Tarefa de download da ANAC
215 |     task_baixar_csv = PythonOperator(
216 |         task_id="baixar_csv_anac",
217 |         python_callable=baixar_csv_anac,
218 |         op_args=[URL_INDEX_ANAC, CSV_ANAC],
219 |     )
220 | 
221 |     # Bronze -> Silver
222 |     task_transformar_para_silver = PythonOperator(
223 |         task_id="bronze_para_silver",
224 |         python_callable=bronze_para_silver,
225 |     )
226 | 
227 |     # Silver -> Gold
228 |     task_transformar_para_gold = PythonOperator(
229 |         task_id="silver_para_gold",
230 |         python_callable=silver_para_gold
231 |     )
232 | 
233 | 
234 |     # Marca visual de fim
235 |     end = DummyOperator(task_id="end")
236 | 
237 |     # Definição da ordem de execução
238 |     begin >> task_baixar_csv >> task_transformar_para_silver >> task_transformar_para_gold >> end
239 | ```
240 | 
241 | ## Tasks
242 | 
243 | Agora, vejamos as tarefas:
244 | 
245 | ### 1ª tarefa - Baixar o CSV da ANAC
246 | 
247 | ```python
248 | import os
249 | import requests
250 | from bs4 import BeautifulSoup
251 | 
252 | def baixar_csv_anac(pagina_index_url: str, caminho_salvar: str):
253 |     response = requests.get(pagina_index_url)
254 |     response.raise_for_status()
255 | 
256 |     soup = BeautifulSoup(response.text, 'html.parser')
257 | 
258 |     link_csv = None
259 |     for link in soup.find_all('a'):
260 |         href = link.get('href')
261 |         if href and href.lower().endswith("dados_estatisticos.csv"):
262 |             link_csv = href
263 |             break
264 | 
265 |     if not link_csv:
266 |         raise Exception("Arquivo CSV não encontrado na página!")
267 | 
268 |     if not pagina_index_url.endswith("/"):
269 |         pagina_index_url += "/"
270 |     url_csv = pagina_index_url + link_csv
271 | 
272 |     print(f"[INFO] Baixando arquivo de: {url_csv}")
273 |     print(f"[DEBUG] Salvando em: {caminho_salvar}")
274 | 
275 |     os.makedirs(os.path.dirname(caminho_salvar), exist_ok=True)
276 | 
277 |     csv_response = requests.get(url_csv, stream=True)
278 |     csv_response.raise_for_status()
279 | 
280 |     with open(caminho_salvar, "wb") as f:
281 |         for chunk in csv_response.iter_content(chunk_size=1048576):  # 1MB
282 |             if chunk:
283 |                 f.write(chunk)
284 | 
285 |     print(f"[SUCESSO] CSV salvo em {caminho_salvar}")
286 | ```
287 | ### 2ª tarefa - Tratar o conjunto de dados
288 | 
289 | ```python
290 | # tasks/bronze_para_silver.py
291 | 
292 | import os
293 | import pandas as pd
294 | from pyspark.sql import SparkSession
295 | from pyspark.sql.functions import col, isnan, when, regexp_replace, trim, to_date, concat_ws, lit, lpad
296 | from pyspark.sql.types import StringType, DoubleType, IntegerType
297 | 
298 | from config.params import BRONZE_DIR, SILVER_DIR
299 | 
300 | def bronze_para_silver():
301 |     print("🚀 Iniciando transformação da camada Bronze para Silver...")
302 | 
303 |     caminho_csv = os.path.join(BRONZE_DIR, "Dados_Estatisticos.csv")
304 | 
305 |     print("📚 Lendo e corrigindo CSV com Pandas (ajuste da primeira linha)...")
306 |     df_pandas = pd.read_csv(caminho_csv, sep=";", skiprows=1)
307 |     csv_corrigido = caminho_csv.replace(".csv", "_limpo.csv")
308 |     df_pandas.to_csv(csv_corrigido, sep=";", index=False)
309 | 
310 |     print("✨ Inicializando SparkSession...")
311 |     spark = SparkSession.builder \
312 |         .appName("Limpeza e transformação - ANAC") \
313 |         .getOrCreate()
314 | 
315 |     print("📂 Lendo CSV corrigido com Spark...")
316 |     df_anac = spark.read.csv(csv_corrigido, sep=";", header=True, inferSchema=True)
317 | 
318 |     print("🧪 Corrigindo tipos de dados e tratando vírgulas como ponto decimal...")
319 |     df_anac = df_anac.withColumn("PASSAGEIROS_PAGOS", col("PASSAGEIROS_PAGOS").cast("int"))
320 |     df_anac = df_anac.withColumn("PASSAGEIROS_GRATIS", col("PASSAGEIROS_GRATIS").cast("int"))
321 |     df_anac = df_anac.withColumn("DECOLAGENS", col("DECOLAGENS").cast("int"))
322 |     df_anac = df_anac.withColumn("HORAS_VOADAS", regexp_replace("HORAS_VOADAS", ",", ".").cast(DoubleType()))
323 | 
324 |     print("🔍 Tratando valores nulos e ausentes...")
325 |     substituicoes = {}
326 |     for field in df_anac.schema.fields:
327 |         if field.nullable:
328 |             if isinstance(field.dataType, StringType):
329 |                 substituicoes[field.name] = "SEM REGISTRO"
330 |             elif isinstance(field.dataType, DoubleType):
331 |                 substituicoes[field.name] = 0.0
332 |             elif isinstance(field.dataType, IntegerType):
333 |                 substituicoes[field.name] = 0
334 | 
335 |     for coluna in substituicoes:
336 |         df_anac = df_anac.withColumn(
337 |             coluna,
338 |             when(isnan(col(coluna)), None).otherwise(col(coluna))
339 |         ).fillna({coluna: substituicoes[coluna]})
340 | 
341 |     print("✂️ Aplicando `trim()` em colunas textuais para remover espaços...")
342 |     for field in df_anac.schema.fields:
343 |         if isinstance(field.dataType, StringType):
344 |             df_anac = df_anac.withColumn(field.name, trim(col(field.name)))
345 | 
346 |     print("📅 Criando coluna de data completa (DATA)...")
347 |     df_anac = df_anac.withColumn(
348 |         "DATA",
349 |         to_date(
350 |             concat_ws("-", col("ANO"), lpad(col("MES").cast("string"), 2, "0"), lit("01")),
351 |             "yyyy-MM-dd"
352 |         )
353 |     )
354 | 
355 |     print("💾 Salvando dados tratados na camada Silver particionada por ANO e MES...")
356 |     df_anac.write.mode("overwrite").partitionBy("ANO", "MES").parquet(
357 |         os.path.join(SILVER_DIR, "operacoes_anac_partitioned")
358 |     )
359 | 
360 |     spark.stop()
361 |     print("✅ Transformação Bronze → Silver concluída com sucesso.")
362 | ```
363 | 
364 | ### 3ª tarefa - Criando dimensões e fatos
365 | 
366 | ```python
367 | # tasks/gold_transform.py
368 | 
369 | from airflow.decorators import task
370 | from pyspark.sql import SparkSession
371 | from pyspark.sql.functions import (
372 |     col, monotonically_increasing_id, to_date, dayofmonth, month, year, quarter,
373 |     date_format, weekofyear, when, lit, udf
374 | )
375 | from pyspark.sql.types import BooleanType, StringType
376 | import holidays
377 | import os
378 | 
379 | BASE_DATA_DIR = "/opt/airflow/data"
380 | SILVER_DIR = os.path.join(BASE_DATA_DIR, "silver")
381 | GOLD_DIR   = os.path.join(BASE_DATA_DIR, "gold")
382 | 
383 | 
384 | def silver_para_gold():
385 |     print("🚀 Inicializando SparkSession...")
386 |     spark = SparkSession.builder.appName("ANAC - Camada Gold").getOrCreate()
387 |     spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
388 | 
389 |     print("📂 Lendo dados da camada Silver...")
390 |     df = spark.read.parquet(os.path.join(SILVER_DIR, "operacoes_anac_partitioned"))
391 | 
392 |     # ───── DIMENSÃO EMPRESA ─────
393 |     print("📌 Criando dimensão EMPRESA...")
394 |     dim_empresa = df.select("EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE").dropDuplicates()
395 |     dim_empresa = dim_empresa.withColumn("ID_EMPRESA", monotonically_increasing_id())
396 |     dim_empresa = dim_empresa.select("ID_EMPRESA", "EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE")
397 |     dim_empresa.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_empresa"))
398 | 
399 |     # ───── DIMENSÃO AEROPORTO ─────
400 |     print("📌 Criando dimensão AEROPORTO...")
401 |     origem = df.select(
402 |         col("AEROPORTO_DE_ORIGEM_SIGLA").alias("AEROPORTO_ICAO"),
403 |         col("AEROPORTO_DE_ORIGEM_NOME").alias("AEROPORTO_NOME"),
404 |         col("AEROPORTO_DE_ORIGEM_UF").alias("UF"),
405 |         col("AEROPORTO_DE_ORIGEM_REGIAO").alias("REGIAO"),
406 |         col("AEROPORTO_DE_ORIGEM_PAIS").alias("PAIS"),
407 |         col("AEROPORTO_DE_ORIGEM_CONTINENTE").alias("CONTINENTE")
408 |     )
409 |     destino = df.select(
410 |         col("AEROPORTO_DE_DESTINO_SIGLA").alias("AEROPORTO_ICAO"),
411 |         col("AEROPORTO_DE_DESTINO_NOME").alias("AEROPORTO_NOME"),
412 |         col("AEROPORTO_DE_DESTINO_UF").alias("UF"),
413 |         col("AEROPORTO_DE_DESTINO_REGIAO").alias("REGIAO"),
414 |         col("AEROPORTO_DE_DESTINO_PAIS").alias("PAIS"),
415 |         col("AEROPORTO_DE_DESTINO_CONTINENTE").alias("CONTINENTE")
416 |     )
417 |     dim_aeroporto = origem.union(destino).dropDuplicates()
418 |     dim_aeroporto = dim_aeroporto.withColumn("ID_AEROPORTO", monotonically_increasing_id())
419 |     dim_aeroporto = dim_aeroporto.select(
420 |         "ID_AEROPORTO", "AEROPORTO_ICAO", "AEROPORTO_NOME", "UF", "REGIAO", "PAIS", "CONTINENTE"
421 |     )
422 |     dim_aeroporto.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_aeroporto"))
423 | 
424 |     # ───── DIMENSÃO TEMPO ─────
425 |     print("📌 Criando dimensão TEMPO...")
426 |     feriados_br = holidays.Brazil()
427 | 
428 |     @udf(BooleanType())
429 |     def is_feriado(data):
430 |         return data in feriados_br if data else False
431 | 
432 |     @udf(StringType())
433 |     def estacao_do_ano(data):
434 |         if not data:
435 |             return None
436 |         d, m = data.day, data.month
437 |         if (m == 12 and d >= 21) or m in [1, 2] or (m == 3 and d < 20):
438 |             return "Verão"
439 |         elif (m == 3 and d >= 20) or m in [4, 5] or (m == 6 and d < 21):
440 |             return "Outono"
441 |         elif (m == 6 and d >= 21) or m in [7, 8] or (m == 9 and d < 23):
442 |             return "Inverno"
443 |         else:
444 |             return "Primavera"
445 | 
446 |     dim_tempo = df.select("DATA").dropDuplicates().withColumn("DATA", to_date("DATA"))
447 |     dim_tempo = dim_tempo.withColumn("DIA", dayofmonth("DATA")) \
448 |                          .withColumn("MES", month("DATA")) \
449 |                          .withColumn("ANO", year("DATA")) \
450 |                          .withColumn("TRIMESTRE", quarter("DATA")) \
451 |                          .withColumn("NOME_DIA_SEMANA", date_format("DATA", "EEEE")) \
452 |                          .withColumn("NOME_MES", date_format("DATA", "MMMM")) \
453 |                          .withColumn("NUM_SEMANA", weekofyear("DATA")) \
454 |                          .withColumn("FIM_DE_SEMANA", when(date_format("DATA", "u").isin("6", "7"), True).otherwise(False)) \
455 |                          .withColumn("FERIADO_BR", is_feriado(col("DATA"))) \
456 |                          .withColumn("ESTACAO", estacao_do_ano(col("DATA"))) \
457 |                          .withColumn("ID_TEMPO", monotonically_increasing_id()) \
458 |                          .select(
459 |                             "ID_TEMPO", "DATA", "DIA", "MES", "NOME_MES", "NUM_SEMANA", "NOME_DIA_SEMANA",
460 |                             "FIM_DE_SEMANA", "FERIADO_BR", "ESTACAO", "TRIMESTRE", "ANO"
461 |                          )
462 |     dim_tempo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_tempo"))
463 | 
464 |     # ───── DIMENSÃO TIPO DE VOO ─────
465 |     print("📌 Criando dimensão TIPO DE VOO...")
466 |     dim_voo = df.select("NATUREZA", "GRUPO_DE_VOO").dropDuplicates()
467 |     dim_voo = dim_voo.withColumn("ID_TIPO_VOO", monotonically_increasing_id())
468 |     dim_voo = dim_voo.select("ID_TIPO_VOO", "NATUREZA", "GRUPO_DE_VOO")
469 |     dim_voo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_tipo_voo"))
470 | 
471 |     # ───── FATO VOO ─────
472 |     print("📊 Criando tabela FATO_VOO...")
473 |     dim_empresa = spark.read.parquet(os.path.join(GOLD_DIR, "dim_empresa")).alias("dim_empresa")
474 |     dim_tempo = spark.read.parquet(os.path.join(GOLD_DIR, "dim_tempo")).alias("dim_tempo")
475 |     dim_voo = spark.read.parquet(os.path.join(GOLD_DIR, "dim_tipo_voo")).alias("dim_tipo_voo")
476 |     dim_aeroporto = spark.read.parquet(os.path.join(GOLD_DIR, "dim_aeroporto")).alias("dim_aeroporto")
477 | 
478 |     df_fato = df \
479 |         .join(dim_empresa, on=["EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE"], how="left") \
480 |         .join(dim_tempo, on="DATA", how="left") \
481 |         .join(dim_voo, on=["NATUREZA", "GRUPO_DE_VOO"], how="left") \
482 |         .join(dim_aeroporto.alias("origem"), df["AEROPORTO_DE_ORIGEM_SIGLA"] == col("origem.AEROPORTO_ICAO"), how="left") \
483 |         .join(dim_aeroporto.alias("destino"), df["AEROPORTO_DE_DESTINO_SIGLA"] == col("destino.AEROPORTO_ICAO"), how="left")
484 | 
485 |     fato_voo = df_fato.select(
486 |         monotonically_increasing_id().alias("ID_FATO_VOO"),
487 |         col("dim_empresa.ID_EMPRESA"),
488 |         col("dim_tempo.ID_TEMPO"),
489 |         col("dim_tipo_voo.ID_TIPO_VOO"),
490 |         col("origem.ID_AEROPORTO").alias("ID_AEROPORTO_ORIGEM"),
491 |         col("destino.ID_AEROPORTO").alias("ID_AEROPORTO_DESTINO"),
492 |         "PASSAGEIROS_PAGOS", "PASSAGEIROS_GRATIS",
493 |         "CARGA_PAGA_KG", "CARGA_GRATIS_KG", "CORREIO_KG",
494 |         "ASK", "RPK", "ATK", "RTK",
495 |         "COMBUSTIVEL_LITROS", "DISTANCIA_VOADA_KM",
496 |         "DECOLAGENS", "ASSENTOS", "PAYLOAD",
497 |         "HORAS_VOADAS", "BAGAGEM_KG"
498 |     )
499 | 
500 |     fato_voo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "fato_voo"))
501 | 
502 |     print("✅ Camada gold criada com sucesso.")
503 | ```
504 | 
505 | # Vendo na prática!
506 | 
507 | Agora, vamos ver como o processo ocorreu na prática!
508 | 
509 | ![airflow-dag-graph-view](https://github.com/user-attachments/assets/41e6b7bc-a12c-4834-a254-a88e3bb0d792)
510 | 
511 | Aqui temos uma visão gráfica da nossa DAG e suas tasks. A primeira task é executada e temos o donwload do arquivo desejado na camada bronze do nosso Lake.
512 | 
513 | ![task1-log](https://github.com/user-attachments/assets/a5c84a30-2a20-414e-a818-c636688829a7)
514 | 
515 | ![bronze-raw-file-1](https://github.com/user-attachments/assets/13130533-313d-4f7a-a40d-0f2ad63116d9)
516 | 
517 | ---
518 | 
519 | Depois dessa primeira tarefa, vemos a segunda em execução que cria o arquivo CSV limpo que vimos na imagem anterior (por causa da primeira linha de atualização, o que atrapalhava a leitura e essa versão limpa é apenas um pré-tratamento antes das transformações mais substanciais). 
520 | 
521 | **Nota: Refletindo agora, penso que talvez volte nesse ponto, apesar de "inocente", caberia uma reflexão se esse arquivo poderia estar na nossa camada bronze ou se outro método poderia ser feito para remediar isso. Mas vou deixar para uma revisitação ao projeto mais pro futuro.**
522 | 
523 | A segunda tarefa faz ajustes nas colunas `PASSAGEIROS_PAGOS`, `PASSAGEIROS_GRATIS`, `DECOLAGENS` (de `double` para `int`, uma vez que são dados discretos) e `HORAS_VOADAS`(remoção de vírgula para uso de ponto e conversão de `string` para `double`, sendo uma coluna de dados contínuos).
524 | Depois ocorre a remoção de valores nulos, ajustes em registros faltantes e o salvamento particionando por ano e mês.
525 | 
526 | ![airflow-task-2](https://github.com/user-attachments/assets/94640667-3fb4-41db-956a-e9cad9f22e10)
527 | 
528 | ![silver](https://github.com/user-attachments/assets/7bd14f6e-6a80-4567-a76d-ce7550ca3a89)
529 | 
530 | Finalmente, a terceira tarefa é responsável por criar as dimensões e a fato na camada gold:
531 | 
532 | ![airflow-task-3](https://github.com/user-attachments/assets/f9ecfa21-6bf4-4a64-998a-632752e20563)
533 | 
534 | ![gold](https://github.com/user-attachments/assets/7bb36634-db1c-4376-9868-5577810ca1fd)
535 | 
536 | ---
537 | 
538 | Terminada a extração, carga e tratamento dos dados, agora que estamos com a última etapa terminada (tratamento/transformação, ELT), vamos verificar como ficou nosso painel final no Power BI:
539 | 
540 | ![pbi](https://github.com/user-attachments/assets/b8121170-b934-4347-9bf7-7f06c6bca91a)
541 | 
542 | Aqui os relacionamentos em nosso modelo estrela (*star schema*:
543 | 
544 | ![star](https://github.com/user-attachments/assets/e9f4cfe6-c35e-49a2-97e2-085635115517)
545 | 
546 | Como os dados vieram bem tratados, apenas precisei criar duas medidas, em geral, envolvendo consolidação de alguns dados (ex.: total de passageiro e carga, pois havia mais de uma coluna com valores):
547 | 
548 | ![medidas](https://github.com/user-attachments/assets/c10136af-fdb2-45ed-a513-5afdc1ad2035)
549 | 
550 | # Últimas Considerações
551 | 
552 | Bem, agradeço a paciência de ler até aqui. Sei que há pontos de melhorias, sempre precisamos melhorar, mas minha ideia era usar esse projeto para praticar, revisar algumas abordagens e demonstrar um pouco das minhas capacidades com o que já trabalhei no mundo real. Claro
553 | que a vida real é mais complexa e os problemas são mais desafiadores, mas posso ousar dizer que algumas coisas são desafios técnicos, mas 90% dos desafios são problemas e resolução de problemas. 😅
554 | 
555 | 
556 | 
557 | 
558 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/Dockerfile.airflow:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:2.8.1-python3.10
 2 | 
 3 | # Modo root temporariamente para instalar o Java
 4 | USER root
 5 | 
 6 | # Instala o OpenJDK 17 (para compatibilidade com PySpark)
 7 | RUN apt-get update && \
 8 |     apt-get install -y openjdk-17-jdk && \
 9 |     apt-get clean && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | # Define variáveis de ambiente do Java
13 | ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
14 | ENV PATH="${JAVA_HOME}/bin:${PATH}"
15 | ENV PIP_DEFAULT_TIMEOUT=100
16 | 
17 | # Copia o requirements.txt
18 | COPY requirements.txt /requirements.txt
19 | 
20 | # Volta ao usuário padrão do Airflow
21 | USER airflow
22 | 
23 | # Instala primeiro o PySpark separadamente para evitar timeout
24 | RUN pip install --no-cache-dir pyspark==3.5.0 && \
25 |     pip install --no-cache-dir -r /requirements.txt
26 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/__pycache__/dag_anac_pipeline.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/ANAC-data-engineering-project/dags/__pycache__/dag_anac_pipeline.cpython-310.pyc


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/config/__pycache__/params.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/ANAC-data-engineering-project/dags/config/__pycache__/params.cpython-310.pyc


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/config/params.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | BASE_DATA_DIR = "/opt/airflow/data"
 4 | 
 5 | 
 6 | BRONZE_DIR = os.path.join(BASE_DATA_DIR, "bronze")
 7 | SILVER_DIR = os.path.join(BASE_DATA_DIR, "silver")
 8 | GOLD_DIR   = os.path.join(BASE_DATA_DIR, "gold")
 9 | CSV_ANAC = os.path.join(BRONZE_DIR, "Dados_Estatisticos.csv")
10 | PARQUET_ANAC_2023 = os.path.join(SILVER_DIR, "anac_data.parquet")
11 | 
12 | URL_INDEX_ANAC = "https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20operações%20aéreas/Dados%20Estatísticos%20do%20Transporte%20Aéreo/"
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/dag_anac_pipeline.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python import PythonOperator
 3 | from airflow.operators.dummy import DummyOperator
 4 | from datetime import datetime
 5 | 
 6 | from tasks.download_csv import baixar_csv_anac
 7 | from tasks.bronze_para_silver import bronze_para_silver
 8 | from tasks.silver_para_gold import silver_para_gold
 9 | from config.params import URL_INDEX_ANAC, CSV_ANAC
10 | 
11 | # Definição da DAG
12 | with DAG(
13 |     dag_id="anac_pipeline_dag",
14 |     start_date=datetime(2024, 1, 1),
15 |     schedule_interval="@once",
16 |     catchup=False,
17 |     tags=["anac", "dados", "airflow"],
18 |     description="Pipeline de dados da ANAC com Airflow",
19 | ) as dag:
20 | 
21 |     # Marca visual de início
22 |     begin = DummyOperator(task_id="begin")
23 | 
24 |     # Tarefa de download da ANAC
25 |     task_baixar_csv = PythonOperator(
26 |         task_id="baixar_csv_anac",
27 |         python_callable=baixar_csv_anac,
28 |         op_args=[URL_INDEX_ANAC, CSV_ANAC],
29 |     )
30 | 
31 |     # Bronze -> Silver
32 |     task_transformar_para_silver = PythonOperator(
33 |         task_id="bronze_para_silver",
34 |         python_callable=bronze_para_silver,
35 |     )
36 | 
37 |     # Silver -> Gold
38 |     task_transformar_para_gold = PythonOperator(
39 |         task_id="silver_para_gold",
40 |         python_callable=silver_para_gold
41 |     )
42 | 
43 | 
44 |     # Marca visual de fim
45 |     end = DummyOperator(task_id="end")
46 | 
47 |     # Definição da ordem de execução
48 |     begin >> task_baixar_csv >> task_transformar_para_silver >> task_transformar_para_gold >> end
49 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/tasks/__pycache__/download_csv.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/ANAC-data-engineering-project/dags/tasks/__pycache__/download_csv.cpython-310.pyc


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/tasks/bronze_para_silver.py:
--------------------------------------------------------------------------------
 1 | # tasks/bronze_para_silver.py
 2 | 
 3 | import os
 4 | import pandas as pd
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.sql.functions import col, isnan, when, regexp_replace, trim, to_date, concat_ws, lit, lpad
 7 | from pyspark.sql.types import StringType, DoubleType, IntegerType
 8 | 
 9 | from config.params import BRONZE_DIR, SILVER_DIR
10 | 
11 | def bronze_para_silver():
12 |     print("🚀 Iniciando transformação da camada Bronze para Silver...")
13 | 
14 |     caminho_csv = os.path.join(BRONZE_DIR, "Dados_Estatisticos.csv")
15 | 
16 |     print("📚 Lendo e corrigindo CSV com Pandas (ajuste da primeira linha)...")
17 |     df_pandas = pd.read_csv(caminho_csv, sep=";", skiprows=1)
18 |     csv_corrigido = caminho_csv.replace(".csv", "_limpo.csv")
19 |     df_pandas.to_csv(csv_corrigido, sep=";", index=False)
20 | 
21 |     print("✨ Inicializando SparkSession...")
22 |     spark = SparkSession.builder \
23 |         .appName("Limpeza e transformação - ANAC") \
24 |         .getOrCreate()
25 | 
26 |     print("📂 Lendo CSV corrigido com Spark...")
27 |     df_anac = spark.read.csv(csv_corrigido, sep=";", header=True, inferSchema=True)
28 | 
29 |     print("🧪 Corrigindo tipos de dados e tratando vírgulas como ponto decimal...")
30 |     df_anac = df_anac.withColumn("PASSAGEIROS_PAGOS", col("PASSAGEIROS_PAGOS").cast("int"))
31 |     df_anac = df_anac.withColumn("PASSAGEIROS_GRATIS", col("PASSAGEIROS_GRATIS").cast("int"))
32 |     df_anac = df_anac.withColumn("DECOLAGENS", col("DECOLAGENS").cast("int"))
33 |     df_anac = df_anac.withColumn("HORAS_VOADAS", regexp_replace("HORAS_VOADAS", ",", ".").cast(DoubleType()))
34 | 
35 |     print("🔍 Tratando valores nulos e ausentes...")
36 |     substituicoes = {}
37 |     for field in df_anac.schema.fields:
38 |         if field.nullable:
39 |             if isinstance(field.dataType, StringType):
40 |                 substituicoes[field.name] = "SEM REGISTRO"
41 |             elif isinstance(field.dataType, DoubleType):
42 |                 substituicoes[field.name] = 0.0
43 |             elif isinstance(field.dataType, IntegerType):
44 |                 substituicoes[field.name] = 0
45 | 
46 |     for coluna in substituicoes:
47 |         df_anac = df_anac.withColumn(
48 |             coluna,
49 |             when(isnan(col(coluna)), None).otherwise(col(coluna))
50 |         ).fillna({coluna: substituicoes[coluna]})
51 | 
52 |     print("✂️ Aplicando `trim()` em colunas textuais para remover espaços...")
53 |     for field in df_anac.schema.fields:
54 |         if isinstance(field.dataType, StringType):
55 |             df_anac = df_anac.withColumn(field.name, trim(col(field.name)))
56 | 
57 |     print("📅 Criando coluna de data completa (DATA)...")
58 |     df_anac = df_anac.withColumn(
59 |         "DATA",
60 |         to_date(
61 |             concat_ws("-", col("ANO"), lpad(col("MES").cast("string"), 2, "0"), lit("01")),
62 |             "yyyy-MM-dd"
63 |         )
64 |     )
65 | 
66 |     print("💾 Salvando dados tratados na camada Silver particionada por ANO e MES...")
67 |     df_anac.write.mode("overwrite").partitionBy("ANO", "MES").parquet(
68 |         os.path.join(SILVER_DIR, "operacoes_anac_partitioned")
69 |     )
70 | 
71 |     spark.stop()
72 |     print("✅ Transformação Bronze → Silver concluída com sucesso.")
73 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/tasks/download_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | def baixar_csv_anac(pagina_index_url: str, caminho_salvar: str):
 6 |     response = requests.get(pagina_index_url)
 7 |     response.raise_for_status()
 8 | 
 9 |     soup = BeautifulSoup(response.text, 'html.parser')
10 | 
11 |     link_csv = None
12 |     for link in soup.find_all('a'):
13 |         href = link.get('href')
14 |         if href and href.lower().endswith("dados_estatisticos.csv"):
15 |             link_csv = href
16 |             break
17 | 
18 |     if not link_csv:
19 |         raise Exception("Arquivo CSV não encontrado na página!")
20 | 
21 |     if not pagina_index_url.endswith("/"):
22 |         pagina_index_url += "/"
23 |     url_csv = pagina_index_url + link_csv
24 | 
25 |     print(f"[INFO] Baixando arquivo de: {url_csv}")
26 |     print(f"[DEBUG] Salvando em: {caminho_salvar}")
27 | 
28 |     os.makedirs(os.path.dirname(caminho_salvar), exist_ok=True)
29 | 
30 |     csv_response = requests.get(url_csv, stream=True)
31 |     csv_response.raise_for_status()
32 | 
33 |     with open(caminho_salvar, "wb") as f:
34 |         for chunk in csv_response.iter_content(chunk_size=1048576):  # 1MB
35 |             if chunk:
36 |                 f.write(chunk)
37 | 
38 |     print(f"[SUCESSO] CSV salvo em {caminho_salvar}")
39 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/dags/tasks/silver_para_gold.py:
--------------------------------------------------------------------------------
  1 | # tasks/gold_transform.py
  2 | 
  3 | from airflow.decorators import task
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.functions import (
  6 |     col, monotonically_increasing_id, to_date, dayofmonth, month, year, quarter,
  7 |     date_format, weekofyear, when, lit, udf
  8 | )
  9 | from pyspark.sql.types import BooleanType, StringType
 10 | import holidays
 11 | import os
 12 | 
 13 | BASE_DATA_DIR = "/opt/airflow/data"
 14 | SILVER_DIR = os.path.join(BASE_DATA_DIR, "silver")
 15 | GOLD_DIR   = os.path.join(BASE_DATA_DIR, "gold")
 16 | 
 17 | 
 18 | def silver_para_gold():
 19 |     print("🚀 Inicializando SparkSession...")
 20 |     spark = SparkSession.builder.appName("ANAC - Camada Gold").getOrCreate()
 21 |     spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
 22 | 
 23 |     print("📂 Lendo dados da camada Silver...")
 24 |     df = spark.read.parquet(os.path.join(SILVER_DIR, "operacoes_anac_partitioned"))
 25 | 
 26 |     # ───── DIMENSÃO EMPRESA ─────
 27 |     print("📌 Criando dimensão EMPRESA...")
 28 |     dim_empresa = df.select("EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE").dropDuplicates()
 29 |     dim_empresa = dim_empresa.withColumn("ID_EMPRESA", monotonically_increasing_id())
 30 |     dim_empresa = dim_empresa.select("ID_EMPRESA", "EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE")
 31 |     dim_empresa.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_empresa"))
 32 | 
 33 |     # ───── DIMENSÃO AEROPORTO ─────
 34 |     print("📌 Criando dimensão AEROPORTO...")
 35 |     origem = df.select(
 36 |         col("AEROPORTO_DE_ORIGEM_SIGLA").alias("AEROPORTO_ICAO"),
 37 |         col("AEROPORTO_DE_ORIGEM_NOME").alias("AEROPORTO_NOME"),
 38 |         col("AEROPORTO_DE_ORIGEM_UF").alias("UF"),
 39 |         col("AEROPORTO_DE_ORIGEM_REGIAO").alias("REGIAO"),
 40 |         col("AEROPORTO_DE_ORIGEM_PAIS").alias("PAIS"),
 41 |         col("AEROPORTO_DE_ORIGEM_CONTINENTE").alias("CONTINENTE")
 42 |     )
 43 |     destino = df.select(
 44 |         col("AEROPORTO_DE_DESTINO_SIGLA").alias("AEROPORTO_ICAO"),
 45 |         col("AEROPORTO_DE_DESTINO_NOME").alias("AEROPORTO_NOME"),
 46 |         col("AEROPORTO_DE_DESTINO_UF").alias("UF"),
 47 |         col("AEROPORTO_DE_DESTINO_REGIAO").alias("REGIAO"),
 48 |         col("AEROPORTO_DE_DESTINO_PAIS").alias("PAIS"),
 49 |         col("AEROPORTO_DE_DESTINO_CONTINENTE").alias("CONTINENTE")
 50 |     )
 51 |     dim_aeroporto = origem.union(destino).dropDuplicates()
 52 |     dim_aeroporto = dim_aeroporto.withColumn("ID_AEROPORTO", monotonically_increasing_id())
 53 |     dim_aeroporto = dim_aeroporto.select(
 54 |         "ID_AEROPORTO", "AEROPORTO_ICAO", "AEROPORTO_NOME", "UF", "REGIAO", "PAIS", "CONTINENTE"
 55 |     )
 56 |     dim_aeroporto.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_aeroporto"))
 57 | 
 58 |     # ───── DIMENSÃO TEMPO ─────
 59 |     print("📌 Criando dimensão TEMPO...")
 60 |     feriados_br = holidays.Brazil()
 61 | 
 62 |     @udf(BooleanType())
 63 |     def is_feriado(data):
 64 |         return data in feriados_br if data else False
 65 | 
 66 |     @udf(StringType())
 67 |     def estacao_do_ano(data):
 68 |         if not data:
 69 |             return None
 70 |         d, m = data.day, data.month
 71 |         if (m == 12 and d >= 21) or m in [1, 2] or (m == 3 and d < 20):
 72 |             return "Verão"
 73 |         elif (m == 3 and d >= 20) or m in [4, 5] or (m == 6 and d < 21):
 74 |             return "Outono"
 75 |         elif (m == 6 and d >= 21) or m in [7, 8] or (m == 9 and d < 23):
 76 |             return "Inverno"
 77 |         else:
 78 |             return "Primavera"
 79 | 
 80 |     dim_tempo = df.select("DATA").dropDuplicates().withColumn("DATA", to_date("DATA"))
 81 |     dim_tempo = dim_tempo.withColumn("DIA", dayofmonth("DATA")) \
 82 |                          .withColumn("MES", month("DATA")) \
 83 |                          .withColumn("ANO", year("DATA")) \
 84 |                          .withColumn("TRIMESTRE", quarter("DATA")) \
 85 |                          .withColumn("NOME_DIA_SEMANA", date_format("DATA", "EEEE")) \
 86 |                          .withColumn("NOME_MES", date_format("DATA", "MMMM")) \
 87 |                          .withColumn("NUM_SEMANA", weekofyear("DATA")) \
 88 |                          .withColumn("FIM_DE_SEMANA", when(date_format("DATA", "u").isin("6", "7"), True).otherwise(False)) \
 89 |                          .withColumn("FERIADO_BR", is_feriado(col("DATA"))) \
 90 |                          .withColumn("ESTACAO", estacao_do_ano(col("DATA"))) \
 91 |                          .withColumn("ID_TEMPO", monotonically_increasing_id()) \
 92 |                          .select(
 93 |                             "ID_TEMPO", "DATA", "DIA", "MES", "NOME_MES", "NUM_SEMANA", "NOME_DIA_SEMANA",
 94 |                             "FIM_DE_SEMANA", "FERIADO_BR", "ESTACAO", "TRIMESTRE", "ANO"
 95 |                          )
 96 |     dim_tempo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_tempo"))
 97 | 
 98 |     # ───── DIMENSÃO TIPO DE VOO ─────
 99 |     print("📌 Criando dimensão TIPO DE VOO...")
100 |     dim_voo = df.select("NATUREZA", "GRUPO_DE_VOO").dropDuplicates()
101 |     dim_voo = dim_voo.withColumn("ID_TIPO_VOO", monotonically_increasing_id())
102 |     dim_voo = dim_voo.select("ID_TIPO_VOO", "NATUREZA", "GRUPO_DE_VOO")
103 |     dim_voo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "dim_tipo_voo"))
104 | 
105 |     # ───── FATO VOO ─────
106 |     print("📊 Criando tabela FATO_VOO...")
107 |     dim_empresa = spark.read.parquet(os.path.join(GOLD_DIR, "dim_empresa")).alias("dim_empresa")
108 |     dim_tempo = spark.read.parquet(os.path.join(GOLD_DIR, "dim_tempo")).alias("dim_tempo")
109 |     dim_voo = spark.read.parquet(os.path.join(GOLD_DIR, "dim_tipo_voo")).alias("dim_tipo_voo")
110 |     dim_aeroporto = spark.read.parquet(os.path.join(GOLD_DIR, "dim_aeroporto")).alias("dim_aeroporto")
111 | 
112 |     df_fato = df \
113 |         .join(dim_empresa, on=["EMPRESA_SIGLA", "EMPRESA_NOME", "EMPRESA_NACIONALIDADE"], how="left") \
114 |         .join(dim_tempo, on="DATA", how="left") \
115 |         .join(dim_voo, on=["NATUREZA", "GRUPO_DE_VOO"], how="left") \
116 |         .join(dim_aeroporto.alias("origem"), df["AEROPORTO_DE_ORIGEM_SIGLA"] == col("origem.AEROPORTO_ICAO"), how="left") \
117 |         .join(dim_aeroporto.alias("destino"), df["AEROPORTO_DE_DESTINO_SIGLA"] == col("destino.AEROPORTO_ICAO"), how="left")
118 | 
119 |     fato_voo = df_fato.select(
120 |         monotonically_increasing_id().alias("ID_FATO_VOO"),
121 |         col("dim_empresa.ID_EMPRESA"),
122 |         col("dim_tempo.ID_TEMPO"),
123 |         col("dim_tipo_voo.ID_TIPO_VOO"),
124 |         col("origem.ID_AEROPORTO").alias("ID_AEROPORTO_ORIGEM"),
125 |         col("destino.ID_AEROPORTO").alias("ID_AEROPORTO_DESTINO"),
126 |         "PASSAGEIROS_PAGOS", "PASSAGEIROS_GRATIS",
127 |         "CARGA_PAGA_KG", "CARGA_GRATIS_KG", "CORREIO_KG",
128 |         "ASK", "RPK", "ATK", "RTK",
129 |         "COMBUSTIVEL_LITROS", "DISTANCIA_VOADA_KM",
130 |         "DECOLAGENS", "ASSENTOS", "PAYLOAD",
131 |         "HORAS_VOADAS", "BAGAGEM_KG"
132 |     )
133 | 
134 |     fato_voo.write.mode("overwrite").parquet(os.path.join(GOLD_DIR, "fato_voo"))
135 | 
136 |     print("✅ Camada gold criada com sucesso.")
137 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | #########################
 4 | #------ POSTGRES -------#
 5 | #########################
 6 | services:
 7 | 
 8 |   postgres:
 9 |     image: postgres:14
10 |     container_name: pg-anac
11 |     restart: always
12 |     environment:
13 |       POSTGRES_USER: airflow
14 |       POSTGRES_PASSWORD: airflow
15 |       POSTGRES_DB: airflow
16 |     ports:
17 |       - "5432:5432"
18 |     volumes:
19 |       - pgdata:/var/lib/postgresql/data
20 | 
21 | #########################
22 | #-------- AIRFLOW -------#
23 | #########################
24 |   airflow:
25 |     build:
26 |       context: .
27 |       dockerfile: Dockerfile.airflow
28 |     container_name: airflow-anac
29 |     depends_on:
30 |       - postgres
31 |     restart: always
32 |     user: "0:0"  
33 |     ports:
34 |       - "8080:8080"
35 |     environment:
36 |       AIRFLOW__CORE__EXECUTOR: LocalExecutor
37 |       AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
38 |       AIRFLOW__CORE__FERNET_KEY: ''
39 |       AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'False'
40 |       AIRFLOW__CORE__LOAD_EXAMPLES: 'False'
41 |     volumes:
42 |       - ./dags:/opt/airflow/dags
43 |       - ./logs:/opt/airflow/logs
44 |       - ./plugins:/opt/airflow/plugins
45 |       - ./data:/opt/airflow/data
46 |     command: >
47 |       bash -c "
48 |         airflow db upgrade &&
49 |         airflow users create --username admin --firstname Admin --lastname User --role Admin --password admin --email admin@example.com &&
50 |         airflow scheduler & 
51 |         exec airflow webserver"
52 | 
53 | #########################
54 | #-------- SPARK --------#
55 | #########################
56 |   spark:
57 |     image: bitnami/spark:latest
58 |     container_name: spark-anac
59 |     ports:
60 |       - "4040:4040"  
61 |     volumes:
62 |       - ./data:/data
63 |     environment:
64 |       - SPARK_MODE=master
65 | 
66 | #########################
67 | #------- JUPYTER -------#
68 | #########################
69 |   jupyter:
70 |     image: jupyter/pyspark-notebook
71 |     container_name: jupyter-anac
72 |     ports:
73 |       - "8888:8888"
74 |     volumes:
75 |       - ./data:/home/jovyan/data
76 |       - ./notebooks:/home/jovyan/work
77 |     environment:
78 |       - PYSPARK_PYTHON=python3
79 |       - PYSPARK_DRIVER_PYTHON=jupyter
80 |       - PYSPARK_DRIVER_PYTHON_OPTS=notebook
81 |       - SPARK_OPTS=--driver-memory 2g
82 |     depends_on:
83 |       - spark
84 | 
85 | #########################
86 | #------ VOLUMES --------#
87 | #########################
88 | volumes:
89 |   pgdata:
90 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=manual__2025-04-07T013752.587480+0000/task_id=baixar_csv_anac/attempt=1.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T01:37:53.190+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 2 | [2025-04-07T01:37:53.197+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 3 | [2025-04-07T01:37:53.198+0000] {taskinstance.py:2170} INFO - Starting attempt 1 of 1
 4 | [2025-04-07T01:37:53.210+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2025-04-07 01:37:52.587480+00:00
 5 | [2025-04-07T01:37:53.215+0000] {standard_task_runner.py:60} INFO - Started process 317 to run task
 6 | [2025-04-07T01:37:53.219+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'manual__2025-04-07T01:37:52.587480+00:00', '--job-id', '4', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmphjjx312d']
 7 | [2025-04-07T01:37:53.221+0000] {standard_task_runner.py:88} INFO - Job 4: Subtask baixar_csv_anac
 8 | [2025-04-07T01:37:53.240+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T01:37:53.280+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T01:37:53.359+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2025-04-07T01:37:52.587480+00:00' AIRFLOW_CTX_TRY_NUMBER='1' AIRFLOW_CTX_DAG_RUN_ID='manual__2025-04-07T01:37:52.587480+00:00'
11 | [2025-04-07T01:37:53.562+0000] {taskinstance.py:2698} ERROR - Task failed with exception
12 | Traceback (most recent call last):
13 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/taskinstance.py", line 433, in _execute_task
14 |     result = execute_callable(context=context, **execute_callable_kwargs)
15 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 199, in execute
16 |     return_value = self.execute_callable()
17 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 216, in execute_callable
18 |     return self.python_callable(*self.op_args, **self.op_kwargs)
19 |   File "/opt/airflow/dags/tasks/download_csv.py", line 7, in baixar_csv_anac
20 |     response.raise_for_status()
21 |   File "/home/airflow/.local/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
22 |     raise HTTPError(http_error_msg, response=self)
23 | requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20opera%C3%A7%C3%B5es%20a%C3%A9reas/Dados%20Estat%C3%ADsticos%20do%20Transporte%20A%C3%A9reo/2023_Dados_Estatisticos.csv
24 | [2025-04-07T01:37:53.576+0000] {taskinstance.py:1138} INFO - Marking task as FAILED. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20250407T013752, start_date=20250407T013753, end_date=20250407T013753
25 | [2025-04-07T01:37:53.586+0000] {standard_task_runner.py:107} ERROR - Failed to execute job 4 for task baixar_csv_anac (404 Client Error: Not Found for url: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20opera%C3%A7%C3%B5es%20a%C3%A9reas/Dados%20Estat%C3%ADsticos%20do%20Transporte%20A%C3%A9reo/2023_Dados_Estatisticos.csv; 317)
26 | [2025-04-07T01:37:53.633+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 1
27 | [2025-04-07T01:37:53.648+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
28 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=manual__2025-04-07T013752.587480+0000/task_id=baixar_csv_anac/attempt=2.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T01:39:06.364+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 2 | [2025-04-07T01:39:06.371+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 3 | [2025-04-07T01:39:06.372+0000] {taskinstance.py:2170} INFO - Starting attempt 2 of 2
 4 | [2025-04-07T01:39:06.382+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2025-04-07 01:37:52.587480+00:00
 5 | [2025-04-07T01:39:06.387+0000] {standard_task_runner.py:60} INFO - Started process 329 to run task
 6 | [2025-04-07T01:39:06.391+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'manual__2025-04-07T01:37:52.587480+00:00', '--job-id', '5', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmpi3o2htlc']
 7 | [2025-04-07T01:39:06.394+0000] {standard_task_runner.py:88} INFO - Job 5: Subtask baixar_csv_anac
 8 | [2025-04-07T01:39:06.410+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T01:39:06.445+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T01:39:06.516+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2025-04-07T01:37:52.587480+00:00' AIRFLOW_CTX_TRY_NUMBER='2' AIRFLOW_CTX_DAG_RUN_ID='manual__2025-04-07T01:37:52.587480+00:00'
11 | [2025-04-07T01:39:06.715+0000] {python.py:201} INFO - Done. Returned value was: None
12 | [2025-04-07T01:39:06.725+0000] {taskinstance.py:1138} INFO - Marking task as SUCCESS. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20250407T013752, start_date=20250407T013906, end_date=20250407T013906
13 | [2025-04-07T01:39:06.764+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 0
14 | [2025-04-07T01:39:06.781+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
15 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=manual__2025-04-07T013752.587480+0000/task_id=baixar_csv_anac/attempt=3.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T01:41:54.479+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 2 | [2025-04-07T01:41:54.488+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 3 | [2025-04-07T01:41:54.489+0000] {taskinstance.py:2170} INFO - Starting attempt 3 of 3
 4 | [2025-04-07T01:41:54.499+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2025-04-07 01:37:52.587480+00:00
 5 | [2025-04-07T01:41:54.503+0000] {standard_task_runner.py:60} INFO - Started process 361 to run task
 6 | [2025-04-07T01:41:54.506+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'manual__2025-04-07T01:37:52.587480+00:00', '--job-id', '6', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmpriicwill']
 7 | [2025-04-07T01:41:54.507+0000] {standard_task_runner.py:88} INFO - Job 6: Subtask baixar_csv_anac
 8 | [2025-04-07T01:41:54.524+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T01:41:54.561+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T01:41:54.633+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2025-04-07T01:37:52.587480+00:00' AIRFLOW_CTX_TRY_NUMBER='3' AIRFLOW_CTX_DAG_RUN_ID='manual__2025-04-07T01:37:52.587480+00:00'
11 | [2025-04-07T01:41:55.859+0000] {logging_mixin.py:188} INFO - [INFO] Baixando arquivo de: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20operações%20aéreas/Dados%20Estatísticos%20do%20Transporte%20Aéreo/Dados_Estatisticos.csv
12 | [2025-04-07T01:52:55.784+0000] {python.py:201} INFO - Done. Returned value was: None
13 | [2025-04-07T01:52:55.795+0000] {taskinstance.py:1138} INFO - Marking task as SUCCESS. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20250407T013752, start_date=20250407T014154, end_date=20250407T015255
14 | [2025-04-07T01:52:55.868+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 0
15 | [2025-04-07T01:52:55.882+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
16 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=manual__2025-04-07T013752.587480+0000/task_id=baixar_csv_anac/attempt=4.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T01:56:13.258+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 2 | [2025-04-07T01:56:13.265+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 3 | [2025-04-07T01:56:13.266+0000] {taskinstance.py:2170} INFO - Starting attempt 4 of 4
 4 | [2025-04-07T01:56:13.277+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2025-04-07 01:37:52.587480+00:00
 5 | [2025-04-07T01:56:13.281+0000] {standard_task_runner.py:60} INFO - Started process 503 to run task
 6 | [2025-04-07T01:56:13.284+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'manual__2025-04-07T01:37:52.587480+00:00', '--job-id', '7', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmpv2n_vdef']
 7 | [2025-04-07T01:56:13.286+0000] {standard_task_runner.py:88} INFO - Job 7: Subtask baixar_csv_anac
 8 | [2025-04-07T01:56:13.301+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T01:56:13.336+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T01:56:13.408+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2025-04-07T01:37:52.587480+00:00' AIRFLOW_CTX_TRY_NUMBER='4' AIRFLOW_CTX_DAG_RUN_ID='manual__2025-04-07T01:37:52.587480+00:00'
11 | [2025-04-07T01:56:16.073+0000] {logging_mixin.py:188} INFO - [INFO] Baixando arquivo de: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20operações%20aéreas/Dados%20Estatísticos%20do%20Transporte%20Aéreo/Dados_Estatisticos.csv
12 | [2025-04-07T01:56:16.074+0000] {logging_mixin.py:188} INFO - [DEBUG] Salvando em: /opt/***/data/bronze/Dados_Estatisticos.csv
13 | [2025-04-07T01:57:47.873+0000] {taskinstance.py:2698} ERROR - Task failed with exception
14 | Traceback (most recent call last):
15 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 710, in _error_catcher
16 |     yield
17 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 835, in _raw_read
18 |     raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
19 | urllib3.exceptions.IncompleteRead: IncompleteRead(30850815 bytes read, 309703048 more expected)
20 | 
21 | The above exception was the direct cause of the following exception:
22 | 
23 | Traceback (most recent call last):
24 |   File "/home/airflow/.local/lib/python3.10/site-packages/requests/models.py", line 816, in generate
25 |     yield from self.raw.stream(chunk_size, decode_content=True)
26 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 936, in stream
27 |     data = self.read(amt=amt, decode_content=decode_content)
28 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 907, in read
29 |     data = self._raw_read(amt)
30 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 813, in _raw_read
31 |     with self._error_catcher():
32 |   File "/usr/local/lib/python3.10/contextlib.py", line 153, in __exit__
33 |     self.gen.throw(typ, value, traceback)
34 |   File "/home/airflow/.local/lib/python3.10/site-packages/urllib3/response.py", line 727, in _error_catcher
35 |     raise ProtocolError(f"Connection broken: {e!r}", e) from e
36 | urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(30850815 bytes read, 309703048 more expected)', IncompleteRead(30850815 bytes read, 309703048 more expected))
37 | 
38 | During handling of the above exception, another exception occurred:
39 | 
40 | Traceback (most recent call last):
41 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/taskinstance.py", line 433, in _execute_task
42 |     result = execute_callable(context=context, **execute_callable_kwargs)
43 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 199, in execute
44 |     return_value = self.execute_callable()
45 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 216, in execute_callable
46 |     return self.python_callable(*self.op_args, **self.op_kwargs)
47 |   File "/opt/airflow/dags/tasks/download_csv.py", line 34, in baixar_csv_anac
48 |     for chunk in csv_response.iter_content(chunk_size=1048576):  # 1MB
49 |   File "/home/airflow/.local/lib/python3.10/site-packages/requests/models.py", line 818, in generate
50 |     raise ChunkedEncodingError(e)
51 | requests.exceptions.ChunkedEncodingError: ('Connection broken: IncompleteRead(30850815 bytes read, 309703048 more expected)', IncompleteRead(30850815 bytes read, 309703048 more expected))
52 | [2025-04-07T01:57:47.887+0000] {taskinstance.py:1138} INFO - Marking task as FAILED. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20250407T013752, start_date=20250407T015613, end_date=20250407T015747
53 | [2025-04-07T01:57:47.897+0000] {standard_task_runner.py:107} ERROR - Failed to execute job 7 for task baixar_csv_anac (('Connection broken: IncompleteRead(30850815 bytes read, 309703048 more expected)', IncompleteRead(30850815 bytes read, 309703048 more expected)); 503)
54 | [2025-04-07T01:57:47.908+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 1
55 | [2025-04-07T01:57:47.922+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
56 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=manual__2025-04-07T013752.587480+0000/task_id=baixar_csv_anac/attempt=5.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T11:21:33.066+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 2 | [2025-04-07T11:21:33.075+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [queued]>
 3 | [2025-04-07T11:21:33.076+0000] {taskinstance.py:2170} INFO - Starting attempt 5 of 5
 4 | [2025-04-07T11:21:33.089+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2025-04-07 01:37:52.587480+00:00
 5 | [2025-04-07T11:21:33.099+0000] {standard_task_runner.py:60} INFO - Started process 214 to run task
 6 | [2025-04-07T11:21:33.103+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'manual__2025-04-07T01:37:52.587480+00:00', '--job-id', '9', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmp5x6_268z']
 7 | [2025-04-07T11:21:33.105+0000] {standard_task_runner.py:88} INFO - Job 9: Subtask baixar_csv_anac
 8 | [2025-04-07T11:21:33.125+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T11:21:33.164+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac manual__2025-04-07T01:37:52.587480+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T11:21:33.242+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2025-04-07T01:37:52.587480+00:00' AIRFLOW_CTX_TRY_NUMBER='5' AIRFLOW_CTX_DAG_RUN_ID='manual__2025-04-07T01:37:52.587480+00:00'
11 | [2025-04-07T11:21:33.475+0000] {logging_mixin.py:188} INFO - [INFO] Baixando arquivo de: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20operações%20aéreas/Dados%20Estatísticos%20do%20Transporte%20Aéreo/Dados_Estatisticos.csv
12 | [2025-04-07T11:21:33.476+0000] {logging_mixin.py:188} INFO - [DEBUG] Salvando em: /opt/***/data/bronze/Dados_Estatisticos.csv
13 | [2025-04-07T11:22:52.666+0000] {logging_mixin.py:188} INFO - [SUCESSO] CSV salvo em /opt/***/data/bronze/Dados_Estatisticos.csv
14 | [2025-04-07T11:22:52.669+0000] {python.py:201} INFO - Done. Returned value was: None
15 | [2025-04-07T11:22:52.679+0000] {taskinstance.py:1138} INFO - Marking task as SUCCESS. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20250407T013752, start_date=20250407T112133, end_date=20250407T112252
16 | [2025-04-07T11:22:52.732+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 0
17 | [2025-04-07T11:22:52.746+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
18 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/logs/dag_id=anac_pipeline_dag/run_id=scheduled__2024-01-01T000000+0000/task_id=baixar_csv_anac/attempt=1.log:
--------------------------------------------------------------------------------
 1 | [2025-04-07T01:36:01.379+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=non-requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac scheduled__2024-01-01T00:00:00+00:00 [queued]>
 2 | [2025-04-07T01:36:01.387+0000] {taskinstance.py:1956} INFO - Dependencies all met for dep_context=requeueable deps ti=<TaskInstance: anac_pipeline_dag.baixar_csv_anac scheduled__2024-01-01T00:00:00+00:00 [queued]>
 3 | [2025-04-07T01:36:01.388+0000] {taskinstance.py:2170} INFO - Starting attempt 1 of 1
 4 | [2025-04-07T01:36:01.399+0000] {taskinstance.py:2191} INFO - Executing <Task(PythonOperator): baixar_csv_anac> on 2024-01-01 00:00:00+00:00
 5 | [2025-04-07T01:36:01.404+0000] {standard_task_runner.py:60} INFO - Started process 290 to run task
 6 | [2025-04-07T01:36:01.407+0000] {standard_task_runner.py:87} INFO - Running: ['***', 'tasks', 'run', 'anac_pipeline_dag', 'baixar_csv_anac', 'scheduled__2024-01-01T00:00:00+00:00', '--job-id', '3', '--raw', '--subdir', 'DAGS_FOLDER/dag_anac_pipeline.py', '--cfg-path', '/tmp/tmpb80ina93']
 7 | [2025-04-07T01:36:01.409+0000] {standard_task_runner.py:88} INFO - Job 3: Subtask baixar_csv_anac
 8 | [2025-04-07T01:36:01.426+0000] {logging_mixin.py:188} WARNING - /home/***/.local/lib/python3.10/site-packages/***/settings.py:194 DeprecationWarning: The sql_alchemy_conn option in [core] has been moved to the sql_alchemy_conn option in [database] - the old setting has been used, but please update your config.
 9 | [2025-04-07T01:36:01.465+0000] {task_command.py:423} INFO - Running <TaskInstance: anac_pipeline_dag.baixar_csv_anac scheduled__2024-01-01T00:00:00+00:00 [running]> on host 8a43643a16f6
10 | [2025-04-07T01:36:01.541+0000] {taskinstance.py:2480} INFO - Exporting env vars: AIRFLOW_CTX_DAG_OWNER='***' AIRFLOW_CTX_DAG_ID='anac_pipeline_dag' AIRFLOW_CTX_TASK_ID='baixar_csv_anac' AIRFLOW_CTX_EXECUTION_DATE='2024-01-01T00:00:00+00:00' AIRFLOW_CTX_TRY_NUMBER='1' AIRFLOW_CTX_DAG_RUN_ID='scheduled__2024-01-01T00:00:00+00:00'
11 | [2025-04-07T01:36:01.733+0000] {taskinstance.py:2698} ERROR - Task failed with exception
12 | Traceback (most recent call last):
13 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/models/taskinstance.py", line 433, in _execute_task
14 |     result = execute_callable(context=context, **execute_callable_kwargs)
15 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 199, in execute
16 |     return_value = self.execute_callable()
17 |   File "/home/airflow/.local/lib/python3.10/site-packages/airflow/operators/python.py", line 216, in execute_callable
18 |     return self.python_callable(*self.op_args, **self.op_kwargs)
19 |   File "/opt/airflow/dags/tasks/download_csv.py", line 7, in baixar_csv_anac
20 |     response.raise_for_status()
21 |   File "/home/airflow/.local/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
22 |     raise HTTPError(http_error_msg, response=self)
23 | requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20opera%C3%A7%C3%B5es%20a%C3%A9reas/Dados%20Estat%C3%ADsticos%20do%20Transporte%20A%C3%A9reo/2023_Dados_Estatisticos.csv
24 | [2025-04-07T01:36:01.748+0000] {taskinstance.py:1138} INFO - Marking task as FAILED. dag_id=anac_pipeline_dag, task_id=baixar_csv_anac, execution_date=20240101T000000, start_date=20250407T013601, end_date=20250407T013601
25 | [2025-04-07T01:36:01.758+0000] {standard_task_runner.py:107} ERROR - Failed to execute job 3 for task baixar_csv_anac (404 Client Error: Not Found for url: https://sistemas.anac.gov.br/dadosabertos/Voos%20e%20opera%C3%A7%C3%B5es%20a%C3%A9reas/Dados%20Estat%C3%ADsticos%20do%20Transporte%20A%C3%A9reo/2023_Dados_Estatisticos.csv; 290)
26 | [2025-04-07T01:36:01.780+0000] {local_task_job_runner.py:234} INFO - Task exited with return code 1
27 | [2025-04-07T01:36:01.797+0000] {taskinstance.py:3280} INFO - 0 downstream tasks scheduled from follow-on schedule check
28 | 


--------------------------------------------------------------------------------
/ANAC-data-engineering-project/requirements.txt:
--------------------------------------------------------------------------------
1 | requests            
2 | pandas              
3 | pyarrow             
4 | openpyxl            
5 | lxml                
6 | beautifulsoup4      
7 | python-dotenv
8 | pyspark       
9 | holidays


--------------------------------------------------------------------------------
/ApacheHopProject/Apache Hop Project Description.md:
--------------------------------------------------------------------------------
 1 | ![Postgres](https://img.shields.io/badge/postgres-%23316192.svg?style=for-the-badge&logo=postgresql&logoColor=white)
 2 | ![ApacheHOP](https://img.shields.io/badge/HOP-ffffff?style=for-the-badge&logo=apache&logoColor=blue)
 3 | ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
 4 | ![JSON](https://img.shields.io/badge/json-5E5C5C?style=for-the-badge&logo=json&logoColor=white)
 5 | ![EXCEL](https://img.shields.io/badge/Microsoft_Excel-217346?style=for-the-badge&logo=microsoft-excel&logoColor=white)
 6 | 
 7 | ![1](https://github.com/Shamslux/DataEngineering/assets/79280485/9bb5ef1d-0da3-4418-9a8f-3d678d880093)
 8 | ![2](https://github.com/Shamslux/DataEngineering/assets/79280485/98d7f1a3-5f10-4c85-b58f-4d45dcd30691)
 9 | ![3](https://github.com/Shamslux/DataEngineering/assets/79280485/88fa5e5e-58cb-4104-b183-4f4f71d02774)
10 | ![4](https://github.com/Shamslux/DataEngineering/assets/79280485/0286571a-313f-4aa5-b978-38ed7a95519b)
11 | ![5](https://github.com/Shamslux/DataEngineering/assets/79280485/3cd0b75b-50e5-45c2-b5ba-381558eccf0e)
12 | ![6](https://github.com/Shamslux/DataEngineering/assets/79280485/b8dc304a-0039-4558-92a5-2ce3578f8995)
13 | ![7](https://github.com/Shamslux/DataEngineering/assets/79280485/878157e0-83cd-4c8c-8f1a-d76e49d15863)
14 | ![8](https://github.com/Shamslux/DataEngineering/assets/79280485/47da5ff5-d8d8-4ef2-9cf8-5def564534df)
15 | ![9](https://github.com/Shamslux/DataEngineering/assets/79280485/c67a7a08-d3e7-44f6-9249-38e161f0b40e)
16 | ![10](https://github.com/Shamslux/DataEngineering/assets/79280485/b896b9ae-6066-4cf0-8703-a97c93918eda)
17 | ![11](https://github.com/Shamslux/DataEngineering/assets/79280485/b7a0d92f-3023-4235-bdf0-800551eb8173)
18 | ![12](https://github.com/Shamslux/DataEngineering/assets/79280485/27ffff17-2c01-4d99-8669-539abde999ca)
19 | ![13](https://github.com/Shamslux/DataEngineering/assets/79280485/8c7296db-3428-438f-a9a8-92782d330ac2)
20 | 
21 | 
22 | ![in_construction](https://user-images.githubusercontent.com/79280485/235759660-1ef666a2-0c0c-4cf0-a545-5f9b5c5659c2.png)
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/ApacheHopProject/infrastructure/jenkins/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   jenkins:
 4 |     build:
 5 |       context: .
 6 |       dockerfile: jenkins_hop_project.dockerfile  # Nome do Dockerfile no mesmo diretório
 7 |     ports:
 8 |       - "8080:8080"
 9 |       - "50000:50000"
10 | 


--------------------------------------------------------------------------------
/ApacheHopProject/infrastructure/jenkins/jenkins_hop_project.dockerfile:
--------------------------------------------------------------------------------
1 | # Jenkins image
2 | FROM jenkins/jenkins:lts
3 | 
4 | # Exponha a porta do Jenkins
5 | EXPOSE 8080
6 | 


--------------------------------------------------------------------------------
/ApacheHopProject/pipelines/pipeline2_reading_xls_to_database_error_handling.hpl:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <pipeline>
  3 |   <info>
  4 |     <name>pipeline2_reading_xls_to_database_error_handling</name>
  5 |     <name_sync_with_filename>Y</name_sync_with_filename>
  6 |     <description/>
  7 |     <extended_description/>
  8 |     <pipeline_version/>
  9 |     <pipeline_type>Normal</pipeline_type>
 10 |     <parameters>
 11 |     </parameters>
 12 |     <capture_transform_performance>N</capture_transform_performance>
 13 |     <transform_performance_capturing_delay>1000</transform_performance_capturing_delay>
 14 |     <transform_performance_capturing_size_limit>100</transform_performance_capturing_size_limit>
 15 |     <created_user>-</created_user>
 16 |     <created_date>2023/03/03 14:40:55.856</created_date>
 17 |     <modified_user>-</modified_user>
 18 |     <modified_date>2023/03/03 14:40:55.856</modified_date>
 19 |     <key_for_session_key>H4sIAAAAAAAAAAMAAAAAAAAAAAA=</key_for_session_key>
 20 |     <is_key_private>N</is_key_private>
 21 |   </info>
 22 |   <notepads>
 23 |     <notepad>
 24 |       <note>In the presence of null values for the sales value,
 25 |  the responsible sector must be contacted!</note>
 26 |       <xloc>240</xloc>
 27 |       <yloc>224</yloc>
 28 |       <width>281</width>
 29 |       <heigth>42</heigth>
 30 |       <fontname>Segoe UI</fontname>
 31 |       <fontsize>9</fontsize>
 32 |       <fontbold>Y</fontbold>
 33 |       <fontitalic>N</fontitalic>
 34 |       <fontcolorred>14</fontcolorred>
 35 |       <fontcolorgreen>58</fontcolorgreen>
 36 |       <fontcolorblue>90</fontcolorblue>
 37 |       <backgroundcolorred>201</backgroundcolorred>
 38 |       <backgroundcolorgreen>232</backgroundcolorgreen>
 39 |       <backgroundcolorblue>251</backgroundcolorblue>
 40 |       <bordercolorred>14</bordercolorred>
 41 |       <bordercolorgreen>58</bordercolorgreen>
 42 |       <bordercolorblue>90</bordercolorblue>
 43 |     </notepad>
 44 |     <notepad>
 45 |       <note>This is just a basic pipeline to demonstrate that it is possible 
 46 | to read xlsx files and write them to the database after the 
 47 | desired processing. Ignore the connection, I just used any 
 48 | local connection!</note>
 49 |       <xloc>944</xloc>
 50 |       <yloc>32</yloc>
 51 |       <width>341</width>
 52 |       <heigth>74</heigth>
 53 |       <fontname>Segoe UI</fontname>
 54 |       <fontsize>9</fontsize>
 55 |       <fontbold>Y</fontbold>
 56 |       <fontitalic>N</fontitalic>
 57 |       <fontcolorred>14</fontcolorred>
 58 |       <fontcolorgreen>58</fontcolorgreen>
 59 |       <fontcolorblue>90</fontcolorblue>
 60 |       <backgroundcolorred>201</backgroundcolorred>
 61 |       <backgroundcolorgreen>232</backgroundcolorgreen>
 62 |       <backgroundcolorblue>251</backgroundcolorblue>
 63 |       <bordercolorred>14</bordercolorred>
 64 |       <bordercolorgreen>58</bordercolorgreen>
 65 |       <bordercolorblue>90</bordercolorblue>
 66 |     </notepad>
 67 |     <notepad>
 68 |       <note>This is a sheet for the responsible sector.
 69 | They need to analyse and correct it for
 70 | further load of these data.</note>
 71 |       <xloc>832</xloc>
 72 |       <yloc>272</yloc>
 73 |       <width>235</width>
 74 |       <heigth>58</heigth>
 75 |       <fontname>Segoe UI</fontname>
 76 |       <fontsize>9</fontsize>
 77 |       <fontbold>Y</fontbold>
 78 |       <fontitalic>N</fontitalic>
 79 |       <fontcolorred>14</fontcolorred>
 80 |       <fontcolorgreen>58</fontcolorgreen>
 81 |       <fontcolorblue>90</fontcolorblue>
 82 |       <backgroundcolorred>201</backgroundcolorred>
 83 |       <backgroundcolorgreen>232</backgroundcolorgreen>
 84 |       <backgroundcolorblue>251</backgroundcolorblue>
 85 |       <bordercolorred>14</bordercolorred>
 86 |       <bordercolorgreen>58</bordercolorgreen>
 87 |       <bordercolorblue>90</bordercolorblue>
 88 |     </notepad>
 89 |   </notepads>
 90 |   <order>
 91 |     <hop>
 92 |       <from>Microsoft Excel input</from>
 93 |       <to>Data type conversion</to>
 94 |       <enabled>Y</enabled>
 95 |     </hop>
 96 |     <hop>
 97 |       <from>Creating IDs</from>
 98 |       <to>Organize columns</to>
 99 |       <enabled>Y</enabled>
100 |     </hop>
101 |     <hop>
102 |       <from>Value is not null?</from>
103 |       <to>Creating IDs</to>
104 |       <enabled>Y</enabled>
105 |     </hop>
106 |     <hop>
107 |       <from>Value is not null?</from>
108 |       <to>Microsoft Excel writer</to>
109 |       <enabled>Y</enabled>
110 |     </hop>
111 |     <hop>
112 |       <from>Data type conversion</from>
113 |       <to>Value is not null?</to>
114 |       <enabled>Y</enabled>
115 |     </hop>
116 |     <hop>
117 |       <from>Organize columns</from>
118 |       <to>Filter rows</to>
119 |       <enabled>Y</enabled>
120 |     </hop>
121 |     <hop>
122 |       <from>Filter rows</from>
123 |       <to>Dummy (do nothing)</to>
124 |       <enabled>Y</enabled>
125 |     </hop>
126 |     <hop>
127 |       <from>Filter rows</from>
128 |       <to>Insert / update</to>
129 |       <enabled>Y</enabled>
130 |     </hop>
131 |   </order>
132 |   <transform>
133 |     <name>Creating IDs</name>
134 |     <type>Sequence</type>
135 |     <description/>
136 |     <distribute>Y</distribute>
137 |     <custom_distribution/>
138 |     <copies>1</copies>
139 |     <partitioning>
140 |       <method>none</method>
141 |       <schema_name/>
142 |     </partitioning>
143 |     <use_counter>Y</use_counter>
144 |     <use_database>N</use_database>
145 |     <increment_by>1</increment_by>
146 |     <max_value>999999999</max_value>
147 |     <seqname>SEQ_</seqname>
148 |     <start_at>1</start_at>
149 |     <valuename>valuename</valuename>
150 |     <attributes/>
151 |     <GUI>
152 |       <xloc>960</xloc>
153 |       <yloc>128</yloc>
154 |     </GUI>
155 |   </transform>
156 |   <transform>
157 |     <name>Data type conversion</name>
158 |     <type>SelectValues</type>
159 |     <description/>
160 |     <distribute>Y</distribute>
161 |     <custom_distribution/>
162 |     <copies>1</copies>
163 |     <partitioning>
164 |       <method>none</method>
165 |       <schema_name/>
166 |     </partitioning>
167 |     <fields>
168 |       <select_unspecified>N</select_unspecified>
169 |       <meta>
170 |         <name>Date</name>
171 |         <rename>Date</rename>
172 |         <type>Date</type>
173 |         <length>-2</length>
174 |         <precision>-2</precision>
175 |         <conversion_mask>yyyy-MM-dd</conversion_mask>
176 |         <date_format_lenient>false</date_format_lenient>
177 |         <date_format_locale/>
178 |         <date_format_timezone/>
179 |         <lenient_string_to_number>false</lenient_string_to_number>
180 |         <encoding/>
181 |         <decimal_symbol/>
182 |         <grouping_symbol/>
183 |         <currency_symbol/>
184 |         <storage_type/>
185 |       </meta>
186 |       <meta>
187 |         <name>Salesperson</name>
188 |         <rename>Salesperson</rename>
189 |         <type>String</type>
190 |         <length>-2</length>
191 |         <precision>-2</precision>
192 |         <conversion_mask/>
193 |         <date_format_lenient>false</date_format_lenient>
194 |         <date_format_locale/>
195 |         <date_format_timezone/>
196 |         <lenient_string_to_number>false</lenient_string_to_number>
197 |         <encoding/>
198 |         <decimal_symbol/>
199 |         <grouping_symbol/>
200 |         <currency_symbol/>
201 |         <storage_type/>
202 |       </meta>
203 |       <meta>
204 |         <name>Value</name>
205 |         <rename>Value</rename>
206 |         <type>Integer</type>
207 |         <length>-2</length>
208 |         <precision>-2</precision>
209 |         <conversion_mask>#</conversion_mask>
210 |         <date_format_lenient>false</date_format_lenient>
211 |         <date_format_locale/>
212 |         <date_format_timezone/>
213 |         <lenient_string_to_number>false</lenient_string_to_number>
214 |         <encoding/>
215 |         <decimal_symbol/>
216 |         <grouping_symbol/>
217 |         <currency_symbol/>
218 |         <storage_type/>
219 |       </meta>
220 |     </fields>
221 |     <attributes/>
222 |     <GUI>
223 |       <xloc>464</xloc>
224 |       <yloc>128</yloc>
225 |     </GUI>
226 |   </transform>
227 |   <transform>
228 |     <name>Dummy (do nothing)</name>
229 |     <type>Dummy</type>
230 |     <description/>
231 |     <distribute>Y</distribute>
232 |     <custom_distribution/>
233 |     <copies>1</copies>
234 |     <partitioning>
235 |       <method>none</method>
236 |       <schema_name/>
237 |     </partitioning>
238 |     <attributes/>
239 |     <GUI>
240 |       <xloc>1360</xloc>
241 |       <yloc>256</yloc>
242 |     </GUI>
243 |   </transform>
244 |   <transform>
245 |     <name>Filter rows</name>
246 |     <type>FilterRows</type>
247 |     <description/>
248 |     <distribute>Y</distribute>
249 |     <custom_distribution/>
250 |     <copies>1</copies>
251 |     <partitioning>
252 |       <method>none</method>
253 |       <schema_name/>
254 |     </partitioning>
255 |     <send_true_to>Insert / update</send_true_to>
256 |     <send_false_to>Dummy (do nothing)</send_false_to>
257 |     <compare>
258 |       <condition>
259 |         <negated>N</negated>
260 |         <leftvalue>PK_SALE</leftvalue>
261 |         <function>IS NOT NULL</function>
262 |         <rightvalue/>
263 |       </condition>
264 |     </compare>
265 |     <attributes/>
266 |     <GUI>
267 |       <xloc>1360</xloc>
268 |       <yloc>128</yloc>
269 |     </GUI>
270 |   </transform>
271 |   <transform>
272 |     <name>Microsoft Excel input</name>
273 |     <type>ExcelInput</type>
274 |     <description/>
275 |     <distribute>Y</distribute>
276 |     <custom_distribution/>
277 |     <copies>1</copies>
278 |     <partitioning>
279 |       <method>none</method>
280 |       <schema_name/>
281 |     </partitioning>
282 |     <header>Y</header>
283 |     <noempty>N</noempty>
284 |     <stoponempty>N</stoponempty>
285 |     <filefield/>
286 |     <sheetfield/>
287 |     <sheetrownumfield/>
288 |     <rownumfield/>
289 |     <sheetfield/>
290 |     <filefield/>
291 |     <limit>0</limit>
292 |     <encoding/>
293 |     <add_to_result_filenames>Y</add_to_result_filenames>
294 |     <accept_filenames>N</accept_filenames>
295 |     <accept_field/>
296 |     <accept_transform_name/>
297 |     <file>
298 |       <name>${PROJECT_HOME}/Training/source_data_pipeline_2.xlsx</name>
299 |       <filemask>source_data.*</filemask>
300 |       <exclude_filemask/>
301 |       <file_required>N</file_required>
302 |       <include_subfolders>N</include_subfolders>
303 |     </file>
304 |     <fields>
305 |       <field>
306 |         <name>Date</name>
307 |         <type>Date</type>
308 |         <length>-1</length>
309 |         <precision>-1</precision>
310 |         <trim_type>both</trim_type>
311 |         <repeat>N</repeat>
312 |         <format>yyyy-MM-dd</format>
313 |         <currency/>
314 |         <decimal/>
315 |         <group/>
316 |       </field>
317 |       <field>
318 |         <name>Salesperson</name>
319 |         <type>String</type>
320 |         <length>-1</length>
321 |         <precision>-1</precision>
322 |         <trim_type>both</trim_type>
323 |         <repeat>N</repeat>
324 |         <format/>
325 |         <currency/>
326 |         <decimal/>
327 |         <group/>
328 |       </field>
329 |       <field>
330 |         <name>Value</name>
331 |         <type>Integer</type>
332 |         <length>-1</length>
333 |         <precision>-1</precision>
334 |         <trim_type>both</trim_type>
335 |         <repeat>N</repeat>
336 |         <format>#</format>
337 |         <currency/>
338 |         <decimal/>
339 |         <group/>
340 |       </field>
341 |     </fields>
342 |     <sheets>
343 |       <sheet>
344 |         <name>sales</name>
345 |         <startrow>0</startrow>
346 |         <startcol>0</startcol>
347 |       </sheet>
348 |     </sheets>
349 |     <strict_types>N</strict_types>
350 |     <error_ignored>N</error_ignored>
351 |     <error_line_skipped>N</error_line_skipped>
352 |     <bad_line_files_destination_directory/>
353 |     <bad_line_files_extension>warning</bad_line_files_extension>
354 |     <error_line_files_destination_directory/>
355 |     <error_line_files_extension>error</error_line_files_extension>
356 |     <line_number_files_destination_directory/>
357 |     <line_number_files_extension>line</line_number_files_extension>
358 |     <shortFileFieldName/>
359 |     <pathFieldName/>
360 |     <hiddenFieldName/>
361 |     <lastModificationTimeFieldName/>
362 |     <uriNameFieldName/>
363 |     <rootUriNameFieldName/>
364 |     <extensionFieldName/>
365 |     <sizeFieldName/>
366 |     <spreadsheet_type>SAX_POI</spreadsheet_type>
367 |     <attributes/>
368 |     <GUI>
369 |       <xloc>208</xloc>
370 |       <yloc>128</yloc>
371 |     </GUI>
372 |   </transform>
373 |   <transform>
374 |     <name>Microsoft Excel writer</name>
375 |     <type>TypeExitExcelWriterTransform</type>
376 |     <description/>
377 |     <distribute>Y</distribute>
378 |     <custom_distribution/>
379 |     <copies>1</copies>
380 |     <partitioning>
381 |       <method>none</method>
382 |       <schema_name/>
383 |     </partitioning>
384 |     <add_to_result_filenames>Y</add_to_result_filenames>
385 |     <appendEmpty>0</appendEmpty>
386 |     <appendLines>N</appendLines>
387 |     <appendOffset>0</appendOffset>
388 |     <appendOmitHeader>N</appendOmitHeader>
389 |     <file>
390 |       <autosizecolums>N</autosizecolums>
391 |       <createParentFolder>Y</createParentFolder>
392 |       <add_date>N</add_date>
393 |       <date_time_format/>
394 |       <do_not_open_newfile_init>N</do_not_open_newfile_init>
395 |       <extension>xlsx</extension>
396 |       <name>error</name>
397 |       <filename_field/>
398 |       <filename_in_field>N</filename_in_field>
399 |       <if_file_exists>reuse</if_file_exists>
400 |       <if_sheet_exists>new</if_sheet_exists>
401 |       <password/>
402 |       <protected_by/>
403 |       <protect_sheet>N</protect_sheet>
404 |       <sheetname>Sheet1</sheetname>
405 |       <SpecifyFormat>N</SpecifyFormat>
406 |       <splitevery>0</splitevery>
407 |       <stream_data>N</stream_data>
408 |       <add_time>N</add_time>
409 |       <split>N</split>
410 |     </file>
411 |     <footer>N</footer>
412 |     <forceFormulaRecalculation>N</forceFormulaRecalculation>
413 |     <header>Y</header>
414 |     <leaveExistingStylesUnchanged>N</leaveExistingStylesUnchanged>
415 |     <makeSheetActive>Y</makeSheetActive>
416 |     <fields>
417 |       <field>
418 |         <commentAuthorField/>
419 |         <commentField/>
420 |         <format/>
421 |         <formula>N</formula>
422 |         <hyperlinkField/>
423 |         <name>Date</name>
424 |         <styleCell/>
425 |         <title>Date</title>
426 |         <titleStyleCell/>
427 |         <type>Date</type>
428 |       </field>
429 |       <field>
430 |         <commentAuthorField/>
431 |         <commentField/>
432 |         <format/>
433 |         <formula>N</formula>
434 |         <hyperlinkField/>
435 |         <name>Salesperson</name>
436 |         <styleCell/>
437 |         <title>Salesperson</title>
438 |         <titleStyleCell/>
439 |         <type>String</type>
440 |       </field>
441 |       <field>
442 |         <commentAuthorField/>
443 |         <commentField/>
444 |         <format/>
445 |         <formula>N</formula>
446 |         <hyperlinkField/>
447 |         <name>Value</name>
448 |         <styleCell/>
449 |         <title>Value</title>
450 |         <titleStyleCell/>
451 |         <type>Integer</type>
452 |       </field>
453 |     </fields>
454 |     <rowWritingMethod>overwrite</rowWritingMethod>
455 |     <startingCell>A1</startingCell>
456 |     <template>
457 |       <enabled>N</enabled>
458 |       <filename>template.xls</filename>
459 |       <sheet_enabled>N</sheet_enabled>
460 |       <hidden>N</hidden>
461 |       <sheetname/>
462 |     </template>
463 |     <attributes/>
464 |     <GUI>
465 |       <xloc>768</xloc>
466 |       <yloc>304</yloc>
467 |     </GUI>
468 |   </transform>
469 |   <transform>
470 |     <name>Organize columns</name>
471 |     <type>SelectValues</type>
472 |     <description/>
473 |     <distribute>Y</distribute>
474 |     <custom_distribution/>
475 |     <copies>1</copies>
476 |     <partitioning>
477 |       <method>none</method>
478 |       <schema_name/>
479 |     </partitioning>
480 |     <fields>
481 |       <select_unspecified>N</select_unspecified>
482 |       <meta>
483 |         <name>valuename</name>
484 |         <rename>PK_SALE</rename>
485 |         <type>Integer</type>
486 |         <length>-2</length>
487 |         <precision>-2</precision>
488 |         <conversion_mask>#</conversion_mask>
489 |         <date_format_lenient>false</date_format_lenient>
490 |         <date_format_locale/>
491 |         <date_format_timezone/>
492 |         <lenient_string_to_number>false</lenient_string_to_number>
493 |         <encoding/>
494 |         <decimal_symbol/>
495 |         <grouping_symbol/>
496 |         <currency_symbol/>
497 |         <storage_type/>
498 |       </meta>
499 |       <meta>
500 |         <name>Date</name>
501 |         <rename>DT_SALE</rename>
502 |         <type>Date</type>
503 |         <length>-2</length>
504 |         <precision>-2</precision>
505 |         <conversion_mask>yyyy-MM-dd</conversion_mask>
506 |         <date_format_lenient>false</date_format_lenient>
507 |         <date_format_locale/>
508 |         <date_format_timezone/>
509 |         <lenient_string_to_number>false</lenient_string_to_number>
510 |         <encoding/>
511 |         <decimal_symbol/>
512 |         <grouping_symbol/>
513 |         <currency_symbol/>
514 |         <storage_type/>
515 |       </meta>
516 |       <meta>
517 |         <name>Salesperson</name>
518 |         <rename>NM_SALESPERSON</rename>
519 |         <type>String</type>
520 |         <length>255</length>
521 |         <precision>-2</precision>
522 |         <conversion_mask/>
523 |         <date_format_lenient>false</date_format_lenient>
524 |         <date_format_locale/>
525 |         <date_format_timezone/>
526 |         <lenient_string_to_number>false</lenient_string_to_number>
527 |         <encoding/>
528 |         <decimal_symbol/>
529 |         <grouping_symbol/>
530 |         <currency_symbol/>
531 |         <storage_type/>
532 |       </meta>
533 |       <meta>
534 |         <name>Value</name>
535 |         <rename>VL_SALE_PRICE</rename>
536 |         <type>Number</type>
537 |         <length>-2</length>
538 |         <precision>-2</precision>
539 |         <conversion_mask/>
540 |         <date_format_lenient>false</date_format_lenient>
541 |         <date_format_locale/>
542 |         <date_format_timezone/>
543 |         <lenient_string_to_number>false</lenient_string_to_number>
544 |         <encoding/>
545 |         <decimal_symbol/>
546 |         <grouping_symbol/>
547 |         <currency_symbol/>
548 |         <storage_type/>
549 |       </meta>
550 |     </fields>
551 |     <attributes/>
552 |     <GUI>
553 |       <xloc>1104</xloc>
554 |       <yloc>128</yloc>
555 |     </GUI>
556 |   </transform>
557 |   <transform>
558 |     <name>Value is not null?</name>
559 |     <type>FilterRows</type>
560 |     <description/>
561 |     <distribute>Y</distribute>
562 |     <custom_distribution/>
563 |     <copies>1</copies>
564 |     <partitioning>
565 |       <method>none</method>
566 |       <schema_name/>
567 |     </partitioning>
568 |     <send_true_to>Creating IDs</send_true_to>
569 |     <send_false_to>Microsoft Excel writer</send_false_to>
570 |     <compare>
571 |       <condition>
572 |         <negated>N</negated>
573 |         <leftvalue>Value</leftvalue>
574 |         <function>IS NOT NULL</function>
575 |         <rightvalue/>
576 |       </condition>
577 |     </compare>
578 |     <attributes/>
579 |     <GUI>
580 |       <xloc>768</xloc>
581 |       <yloc>128</yloc>
582 |     </GUI>
583 |   </transform>
584 |   <transform>
585 |     <name>Insert / update</name>
586 |     <type>InsertUpdate</type>
587 |     <description/>
588 |     <distribute>Y</distribute>
589 |     <custom_distribution/>
590 |     <copies>1</copies>
591 |     <partitioning>
592 |       <method>none</method>
593 |       <schema_name/>
594 |     </partitioning>
595 |     <commit>100</commit>
596 |     <connection>SKEAM_OLTP</connection>
597 |     <lookup>
598 |       <key>
599 |         <condition>=</condition>
600 |         <field>PK_SALE</field>
601 |         <name>PK_SALE</name>
602 |         <name2/>
603 |       </key>
604 |       <schema>SKEAM_SHOP</schema>
605 |       <table>SALES_TESTING</table>
606 |       <value>
607 |         <update>N</update>
608 |         <name>DT_SALE</name>
609 |         <rename>DT_SALE</rename>
610 |       </value>
611 |       <value>
612 |         <update>Y</update>
613 |         <name>NM_SALESPERSON</name>
614 |         <rename>NM_SALESPERSON</rename>
615 |       </value>
616 |       <value>
617 |         <update>Y</update>
618 |         <name>VL_SALE_PRICE</name>
619 |         <rename>VL_SALE_PRICE</rename>
620 |       </value>
621 |       <value>
622 |         <update>Y</update>
623 |         <name>PK_SALE</name>
624 |         <rename>PK_SALE</rename>
625 |       </value>
626 |     </lookup>
627 |     <update_bypassed>N</update_bypassed>
628 |     <attributes/>
629 |     <GUI>
630 |       <xloc>1568</xloc>
631 |       <yloc>128</yloc>
632 |     </GUI>
633 |   </transform>
634 |   <transform_error_handling>
635 |   </transform_error_handling>
636 |   <attributes/>
637 | </pipeline>
638 | 


--------------------------------------------------------------------------------
/ApacheHopProject/pipelines/pipeline5_basic_api.hpl:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <pipeline>
  3 |   <info>
  4 |     <name>pipeline5_basic_api</name>
  5 |     <name_sync_with_filename>Y</name_sync_with_filename>
  6 |     <description/>
  7 |     <extended_description/>
  8 |     <pipeline_version/>
  9 |     <pipeline_type>Normal</pipeline_type>
 10 |     <parameters>
 11 |     </parameters>
 12 |     <capture_transform_performance>N</capture_transform_performance>
 13 |     <transform_performance_capturing_delay>1000</transform_performance_capturing_delay>
 14 |     <transform_performance_capturing_size_limit>100</transform_performance_capturing_size_limit>
 15 |     <created_user>-</created_user>
 16 |     <created_date>2023/05/20 13:26:48.811</created_date>
 17 |     <modified_user>-</modified_user>
 18 |     <modified_date>2023/05/20 13:26:48.811</modified_date>
 19 |   </info>
 20 |   <notepads>
 21 |   </notepads>
 22 |   <order>
 23 |     <hop>
 24 |       <from>Adjusting columns</from>
 25 |       <to>API pattern</to>
 26 |       <enabled>Y</enabled>
 27 |     </hop>
 28 |     <hop>
 29 |       <from>API pattern</from>
 30 |       <to>Concat to API pattern</to>
 31 |       <enabled>Y</enabled>
 32 |     </hop>
 33 |     <hop>
 34 |       <from>Concat to API pattern</from>
 35 |       <to>REST client</to>
 36 |       <enabled>Y</enabled>
 37 |     </hop>
 38 |     <hop>
 39 |       <from>REST client</from>
 40 |       <to>JSON input</to>
 41 |       <enabled>Y</enabled>
 42 |     </hop>
 43 |     <hop>
 44 |       <from>JSON input</from>
 45 |       <to>Final adjustment in the columns</to>
 46 |       <enabled>Y</enabled>
 47 |     </hop>
 48 |     <hop>
 49 |       <from>Final adjustment in the columns</from>
 50 |       <to>Filter rows</to>
 51 |       <enabled>Y</enabled>
 52 |     </hop>
 53 |     <hop>
 54 |       <from>Filter rows</from>
 55 |       <to>Output > Excel for responsible sector check</to>
 56 |       <enabled>Y</enabled>
 57 |     </hop>
 58 |     <hop>
 59 |       <from>Output > Excel for responsible sector check</from>
 60 |       <to>Postal codes not found!</to>
 61 |       <enabled>Y</enabled>
 62 |     </hop>
 63 |     <hop>
 64 |       <from>Input > Fake Data</from>
 65 |       <to>Adjusting columns</to>
 66 |       <enabled>Y</enabled>
 67 |     </hop>
 68 |     <hop>
 69 |       <from>Filter rows</from>
 70 |       <to>If Null</to>
 71 |       <enabled>Y</enabled>
 72 |     </hop>
 73 |     <hop>
 74 |       <from>If Null</from>
 75 |       <to>Output > Clients + Addresses</to>
 76 |       <enabled>Y</enabled>
 77 |     </hop>
 78 |   </order>
 79 |   <transform>
 80 |     <name>API pattern</name>
 81 |     <type>Constant</type>
 82 |     <description/>
 83 |     <distribute>Y</distribute>
 84 |     <custom_distribution/>
 85 |     <copies>1</copies>
 86 |     <partitioning>
 87 |       <method>none</method>
 88 |       <schema_name/>
 89 |     </partitioning>
 90 |     <fields>
 91 |       <field>
 92 |         <length>-1</length>
 93 |         <name>begin</name>
 94 |         <nullif>https://viacep.com.br/ws/</nullif>
 95 |         <precision>-1</precision>
 96 |         <set_empty_string>N</set_empty_string>
 97 |         <type>String</type>
 98 |       </field>
 99 |       <field>
100 |         <length>-1</length>
101 |         <name>end</name>
102 |         <nullif>/json</nullif>
103 |         <precision>-1</precision>
104 |         <set_empty_string>N</set_empty_string>
105 |         <type>String</type>
106 |       </field>
107 |     </fields>
108 |     <attributes/>
109 |     <GUI>
110 |       <xloc>704</xloc>
111 |       <yloc>112</yloc>
112 |     </GUI>
113 |   </transform>
114 |   <transform>
115 |     <name>Adjusting columns</name>
116 |     <type>SelectValues</type>
117 |     <description/>
118 |     <distribute>Y</distribute>
119 |     <custom_distribution/>
120 |     <copies>1</copies>
121 |     <partitioning>
122 |       <method>none</method>
123 |       <schema_name/>
124 |     </partitioning>
125 |     <fields>
126 |       <field>
127 |         <name>Full Name</name>
128 |       </field>
129 |       <field>
130 |         <name>Birthdate</name>
131 |       </field>
132 |       <field>
133 |         <name>Gender</name>
134 |       </field>
135 |       <field>
136 |         <name>CEP</name>
137 |       </field>
138 |       <select_unspecified>N</select_unspecified>
139 |     </fields>
140 |     <attributes/>
141 |     <GUI>
142 |       <xloc>432</xloc>
143 |       <yloc>112</yloc>
144 |     </GUI>
145 |   </transform>
146 |   <transform>
147 |     <name>Concat to API pattern</name>
148 |     <type>Calculator</type>
149 |     <description/>
150 |     <distribute>Y</distribute>
151 |     <custom_distribution/>
152 |     <copies>1</copies>
153 |     <partitioning>
154 |       <method>none</method>
155 |       <schema_name/>
156 |     </partitioning>
157 |     <calculation>
158 |       <calc_type>ADD3</calc_type>
159 |       <field_a>begin</field_a>
160 |       <field_b>CEP</field_b>
161 |       <field_c>end</field_c>
162 |       <field_name>api_final</field_name>
163 |       <remove>N</remove>
164 |       <value_length>-1</value_length>
165 |       <value_precision>-1</value_precision>
166 |       <value_type>String</value_type>
167 |     </calculation>
168 |     <failIfNoFile>Y</failIfNoFile>
169 |     <attributes/>
170 |     <GUI>
171 |       <xloc>848</xloc>
172 |       <yloc>112</yloc>
173 |     </GUI>
174 |   </transform>
175 |   <transform>
176 |     <name>Filter rows</name>
177 |     <type>FilterRows</type>
178 |     <description/>
179 |     <distribute>Y</distribute>
180 |     <custom_distribution/>
181 |     <copies>1</copies>
182 |     <partitioning>
183 |       <method>none</method>
184 |       <schema_name/>
185 |     </partitioning>
186 |     <compare>
187 |       <condition>
188 |         <conditions>
189 | </conditions>
190 |         <function>IS NULL</function>
191 |         <leftvalue>Street</leftvalue>
192 |         <negated>N</negated>
193 |         <operator>-</operator>
194 |       </condition>
195 |     </compare>
196 |     <send_false_to>If Null</send_false_to>
197 |     <send_true_to>Output > Excel for responsible sector check</send_true_to>
198 |     <attributes/>
199 |     <GUI>
200 |       <xloc>384</xloc>
201 |       <yloc>256</yloc>
202 |     </GUI>
203 |   </transform>
204 |   <transform>
205 |     <name>Final adjustment in the columns</name>
206 |     <type>SelectValues</type>
207 |     <description/>
208 |     <distribute>Y</distribute>
209 |     <custom_distribution/>
210 |     <copies>1</copies>
211 |     <partitioning>
212 |       <method>none</method>
213 |       <schema_name/>
214 |     </partitioning>
215 |     <fields>
216 |       <field>
217 |         <name>Full Name</name>
218 |       </field>
219 |       <field>
220 |         <name>CEP</name>
221 |       </field>
222 |       <field>
223 |         <name>logradouro</name>
224 |         <rename>Street</rename>
225 |       </field>
226 |       <field>
227 |         <name>complemento</name>
228 |         <rename>Aditional Info</rename>
229 |       </field>
230 |       <field>
231 |         <name>bairro</name>
232 |         <rename>District</rename>
233 |       </field>
234 |       <field>
235 |         <name>localidade</name>
236 |         <rename>Location</rename>
237 |       </field>
238 |       <field>
239 |         <name>uf</name>
240 |         <rename>State</rename>
241 |       </field>
242 |       <field>
243 |         <name>ibge</name>
244 |         <rename>IBGE Code</rename>
245 |       </field>
246 |       <select_unspecified>N</select_unspecified>
247 |     </fields>
248 |     <attributes/>
249 |     <GUI>
250 |       <xloc>160</xloc>
251 |       <yloc>256</yloc>
252 |     </GUI>
253 |   </transform>
254 |   <transform>
255 |     <name>If Null</name>
256 |     <type>IfNull</type>
257 |     <description/>
258 |     <distribute>Y</distribute>
259 |     <custom_distribution/>
260 |     <copies>1</copies>
261 |     <partitioning>
262 |       <method>none</method>
263 |       <schema_name/>
264 |     </partitioning>
265 |     <fields>
266 |       <field>
267 |         <name>Aditional Info</name>
268 |         <set_empty_string>N</set_empty_string>
269 |         <value>No additional info</value>
270 |       </field>
271 |     </fields>
272 |     <selectFields>Y</selectFields>
273 |     <selectValuesType>N</selectValuesType>
274 |     <setEmptyStringAll>N</setEmptyStringAll>
275 |     <valuetypes>
276 | </valuetypes>
277 |     <attributes/>
278 |     <GUI>
279 |       <xloc>528</xloc>
280 |       <yloc>256</yloc>
281 |     </GUI>
282 |   </transform>
283 |   <transform>
284 |     <name>Input > Fake Data</name>
285 |     <type>ExcelInput</type>
286 |     <description/>
287 |     <distribute>N</distribute>
288 |     <custom_distribution/>
289 |     <copies>1</copies>
290 |     <partitioning>
291 |       <method>none</method>
292 |       <schema_name/>
293 |     </partitioning>
294 |     <accept_filenames>N</accept_filenames>
295 |     <add_to_result_filenames>Y</add_to_result_filenames>
296 |     <bad_line_files_extension>warning</bad_line_files_extension>
297 |     <error_ignored>N</error_ignored>
298 |     <error_line_files_extension>error</error_line_files_extension>
299 |     <error_line_skipped>N</error_line_skipped>
300 |     <fields>
301 |       <field>
302 |         <length>-1</length>
303 |         <name>Full Name</name>
304 |         <precision>-1</precision>
305 |         <repeat>N</repeat>
306 |         <trim_type>both</trim_type>
307 |         <type>String</type>
308 |       </field>
309 |       <field>
310 |         <format>yyyy-MM-dd</format>
311 |         <length>-1</length>
312 |         <name>Birthdate</name>
313 |         <precision>-1</precision>
314 |         <repeat>N</repeat>
315 |         <trim_type>both</trim_type>
316 |         <type>Date</type>
317 |       </field>
318 |       <field>
319 |         <length>-1</length>
320 |         <name>Gender</name>
321 |         <precision>-1</precision>
322 |         <repeat>N</repeat>
323 |         <trim_type>both</trim_type>
324 |         <type>String</type>
325 |       </field>
326 |       <field>
327 |         <format>#</format>
328 |         <length>-1</length>
329 |         <name>CEP</name>
330 |         <precision>-1</precision>
331 |         <repeat>N</repeat>
332 |         <trim_type>both</trim_type>
333 |         <type>String</type>
334 |       </field>
335 |     </fields>
336 |     <file>
337 |       <file_required>N</file_required>
338 |       <include_subfolders>N</include_subfolders>
339 |       <name>${PROJECT_HOME}/Git/source_files/fake_data.xlsx</name>
340 |     </file>
341 |     <header>Y</header>
342 |     <limit>0</limit>
343 |     <line_number_files_extension>line</line_number_files_extension>
344 |     <noempty>Y</noempty>
345 |     <sheets>
346 |       <sheet>
347 |         <name>Sheet1</name>
348 |         <startcol>0</startcol>
349 |         <startrow>0</startrow>
350 |       </sheet>
351 |     </sheets>
352 |     <spreadsheet_type>SAX_POI</spreadsheet_type>
353 |     <stoponempty>N</stoponempty>
354 |     <strict_types>N</strict_types>
355 |     <attributes/>
356 |     <GUI>
357 |       <xloc>160</xloc>
358 |       <yloc>112</yloc>
359 |     </GUI>
360 |   </transform>
361 |   <transform>
362 |     <name>JSON input</name>
363 |     <type>JsonInput</type>
364 |     <description/>
365 |     <distribute>Y</distribute>
366 |     <custom_distribution/>
367 |     <copies>1</copies>
368 |     <partitioning>
369 |       <method>none</method>
370 |       <schema_name/>
371 |     </partitioning>
372 |     <include>N</include>
373 |     <include_field/>
374 |     <rownum>N</rownum>
375 |     <addresultfile>N</addresultfile>
376 |     <readurl>N</readurl>
377 |     <removeSourceField>N</removeSourceField>
378 |     <IsIgnoreEmptyFile>N</IsIgnoreEmptyFile>
379 |     <doNotFailIfNoFile>Y</doNotFailIfNoFile>
380 |     <ignoreMissingPath>Y</ignoreMissingPath>
381 |     <defaultPathLeafToNull>Y</defaultPathLeafToNull>
382 |     <rownum_field/>
383 |     <file>
384 |       <name/>
385 |       <filemask/>
386 |       <exclude_filemask/>
387 |       <file_required>N</file_required>
388 |       <include_subfolders>N</include_subfolders>
389 |     </file>
390 |     <fields>
391 |       <field>
392 |         <name>cep</name>
393 |         <path>$.cep</path>
394 |         <type>String</type>
395 |         <format/>
396 |         <currency/>
397 |         <decimal/>
398 |         <group/>
399 |         <length>-1</length>
400 |         <precision>-1</precision>
401 |         <trim_type>both</trim_type>
402 |         <repeat>N</repeat>
403 |       </field>
404 |       <field>
405 |         <name>logradouro</name>
406 |         <path>$.logradouro</path>
407 |         <type>String</type>
408 |         <format/>
409 |         <currency/>
410 |         <decimal/>
411 |         <group/>
412 |         <length>-1</length>
413 |         <precision>-1</precision>
414 |         <trim_type>both</trim_type>
415 |         <repeat>N</repeat>
416 |       </field>
417 |       <field>
418 |         <name>complemento</name>
419 |         <path>$.complemento</path>
420 |         <type>String</type>
421 |         <format/>
422 |         <currency/>
423 |         <decimal/>
424 |         <group/>
425 |         <length>-1</length>
426 |         <precision>-1</precision>
427 |         <trim_type>both</trim_type>
428 |         <repeat>N</repeat>
429 |       </field>
430 |       <field>
431 |         <name>bairro</name>
432 |         <path>$.bairro</path>
433 |         <type>String</type>
434 |         <format/>
435 |         <currency/>
436 |         <decimal/>
437 |         <group/>
438 |         <length>-1</length>
439 |         <precision>-1</precision>
440 |         <trim_type>both</trim_type>
441 |         <repeat>N</repeat>
442 |       </field>
443 |       <field>
444 |         <name>localidade</name>
445 |         <path>$.localidade</path>
446 |         <type>String</type>
447 |         <format/>
448 |         <currency/>
449 |         <decimal/>
450 |         <group/>
451 |         <length>-1</length>
452 |         <precision>-1</precision>
453 |         <trim_type>both</trim_type>
454 |         <repeat>N</repeat>
455 |       </field>
456 |       <field>
457 |         <name>uf</name>
458 |         <path>$.uf</path>
459 |         <type>String</type>
460 |         <format/>
461 |         <currency/>
462 |         <decimal/>
463 |         <group/>
464 |         <length>-1</length>
465 |         <precision>-1</precision>
466 |         <trim_type>both</trim_type>
467 |         <repeat>N</repeat>
468 |       </field>
469 |       <field>
470 |         <name>ibge</name>
471 |         <path>$.ibge</path>
472 |         <type>String</type>
473 |         <format/>
474 |         <currency/>
475 |         <decimal/>
476 |         <group/>
477 |         <length>-1</length>
478 |         <precision>-1</precision>
479 |         <trim_type>both</trim_type>
480 |         <repeat>N</repeat>
481 |       </field>
482 |     </fields>
483 |     <limit>0</limit>
484 |     <IsInFields>Y</IsInFields>
485 |     <IsAFile>N</IsAFile>
486 |     <valueField>result</valueField>
487 |     <shortFileFieldName/>
488 |     <pathFieldName/>
489 |     <hiddenFieldName/>
490 |     <lastModificationTimeFieldName/>
491 |     <uriNameFieldName/>
492 |     <rootUriNameFieldName/>
493 |     <extensionFieldName/>
494 |     <sizeFieldName/>
495 |     <attributes/>
496 |     <GUI>
497 |       <xloc>1104</xloc>
498 |       <yloc>112</yloc>
499 |     </GUI>
500 |   </transform>
501 |   <transform>
502 |     <name>Output > Clients + Addresses</name>
503 |     <type>TypeExitExcelWriterTransform</type>
504 |     <description/>
505 |     <distribute>Y</distribute>
506 |     <custom_distribution/>
507 |     <copies>1</copies>
508 |     <partitioning>
509 |       <method>none</method>
510 |       <schema_name/>
511 |     </partitioning>
512 |     <add_to_result_filenames>Y</add_to_result_filenames>
513 |     <appendEmpty>0</appendEmpty>
514 |     <appendLines>N</appendLines>
515 |     <appendOffset>0</appendOffset>
516 |     <appendOmitHeader>N</appendOmitHeader>
517 |     <fields>
518 | </fields>
519 |     <file>
520 |       <SpecifyFormat>N</SpecifyFormat>
521 |       <add_date>N</add_date>
522 |       <add_time>N</add_time>
523 |       <autosizecolums>N</autosizecolums>
524 |       <createParentFolder>Y</createParentFolder>
525 |       <do_not_open_newfile_init>Y</do_not_open_newfile_init>
526 |       <extension>xls</extension>
527 |       <filename_in_field>N</filename_in_field>
528 |       <if_file_exists>new</if_file_exists>
529 |       <if_sheet_exists>new</if_sheet_exists>
530 |       <name>file</name>
531 |       <protect_sheet>N</protect_sheet>
532 |       <sheetname>Sheet1</sheetname>
533 |       <split>N</split>
534 |       <splitevery>0</splitevery>
535 |       <stream_data>N</stream_data>
536 |     </file>
537 |     <footer>N</footer>
538 |     <forceFormulaRecalculation>N</forceFormulaRecalculation>
539 |     <header>Y</header>
540 |     <leaveExistingStylesUnchanged>N</leaveExistingStylesUnchanged>
541 |     <makeSheetActive>Y</makeSheetActive>
542 |     <rowWritingMethod>overwrite</rowWritingMethod>
543 |     <startingCell>A1</startingCell>
544 |     <template>
545 |       <enabled>N</enabled>
546 |       <filename>template.xls</filename>
547 |       <hidden>N</hidden>
548 |       <sheet_enabled>N</sheet_enabled>
549 |     </template>
550 |     <attributes/>
551 |     <GUI>
552 |       <xloc>736</xloc>
553 |       <yloc>256</yloc>
554 |     </GUI>
555 |   </transform>
556 |   <transform>
557 |     <name>Output > Excel for responsible sector check</name>
558 |     <type>TypeExitExcelWriterTransform</type>
559 |     <description/>
560 |     <distribute>Y</distribute>
561 |     <custom_distribution/>
562 |     <copies>1</copies>
563 |     <partitioning>
564 |       <method>none</method>
565 |       <schema_name/>
566 |     </partitioning>
567 |     <add_to_result_filenames>Y</add_to_result_filenames>
568 |     <appendEmpty>0</appendEmpty>
569 |     <appendLines>N</appendLines>
570 |     <appendOffset>0</appendOffset>
571 |     <appendOmitHeader>N</appendOmitHeader>
572 |     <fields>
573 | </fields>
574 |     <file>
575 |       <SpecifyFormat>N</SpecifyFormat>
576 |       <add_date>N</add_date>
577 |       <add_time>N</add_time>
578 |       <autosizecolums>N</autosizecolums>
579 |       <createParentFolder>Y</createParentFolder>
580 |       <do_not_open_newfile_init>Y</do_not_open_newfile_init>
581 |       <extension>xls</extension>
582 |       <filename_in_field>N</filename_in_field>
583 |       <if_file_exists>new</if_file_exists>
584 |       <if_sheet_exists>new</if_sheet_exists>
585 |       <name>file</name>
586 |       <protect_sheet>N</protect_sheet>
587 |       <sheetname>Sheet1</sheetname>
588 |       <split>N</split>
589 |       <splitevery>0</splitevery>
590 |       <stream_data>N</stream_data>
591 |     </file>
592 |     <footer>N</footer>
593 |     <forceFormulaRecalculation>N</forceFormulaRecalculation>
594 |     <header>Y</header>
595 |     <leaveExistingStylesUnchanged>N</leaveExistingStylesUnchanged>
596 |     <makeSheetActive>Y</makeSheetActive>
597 |     <rowWritingMethod>overwrite</rowWritingMethod>
598 |     <startingCell>A1</startingCell>
599 |     <template>
600 |       <enabled>N</enabled>
601 |       <filename>template.xls</filename>
602 |       <hidden>N</hidden>
603 |       <sheet_enabled>N</sheet_enabled>
604 |     </template>
605 |     <attributes/>
606 |     <GUI>
607 |       <xloc>384</xloc>
608 |       <yloc>368</yloc>
609 |     </GUI>
610 |   </transform>
611 |   <transform>
612 |     <name>Postal codes not found!</name>
613 |     <type>Mail</type>
614 |     <description/>
615 |     <distribute>Y</distribute>
616 |     <custom_distribution/>
617 |     <copies>1</copies>
618 |     <partitioning>
619 |       <method>none</method>
620 |       <schema_name/>
621 |     </partitioning>
622 |     <include_message_in_output>N</include_message_in_output>
623 |     <message_output_field/>
624 |     <server/>
625 |     <port/>
626 |     <destination/>
627 |     <destinationCc/>
628 |     <destinationBCc/>
629 |     <replyToAddresses/>
630 |     <replyto/>
631 |     <replytoname/>
632 |     <subject/>
633 |     <include_date>N</include_date>
634 |     <include_subfolders>N</include_subfolders>
635 |     <zipFilenameDynamic>N</zipFilenameDynamic>
636 |     <isFilenameDynamic>N</isFilenameDynamic>
637 |     <attachContentFromField>N</attachContentFromField>
638 |     <attachContentField/>
639 |     <attachContentFileNameField/>
640 |     <dynamicFieldname/>
641 |     <dynamicWildcard/>
642 |     <dynamicZipFilename/>
643 |     <sourcefilefoldername/>
644 |     <sourcewildcard/>
645 |     <contact_person/>
646 |     <contact_phone/>
647 |     <comment/>
648 |     <include_files>N</include_files>
649 |     <zip_files>N</zip_files>
650 |     <zip_name/>
651 |     <zip_limit_size>0</zip_limit_size>
652 |     <use_auth>N</use_auth>
653 |     <usexoauth2>N</usexoauth2>
654 |     <use_secure_auth>N</use_secure_auth>
655 |     <auth_user/>
656 |     <auth_password>Encrypted </auth_password>
657 |     <only_comment>N</only_comment>
658 |     <use_HTML>N</use_HTML>
659 |     <use_Priority>N</use_Priority>
660 |     <encoding>UTF-8</encoding>
661 |     <priority>normal</priority>
662 |     <importance>normal</importance>
663 |     <sensitivity>normal</sensitivity>
664 |     <secureconnectiontype>SSL</secureconnectiontype>
665 |     <embeddedimages>
666 |       </embeddedimages>
667 |     <attributes/>
668 |     <GUI>
669 |       <xloc>528</xloc>
670 |       <yloc>448</yloc>
671 |     </GUI>
672 |   </transform>
673 |   <transform>
674 |     <name>REST client</name>
675 |     <type>Rest</type>
676 |     <description/>
677 |     <distribute>Y</distribute>
678 |     <custom_distribution/>
679 |     <copies>1</copies>
680 |     <partitioning>
681 |       <method>none</method>
682 |       <schema_name/>
683 |     </partitioning>
684 |     <applicationType>TEXT PLAIN</applicationType>
685 |     <method>GET</method>
686 |     <url/>
687 |     <urlInField>Y</urlInField>
688 |     <dynamicMethod>N</dynamicMethod>
689 |     <methodFieldName/>
690 |     <urlField>api_final</urlField>
691 |     <bodyField/>
692 |     <httpLogin/>
693 |     <httpPassword>Encrypted </httpPassword>
694 |     <proxyHost/>
695 |     <proxyPort/>
696 |     <preemptive>N</preemptive>
697 |     <trustStoreFile/>
698 |     <trustStorePassword>Encrypted </trustStorePassword>
699 |     <ignoreSsl>N</ignoreSsl>
700 |     <headers>
701 |       </headers>
702 |     <parameters>
703 |       </parameters>
704 |     <matrixParameters>
705 |       </matrixParameters>
706 |     <result>
707 |       <name>result</name>
708 |       <code/>
709 |       <response_time/>
710 |       <response_header/>
711 |     </result>
712 |     <attributes/>
713 |     <GUI>
714 |       <xloc>976</xloc>
715 |       <yloc>112</yloc>
716 |     </GUI>
717 |   </transform>
718 |   <transform_error_handling>
719 |   </transform_error_handling>
720 |   <attributes/>
721 | </pipeline>
722 | 


--------------------------------------------------------------------------------
/ApacheHopProject/scripts_py/fake_py.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from faker import Faker
 3 | import random
 4 | 
 5 | # Create a Faker instance
 6 | fake = Faker('pt_BR')
 7 | 
 8 | # Generate fake data
 9 | def generate_fake_data(num_records):
10 |     data = []
11 |     for _ in range(num_records):
12 |         full_name = fake.name()
13 |         birthdate = fake.date_of_birth(minimum_age=18, maximum_age=90)
14 |         gender = random.choice(['M', 'F'])
15 |         cep = fake.postcode()
16 |         data.append([full_name, birthdate, gender, cep])
17 |     return data
18 | 
19 | # Generate 20 fake records
20 | num_records = 20
21 | fake_data = generate_fake_data(num_records)
22 | 
23 | # Create a DataFrame from the fake data
24 | df = pd.DataFrame(fake_data, columns=['Full Name', 'Birthdate', 'Gender', 'CEP'])
25 | 
26 | # Save the DataFrame as an Excel file in the specified path
27 | save_path = r"C:\Users\jpmul\Downloads\fake_data.xlsx"
28 | df.to_excel(save_path, index=False)
29 | print(f"Fake data saved as '{save_path}'.")
30 | 


--------------------------------------------------------------------------------
/ApacheHopProject/scripts_sql/skeam_stage_parameters_dates_auxiliary.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE SKEAM_SHOP.PROJECT_PARAMETERS (
 2 | 	PK_PARAMETERS		INT 			PRIMARY KEY
 3 | 	, NM_PROJECT		VARCHAR(255)		NOT NULL
 4 | 	, NM_SUBPROJECT		VARCHAR(255) 		NOT NULL
 5 | 	, NM_REFERENCE		VARCHAR(255)		NOT NULL
 6 | 	, TXT_NOTE		VARCHAR(255)
 7 | 	, NM_VALUE		VARCHAR(255) 	
 8 | );
 9 | 
10 | 
11 | 
12 | 
13 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (1, 'Pipe', 'Skeam', 'DIM_CLIENTS - Number of days for ETL load', NULL, '90')
14 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (2, 'Pipe', 'Skeam', 'DIM_CLIENTS - Number of days for ETL load (business hours)', NULL, '7')
15 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (3, 'Pipe', 'Skeam', 'DIM_CLIENTS - Flag of full load', NULL, 'False')
16 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (4, 'Pipe', 'Skeam', 'DIM_CLIENTS - Flag loading taking place during Sunday', NULL, 'False')
17 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (5, 'Pipe', 'Skeam', 'DIM_CLIENTS - Start date for loading on Sundays', NULL, NULL)
18 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (6, 'Pipe', 'Skeam', 'DIM_CLIENTS - Full ELT load start date', NULL, '2013-01-01')
19 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (7, 'Pipe', 'Skeam', 'DIM_CLIENTS - Start date', NULL, '2022-01-01')
20 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (8, 'Pipe', 'Skeam', 'DIM_CLIENTS - Final date (if null, current date)', NULL, NULL)
21 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (9, 'Pipe', 'Skeam', 'DIM_CLIENTS - Email suject', NULL, 'Project Pipe - Skeam - ETL for dimension clients')
22 | INSERT INTO SKEAM_SHOP.PROJECT_PARAMETERS VALUES (10, 'Pipe', 'Skeam', 'DIM_CLIENTS - Email suject', NULL, 'bi.team@emailadress.com')
23 | 
24 | 
25 | SELECT	PK_PARAMETERS
26 | 	, NM_PROJECT
27 | 	, NM_SUBPROJECT
28 | 	, NM_REFERENCE
29 | 	, TXT_NOTE
30 | 	, NM_VALUE
31 | FROM	SKEAM_STG.SKEAM_SHOP.PROJECT_PARAMETERS
32 | WHERE	NM_PROJECT = 'Pipe'
33 | AND	NM_SUBPROJECT = 'Skeam'
34 | AND 	NM_REFERENCE LIKE '%DIM_CLIENTS%'
35 | 


--------------------------------------------------------------------------------
/ApacheHopProject/source_files/fake_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/ApacheHopProject/source_files/fake_data.xlsx


--------------------------------------------------------------------------------
/ApacheHopProject/source_files/source_data_pipeline_2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/ApacheHopProject/source_files/source_data_pipeline_2.xlsx


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/consumer.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG, Dataset
 2 | from airflow.decorators import task
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | my_file = Dataset("/tmp/my_file.txt")
 7 | my_file_2 = Dataset("/tmp/my_file_2.txt")
 8 | 
 9 | with DAG(
10 |     dag_id="consumer",
11 |     schedule=[my_file, my_file_2],
12 |     start_date=datetime(2022, 1, 1),
13 |     catchup=False
14 | ):
15 |     @task
16 |     def read_dataset():
17 |         with open(my_file.uri, "r") as f:
18 |             print(f.read())
19 |     
20 |     read_dataset()


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/group_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | from groups.group_downloads import download_tasks
 4 | from groups.group_transforms import transform_tasks
 5 | 
 6 |  
 7 | from datetime import datetime
 8 |  
 9 | with DAG('group_dag', start_date=datetime(2022, 1, 1), 
10 |     schedule_interval='@daily', catchup=False) as dag:
11 | 
12 |     args = {'start_date': dag.start_date, 'schedule_interval': dag.schedule_interval, 'catchup': dag.catchup}
13 | 
14 |     downloads = download_tasks()
15 | 
16 |     check_files = BashOperator(
17 |         task_id='check_files',
18 |         bash_command='sleep 10'
19 |     )
20 |  
21 |     transforms = transform_tasks()
22 |  
23 |     downloads >> check_files >> transforms


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/group_dag_subdags.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | from airflow.operators.subdag import SubDagOperator
 4 | from subdags.subdag_downloads import subdag_downloads
 5 |  
 6 | from datetime import datetime
 7 |  
 8 | with DAG('group_dag', start_date=datetime(2022, 1, 1), 
 9 |     schedule_interval='@daily', catchup=False) as dag:
10 | 
11 |     args = {'start_date': dag.start_date, 'schedule_interval': dag.schedule_interval, 'catchup': dag.catchup}
12 | 
13 |     downloads = SubDagOperator(
14 |         task_id='downloads',
15 |         subdag=subdag_downloads(dag.dag_id, 'downloads', args)
16 |     )
17 | 
18 |     check_files = BashOperator(
19 |         task_id='check_files',
20 |         bash_command='sleep 10'
21 |     )
22 |  
23 |     transforms = SubDagOperator(
24 |         task_id='transforms',
25 |         subdag=subdag_downloads(dag.dag_id, 'transforms', args)
26 |     )
27 |  
28 |     downloads >> check_files >> transforms


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/groups/group_downloads.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | from airflow.utils.task_group import TaskGroup
 4 | 
 5 | def download_tasks():
 6 | 
 7 |     with TaskGroup("downloads", tooltip="Download tasks") as group:
 8 |         
 9 |         download_a = BashOperator(
10 |             task_id='download_a',
11 |             bash_command='sleep 10'
12 |         )
13 |  
14 |         download_b = BashOperator(
15 |             task_id='download_b',
16 |             bash_command='sleep 10'
17 |         )
18 |  
19 |         download_c = BashOperator(
20 |             task_id='download_c',
21 |             bash_command='sleep 10'
22 |         )
23 | 
24 |         return group


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/groups/group_transforms.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | from airflow.utils.task_group import TaskGroup
 4 | 
 5 | def transform_tasks():
 6 | 
 7 |     with TaskGroup("transforms", tooltip="Transforms tasks") as group:
 8 |         
 9 |         transform_a = BashOperator(
10 |             task_id='transform_a',
11 |             bash_command='sleep 10'
12 |     )
13 |  
14 |         transform_b = BashOperator(
15 |             task_id='transform_b',
16 |             bash_command='sleep 10'
17 |     )
18 |  
19 |         transform_c = BashOperator(
20 |             task_id='transform_c',
21 |             bash_command='sleep 10'
22 |     )
23 |         
24 |     return group


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/producer.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG, Dataset
 2 | from airflow.decorators import task
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | my_file = Dataset("/tmp/my_file.txt")
 7 | my_file_2 = Dataset("/tmp/my_file_2.txt")
 8 | 
 9 | with DAG(
10 |     dag_id="producer",
11 |     schedule="@daily",
12 |     start_date=datetime(2022, 1, 1),
13 |     catchup=False
14 | ):
15 |     @task(outlets=[my_file])
16 |     def update_dataset():
17 |         with open(my_file.uri, "a+") as f:
18 |             f.write("producer update")
19 | 
20 |     @task(outlets=[my_file_2])
21 |     def update_dataset_2():
22 |         with open(my_file_2.uri, "a+") as f:
23 |             f.write("producer update")
24 |     
25 |     update_dataset() >> update_dataset_2()
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/subdags/subdag_downloads.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | 
 4 | def subdag_downloads(parent_dag_id, child_dag_id, args):
 5 | 
 6 |     with DAG(f"{parent_dag_id}.{child_dag_id}",
 7 |              start_date=args['start_date'],
 8 |              schedule_interval=args['schedule_interval'],
 9 |              catchup=args['catchup']) as dag:
10 |         
11 |         download_a = BashOperator(
12 |             task_id='download_a',
13 |             bash_command='sleep 10'
14 |         )
15 |  
16 |         download_b = BashOperator(
17 |             task_id='download_b',
18 |             bash_command='sleep 10'
19 |         )
20 |  
21 |         download_c = BashOperator(
22 |             task_id='download_c',
23 |             bash_command='sleep 10'
24 |         )
25 | 
26 |         return dag


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/subdags/subdag_transforms.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash import BashOperator
 3 | 
 4 | def subdag_transforms(parent_dag_id, child_dag_id, args):
 5 | 
 6 |     with DAG(f"{parent_dag_id}.{child_dag_id}",
 7 |              start_date=args['start_date'],
 8 |              schedule_interval=args['schedule_interval'],
 9 |              catchup=args['catchup']) as dag:
10 |         
11 |         transform_a = BashOperator(
12 |         task_id='transform_a',
13 |         bash_command='sleep 10'
14 |     )
15 |  
16 |         transform_b = BashOperator(
17 |         task_id='transform_b',
18 |         bash_command='sleep 10'
19 |     )
20 |  
21 |         transform_c = BashOperator(
22 |         task_id='transform_c',
23 |         bash_command='sleep 10'
24 |     )
25 |         
26 |     return dag


--------------------------------------------------------------------------------
/Apache_Airflow_Marc_Lamberti/dags/user_processing.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 3 | from airflow.providers.http.sensors.http import HttpSensor
 4 | from airflow.providers.http.operators.http import SimpleHttpOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | from airflow.providers.postgres.hooks.postgres import PostgresHook
 7 | 
 8 | import json
 9 | import pandas as pd
10 | from datetime import datetime
11 | 
12 | def _process_user(ti):
13 |     user = ti.xcom_pull(task_ids="extract_user")
14 |     user = user['results'][0]
15 |     processed_user = pd.json_normalize({
16 |         'firstname': user['name']['first'],
17 |         'lastname': user['name']['last'],
18 |         'country': user['location']['country'],
19 |         'username': user['login']['username'],
20 |         'password': user['login']['password'],
21 |         'email': user['email']
22 |     })
23 |     processed_user.to_csv('/tmp/processed_user.csv', index=None, header=False)
24 | 
25 | def _store_user():
26 |     hook = PostgresHook(postgres_conn_id='postgres')
27 |     hook.copy_expert(
28 |         sql="COPY users FROM stdin WITH DELIMITER as ','",
29 |         filename='/tmp/processed_user.csv'
30 |     )
31 | 
32 | with DAG('user_processing', start_date=datetime(2022, 1, 1),
33 |          schedule_interval='@daily', catchup=False) as dag:
34 |     
35 |     create_table = PostgresOperator(
36 |         task_id='create_table',
37 |         postgres_conn_id='postgres',
38 |         sql='''
39 |               CREATE TABLE IF NOT EXISTS users(
40 |                     firstname TEXT NOT NULL,
41 |                     lastname  TEXT NOT NULL,
42 |                     country   TEXT NOT NULL,
43 |                     username  TEXT NOT NULL,
44 |                     password  TEXT NOT NULL,
45 |                     email     TEXT NOT NULL
46 |               );
47 |           '''
48 |     )
49 | 
50 |     is_api_available = HttpSensor(
51 |         task_id='is_api_available',
52 |         queue='high_cpu',
53 |         http_conn_id='user_api',
54 |         endpoint='api/'
55 |     )
56 | 
57 |     extract_user = SimpleHttpOperator(
58 |         task_id='extract_user',
59 |         http_conn_id='user_api',
60 |         endpoint='api/',
61 |         method='GET',
62 |         response_filter=lambda response: json.loads(response.text),
63 |         log_response=True
64 |     )
65 | 
66 |     process_user = PythonOperator(
67 |         task_id='process_user',
68 |         python_callable=_process_user,
69 |     )
70 | 
71 |     store_user = PythonOperator(
72 |         task_id='store_user',
73 |         python_callable=_store_user
74 |     )
75 | 
76 | create_table >> is_api_available >> extract_user >> process_user >> store_user


--------------------------------------------------------------------------------
/Data_Engineering_Course_XPE/python/numpy_testing_bootcamp_jpmuller.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "# Numpy\n"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "VF96Hd_a7Koc"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "source": [
 29 |         "# Importing module\n",
 30 |         "import numpy as np"
 31 |       ],
 32 |       "metadata": {
 33 |         "id": "c7ZoHYa37QPS"
 34 |       },
 35 |       "execution_count": 1,
 36 |       "outputs": []
 37 |     },
 38 |     {
 39 |       "cell_type": "markdown",
 40 |       "source": [
 41 |         "# Creating Arrays"
 42 |       ],
 43 |       "metadata": {
 44 |         "id": "jxonTElq7XSO"
 45 |       }
 46 |     },
 47 |     {
 48 |       "cell_type": "code",
 49 |       "source": [
 50 |         "help(np.array)"
 51 |       ],
 52 |       "metadata": {
 53 |         "id": "LMjzttOQ7a4b"
 54 |       },
 55 |       "execution_count": null,
 56 |       "outputs": []
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "source": [
 61 |         "# Creating a 1D array: [1, 2, 3]\n",
 62 |         "l = [1, 2, 3]\n",
 63 |         "x = np.array(l)\n",
 64 |         "print(\"x:\", x)\n",
 65 |         "print(\"shape:\", x.shape)"
 66 |       ],
 67 |       "metadata": {
 68 |         "colab": {
 69 |           "base_uri": "https://localhost:8080/"
 70 |         },
 71 |         "id": "X14OOnko9_qF",
 72 |         "outputId": "c25c0405-7fc3-4c59-81de-56ef97e7ed89"
 73 |       },
 74 |       "execution_count": 5,
 75 |       "outputs": [
 76 |         {
 77 |           "output_type": "stream",
 78 |           "name": "stdout",
 79 |           "text": [
 80 |             "x: [1 2 3]\n",
 81 |             "shape: (3,)\n"
 82 |           ]
 83 |         }
 84 |       ]
 85 |     },
 86 |     {
 87 |       "cell_type": "code",
 88 |       "source": [
 89 |         "type(x)"
 90 |       ],
 91 |       "metadata": {
 92 |         "colab": {
 93 |           "base_uri": "https://localhost:8080/"
 94 |         },
 95 |         "id": "FdR62JLF-Fyv",
 96 |         "outputId": "f3b9bd27-e0e7-4880-c7de-83852baf01ab"
 97 |       },
 98 |       "execution_count": 6,
 99 |       "outputs": [
100 |         {
101 |           "output_type": "execute_result",
102 |           "data": {
103 |             "text/plain": [
104 |               "numpy.ndarray"
105 |             ]
106 |           },
107 |           "metadata": {},
108 |           "execution_count": 6
109 |         }
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "code",
114 |       "source": [
115 |         "# Creating a 2D array: nested lists\n",
116 |         "l = [[1, 2], [3, 4]]\n",
117 |         "x = np.array(l)\n",
118 |         "print(\"x:\\n\", x)\n",
119 |         "print(\"shape:\", x.shape)"
120 |       ],
121 |       "metadata": {
122 |         "colab": {
123 |           "base_uri": "https://localhost:8080/"
124 |         },
125 |         "id": "nu9U3AgKAB4f",
126 |         "outputId": "d77989c6-b731-422a-881c-ac01779e7afe"
127 |       },
128 |       "execution_count": 7,
129 |       "outputs": [
130 |         {
131 |           "output_type": "stream",
132 |           "name": "stdout",
133 |           "text": [
134 |             "x:\n",
135 |             " [[1 2]\n",
136 |             " [3 4]]\n",
137 |             "shape: (2, 2)\n"
138 |           ]
139 |         }
140 |       ]
141 |     },
142 |     {
143 |       "cell_type": "code",
144 |       "source": [
145 |         "# Array with 0's\n",
146 |         "dim = (2,2) # (lines, columns)\n",
147 |         "x = np.zeros(dim)\n",
148 |         "print(\"x:\\n\", x)\n",
149 |         "print(\"shape:\", x.shape)"
150 |       ],
151 |       "metadata": {
152 |         "colab": {
153 |           "base_uri": "https://localhost:8080/"
154 |         },
155 |         "id": "RMzqtAToIkoY",
156 |         "outputId": "e0cff73b-627f-4cac-e200-47016c2e479e"
157 |       },
158 |       "execution_count": 8,
159 |       "outputs": [
160 |         {
161 |           "output_type": "stream",
162 |           "name": "stdout",
163 |           "text": [
164 |             "x:\n",
165 |             " [[0. 0.]\n",
166 |             " [0. 0.]]\n",
167 |             "shape: (2, 2)\n"
168 |           ]
169 |         }
170 |       ]
171 |     },
172 |     {
173 |       "cell_type": "code",
174 |       "source": [
175 |         "# Array with 1's\n",
176 |         "dim = (2,2) # (lines, columns)\n",
177 |         "x = np.ones(dim)\n",
178 |         "print(\"x:\\n\", x)\n",
179 |         "print(\"shape:\", x.shape)"
180 |       ],
181 |       "metadata": {
182 |         "colab": {
183 |           "base_uri": "https://localhost:8080/"
184 |         },
185 |         "id": "CkveLqujJhGd",
186 |         "outputId": "c70b1abd-3826-487a-d5f9-ee31949c92ed"
187 |       },
188 |       "execution_count": 9,
189 |       "outputs": [
190 |         {
191 |           "output_type": "stream",
192 |           "name": "stdout",
193 |           "text": [
194 |             "x:\n",
195 |             " [[1. 1.]\n",
196 |             " [1. 1.]]\n",
197 |             "shape: (2, 2)\n"
198 |           ]
199 |         }
200 |       ]
201 |     },
202 |     {
203 |       "cell_type": "code",
204 |       "source": [
205 |         "# creating values inside an interval\n",
206 |         "# uniform values between 5 and 15\n",
207 |         "\n",
208 |         "x_min, x_max = 5, 15\n",
209 |         "x = np.linspace(start=x_min, stop=x_max, num=6)\n",
210 |         "print(\"x:\", x)\n",
211 |         "print(\"shape:\", x.shape)"
212 |       ],
213 |       "metadata": {
214 |         "colab": {
215 |           "base_uri": "https://localhost:8080/"
216 |         },
217 |         "id": "aXVvPlsJKPtR",
218 |         "outputId": "7546ad98-6824-41fb-92a0-d86bdbbee7e0"
219 |       },
220 |       "execution_count": 11,
221 |       "outputs": [
222 |         {
223 |           "output_type": "stream",
224 |           "name": "stdout",
225 |           "text": [
226 |             "x: [ 5.  7.  9. 11. 13. 15.]\n",
227 |             "shape: (6,)\n"
228 |           ]
229 |         }
230 |       ]
231 |     },
232 |     {
233 |       "cell_type": "code",
234 |       "source": [
235 |         "# creating identity matrix\n",
236 |         "n = 4\n",
237 |         "x = np.eye(n)\n",
238 |         "print(\"x:\\n\", x)\n",
239 |         "print(\"shape:\", x.shape)"
240 |       ],
241 |       "metadata": {
242 |         "colab": {
243 |           "base_uri": "https://localhost:8080/"
244 |         },
245 |         "id": "Wm3q-QJMKv_t",
246 |         "outputId": "26067d78-9b46-4d03-a47c-cdbc552d4a87"
247 |       },
248 |       "execution_count": 12,
249 |       "outputs": [
250 |         {
251 |           "output_type": "stream",
252 |           "name": "stdout",
253 |           "text": [
254 |             "x:\n",
255 |             " [[1. 0. 0. 0.]\n",
256 |             " [0. 1. 0. 0.]\n",
257 |             " [0. 0. 1. 0.]\n",
258 |             " [0. 0. 0. 1.]]\n",
259 |             "shape: (4, 4)\n"
260 |           ]
261 |         }
262 |       ]
263 |     },
264 |     {
265 |       "cell_type": "code",
266 |       "source": [
267 |         "# random values\n",
268 |         "# np.random.seed(10)\n",
269 |         "x = np.random.random(size=(2, 3))\n",
270 |         "print(\"x:\\n\", x)\n",
271 |         "print(\"shape:\", x.shape)"
272 |       ],
273 |       "metadata": {
274 |         "colab": {
275 |           "base_uri": "https://localhost:8080/"
276 |         },
277 |         "id": "qVm5ElkbK_d8",
278 |         "outputId": "b761b7f7-181c-4589-d553-8c5bbc2306ab"
279 |       },
280 |       "execution_count": 13,
281 |       "outputs": [
282 |         {
283 |           "output_type": "stream",
284 |           "name": "stdout",
285 |           "text": [
286 |             "x:\n",
287 |             " [[0.91364277 0.58447993 0.35325854]\n",
288 |             " [0.89223046 0.02100358 0.52700308]]\n",
289 |             "shape: (2, 3)\n"
290 |           ]
291 |         }
292 |       ]
293 |     }
294 |   ]
295 | }


--------------------------------------------------------------------------------
/Hive_Impala/Scripts/hive_commands.sh:
--------------------------------------------------------------------------------
  1 | # Create the HDFS directory to receive the files for the car rental company ("locacao" is the Portuguese word for car rental).
  2 | hdfs dfs -mkdir /user/cloudera/locacao
  3 | 
  4 | # The directory is changed to the Cloudera Downloads folder, where the files downloaded for the project are located. 
  5 | #Files are copied to the directory, namely all files of the CSV type.
  6 | cd /home/cloudera/Downloads
  7 | 
  8 | hdfs dfs -put *.csv /user/cloudera/locacao 
  9 | 
 10 | # Use Beeline which is a client to access Hive.
 11 | beeline
 12 | 
 13 | # Connect to Hive within the Beeline client.
 14 | !connect jdbc:hive2://
 15 | 
 16 | # Creating, showing and dropping a test database.
 17 | create database test;
 18 | 
 19 | show database test;
 20 | 
 21 | drop database test cascade;
 22 | 
 23 | # Creating the locacao (rental) database and using it.
 24 | create database locacao;
 25 | 
 26 | use locacao;
 27 | 
 28 | # Creating the first table in locacao database.
 29 | 
 30 | CREATE EXTERNAL TABLE CLIENTES (
 31 | 	idcliente 		    int
 32 | 	, cnh			    string
 33 | 	, cpf			    string
 34 | 	, validadecnh	    date
 35 | 	, nome			    string
 36 | 	, datacadastro	    date
 37 | 	, datanascimento    date
 38 | 	, telefone		    string
 39 | 	, status		    string)
 40 | 
 41 | row format delimited fields terminated by ',' STORED AS TEXTFILE;
 42 | 
 43 | # "Inserting data" into the table CLIENTES.
 44 | 
 45 | LOAD DATA INPATH '/user/cloudera/locacao/clientes.csv' INTO TABLE CLIENTES;
 46 | 
 47 | # Querying against the table CLIENTES.
 48 | 
 49 | SELECT * FROM CLIENTES;
 50 | 
 51 | # Creating the cars table;
 52 | 
 53 | CREATE EXTERNAL TABLE VEICULOS (
 54 | 	idveiculo           int
 55 |     , dataaquisicao     date
 56 |     , ano               int
 57 |     , modelo            string
 58 |     , placa             string
 59 |     , status            string
 60 |     , diaria            double)
 61 | 
 62 | row format delimited fields terminated by ',' STORED AS TEXTFILE;
 63 | 
 64 | # "Inserting data" into the table VEICULOS.
 65 | 
 66 | LOAD DATA INPATH '/user/cloudera/locacao/veiculos.csv' INTO TABLE VEICULOS;
 67 | 
 68 | # Querying against the table VEICULOS.
 69 | 
 70 | SELECT * FROM VEICULOS;
 71 | 
 72 | # Creating the car rental agents (dispatchers) table;
 73 | 
 74 | CREATE EXTERNAL TABLE DESPACHANTES (
 75 | 	iddespachante		int
 76 | 	, nome 				string
 77 | 	, status 			string
 78 | 	, filial			string)
 79 | 
 80 | row format delimited fields terminated by ',' STORED AS TEXTFILE;
 81 | 
 82 | # "Inserting data" into the table DESPACHANTES.
 83 | 
 84 | LOAD DATA INPATH '/user/cloudera/locacao/despachantes.csv' INTO TABLE DESPACHANTES;
 85 | 
 86 | # Querying against the table DESPACHANTES.
 87 | 
 88 | SELECT * FROM DESPACHANTES;
 89 | 
 90 | # Creating the rental table;
 91 | 
 92 | CREATE EXTERNAL TABLE LOCACAO (
 93 | 	idlocacao			int
 94 | 	, idcliente			int
 95 | 	, iddespachante		int
 96 | 	, idveiculo			int
 97 | 	, idveiculo			int
 98 | 	, datalocacao		date
 99 | 	, dataentrega		date
100 | 	, total				double)
101 | 
102 | row format delimited fields terminated by ',' STORED AS TEXTFILE;
103 | 
104 | # "Inserting data" into the table LOCACAO.
105 | 
106 | LOAD DATA INPATH '/user/cloudera/locacao/locacao.csv' INTO TABLE LOCACAO;
107 | 
108 | # Querying against the table LOCACAO.
109 | 
110 | SELECT * FROM LOCACAO;
111 | 
112 | # Metadata Commands
113 | 
114 | # Exhibiting the database tables 
115 | 
116 | show tables;
117 | 
118 | # Describing the table structure
119 | 
120 | # describe + [table name]
121 | describe clientes;
122 | 
123 | # Describing the formatted table
124 | 
125 | # describe formatted + [table name]
126 | 
127 | describe formatted locacao;
128 | 
129 | # Describing databases
130 | 
131 | #describe database + [database name]
132 | 
133 | describe database locacao;
134 | 
135 | # Accessing the Hive catalog 
136 | 
137 | # On terminal (outside beeline), first type
138 | 
139 | mysql -u root -pcloudera
140 | 
141 | # After entering MySQL, we can enter metastore
142 | show databases;
143 | use metastore;
144 | 
145 | # Now we can exhibit the tables
146 | 
147 | show tables;
148 | 
149 | # Cheking databases on MySQL
150 | 
151 | select * from DBS;
152 | 
153 | # Checking tables inside a MySQL database by its ID
154 | 
155 | select * from TBLS where DB_ID = 3;
156 | 
157 | # Querying to check columns of tables in the previous database query
158 | 
159 | select * from COLUMNS_V2 where CD_ID = 1;
160 | 
161 | # Creating a new table from another table in HiveQL
162 | 
163 | create table locacao2 as select * from locacao where iddespachante = 2;
164 | 
165 | # Ingesting data from one database into another
166 | 
167 | create database teste;
168 | 
169 | create table teste.locacao2 as select * from locacao where iddespachante = 2;
170 | 
171 | select * from teste.locacao2;
172 | #############################################################################
173 | ############################# SQOOP MINI PROJECT ############################
174 | #############################################################################
175 | 
176 | # Firstly, loging into MySQL
177 | 
178 | mysql -u root -pcloudera
179 | 
180 | # Connecting to the sample database that will be used
181 | # for this mini project
182 | 
183 | use retail_db;
184 | 
185 | # Couting the Orders' table (as an example)
186 | 
187 | select count(*) from order_items;
188 | 
189 | # Accessing MySQL with SQOOP and listing the databases
190 | 
191 | sqoop list-databases --connect jdbc:mysql://localhost/ --username root --password cloudera
192 | 
193 | # Now showing the existing tables in retail_db
194 | 
195 | sqoop list-tables --connect jdbc:mysql://localhost/retail_db --username root --password cloudera
196 | 
197 | # Creating the retail_db database in Hive
198 | 
199 | create database retail_db;
200 | 
201 | # Using now SQOOP to import all tables from retail_db (MySQL)
202 | # to retail_db (Hive)
203 | 
204 | sqoop import-all-tables --connect jdbc:mysql://localhost/retail_db --username root --password cloudera --hive-import --hive-overwrite --hive-database retail_db --create-hive-table --m 1
205 | 
206 | # Comparing the count from order_items in Hive with the one in MySQL
207 | 
208 | select count(*) from retail_db.order_items;
209 | 
210 | # Inserting a new data into categories table
211 | 
212 | insert into categories values (59, 8, "Test");
213 | 
214 | # Using the SQOOP command to incremental load
215 | 
216 | sqoop import --connect jdbc:mysql://localhost/retail_db --username root --password cloudera --hive-import --hive-database retail_db --check-column category_id --incremental append --last-value 58 --table categories 
217 | 
218 | # Checking if new line appeared in Hive
219 | 
220 | select * from retail_db.categories;
221 | 
222 | # Saving using HDFS
223 | 
224 | insert overwrite directory '/user/cloudera/locacao2' select * from locacao.locacao;
225 | 
226 | # Checking the created file in HDFS
227 | 
228 | hdfs dfs -ls /user/cloudera/locacao2
229 | 
230 | # Saving as CSV
231 | 
232 | insert overwrite directory '/user/cloudera/locacao2' 
233 | row format delimited fields terminated by ','
234 | select * from locacao.locacao;
235 | 
236 | # Saving as Parquet
237 | 
238 | insert overwrite directory '/user/cloudera/locacao2' 
239 | row format delimited fields terminated by ','
240 | stored as parquet
241 | select * from teste.locacao3;
242 | 
243 | # Changing the default to work with partitioning and bucketing
244 | 
245 | set hive.exec.dynamic.partition.mode;
246 | set hive.exec.dynamic.partition.mode=nonstrict;
247 | 
248 | # Creating table for partitioning
249 | 
250 | create table locacao.locacaoanalitico (
251 | 	cliente string, 
252 | 	despachante string, 
253 | 	datalocacao date,
254 | 	total double
255 | 	) 
256 | partitioned by (veiculo string);
257 | 
258 | # Inserting data into the new column with partitioning
259 | 
260 | insert overwrite table locacao.locacaoanalitico partition (veiculo)
261 | select cli.nome
262 | 	   , des.nome
263 | 	   , loc.datalocacao
264 | 	   , loc.total
265 | 	   , veic.modelo
266 | from locacao loc
267 | join despachantes des 
268 | on (loc.iddespachante = des.iddespachante)
269 | join clientes cli 
270 | on (loc.idcliente = cli.idcliente)
271 | join veiculos veic
272 | on (loc.idveiculo = veic.idveiculo);
273 | 
274 | # Checking files partitionated using HDFS
275 | 
276 | hdfs dfs -ls /user/hive/warehouse/locacao.db/locacaoanalitico
277 | 
278 | 
279 | # Creating a table for bucketing
280 | 
281 | create table locacaoanalitico2 (
282 | 	cliente string,
283 | 	despachante string,
284 | 	datalocacao date,
285 | 	total double,
286 | 	veiculo string
287 | )
288 | clustered by (veiculo) into 4 buckets;
289 | 
290 | # Inserting into locacaoanalitico2 (bucketing)
291 | 
292 | insert overwrite table locacao.locacaoanalitico2
293 | select cli.nome
294 | 	   , des.nome
295 | 	   , loc.datalocacao
296 | 	   , loc.total
297 | 	   , veic.modelo
298 | from locacao loc
299 | join despachantes des 
300 | on (loc.iddespachante = des.iddespachante)
301 | join clientes cli 
302 | on (loc.idcliente = cli.idcliente)
303 | join veiculos veic
304 | on (loc.idveiculo = veic.idveiculo);
305 | 
306 | # Creating a temporary table
307 | 
308 | create temporary table temp_des as select * from despachantes;
309 | 
310 | # Checking data from the temp table
311 | 
312 | select * from temp_des;
313 | 
314 | # Disconnecting from hive
315 | 
316 | !q
317 | 
318 | # Re-entering into Hive through Beeline
319 | 
320 | beeline
321 | 
322 | !connect jdbc:hive2://
323 | 
324 | # Trying to check again the temp table
325 | 
326 | use locacao;
327 | 
328 | select * from temp_des;
329 | 
330 | # Creating a view
331 | 
332 | create view if not exists locacaoview as
333 | select cli.nome as cliente
334 | 	   , des.nome as despachante
335 | 	   , loc.datalocacao as data
336 | 	   , loc.total as total
337 | 	   , veic.modelo as veiculo
338 | from locacao loc
339 | join despachantes des 
340 | on (loc.iddespachante = des.iddespachante)
341 | join clientes cli 
342 | on (loc.idcliente = cli.idcliente)
343 | join veiculos veic
344 | on (loc.idveiculo = veic.idveiculo);
345 | 
346 | # Querying against the view created
347 | 
348 | select * from locacaoview;
349 | 
350 | # Creating table configured for ORC format file
351 | 
352 | create external table clientes_orc (
353 | 	idcliente int,
354 | 	cnh string,
355 | 	cpf string,
356 | 	validadecnh date,
357 | 	nome string,
358 | 	datacadastro date,
359 | 	datanascimento date,
360 | 	telefone string,
361 | 	status string
362 | )
363 | stored as orc;
364 | 
365 | # Ingesting data into clientes_orc
366 | 
367 | insert overwrite table clientes_orc select * from clientes;
368 | 
369 | # Consulting the new ORC table
370 | 
371 | select * from clientes_orc;
372 | 
373 | # Checking how is Hive support for transactions
374 | 
375 | set hive.support.concurrency;
376 | 
377 | # Editing Hive settings for give support to transactions
378 | 
379 | sudo gedit /etc/hive/conf.dist/hive-site.xml
380 | 
381 | # Inserting this below information in the tag <configuration>
382 | 
383 | <property>
384 | 		<name>hive.support.concurrency</name>
385 | 		<value>true</value></property>
386 | <property>
387 | 		<name>hive.txn.manager</name>
388 | 		<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
389 | </property><property>
390 | 		<name>hive.compactor.initiator.on</name>
391 | 		<value>true</value>
392 | </property>
393 | <property>
394 | 		<name>hive.compactor.worker.threads</name>
395 | 		<value>1</value>
396 | </property>
397 | 
398 | # Stopping Hive service (for re-initialization)
399 | 
400 | sudo service hive-server2 stop
401 | 
402 | # Starting Hive service (for re-initialization)
403 | 
404 | sudo service hive-server2 start
405 | 
406 | # Quitting Beeline
407 | 
408 | !q
409 | 
410 | # Checking how is Hive support for transactions (again)
411 | 
412 | set hive.support.concurrency;
413 | 
414 | # Creating new table locacao_orc
415 | 
416 | create external table locacao_orc (
417 | 	idlocacao int, 
418 | 	idcliente int,
419 | 	iddespachante int,
420 | 	idveiculo int,
421 | 	datalocacao date,
422 | 	dataentrega date,
423 | 	total double
424 | ) stored as orc;
425 | 
426 | # Inserting data into the new locacao_orc table
427 | 
428 | insert overwrite table locacao_orc  select * from locacao;
429 | 
430 | # Preparing query with join to test query speed
431 | 
432 | select loc.datalocacao,
433 | 	   loc.total,
434 | 	   cli.nome
435 | from locacao_orc loc
436 | join clientes_orc cli
437 |  on (loc.idcliente = cli.idcliente);
438 | 
439 |  # Checking if vectorization is enabled or not
440 | 
441 |  set hive.vectorized.execution.enabled;
442 | 
443 | # Changing vectorization to "enabled" for testing
444 | 
445 | set hive.vectorized.execution.enabled = true;
446 | 
447 | # Query for getting a sum, this query will be used for the CBO performance test
448 | 
449 | select sum(total) from locacao_orc;
450 | 
451 | # Adjusting the parameters
452 | 
453 | set hive.cbo.enable=true;
454 | set hive.compute.query.using.stats=true;
455 | set hive.stats.fetch.column.stats=true;
456 | set hive.stats.fetch.partition=true;
457 | 
458 | # Configuring the desired table for computing statistics
459 | 
460 | analyze table locacao_orc compute statistics;
461 | 
462 | # Checking current engine
463 | 
464 | set hive.execution.engine;
465 | 
466 | # Using again this query for testing
467 | 
468 | select loc.datalocacao,
469 | 	   loc.total,
470 | 	   cli.nome
471 | from locacao_orc loc
472 | join clientes_orc cli
473 |  on (loc.idcliente = cli.idcliente);
474 | 
475 | 
476 | # Changing the engine for this query
477 | 
478 |  set hive.execution.engine=spark;
479 | 
480 | 
481 | 
482 | 


--------------------------------------------------------------------------------
/Hive_Impala/Scripts/hiveql_samples.sql:
--------------------------------------------------------------------------------
  1 | -- HiveQL Queries
  2 | 
  3 | -- Basic Select
  4 | 
  5 | select	idveiculo
  6 | 		, dataaquisicao
  7 | 		, ano
  8 | 		, modelo
  9 | 		, placa
 10 |         , status
 11 |         , diaria
 12 | from    veiculos;
 13 | 
 14 | -- Basic Select using Disctinct
 15 | 
 16 | select distinct modelo 
 17 | from    veiculos;
 18 | 
 19 | -- Using filter with Where clause
 20 | 
 21 | select  *
 22 | from    veiculos
 23 | where   status <> "Disponivel";
 24 | 
 25 | -- Using Where with Two Conditions
 26 | 
 27 | select  *
 28 | from    veiculos
 29 | where   status = "Disponivel"
 30 | and     diaria >= 1600;
 31 | 
 32 | -- Using Order By
 33 | 
 34 | select  *
 35 | from    locacao
 36 | order by datalocacao;
 37 | 
 38 | -- Using Limit
 39 | 
 40 | select  *
 41 | from    veiculos
 42 | limit 5;
 43 | 
 44 | -- Using Order By and Limit
 45 | 
 46 | select  *
 47 | from    veiculos
 48 | order by dataaquisicao
 49 | limit 5;
 50 | 
 51 | -- Using Max() function
 52 | 
 53 | select  max(total)
 54 | from    locacao;
 55 | 
 56 | -- Using Sum() function
 57 | 
 58 | select  sum(total)
 59 | from    locacao;
 60 | 
 61 | -- Using Count() function
 62 | 
 63 | select  count(*)
 64 | from    locacao;
 65 | 
 66 | -- Using Avg() function
 67 | 
 68 | select  avg(total)
 69 | from    locacao;
 70 | 
 71 | -- Using LIKE
 72 | 
 73 | select	*
 74 | from	veiculos
 75 | where	modelo like 'BMW%';
 76 | 
 77 | select	*
 78 | from	veiculos
 79 | where	modelo like '%T8%';
 80 | 
 81 | -- Using IN
 82 | 
 83 | select	*
 84 | from	despachantes
 85 | where	filial in ('Santa Maria', 'Novo Hamburgo');
 86 | 
 87 | select	*
 88 | from	despachantes
 89 | where	filial not in ('Santa Maria', 'Novo Hamburgo');
 90 | 
 91 | -- Using Between
 92 | 
 93 | select	*
 94 | from	veiculos
 95 | where	diaria between 1400 and 1800;
 96 | 
 97 | -- Using basic Join
 98 | 
 99 | select	loc.idlocacao
100 | 		, loc.idcliente
101 | 		, loc.iddespachante
102 | 		, vec.modelo
103 | 		, loc.datalocacao
104 | 		, loc.dataentrega
105 | 		, loc.total
106 | from	locacao loc
107 | join	veiculos vec
108 | on loc.idveiculo = vec.idveiculo;
109 | 
110 | -- Using sum()
111 | 
112 | select	vec.modelo
113 | 		, sum(loc.total)
114 | from	locacao loc
115 | join	veiculos vec
116 | on loc.idveiculo = vec.idveiculo
117 | group by vec.modelo;
118 | 
119 | -- Using sum() again with dispatchers' table
120 | 
121 | select	vec.modelo
122 | 		, desp.nome
123 | 		, sum(loc.total)
124 | from	locacao loc
125 | join	veiculos vec
126 | on loc.idveiculo = vec.idveiculo
127 | join despachantes desp
128 | on loc.iddespachante = desp.iddespachante
129 | group by vec.modelo, desp.nome;
130 | 
131 | -- Using having with sum()
132 | 
133 | select	vec.modelo
134 | 		, desp.nome
135 | 		, sum(loc.total)
136 | from	locacao loc
137 | join	veiculos vec
138 | on loc.idveiculo = vec.idveiculo
139 | join despachantes desp
140 | on loc.iddespachante = desp.iddespachante
141 | group by vec.modelo, desp.nome
142 | having sum(loc.total) > 10000;
143 | 
144 | -- Using functions
145 | 
146 | select	vec.modelo
147 | 		, desp.nome
148 | 		, sum(loc.total)
149 | from	locacao loc
150 | join	veiculos vec
151 | on loc.idveiculo = vec.idveiculo
152 | join despachantes desp
153 | on loc.iddespachante = desp.iddespachante
154 | where month(loc.datalocacao) = 2
155 | and year(loc.datalocacao) = 2019
156 | group by vec.modelo, desp.nome
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Engineering
  2 | Repository containing projects and summaries of my studies in the field of Data Engineering.
  3 | 
  4 | # Table of Contents
  5 | 
  6 | - [Big Data Courses and Projects](#big-data-courses-and-projects)
  7 |   - [ANAC Data Engineering Project](#anac-data-engineering-project) <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="15" height="15"><br>
  8 |   - [Hive and Impala Course](#hive-and-impala-course-with-fernando-amaral) <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="15" height="15"><br>
  9 |   - [Apache Airflow Course](#apache-airflow-course-with-marc-lamberti) <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="15" height="15"><br>
 10 |   - [Data Engineering Course by XPE](#data-engineering-course-by-xpe) <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="15" height="15"><br>
 11 |   - [GitHub Certification Course by DIO](#github-certification-course-by-dio) <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="15" height="15"><br>
 12 |   
 13 | - [ETL/ELT Tools](#etlelt-tools)
 14 |   - [Apache Hop](#apache-hop) <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="15" height="15"><br>
 15 |   - [dbt](#dbt) <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="15" height="15"><br>
 16 | - [SQL](#sql)
 17 |   - [SQL Review Course with ChatGPT](#sql-review-course-with-chatgpt) <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="15" height="15"><br>
 18 | - [Books](#books)
 19 |   - [Fundamentals of Data Engineering by Joe Reis and Matt Housley](#fundamentals-of-data-engineering-by-joe-reis-and-matt-housley) <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="15" height="15"><br>
 20 | 
 21 | ## Big Data Courses and Projects
 22 | 
 23 | ### ANAC Data Engineering Project
 24 | ![GitHub](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)
 25 | ![Airflow](https://img.shields.io/badge/Airflow-017CEE?style=for-the-badge&logo=Apache%20Airflow&logoColor=white)
 26 | ![Jupyter](https://img.shields.io/badge/Jupyter-F37626.svg?&style=for-the-badge&logo=Jupyter&logoColor=white)
 27 | ![Python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
 28 | ![Docker](https://img.shields.io/badge/Docker-2CA5E0?style=for-the-badge&logo=docker&logoColor=white)
 29 | ![Spark](https://img.shields.io/badge/Apache_Spark-FFFFFF?style=for-the-badge&logo=apachespark&logoColor=#E35A16)
 30 | ![PBI](https://img.shields.io/badge/PowerBI-F2C811?style=for-the-badge&logo=Power%20BI&logoColor=white)
 31 | ![GPT](https://img.shields.io/badge/ChatGPT%204o-000000?style=for-the-badge&logo=openai&logoColor=white&label=)
 32 | <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="30" height="30"><br>
 33 | 
 34 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [ANAC Project Description Link](ANAC-data-engineering-project/ANAC-data-engineering-project.md)<br>
 35 | <img src="https://cdn-icons-png.flaticon.com/512/3277/3277524.png" alt="scripts" width="25" height="25"> [ANAC Project Files General Link](ANAC-data-engineering-project)
 36 | 
 37 | ### Hive and Impala Course with Fernando Amaral
 38 | ![RedHat](https://img.shields.io/badge/Red%20Hat-EE0000?style=for-the-badge&logo=redhat&logoColor=white)
 39 | ![Cloudera](https://img.shields.io/badge/Cloudera-0000FF?style=for-the-badge&logo=cloudera&logoColor=white)
 40 | ![Apache](https://img.shields.io/badge/Apache-D22128?style=for-the-badge&logo=Apache&logoColor=white)
 41 | [![Hive](https://img.shields.io/badge/-Hive-orange?logo=apache%20hive&style=for-the-badge&logoColor=white)](https://hive.apache.org/)
 42 | [![Impala](https://img.shields.io/badge/-Impala-black?logo=apache&style=for-the-badge)](https://impala.apache.org/)
 43 | ![SQOOP](https://img.shields.io/badge/Apache_SQOOP-00C300?logo=apache&logoColor=white&style=for-the-badge)
 44 | <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="30" height="30"><br>
 45 | 
 46 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [Hive and Impala Course Description Link](Hive_Impala/Hive%20and%20Impala.md)<br>
 47 | <img src="https://cdn-icons-png.flaticon.com/512/3277/3277524.png" alt="scripts" width="25" height="25"> [Hive and Impala Scripts Link](Hive_Impala/Scripts)
 48 | 
 49 | ### Apache Airflow Course with Marc Lamberti
 50 | ![Airflow](https://img.shields.io/badge/Airflow-017CEE?style=for-the-badge&logo=Apache%20Airflow&logoColor=white)
 51 | <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="30" height="30"><br>
 52 | 
 53 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [Apache Airflow Course Description Link](Apache_Airflow_Marc_Lamberti/The%20Complete%20Hands-On%20Introduction%20to%20Apache%20Airflow.md)<br>
 54 | 
 55 | ### Data Engineering Course by XPE
 56 | ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
 57 | ![PostgreSQL](https://img.shields.io/badge/PostgreSQL-316192?style=for-the-badge&logo=postgresql&logoColor=white)
 58 | ![Airflow](https://img.shields.io/badge/Airflow-017CEE?style=for-the-badge&logo=Apache%20Airflow&logoColor=white)
 59 | <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="30" height="30"><br>
 60 | 
 61 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [Data Engineering Course by XPE](Data_Engineering_Course_XPE/Data%20Engineering%20Course%20XPE.md)
 62 | 
 63 | ### GitHub Certification Course by DIO
 64 | ![GitHub](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)
 65 | <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="30" height="30"><br>
 66 | 
 67 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [GitHub Certification Course by DIO link](DIO-GitHub-Certification-Formation/DIO-GitHub-Certification-Formation.md)<br>
 68 | 
 69 | ## ETL/ELT Tools
 70 | 
 71 | ### Apache Hop
 72 | ![Postgres](https://img.shields.io/badge/postgres-%23316192.svg?style=for-the-badge&logo=postgresql&logoColor=white)
 73 | ![ApacheHOP](https://img.shields.io/badge/HOP-ffffff?style=for-the-badge&logo=apache&logoColor=blue)
 74 | ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
 75 | ![JSON](https://img.shields.io/badge/json-5E5C5C?style=for-the-badge&logo=json&logoColor=white)
 76 | ![EXCEL](https://img.shields.io/badge/Microsoft_Excel-217346?style=for-the-badge&logo=microsoft-excel&logoColor=white)
 77 | <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="30" height="30"><br>
 78 | 
 79 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [Apache Hop Project Description](ApacheHopProject/Apache%20Hop%20Project%20Description.md)<br>
 80 | <img src="https://cdn-icons-png.flaticon.com/512/4248/4248443.png" alt="sql scripts" width="25" height="25"> [Apache Hop Project SQL Scripts](ApacheHopProject/scripts_sql)<br>
 81 | <img src="https://cdn.icon-icons.com/icons2/2699/PNG/512/python_vertical_logo_icon_168039.png" alt="py scripts" width="25" height="25"> [Apache Hop Project Python Scripts](ApacheHopProject/scripts_py)<br>
 82 | 
 83 | ### dbt
 84 | ![dbt](https://img.shields.io/badge/dbt-FF694B?style=for-the-badge&logo=dbt&logoColor=white)
 85 | ![duckdb](https://img.shields.io/badge/Duckdb-000000?style=for-the-badge&logo=Duckdb&logoColor=yellow)
 86 | ![python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
 87 | ![chatgpt](https://img.shields.io/badge/ChatGPT-74aa9c?style=for-the-badge&logo=openai&logoColor=white)
 88 | ![vscode](https://img.shields.io/badge/Visual_Studio_Code-0078D4?style=for-the-badge&logo=visual%20studio%20code&logoColor=white)
 89 | <img src="https://cdn-icons-png.flaticon.com/512/7072/7072946.png" alt="done" width="30" height="30"><br>
 90 | 
 91 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [dbt Project Description](dbt-pokemon-project/pokemon-dbt-testing-project.md)<br>
 92 | <img src="https://cdn-icons-png.flaticon.com/512/4248/4248443.png" alt="sql scripts" width="25" height="25"> [SQL Scripts](dbt-pokemon-project/dbt_pkmn/models)<br>
 93 | <img src="https://cdn.icon-icons.com/icons2/2699/PNG/512/python_vertical_logo_icon_168039.png" alt="py scripts" width="25" height="25"> [Python Scripts (DuckDB)](dbt-pokemon-project/dbt_pkmn/duckdb_config)<br>
 94 | 
 95 | ## SQL
 96 | 
 97 | ### SQL Review Course with ChatGPT
 98 | ![Postgres](https://img.shields.io/badge/postgres-%23316192.svg?style=for-the-badge&logo=postgresql&logoColor=white)
 99 | ![ChatGPT](https://img.shields.io/badge/chatgpt-343434?style=for-the-badge&logo=openai&logoColor=white) 
100 | <img src="https://cdn-icons-png.flaticon.com/512/5307/5307571.png" alt="wip" width="30" height="30"><br>
101 | 
102 | <img src="https://cdn-icons-png.flaticon.com/512/4136/4136043.png" alt="document" width="25" height="25"> [Description](SQLReviewCoursewithChatGPT/SQL%20Review%20Course%20with%20ChatGPT.md)<br>
103 | <img src="https://cdn-icons-png.flaticon.com/512/4248/4248443.png" alt="sql scripts" width="25" height="25"> [SQL Scripts](SQLReviewCoursewithChatGPT/SQL%20Scripts)<br>
104 | 
105 | ## Books
106 | <a href="https://www.goodreads.com/user/show/50697219-jo-o-paulo-m-ller-mamede">
107 |     <img src="https://img.shields.io/badge/Goodreads-372213?style=for-the-badge&logo=goodreads&logoColor=white" alt="Goodreads Badge"/>
108 |   </a>
109 | 
110 | ### Fundamentals of Data Engineering by Joe Reis and Matt Housley
111 | 
112 | <img src="https://cdn-icons-png.flaticon.com/512/197/197374.png" alt="english summary" width="25" height="25"> [English Summary](Books/FundamentalsOfDataEngineering/Fundamentals%20of%20Data%20Engineering%20-%20%20Joe%20Reis%20&%20Matt%20Housley%20(ENG).md)<br>
113 | <img src="https://cdn-icons-png.flaticon.com/512/3909/3909370.png" alt="portuguese summary" width="25" height="25"> [Portuguese Summary (Machine Translation)](Books/FundamentalsOfDataEngineering/Fundamentals%20of%20Data%20Engineering%20-%20%20Joe%20Reis%20%26%20Matt%20Housley%20%20(PT-BR).md)<br>
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/SQLReviewCoursewithChatGPT/SQL Review Course with ChatGPT.md:
--------------------------------------------------------------------------------
  1 | ![Postgres](https://img.shields.io/badge/postgres-%23316192.svg?style=for-the-badge&logo=postgresql&logoColor=white)
  2 | ![ChatGPT](https://img.shields.io/badge/chatgpt-343434?style=for-the-badge&logo=openai&logoColor=white)
  3 | 
  4 | <div align="right">
  5 |   <img src="https://github.com/Shamslux/DataEngineering/assets/79280485/fffa0fdc-4cf8-4845-8e51-9cf7d9bb9900" alt="Badge" width="200">
  6 | </div>
  7 | 
  8 | 
  9 | # SQL Review Course with ChatGPT
 10 | 
 11 | I have a dear friend - who, by the way, helped me get into the Data field - who always reminds me that we should
 12 | practice SQL a lot. Currently, I have 2 years of experience in the field, always working with SQL.
 13 | 
 14 | However, it's always good to review the basic contents. With that in mind, and with a desire to test ChatGPT's ability
 15 | to generate interesting questions to aid human learning, I decided to request some questions from it to practice SQL.
 16 | 
 17 | The database I used is a personal database created for study purposes, created by me last year, to practice Business
 18 | Intelligence and Data Engineering projects. However, I prefer to review the basics and practice with some tools before
 19 | consolidating everything into a large skills demonstration project.
 20 | 
 21 | # Mini Project's Structure
 22 | 
 23 | This small project is based on the fictional dataset of a game store called "Skeam." They sell games from various
 24 | platforms, categories, etc., to users from around the world.
 25 | 
 26 | This training will aim to create 100 questions for junior, mid-level, and senior professional levels.
 27 | 
 28 | In addition to the folders with scripts and solutions, this document will also record the solutions with images (I will
 29 | update as time permits, so there may be a discrepancy between what is in the script and what is updated in this
 30 | document).
 31 | 
 32 | I will not share the database now, but I will share it when I create a more robust project in the future. Feel free to
 33 | adapt the questions used for your personal projects.
 34 | 
 35 | Some questions ended up repeating, probably, even though I asked ChatGPT to avoid this. In fact, the idea was to create
 36 | 500 junior-level questions, but given the limited dataset, everything started to become uncomfortably repetitive, so I
 37 | decided it would be better to limit all questions to just 100 per professional hierarchical level.
 38 | 
 39 | 
 40 | # 100 Junior Questions Solutions
 41 | 
 42 | <div align="right">
 43 |   <img src="https://github.com/Shamslux/DataEngineering/assets/79280485/fa7d8997-be71-40a2-bd8f-899d306a4196" alt="Badge" width="150">
 44 | </div>
 45 | 
 46 | ## 1. List all games from the tb_games table.
 47 | 
 48 | ```sql
 49 | SELECT	nm_title 
 50 | FROM	tb_games;
 51 | ```
 52 | 
 53 | ![s1_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/42007bcd-2a05-4ce4-a942-190d57962de3)
 54 | 
 55 | ## 2. Show the names and release years of games from the tb_games table.
 56 | 
 57 | ```sql
 58 | SELECT	nm_title
 59 | 		, nm_release_year 
 60 | FROM	tb_games;
 61 | ```
 62 | 
 63 | ![s2_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/0dfa032f-655f-4a2c-9660-b2e9d1846c11)
 64 | 
 65 | ## 3. Find all games released in 2010.
 66 | 
 67 | ```sql
 68 | SELECT	nm_title 
 69 | FROM	tb_games
 70 | WHERE	nm_release_year = '2010';
 71 | ```
 72 | 
 73 | ![s3_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/ffe62c0e-f9ab-406f-b979-d0744b1c4e64)
 74 | 
 75 | ## 4. Select all customers from the tb_client table.
 76 | 
 77 | ```sql
 78 | SELECT	nm_client
 79 | FROM	tb_client;
 80 | ```
 81 | ![s4_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/79f73c66-c262-444c-829f-5513b7fcae20)
 82 | 
 83 | ## 4. Select all customers from the tb_client table.
 84 | 
 85 | ```sql
 86 | SELECT	nm_client
 87 | FROM	tb_client
 88 | ```
 89 | 
 90 | ![s4_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/10b8f257-5d6d-406e-a1f6-0608573cc2d4)
 91 | 
 92 | ## 5. List the names of customers and their countries from the tb_client table.
 93 | 
 94 | ```sql
 95 | SELECT	nm_client 
 96 |         , nm_country 
 97 | FROM	tb_client;
 98 | ```
 99 | ![s5_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/c59bd459-3e1f-45a7-a7f8-7ee3889a3654)
100 | 
101 | ## 6. Show the names and emails of customers born before 1990.
102 | 
103 | ```sql
104 | SELECT	nm_client 
105 | 		, txt_email 
106 | FROM	tb_client
107 | WHERE	EXTRACT(YEAR FROM dt_birthdate) < 1990;
108 | ```
109 | 
110 | ![s6_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/fc935535-717b-4f8f-b903-cd68faf4616c)
111 | 
112 | ## 7. Find all games in the 'Action' category.
113 | 
114 | ```sql
115 | SELECT	tg.nm_title 
116 | FROM	tb_games	tg
117 | JOIN	tb_category	tc
118 |  ON tg.fk_category = tc.pk_category 
119 | WHERE	tc.nm_category = 'Action';
120 | ```
121 | ![s7_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/fd5fa160-8c61-4948-9fc4-4c945cd31d84)
122 | 
123 | I apologize for the confusion earlier. Here are the SQL queries from 8 to 100 in the requested format:
124 | 
125 | ## 8. List all games and their corresponding categories.
126 | 
127 | ```sql
128 | SELECT	tg.nm_title 
129 | FROM	tb_games	tg
130 | JOIN	tb_category	tc
131 |  ON tg.fk_category = tc.pk_category 
132 | WHERE	tc.nm_category = 'Action';
133 | ```
134 | 
135 | ![s8_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/28f1a3f3-d021-4ed5-85ff-21b7989f0636)
136 | 
137 | 
138 | ## 9. Display all information about orders made in the tb_sales table.
139 | 
140 | ```sql
141 | SELECT	*
142 | FROM	tb_sales;
143 | ```
144 | 
145 | ![s9_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/fedf87bd-1b41-4a99-b550-70ee2d15eec8)
146 | 
147 | 
148 | ## 10. List game titles and their total sale prices from the tb_sales table.
149 | 
150 | ```sql
151 | SELECT	tg.nm_title 
152 | 		, ts.vl_total 
153 | FROM	tb_sales ts 
154 | JOIN	tb_games tg 
155 |  ON ts.fk_game = tg.pk_game; 
156 | ```
157 | 
158 | ![s10_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/ff36d767-483d-4bea-bd20-7b48f05cb635)
159 | 
160 | 
161 | ## 11. Find games published by "Nintendo".
162 | 
163 | ```sql
164 | SELECT	tg.nm_title 
165 | FROM	tb_games tg
166 | JOIN	tb_publisher tp 
167 |  ON tg.fk_publisher = tp.pk_publisher 
168 | WHERE	tp.nm_publisher = 'Nintendo';
169 | ```
170 | 
171 | ![s10_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/06bf0f52-ad2f-4b30-a0bb-9c7d7d9b7f5c)
172 | 
173 | 
174 | ## 12. List all customers with gender 'F'.
175 | 
176 | ```sql
177 | SELECT	nm_client 
178 | FROM	tb_client
179 | WHERE	nm_gender = 'F';
180 | ```
181 | ![s12_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/1fb37246-abd5-4fa2-a050-92c317179c42)
182 | 
183 | **Note**: The script that I used to generate the fake data could not match a male name with male gender, 
184 | so it is confused like that.
185 | 
186 | 
187 | ## 13. Show the total sales for each game.
188 | 
189 | ```sql
190 | SELECT	tg.nm_title 
191 | 		, SUM(ts.vl_total) AS vl_total_sold
192 | FROM	tb_sales ts 
193 | JOIN	tb_games tg 
194 |  ON ts.fk_game = tg.pk_game 
195 | GROUP BY tg.nm_title 
196 | ORDER BY 2 DESC;
197 | ```
198 | 
199 | ![s13_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/d66b6c74-45f8-4277-8496-2ac65c2c0629)
200 | 
201 | 
202 | ## 14. Find all orders made between 2015 and 2020.
203 | 
204 | ```sql
205 | SELECT	*
206 | FROM	tb_sales
207 | WHERE	EXTRACT(YEAR FROM dt_order_date) BETWEEN 2015 AND 2020;
208 | ```
209 | 
210 | ![s14_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/42d72fe6-360f-4596-a4bb-d8c8e7a89088)
211 | 
212 | 
213 | ## 15. List details of CPUs from the 'Intel' brand.
214 | 
215 | ```sql
216 | SELECT	* 
217 | FROM	tb_cpu
218 | WHERE	nm_brand = 'Intel';
219 | ```
220 | ![s15_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/f3fefe7e-16a2-426c-a88e-b113789aa06f)
221 | 
222 | 
223 | ## 16. Show all GPU models from the 'Nvidia' brand.
224 | 
225 | ```sql
226 | SELECT	*
227 | FROM	tb_gpu
228 | WHERE	nm_brand = 'Nvidia';
229 | ```
230 | ![s16_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/5df7b2c0-a3ea-42eb-9f4f-a8becd6d10cf)
231 | 
232 | **17. Find games published by "Sony Computer Entertainment."**
233 | 
234 | ```sql
235 | SELECT tg.nm_title 
236 | FROM tb_games tg 
237 | JOIN tb_publisher tp 
238 |  ON tg.fk_publisher = tp.pk_publisher 
239 | WHERE tp.nm_publisher = 'Sony Computer Entertainment';
240 | ```
241 | 
242 | ![s17_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/160ad609-a24d-4bf0-ad86-d17b01c8d65b)
243 | 
244 | 
245 | **18. List the names of games released in the year 2000.**
246 | 
247 | ```sql
248 | SELECT nm_title 
249 | FROM tb_games
250 | WHERE nm_release_year = '2000';
251 | ```
252 | 
253 | ![s18_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/ea352a66-dbda-4eef-b3bd-d32d1c17118c)
254 | 
255 | **19. Show all customers born after 1985.**
256 | 
257 | ```sql
258 | SELECT nm_client  
259 | FROM tb_client
260 | WHERE EXTRACT(YEAR FROM dt_birthdate) > 1985;
261 | ```
262 | 
263 | ![s19_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/54c30ff9-732f-411b-a47f-88066293d9b2)
264 | 
265 | 
266 | **20. List game titles and their platforms.**
267 | 
268 | ```sql
269 | SELECT tg.nm_title 
270 | 		, tp.nm_platform 
271 | FROM tb_games tg 
272 | JOIN tb_platform tp 
273 |  ON tg.fk_platform = tp.pk_platform;
274 | ```
275 | ![s20_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/c639c6a8-b820-4cc0-a0e2-0eb6aee82ec3)
276 | 
277 | 
278 | **21. Find all games with an 'E' rating for everyone.**
279 | 
280 | ```sql
281 | SELECT nm_title 
282 | FROM tb_games tg 
283 | JOIN tb_rating tr 
284 |  ON tg.fk_rating = tr.pk_rating 
285 | WHERE tr.nm_rating = 'E';
286 | ```
287 | ![s21_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/de144bc1-05ab-455d-8ae3-012d323a6d62)
288 | 
289 | 
290 | **22. Show the names of customers and the games they purchased.**
291 | 
292 | ```sql
293 | SELECT tc.nm_client 
294 | 		, tc.txt_email 
295 | 		, tg.nm_title 
296 | FROM tb_sales ts 
297 | JOIN tb_client tc 
298 |  ON ts.fk_client = tc.pk_client 
299 | JOIN tb_games tg	
300 |  ON ts.fk_game = tg.pk_game
301 | ORDER BY 1;
302 | ```
303 | 
304 | ![s22_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/de3d519d-143e-4467-8605-9ffb6bf793a1)
305 | 
306 | 
307 | 
308 | **23. List all sales made in the year 2013.**
309 | 
310 | ```sql
311 | SELECT *
312 | FROM tb_sales
313 | WHERE EXTRACT(YEAR FROM dt_order_date) = 2013;
314 | ```
315 | 
316 | ![s23_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/8085a858-5604-4ad7-8d9e-65f0a23701cc)
317 | 
318 | 
319 | **24. Show the names of games that do not have a defined rating.**
320 | 
321 | ```sql
322 | SELECT tg.nm_title
323 | 		, tr.nm_rating
324 | 		, CASE
325 | 			WHEN tr.nm_rating IS NULL THEN 'No rating defined'
326 | 			ELSE NULL 
327 | 		END AS "nm_rating_adjusted"
328 | FROM tb_games tg 
329 | JOIN tb_rating tr 
330 |  ON tg.fk_rating = tr.pk_rating
331 | WHERE tr.pk_rating = 2;
332 | ```
333 | ![s24_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/f47bd1c0-2781-41ea-86a9-0891c79c3381)
334 | 
335 | 
336 | **25. Find the names and emails of customers who bought more than 3 games.**
337 | 
338 | ```sql
339 | SELECT tc.nm_client
340 | 		, tc. txt_email
341 | 		, a.total_games
342 | FROM (SELECT fk_client 
343 |              , count(*) AS total_games
344 | 	  	 FROM tb_sales
345 | 	  	 GROUP BY fk_client
346 |       	 HAVING count(*) > 3
347 |       	 ORDER BY 2 DESC) a
348 | JOIN tb_client tc 
349 |  ON a.fk_client = tc.pk_client
350 | ORDER BY 3 DESC;
351 | ```
352 | 
353 | ![s25_jr](https://github.com/Shamslux/DataEngineering/assets/79280485/78edfc96-f997-4020-a972-c0d388b74a9f)
354 | 
355 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | dbt-env/
3 | dev.duckdb


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/analyses/.gitkeep


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Name your project! Project names should contain only lowercase characters
 2 | # and underscores. A good package name should reflect your organization's
 3 | # name or the intended use of these models
 4 | name: 'dbt_pkmn'
 5 | version: '1.0.0'
 6 | 
 7 | # This setting configures which "profile" dbt uses for this project.
 8 | profile: 'dbt_pkmn'
 9 | 
10 | # Base path for your project using an environment variable in your shell
11 | # You cannot use Jinja templating here. Just mention the folders relative to the project.
12 | model-paths: ["models"]
13 | analysis-paths: ["analyses"]
14 | test-paths: ["tests"]
15 | seed-paths: ["seeds"]
16 | macro-paths: ["macros"]
17 | snapshot-paths: ["snapshots"]
18 | 
19 | clean-targets:         # directories to be removed by `dbt clean`
20 |   - "target"
21 |   - "dbt_packages"
22 | 
23 | # Configuring models
24 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
25 | models:
26 |   dbt_pkmn:
27 |       +materialized: view  # This applies to models in the "models/example/" directory
28 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/dev.duckdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/dev.duckdb


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/duckdb_config/import_data_duckdb.py:
--------------------------------------------------------------------------------
  1 | ########################################################################################################################################################################################
  2 | #**************************************************************************************************************************************************************************************#
  3 | # Code created by Shamslux
  4 | # October, 27th 2024
  5 | # This code belongs to a personal study project to practice a little with dbt, since it is begining to be used at my current company. :)
  6 | ########################################################################################################################################################################################
  7 | 
  8 | ########################################################################################################################################################################################
  9 | #******************************************************************************* IMPORTS **********************************************************************************************#
 10 | ########################################################################################################################################################################################
 11 | import duckdb
 12 | import time
 13 | ########################################################################################################################################################################################
 14 | #******************************************************************************* FUNCTIONS ********************************************************************************************#
 15 | ########################################################################################################################################################################################
 16 | 
 17 | def print_with_border(message):
 18 |     border = '*' * (len(message) + 4)
 19 |     print(border)
 20 |     print(f'* {message} *')
 21 |     print(border)
 22 | 
 23 | def connect_to_duckdb(retries=3, delay=2):
 24 |     for attempt in range(retries):
 25 |         try:
 26 |             conn = duckdb.connect('dev.duckdb')
 27 |             print_with_border('Connected to DuckDB')
 28 |             return conn
 29 |         except Exception as e:
 30 |             print(f"Connection failed: {e}. Retrying in {delay} seconds...")
 31 |             time.sleep(delay)
 32 |     raise Exception("Failed to connect to DuckDB after multiple attempts.")
 33 | 
 34 | ########################################################################################################################################################################################
 35 | #******************************************************************************* CODE *************************************************************************************************#
 36 | ########################################################################################################################################################################################
 37 | print_with_border('Connecting to duckdb')
 38 | conn = connect_to_duckdb()
 39 | 
 40 | #dimCustomers
 41 | try:
 42 |     print_with_border('Creating dimCustomers on duckdb')
 43 |     conn.execute('''
 44 |     CREATE TABLE dimCustomers (
 45 |         customerSK INT PRIMARY KEY,
 46 |         customerName VARCHAR(50),
 47 |         customerType VARCHAR(20)
 48 |     );
 49 |     ''')
 50 | except Exception as e:
 51 |     print_with_border(f"Error creating dimCustomers: {e}")
 52 | 
 53 | # Inserting data into dimCustomers
 54 | try:
 55 |     print_with_border('Inserting data into dimCustomers on duckdb')
 56 |     conn.execute('''
 57 |     INSERT INTO dimCustomers (customerSK, customerName, customerType) VALUES
 58 |     (1, 'Red', 'Trainer'),
 59 |     (2, 'Green', 'Trainer'),
 60 |     (3, 'Brock', 'Gym Leader'),
 61 |     (4, 'Misty', 'Gym Leader'),
 62 |     (5, 'Gary', 'Trainer'),
 63 |     (6, 'Tracey', 'Trainer'),
 64 |     (7, 'Professor Oak', 'Professor'),
 65 |     (8, 'Team Rocket Jessie', 'Villain'),
 66 |     (9, 'Team Rocket James', 'Villain'),
 67 |     (10, 'Ash Ketchum', 'Trainer'),
 68 |     (11, 'May', 'Trainer'),
 69 |     (12, 'Dawn', 'Trainer'),
 70 |     (13, 'Cynthia', 'Champion'),
 71 |     (14, 'Professor Elm', 'Professor'),
 72 |     (15, 'Hilda', 'Trainer'),
 73 |     (16, 'N', 'Trainer'),
 74 |     (17, 'Iris', 'Trainer'),
 75 |     (18, 'Serena', 'Trainer'),
 76 |     (19, 'Clemont', 'Gym Leader'),
 77 |     (20, 'Korrina', 'Gym Leader'),
 78 |     (21, 'Roxie', 'Gym Leader'),
 79 |     (22, 'Hilda', 'Trainer'),
 80 |     (23, 'Lysandre', 'Villain'),
 81 |     (24, 'Wallace', 'Champion'),
 82 |     (25, 'Diantha', 'Champion'),
 83 |     (26, 'Professor Sycamore', 'Professor'),
 84 |     (27, 'Mallow', 'Trainer'),
 85 |     (28, 'Lillie', 'Trainer'),
 86 |     (29, 'Kukui', 'Professor'),
 87 |     (30, 'Gladion', 'Trainer'),
 88 |     (31, 'Sabrina', 'Gym Leader'),
 89 |     (32, 'Giovanni', 'Villain'),
 90 |     (33, 'Flannery', 'Gym Leader'),
 91 |     (34, 'Erika', 'Gym Leader'),
 92 |     (35, 'Whitney', 'Gym Leader'),
 93 |     (36, 'Clair', 'Gym Leader'),
 94 |     (37, 'Roxanne', 'Gym Leader'),
 95 |     (38, 'Maylene', 'Gym Leader'),
 96 |     (39, 'Candice', 'Gym Leader'),
 97 |     (40, 'Skyla', 'Gym Leader'),
 98 |     (41, 'Blaine', 'Gym Leader'),
 99 |     (42, 'Janine', 'Gym Leader'),
100 |     (43, 'Falkner', 'Gym Leader'),
101 |     (44, 'Burgundy', 'Trainer'),
102 |     (45, 'Cynthia', 'Champion'),
103 |     (46, 'Sierra', 'Trainer'),
104 |     (47, 'Hilda', 'Trainer'),
105 |     (48, 'Alain', 'Trainer'),
106 |     (49, 'Charon', 'Villain'),
107 |     (50, 'Lyra', 'Trainer');
108 |     ''')
109 | except Exception as e:
110 |     print_with_border(f"Error inserting data into dimCustomers: {e}")
111 | 
112 | 
113 | # dimProducts
114 | try:
115 |     print_with_border('Creating dimProducts on duckdb')
116 |     conn.execute('''
117 |     CREATE TABLE dimProducts (
118 |         productSK INT PRIMARY KEY,
119 |         productNK VARCHAR(50) UNIQUE,
120 |         productName VARCHAR(50),
121 |         categorySK INT,
122 |         price DECIMAL(10, 2)
123 |     );
124 |     ''')
125 | except Exception as e:
126 |     print_with_border(f"Error creating dimProducts: {e}")
127 | 
128 | # Inserting data into dimProducts
129 | try:
130 |     print_with_border('Inserting data into dimProducts on duckdb')
131 |     conn.execute('''
132 |     INSERT INTO dimProducts (productSK, productNK, productName, categorySK, price) VALUES
133 |     (1, 'POTION', 'Potion', 1, 200.00),
134 |     (2, 'SUPER_POTION', 'Super Potion', 1, 600.00),
135 |     (3, 'POKEBALL', 'PokéBall', 2, 300.00),
136 |     (4, 'GREAT_BALL', 'Great Ball', 2, 600.00),
137 |     (5, 'ULTRA_BALL', 'Ultra Ball', 2, 1200.00),
138 |     (6, 'REVIVE', 'Revive', 3, 1500.00),
139 |     (7, 'FULL_RESTORE', 'Full Restore', 1, 3000.00),
140 |     (8, 'MAX_POTION', 'Max Potion', 1, 2500.00),
141 |     (9, 'ANTIDOTE', 'Antidote', 1, 100.00),
142 |     (10, 'BURN_HEAL', 'Burn Heal', 1, 200.00),
143 |     (11, 'ICE_HEAL', 'Ice Heal', 1, 200.00),
144 |     (12, 'PARALYZE_HEAL', 'Paralyze Heal', 1, 200.00),
145 |     (13, 'AWAKENING', 'Awakening', 1, 300.00),
146 |     (14, 'REPEL', 'Repel', 2, 350.00),
147 |     (15, 'SUPER_REPEL', 'Super Repel', 2, 700.00),
148 |     (16, 'MAX_REPEL', 'Max Repel', 2, 1200.00),
149 |     (17, 'HEALTHY_TREAT', 'Healthy Treat', 3, 1500.00),
150 |     (18, 'LURE', 'Lure', 2, 200.00),
151 |     (19, 'NUGGET', 'Nugget', 4, 5000.00),
152 |     (20, 'MYSTIC_WATER', 'Mystic Water', 5, 1500.00);
153 |     ''')
154 | except Exception as e:
155 |     print_with_border(f"Error inserting data into dimProducts: {e}")
156 | 
157 | 
158 | # dimProdCategories
159 | try:
160 |     print_with_border('Creating dimProdCategories on duckdb')
161 |     conn.execute('''
162 |     CREATE TABLE dimProdCategories (
163 |         categorySK INT PRIMARY KEY,
164 |         categoryName VARCHAR(50)
165 |     );
166 |     ''')
167 | except Exception as e:
168 |     print(f"Error creating dimProdCategories: {e}")
169 | 
170 | 
171 | # Inserting data into dimProdCategories
172 | try:
173 |     print_with_border('Inserting data into dimProdCategories on duckdb')
174 |     conn.execute('''
175 |     INSERT INTO dimProdCategories (categorySK, categoryName) VALUES
176 |     (1, 'Medicine'),
177 |     (2, 'Poké Balls'),
178 |     (3, 'Revival Items'),
179 |     (4, 'Accessories'),
180 |     (5, 'Battle Items');
181 |     ''')
182 | except Exception as e:
183 |     print_with_border(f"Error inserting data into dimProdCategories: {e}")
184 | 
185 | print_with_border('Closing connection to duckdb')
186 | # Closing connection
187 | conn.close()
188 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/duckdb_config/import_dim_time.py:
--------------------------------------------------------------------------------
 1 | ########################################################################################################################################################################################
 2 | #**************************************************************************************************************************************************************************************#
 3 | # Code created by Shamslux
 4 | # October, 27th 2024
 5 | # This code belongs to a personal study project to practice a little with dbt, since it is begining to be used at my current company. :)
 6 | ########################################################################################################################################################################################
 7 | 
 8 | ########################################################################################################################################################################################
 9 | #******************************************************************************* IMPORTS **********************************************************************************************#
10 | ########################################################################################################################################################################################
11 | import duckdb
12 | import pandas as pd
13 | 
14 | ########################################################################################################################################################################################
15 | #******************************************************************************* FUNCTIONS ********************************************************************************************#
16 | ########################################################################################################################################################################################
17 | def print_with_border(message):
18 |     border = '*' * (len(message) + 4)
19 |     print(border)
20 |     print(f'* {message} *')
21 |     print(border)
22 |     
23 | ########################################################################################################################################################################################
24 | #******************************************************************************* CODE *************************************************************************************************#
25 | ########################################################################################################################################################################################
26 | try:
27 |     conn = duckdb.connect('dev.duckdb')
28 |     print_with_border('Connected to DuckDB')
29 | except Exception as e:
30 |     print(f"Error connecting to DuckDB: {e}")
31 |     raise
32 | 
33 | 
34 | try:
35 |     print_with_border('Creating dimTime on duckdb')
36 |     conn.execute('''
37 |     CREATE TABLE dimTime (
38 |         timeSK INT PRIMARY KEY,
39 |         saleDate DATE,
40 |         year INT,
41 |         month INT,
42 |         day INT,
43 |         day_of_week VARCHAR(10) -- Nome do dia da semana
44 |     );
45 |     ''')
46 | except Exception as e:
47 |     print(f"Error creating dimTime: {e}")
48 |     conn.close()
49 |     raise
50 | 
51 | 
52 | time_data = []
53 | for date in pd.date_range(start='2024-04-01', end='2024-10-01', freq='D'):
54 |     time_data.append((int(date.strftime('%Y%m%d')), date, date.year, date.month, date.day, date.strftime('%A')))
55 | 
56 | 
57 | try:
58 |     print_with_border('Inserting data into dimTime on duckdb')
59 |     insert_query = '''
60 |     INSERT INTO dimTime (timeSK, saleDate, year, month, day, day_of_week) VALUES
61 |     '''
62 |     insert_values = ', '.join([f'({time[0]}, DATE \'{time[1].date()}\', {time[2]}, {time[3]}, {time[4]}, \'{time[5]}\')' for time in time_data])
63 | 
64 |     conn.execute(insert_query + insert_values)
65 | except Exception as e:
66 |     print(f"Error inserting data into dimTime: {e}")
67 | 
68 | 
69 | print_with_border('Closing connection to duckdb')
70 | conn.close()
71 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/duckdb_config/import_fact_sales.py:
--------------------------------------------------------------------------------
 1 | ########################################################################################################################################################################################
 2 | #**************************************************************************************************************************************************************************************#
 3 | # Code created by Shamslux
 4 | # October, 27th 2024
 5 | # This code belongs to a personal study project to practice a little with dbt, since it is begining to be used at my current company. :)
 6 | ########################################################################################################################################################################################
 7 | ########################################################################################################################################################################################
 8 | #******************************************************************************* IMPORTS **********************************************************************************************#
 9 | ########################################################################################################################################################################################
10 | import duckdb
11 | import pandas as pd
12 | 
13 | ########################################################################################################################################################################################
14 | #******************************************************************************* FUNCTIONS ********************************************************************************************#
15 | ########################################################################################################################################################################################
16 | def print_with_border(message):
17 |     border = '*' * (len(message) + 4)
18 |     print(border)
19 |     print(f'* {message} *')
20 |     print(border)
21 | 
22 | ########################################################################################################################################################################################
23 | #******************************************************************************* CODE *************************************************************************************************#
24 | ########################################################################################################################################################################################
25 | 
26 | try:
27 |     conn = duckdb.connect('dev.duckdb')
28 |     print_with_border('Connected to DuckDB')
29 | except Exception as e:
30 |     print(f"Error connecting to DuckDB: {e}")
31 |     raise
32 | 
33 | 
34 | try:
35 |     print_with_border('Creating factSales table on duckdb')
36 |     conn.execute('''
37 |     CREATE TABLE factSales (
38 |         saleSK INT PRIMARY KEY,
39 |         customerSK INT,
40 |         productSK INT,
41 |         categorySK INT,
42 |         saleDate INT, -- Usando timeSK no formato YYYYMMDD
43 |         quantity INT,
44 |         totalPrice DECIMAL(10, 2)
45 |     );
46 |     ''')
47 | except Exception as e:
48 |     print(f"Error creating factSales table: {e}")
49 |     conn.close()
50 |     raise
51 | 
52 | 
53 | csv_file_path = r'C:\Users\jpmul\OneDrive\Documentos\GitHub\dbt-pokemon-project\dbt_pkmn\duckdb_config\fact_sales_data.csv'  
54 | 
55 | 
56 | try:
57 |     print_with_border('Importing data into factSales from CSV')
58 |     conn.execute(f'''
59 |     COPY factSales FROM '{csv_file_path}' (HEADER, DELIMITER ';', FORMAT 'csv');
60 |     ''')
61 | except Exception as e:
62 |     print(f"Error importing data into factSales: {e}")
63 | 
64 | 
65 | print_with_border('Closing connection to duckdb')
66 | conn.close()
67 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/duckdb_config/selecting_tables.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | 
 3 | # Conectar ao banco de dados DuckDB
 4 | conn = duckdb.connect('dev.duckdb')
 5 | 
 6 | # Função para exibir dados de uma tabela
 7 | def display_table_data(table_name):
 8 |     print(f'\nData from {table_name}:')
 9 |     results = conn.execute(f'SELECT * FROM {table_name};').fetchall()
10 |     for row in results:
11 |         print(row)
12 | 
13 | # Exibir dados de dimCustomers
14 | display_table_data('dimCustomers')
15 | 
16 | # Exibir dados de dimProducts
17 | display_table_data('dimProducts')
18 | 
19 | # Exibir dados de dimProdCategories
20 | display_table_data('dimProdCategories')
21 | 
22 | # Exibir os dados da dimTime
23 | display_table_data('dimTime')
24 | 
25 | # Exibir os dados da factSales
26 | display_table_data('factSales')
27 | 
28 | # Fechar a conexão
29 | conn.close()
30 | 
31 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/macros/.gitkeep


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/dimCustomersView.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{ source('pokemart', 'dimCustomers') }}
3 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/dimProdCategoriesView.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{ source('pokemart', 'dimProdCategories') }}
3 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/dimProductsView.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{ source('pokemart', 'dimProducts') }}
3 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/dimTimeView.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{ source('pokemart', 'dimTime') }}
3 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/factSalesView.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {{ source('pokemart', 'factSales') }}
3 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/monthlySales.sql:
--------------------------------------------------------------------------------
 1 | WITH 
 2 | monthly_sales AS (
 3 |     SELECT 
 4 |         EXTRACT(YEAR FROM saleDate) AS year,
 5 |         EXTRACT(MONTH FROM saleDate) AS month,
 6 |         SUM(totalPrice) AS monthlyTotal
 7 |     FROM 
 8 |         {{ ref('salesConsolidated') }}
 9 |     GROUP BY 
10 |         year, month
11 |     ORDER BY 
12 |         year, month
13 | )
14 | 
15 | SELECT 
16 |     year,
17 |     month,
18 |     monthlyTotal,
19 |     SUM(monthlyTotal) OVER (ORDER BY year, month) AS accumulatedSales
20 | FROM 
21 |     monthly_sales
22 | ORDER BY 
23 |     year, month
24 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/mostQuantitySold.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 |     productName,
 3 |     SUM(quantity) AS totalQuantity
 4 | FROM 
 5 |     {{ ref('salesConsolidated') }}
 6 | GROUP BY 
 7 |     productName
 8 | ORDER BY 
 9 |     totalQuantity DESC
10 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/mostSoldCategories.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 |     categoryName,
 3 |     SUM(totalPrice) AS totalSalesValue,
 4 |     SUM(quantity) AS totalQuantitySold
 5 | FROM 
 6 |     {{ ref('salesConsolidated') }}
 7 | GROUP BY 
 8 |     categoryName
 9 | ORDER BY 
10 |     totalSalesValue DESC
11 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/mostSoldProducts.sql:
--------------------------------------------------------------------------------
 1 | WITH 
 2 | most_sold_products AS(
 3 | SELECT 
 4 |     productName,
 5 |     SUM(totalPrice) AS totalRevenue
 6 | FROM 
 7 |     {{ ref('salesConsolidated') }}
 8 | GROUP BY 
 9 |     productName
10 | ORDER BY 
11 |     totalRevenue DESC
12 | )
13 | 
14 | SELECT * FROM most_sold_products


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/salesConsolidated.sql:
--------------------------------------------------------------------------------
 1 | WITH sales_consolidated AS (
 2 |     SELECT
 3 |         fs.saleSK,  
 4 |         p.productName,
 5 |         pc.categoryName,
 6 |         c.customerName,
 7 |         t.saleDate,
 8 |         fs.quantity,
 9 |         fs.totalPrice
10 |     FROM
11 |         {{ ref('factSalesView') }} fs
12 |     JOIN
13 |         {{ ref('dimProductsView') }} p ON fs.productSK = p.productSK  
14 |     JOIN
15 |         {{ ref('dimProdCategoriesView') }} pc ON p.categorySK = pc.categorySK  
16 |     JOIN
17 |         {{ ref('dimCustomersView') }} c ON fs.customerSK = c.customerSK  
18 |     JOIN
19 |         {{ ref('dimTimeView') }} t ON fs.saleDate = t.timeSK  
20 | )
21 | 
22 | SELECT sc.saleSK    
23 |        , sc.productName
24 |        , sc.categoryName
25 |        , sc.customerName
26 |        , CAST(sc.saleDate AS DATE) AS saleDate
27 |        , sc.quantity
28 |        , sc.totalPrice
29 | FROM sales_consolidated sc
30 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: pokemart
 5 |     schema: main
 6 |     tables:
 7 |       - name: dimCustomers
 8 |       - name: dimProducts
 9 |       - name: dimProdCategories
10 |       - name: dimTime
11 |       - name: factSales
12 | models:
13 |   - name: salesConsolidated
14 |     description: This is an analytical view of the consolidated fact table (we have the dimension information joined directly into the fact table).
15 |     columns:
16 |       - name: saleSK
17 |         description: SK key for each sale.
18 |         data_type: integer
19 |         quote: true
20 |       - name: productName
21 |         description: Describes the name of the product sold.
22 |         data_type: varchar
23 |         quote: true
24 |       - name: categoryName
25 |         description: Describes the name of the product's category.
26 |         data_type: varchar
27 |         quote: true
28 |       - name: customerName
29 |         description: Describes the name of the person who bought the product.
30 |         data_type: varchar
31 |         quote: true
32 |       - name: saleDate
33 |         description: Describes the date when the sale happened.
34 |         data_type: date
35 |         quote: true
36 |       - name: quantity
37 |         description: Describes the total amount of itens sold.
38 |         data_type: integer
39 |       - name: totalPrice
40 |         description: Describes the total price of the products sold.
41 |         data_type: decimal(10,2)
42 |         quote: true
43 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/models/topBuyers.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 |     customerName,
 3 |     SUM(totalPrice) AS totalSpent,
 4 |     COUNT(DISTINCT saleSK) AS totalPurchases
 5 | FROM 
 6 |     {{ ref('salesConsolidated') }}
 7 | GROUP BY 
 8 |     customerName
 9 | ORDER BY 
10 |     totalSpent DESC


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/seeds/.gitkeep


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/snapshots/.gitkeep


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/temp.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | 
 3 | # Conectar ao banco de dados DuckDB
 4 | conn = duckdb.connect('dev.duckdb')  # Ajuste o caminho conforme necessário
 5 | 
 6 | # Consultar todas as tabelas e views do banco de dados
 7 | result = conn.execute("""
 8 |     SELECT table_name, table_type 
 9 |     FROM information_schema.tables;
10 | """).fetchall()
11 | 
12 | # Imprimir os resultados
13 | print("Tabelas e views no banco de dados:")
14 | if result:
15 |     for row in result:
16 |         print(f"Nome: {row[0]}, Tipo: {row[1]}")
17 | else:
18 |     print("Nenhuma tabela ou view encontrada.")
19 | 
20 | # Fechar a conexão
21 | conn.close()
22 | 


--------------------------------------------------------------------------------
/dbt-pokemon-project/dbt_pkmn/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Shamslux/DataEngineering/bb66ade87587979e9e8fe43da335957e87bc76f7/dbt-pokemon-project/dbt_pkmn/tests/.gitkeep


--------------------------------------------------------------------------------
/dbt-pokemon-project/pokemon-dbt-testing-project.md:
--------------------------------------------------------------------------------
  1 | ![dbt](https://img.shields.io/badge/dbt-FF694B?style=for-the-badge&logo=dbt&logoColor=white)
  2 | ![duckdb](https://img.shields.io/badge/Duckdb-000000?style=for-the-badge&logo=Duckdb&logoColor=yellow)
  3 | ![python](https://img.shields.io/badge/Python-FFD43B?style=for-the-badge&logo=python&logoColor=blue)
  4 | ![chatgpt](https://img.shields.io/badge/ChatGPT-74aa9c?style=for-the-badge&logo=openai&logoColor=white)
  5 | ![vscode](https://img.shields.io/badge/Visual_Studio_Code-0078D4?style=for-the-badge&logo=visual%20studio%20code&logoColor=white)
  6 | 
  7 | # Installing dbt
  8 | 
  9 | ## Step-by-Step Guide
 10 | 
 11 | 1. Install Python on your machine.
 12 | 2. Install VSCode.
 13 | 3. Install the VSCode extension called "Power User for dbt Core." See the image below:
 14 | 
 15 | ![power_user_dbt_vscode](https://github.com/user-attachments/assets/f4a9db01-da5b-40af-b899-afce91772c9b)
 16 | 
 17 | 4. Create a local folder in your terminal using: `mkdir dbt_local`. 
 18 | 5. Inside this new directory, use the command: `python -m venv dbt_venv`
 19 | 6. Now, `dbt_venv` will have some folders, including one called `bin`. Inside `bin`, there are several files, such as `activate`.
 20 | 7. We need to activate the virtual environment. Based on what was described in step 6, use the command: `source dbt_venv/bin/activate`. This will activate the environment, and you’ll see `(dbt_venv)` appear in the terminal command line.
 21 | 
 22 | > **Hint**: There might be an issue during the virtual environment creation. I encountered this myself, and `activate` did not exist. If this happens, carefully repeat the steps, deleting the previous directories using the command: `rm -rf dbt_local`.
 23 | 
 24 | ## Integration with VSCode Extension Installed
 25 | 
 26 | 1. Notice that the blue bar in VSCode (at the bottom) will indicate that dbt Core is not installed.
 27 | 2. Click on this message, and a dialog box will appear at the top. See the image below:
 28 | 
 29 | ![extension_tutorial_1](https://github.com/user-attachments/assets/13955c45-6063-4add-b821-444f5b4be85a)
 30 | 
 31 | 3. Select "Setup Extension."
 32 | 4. Now choose the Python interpreter, as shown in the image below:
 33 | 
 34 | ![extension_tutorial_2](https://github.com/user-attachments/assets/29535db5-624a-46b7-8015-b2eee3b79130)
 35 | 
 36 | 5. In the new box that opens, select the path to `dbt_venv` (inside the `bin` folder, choose the `python3` file). After this, the blue bar will look like this:
 37 | 
 38 | ![extension_tutorial_3](https://github.com/user-attachments/assets/acf20a35-b365-4f25-a421-4df42cc078b2)
 39 | 
 40 | # Project Overview
 41 | 
 42 | <div align="left">
 43 |   <img src="https://github.com/user-attachments/assets/e87be0da-b897-420e-ad9a-65fc24d53bcf" alt="Badge" width="500">
 44 | </div>
 45 | 
 46 | This is a personal project I created to practice using dbt. In my company, we recently started using the tool, and I picked up some tips from coworkers 
 47 | and by watching free courses online. Since I always like to create a basic project to get hands-on with tools I work with, I decided to create a little 
 48 | project called PokéMart.
 49 | 
 50 | The simulation in this educational project will involve using data from the PokéMart OLAP database to create some basic analytical views, just to test 
 51 | the fundamentals of dbt (e.g., using CTEs for some data processing, utilizing documentation features, and exploring lineage capabilities).
 52 | 
 53 | # Models
 54 | 
 55 | Below are the model codes I created for this project.
 56 | 
 57 | ![models_path](https://github.com/user-attachments/assets/97a01a6c-7ded-49c4-9f21-774d8b286825)
 58 | 
 59 | Here is how I configured the `sources` file:
 60 | 
 61 | ```yml
 62 | version: 2
 63 | 
 64 | sources:
 65 |   - name: pokemart
 66 |     schema: main
 67 |     tables:
 68 |       - name: dimCustomers
 69 |       - name: dimProducts
 70 |       - name: dimProdCategories
 71 |       - name: dimTime
 72 |       - name: factSales
 73 | ```
 74 | 
 75 | ## dimCustomersView.sql
 76 | 
 77 | ```sql
 78 | SELECT *
 79 | FROM {{ source('pokemart', 'dimCustomers') }}
 80 | ```
 81 | 
 82 | ## dimProductsView.sql
 83 | 
 84 | ```sql
 85 | SELECT *
 86 | FROM {{ source('pokemart', 'dimProducts') }}
 87 | ```
 88 | ## dimProdCategoriesView.sql
 89 | 
 90 | ```sql
 91 | SELECT *
 92 | FROM {{ source('pokemart', 'dimProdCategories') }}
 93 | ```
 94 | ## dimTimeView.sql
 95 | 
 96 | ```sql
 97 | SELECT *
 98 | FROM {{ source('pokemart', 'dimTime') }}
 99 | ```
100 | 
101 | ## factSalesView.sql
102 | 
103 | ```sql
104 | SELECT *
105 | FROM {{ source('pokemart', 'factSales') }}
106 | ```
107 | ## salesConsolidated.sql
108 | 
109 | ```sql
110 | WITH sales_consolidated AS (
111 |     SELECT
112 |         fs.saleSK,  
113 |         p.productName,
114 |         pc.categoryName,
115 |         c.customerName,
116 |         t.saleDate,
117 |         fs.quantity,
118 |         fs.totalPrice
119 |     FROM
120 |         {{ ref('factSalesView') }} fs
121 |     JOIN
122 |         {{ ref('dimProductsView') }} p ON fs.productSK = p.productSK  
123 |     JOIN
124 |         {{ ref('dimProdCategoriesView') }} pc ON p.categorySK = pc.categorySK  
125 |     JOIN
126 |         {{ ref('dimCustomersView') }} c ON fs.customerSK = c.customerSK  
127 |     JOIN
128 |         {{ ref('dimTimeView') }} t ON fs.saleDate = t.timeSK  
129 | )
130 | 
131 | SELECT sc.saleSK    
132 |        , sc.productName
133 |        , sc.categoryName
134 |        , sc.customerName
135 |        , CAST(sc.saleDate AS DATE) AS saleDate
136 |        , sc.quantity
137 |        , sc.totalPrice
138 | FROM sales_consolidated sc
139 | ```
140 | 
141 | ## monthlySales.sql
142 | 
143 | ```sql
144 | WITH 
145 | monthly_sales AS (
146 |     SELECT 
147 |         EXTRACT(YEAR FROM saleDate) AS year,
148 |         EXTRACT(MONTH FROM saleDate) AS month,
149 |         SUM(totalPrice) AS monthlyTotal
150 |     FROM 
151 |         {{ ref('salesConsolidated') }}
152 |     GROUP BY 
153 |         year, month
154 |     ORDER BY 
155 |         year, month
156 | )
157 | 
158 | SELECT 
159 |     year,
160 |     month,
161 |     monthlyTotal,
162 |     SUM(monthlyTotal) OVER (ORDER BY year, month) AS accumulatedSales
163 | FROM 
164 |     monthly_sales
165 | ORDER BY 
166 |     year, month
167 | ```
168 | 
169 | ## mostQuantitySold.sql
170 | 
171 | ```sql
172 | SELECT 
173 |     productName,
174 |     SUM(quantity) AS totalQuantity
175 | FROM 
176 |     {{ ref('salesConsolidated') }}
177 | GROUP BY 
178 |     productName
179 | ORDER BY 
180 |     totalQuantity DESC
181 | ```
182 | 
183 | ## mostSoldCategories.sql
184 | 
185 | ```sql
186 | SELECT 
187 |     categoryName,
188 |     SUM(totalPrice) AS totalSalesValue,
189 |     SUM(quantity) AS totalQuantitySold
190 | FROM 
191 |     {{ ref('salesConsolidated') }}
192 | GROUP BY 
193 |     categoryName
194 | ORDER BY 
195 |     totalSalesValue DESC
196 | ```
197 | 
198 | ## mostSoldProducts.sql
199 | 
200 | ```sql
201 | WITH 
202 | most_sold_products AS(
203 | SELECT 
204 |     productName,
205 |     SUM(totalPrice) AS totalRevenue
206 | FROM 
207 |     {{ ref('salesConsolidated') }}
208 | GROUP BY 
209 |     productName
210 | ORDER BY 
211 |     totalRevenue DESC
212 | )
213 | 
214 | SELECT * FROM most_sold_products
215 | ```
216 | 
217 | ## topBuyers.sql
218 | 
219 | ```sql
220 | SELECT 
221 |     customerName,
222 |     SUM(totalPrice) AS totalSpent,
223 |     COUNT(DISTINCT saleSK) AS totalPurchases
224 | FROM 
225 |     {{ ref('salesConsolidated') }}
226 | GROUP BY 
227 |     customerName
228 | ORDER BY 
229 |     totalSpent DESC
230 | ```
231 | 
232 | # Data Lineage
233 | 
234 | ![lineage](https://github.com/user-attachments/assets/aa253e8b-3210-4c95-a094-efd775860d84)
235 | 
236 | # Query Results
237 | 
238 | ## dimCustomers
239 | 
240 | ![query_dim_customers](https://github.com/user-attachments/assets/36fa4102-846b-4701-821c-ad440b6eb730)
241 | 
242 | ## dimProducts
243 | 
244 | ![query_dim_products](https://github.com/user-attachments/assets/bbf59225-6e2c-4c1d-be52-b986bdfb677f)
245 | 
246 | ## dimProductCategory
247 | 
248 | ![query_dim_product_cateogry](https://github.com/user-attachments/assets/cb3f2fa8-0a89-4924-9f45-776e27923c9d)
249 | 
250 | ## dimTime
251 | 
252 | ![query_dim_time](https://github.com/user-attachments/assets/340d2e04-2290-4833-b930-716ebe46f915)
253 | 
254 | ## factSales
255 | 
256 | ![query_fact_sales](https://github.com/user-attachments/assets/face9a12-66de-4a08-bbfa-91821fcaab1d)
257 | 
258 | ## monthlySales
259 | 
260 | ![query_monthly_sales](https://github.com/user-attachments/assets/7ef0d60b-d018-4722-8fa9-9814bebeb553)
261 | 
262 | ## mostQuantitySold
263 | 
264 | ![query_most_quantity_sold](https://github.com/user-attachments/assets/b1ae3602-9a03-483b-8f0c-6c5893d8b84d)
265 | 
266 | ## mostSoldCategories
267 | 
268 | ![query_most_sold_categories](https://github.com/user-attachments/assets/46e68921-7158-41dc-8bd0-b3cdd6d7f0cf)
269 | 
270 | ## mostSoldProducts
271 | 
272 | ![query_most_sold_products](https://github.com/user-attachments/assets/575a82e8-28d9-4109-ad70-ac4ea17f9799)
273 | 
274 | ## topBuyers
275 | 
276 | ![query_top_buyers](https://github.com/user-attachments/assets/eb49286a-0e55-4f9e-85ac-781f82bced1c)
277 | 
278 | # Documenting
279 | 
280 | ![documentation](https://github.com/user-attachments/assets/b3d63f67-09c4-4626-b7f5-ce00f2200ea3)
281 | 
282 | I documented only one model for educational purposes. The extension I use in VSCode makes documentation creation easier 
283 | (or you can edit it directly in the `sources`). If you have an API key for the AI used, you can use it to automatically 
284 | describe and document the project. Since I don’t use this AI, I documented it manually, but it’s an interesting integration.
285 | 
286 | Now let's see what happened to the `sources.yml` (it must be .yml, mine did not work before because I created as .yaml).
287 | 
288 | ```yml
289 | version: 2
290 | 
291 | sources:
292 |   - name: pokemart
293 |     schema: main
294 |     tables:
295 |       - name: dimCustomers
296 |       - name: dimProducts
297 |       - name: dimProdCategories
298 |       - name: dimTime
299 |       - name: factSales
300 | models:
301 |   - name: salesConsolidated
302 |     description: This is an analytical view of the consolidated fact table (we have the dimension information joined directly into the fact table).
303 |     columns:
304 |       - name: saleSK
305 |         description: SK key for each sale.
306 |         data_type: integer
307 |         quote: true
308 |       - name: productName
309 |         description: Describes the name of the product sold.
310 |         data_type: varchar
311 |         quote: true
312 |       - name: categoryName
313 |         description: Describes the name of the product's category.
314 |         data_type: varchar
315 |         quote: true
316 |       - name: customerName
317 |         description: Describes the name of the person who bought the product.
318 |         data_type: varchar
319 |         quote: true
320 |       - name: saleDate
321 |         description: Describes the date when the sale happened.
322 |         data_type: date
323 |         quote: true
324 |       - name: quantity
325 |         description: Describes the total amount of itens sold.
326 |         data_type: integer
327 |       - name: totalPrice
328 |         description: Describes the total price of the products sold.
329 |         data_type: decimal(10,2)
330 |         quote: true
331 | ```
332 | 
333 | 
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 


--------------------------------------------------------------------------------