├── .DS_Store
├── .gitignore
├── .vscode
    └── settings.json
├── AWS Glue
    ├── .DS_Store
    ├── img
    │   ├── .DS_Store
    │   ├── aws s3 partitions.png
    │   └── glue architecture.png
    └── readme
├── AWS Lambda
    ├── .DS_Store
    ├── LatLongLog
    │   ├── .aws-sam
    │   │   └── build.toml
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── events
    │   │   └── event.json
    │   ├── hello_world
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   └── requirements.txt
    │   ├── samconfig.toml
    │   ├── template.yaml
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── integration
    │   │       ├── __init__.py
    │   │       └── test_api_gateway.py
    │   │   ├── requirements.txt
    │   │   └── unit
    │   │       ├── __init__.py
    │   │       └── test_handler.py
    ├── Readme.md
    ├── getTicket
    │   ├── .aws-sam
    │   │   └── build.toml
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── events
    │   │   └── event.json
    │   ├── hello_world
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   └── requirements.txt
    │   ├── samconfig.toml
    │   ├── template.yaml
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── integration
    │   │       ├── __init__.py
    │   │       └── test_api_gateway.py
    │   │   ├── requirements.txt
    │   │   └── unit
    │   │       ├── __init__.py
    │   │       └── test_handler.py
    ├── lambdatemplateSAM
    │   ├── .aws-sam
    │   │   └── build.toml
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── events
    │   │   └── event.json
    │   ├── hello_world
    │   │   ├── __init__.py
    │   │   ├── app.py
    │   │   ├── datatypes.py
    │   │   └── requirements.txt
    │   ├── outputfile.txt
    │   ├── samconfig.toml
    │   ├── template.yaml
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── integration
    │   │       ├── __init__.py
    │   │       └── test_api_gateway.py
    │   │   ├── requirements.txt
    │   │   └── unit
    │   │       ├── __init__.py
    │   │       └── test_handler.py
    └── orders-api
    │   ├── .aws-sam
    │       └── build.toml
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── events
    │       └── event.json
    │   ├── orders_api
    │       ├── __init__.py
    │       ├── app.py
    │       ├── create.py
    │       ├── read.py
    │       └── requirements.txt
    │   ├── samconfig.toml
    │   ├── template.yaml
    │   └── tests
    │       ├── __init__.py
    │       ├── integration
    │           ├── __init__.py
    │           └── test_api_gateway.py
    │       ├── requirements.txt
    │       └── unit
    │           ├── __init__.py
    │           └── test_handler.py
├── Airflow
    ├── .DS_Store
    ├── DAG Authoring Certification
    │   ├── .DS_Store
    │   ├── .astro
    │   │   └── config.yaml
    │   ├── .dockerignore
    │   ├── .gitignore
    │   └── readme.md
    ├── Fundamentals Certification
    │   ├── oper_sensors_sample_dag.py
    │   ├── readme.md
    │   ├── sensor_operator_dag.py
    │   └── simple_dag.py
    ├── GoogleCloudComposer
    │   └── dag_bq_gcs.py
    └── img
    │   ├── .DS_Store
    │   ├── Chain Operator.png
    │   ├── Python Branch Operator.png
    │   ├── celery cluster.jpg
    │   ├── multi_node.jpg
    │   ├── one_node.jpg
    │   └── task_instance_view.PNG
├── Apache Beam
    ├── .DS_Store
    ├── Bach Processing
    │   ├── 4.4 - Batch DirectRunner + GCS.py
    │   ├── 4.5 - Batch Dataflow + GCS.py
    │   ├── 4.7 - Batch Dataflow + BigQuery.py
    │   ├── comando deploy.txt
    │   └── unnest.py
    ├── Main Functions
    │   ├── 3.1 - Setup Colab.py
    │   ├── 3.10 - ParDo.py
    │   ├── 3.2 - Create.py
    │   ├── 3.2 - Read Transform.py
    │   ├── 3.3 - Write Transform.py
    │   ├── 3.4 - FlatMap.py
    │   ├── 3.4 - Map.py
    │   ├── 3.5 - Filter Lambda.py
    │   ├── 3.5 - Filter Lista.py
    │   ├── 3.6 - Flatten.py
    │   ├── 3.7 - CombinePerKey.py
    │   ├── 3.8 - Combiners.Count.Perkey.py
    │   ├── 3.9 - CoGroupByKey.py
    │   ├── Poem.txt
    │   └── voos_sample.csv
    ├── README.md
    └── Streaming Processing
    │   ├── 5.2.0 - Data Generator.py
    │   ├── 5.2.1 - Voos Streaming DF + Pubsub.py
    │   ├── 5.3 - Voos Streaming DF + BQ.py
    │   ├── 5.4 - Janelas e Noções de Tempo para Streaming.pptx
    │   ├── 5.5 - Tumbling Window DF + BQ.py
    │   ├── 5.6 - Sliding Window DF + BQ.py
    │   ├── Sliding_processor.py
    │   └── streaming janelas.xlsx
├── Data-Eng-Bootcamp
    ├── 17 - Introduction to Relational DB.md
    ├── 19 - 21 game.scala
    ├── 19 - Introduction to Scala.md
    ├── 2 - Introduction to Data Egineering.md
    └── 6 - Introduction to Shell Script.md
├── Databricks
    ├── LAB DATABRICKS FROM A TO Z .md
    ├── Readme.md
    ├── SCALA ETL Part 1 - Data Extraction.md
    ├── SCALA ETL Part 2 - Data Transformation.md
    ├── SPARK ETL Part 1 - Data Extraction.md
    ├── SPARK SQL.md
    └── Scala Exercises
    │   ├── People_vs_Execises.scala
    │   ├── exercises.json
    │   ├── notebook.html
    │   ├── people.json
    │   └── readme.md
├── Docker
    ├── .DS_Store
    ├── img
    │   ├── .DS_Store
    │   ├── Containerrization PID.png
    │   ├── container-layer.png
    │   ├── docker-compose-versions.png
    │   ├── docker-networks.png
    │   ├── docker-volume.png
    │   ├── port-mapping.png
    │   └── voting-app-diagram.png
    ├── python-sample-app
    │   ├── Dockerfile
    │   ├── my-app.py
    │   └── requirements.txt
    └── readme.md
├── Fundamentals of Data Engineering
    ├── .DS_Store
    ├── 1-Foundations.md
    ├── 2-DE Lifecycle.md
    ├── 3-Data Architecture.md
    ├── 4-Chooing Tech Across DE Lifecycle.md
    ├── readme.md
    └── src
    │   ├── DE_lifecycle.png
    │   ├── DE_stakeholders.png
    │   └── monolith_vs_ms_arch.png
├── Git
    ├── .DS_Store
    ├── Git CI Fundamentals.md
    ├── gitlab-ci-chapter-3.yml
    ├── gitlab-ci-chapter-4.yml
    ├── gitlab-ci-my-first-pipeline.yml
    ├── img
    │   ├── .DS_Store
    │   └── CI CD Pipeline.png
    └── readme.md
├── Installing-components.md
├── Kafka
    └── Apache Kafka Fundamentals.md
├── Kubernetes
    ├── Kubernetes Up and Running
    │   ├── 2
    │   │   ├── .dockerignore
    │   │   ├── Dockerfile
    │   │   ├── package.json
    │   │   └── server.js
    │   └── kubernetes_up_and_running.md
    ├── img
    │   └── kubernetes-architecture.png
    ├── k8s_and_data.md
    ├── k8s_minikube.md
    └── readme.md
├── README.md
├── Snowflake
    ├── .DS_Store
    ├── curated-links-list.md
    ├── live-sf-getting-started.sql
    ├── readme.md
    ├── the-snowflake-series-course
    │   ├── COPY.sql
    │   ├── FILE FORMAT.sql
    │   ├── FLATTEN.sql
    │   ├── INSERT.sql
    │   ├── LOAD JSON.sql
    │   ├── PARSE JSON.sql
    │   ├── STAGES.sql
    │   ├── STORAGE INTEGRATION.sql
    │   ├── TRANSFORMATION WITH COPY.sql
    │   ├── automacao
    │   │   ├── SNOWPIPE.sql
    │   │   ├── TASK TREE.sql
    │   │   └── TASK.SQL
    │   └── test.sql
    └── ws
    │   ├── extras
    │       └── extras.sql
    │   ├── ingestion
    │       ├── 1-storage-integration.sql
    │       ├── 2-stage-and-file-format.sql
    │       ├── 3-copy-command.sql
    │       └── 4-snowpipe.sql
    │   └── transformation
    │       ├── 1-semi-structured.sql
    │       ├── 2-tasks.sql
    │       ├── 3-duplicates.sql
    │       ├── 4-streams.sql
    │       ├── 5-streams+tasks.sql
    │       └── 6-refactoring.sql
├── Sql
    ├── SQL HACKER RANK TRAINING.sql
    └── WINDOW FUNCTIONS.sql
├── Terraform
    ├── .DS_Store
    ├── 05-language-features
    │   └── README.md
    ├── 06-organization-and-modules
    │   ├── .DS_Store
    │   ├── README.md
    │   ├── consul
    │   │   ├── README.md
    │   │   └── main.tf
    │   ├── web-app-module
    │   │   ├── compute.tf
    │   │   ├── database.tf
    │   │   ├── dns.tf
    │   │   ├── main.tf
    │   │   ├── networking.tf
    │   │   ├── outputs.tf
    │   │   ├── storage.tf
    │   │   └── variables.tf
    │   └── web-app
    │   │   └── main.tf
    ├── 2-first-tf-deployment
    │   ├── main.tf
    │   └── terraform.tfstate
    ├── 3-remote-backend
    │   ├── main.tf
    │   └── terraform.tfstate
    ├── 3-web-app
    │   ├── .terraform
    │   │   └── terraform.tfstate
    │   ├── errored.tfstate
    │   ├── main.tf
    │   └── web-app-architecture.png
    ├── 4-variables-and-outputs
    │   ├── examples
    │   │   ├── another-variable-file.tfvars
    │   │   ├── main.tf
    │   │   ├── outputs.tf
    │   │   ├── terraform.tfvars
    │   │   └── variables.tf
    │   └── web-app
    │   │   ├── main.tf
    │   │   ├── outputs.tf
    │   │   ├── terraform.tfstate
    │   │   ├── terraform.tfvars
    │   │   └── variables.tf
    ├── img
    │   ├── .DS_Store
    │   └── 2_2_terraform_architecture.png
    └── readme.md
├── airbyte-dbt-airflow-snowflake
    ├── .DS_Store
    ├── .env
    ├── airbyte-docker-compose.yml
    ├── flags.yml
    ├── readme.md
    └── temporal
    │   └── dynamicconfig
    │       └── development.yaml
├── dbt
    ├── .DS_Store
    ├── dbt-fundamentals-course
    │   ├── dim_customer-3.sql
    │   ├── dim_customers-2.sql
    │   ├── dim_customers.sql
    │   ├── fct_orders.sql
    │   ├── img
    │   │   ├── dbt_and_alatycs_engineer.png
    │   │   ├── dbt_workflow.png
    │   │   ├── ineage_graph.png
    │   │   └── modern_data_platform.png
    │   ├── insert_sf_dbt_training_data.sql
    │   ├── jaffle_shop.md
    │   ├── readme.md
    │   ├── src_jaffle_shop-2.yml
    │   ├── src_jaffle_shop-3.yml
    │   ├── src_jaffle_shop-4.yml
    │   ├── src_jaffle_shop.yml
    │   ├── src_stripe.yml
    │   ├── stg_customers.sql
    │   ├── stg_jaffle_shop-2.yml
    │   ├── stg_jaffle_shop.yml
    │   ├── stg_orders.sql
    │   └── stg_payments.sql
    └── dbt-udemy-course
    │   ├── .DS_Store
    │   ├── Notes.md
    │   ├── dbt-course-udemy
    │       ├── .DS_Store
    │       ├── .dbt
    │       │   ├── .user.yml
    │       │   └── profiles.yml
    │       ├── dbt_project
    │       │   ├── .DS_Store
    │       │   ├── .gitignore
    │       │   ├── README.md
    │       │   ├── analyses
    │       │   │   ├── .gitkeep
    │       │   │   └── full_moon_no_sleep.sql
    │       │   ├── assets
    │       │   │   └── input_schema.png
    │       │   ├── dbt_project.yml
    │       │   ├── macros
    │       │   │   ├── .gitkeep
    │       │   │   ├── no_nulls_in_columns.sql
    │       │   │   └── positive_values.sql
    │       │   ├── models
    │       │   │   ├── dim
    │       │   │   │   ├── dim_hosts_cleansed.sql
    │       │   │   │   ├── dim_listings_cleansed.sql
    │       │   │   │   └── dim_listings_w_hosts.sql
    │       │   │   ├── docs.md
    │       │   │   ├── fct
    │       │   │   │   └── fct_reviews.sql
    │       │   │   ├── mart
    │       │   │   │   └── mart_fullmoon_reviews.sql
    │       │   │   ├── overview.md
    │       │   │   ├── schema.yml
    │       │   │   ├── sources.yml
    │       │   │   └── src
    │       │   │   │   ├── src_hosts.sql
    │       │   │   │   ├── src_listings.sql
    │       │   │   │   └── src_reviews.sql
    │       │   ├── package-lock.yml
    │       │   ├── packages.yml
    │       │   ├── seeds
    │       │   │   ├── .gitkeep
    │       │   │   └── seed_full_moon_dates.csv
    │       │   ├── snapshots
    │       │   │   ├── .gitkeep
    │       │   │   └── scd_raw_listings.sql
    │       │   └── tests
    │       │   │   ├── .gitkeep
    │       │   │   ├── consistent_created_at.sql
    │       │   │   ├── dim_listings_minimum_nights.sql
    │       │   │   └── no_nulls_in_dim_linstings.sql
    │       └── logs
    │       │   └── dbt.log
    │   ├── readme.md
    │   └── setup.md
├── pytest
    ├── first_test.py
    ├── gtfs_test.py
    ├── multiple_tests.py
    ├── pytest.ini
    ├── readme.md
    ├── test_api.py
    ├── test_fixtures.py
    └── test_parametrized.py
├── spark_on_google_colab.py
└── src
    └── img
        ├── 1 - Intro to data Engineering
            ├── fig 1 - Pipeline.JPG
            ├── fig 2 - Star Schema.JPG
            ├── fig 3 - Parallel Computing.JPG
            ├── fig 4 - DAG Example Air Flow.JPG
            ├── fig 5 - MPP.JPG
            └── fig 6 - AirFlow UI.JPG
        ├── 17 - Introduction to Relational DB
            ├── 1_n_relationship.jpg
            ├── ENTITY_MODELS.jpg
            ├── n_n_relationships.jpg
            └── postgree_datatypes.jpg
        ├── 17 - Introduction to Scala
            ├── 21_game_points.jpg
            └── Scala_Interpreter.jpg
        ├── 2 - Streamlined data with pandas
            ├── fig 1 - Dataframe.JPG
            ├── fig 2 - Loading Excel.JPG
            ├── fig 3 - Datetime Table.JPG
            ├── fig 4 - Datetime Formatting.JPG
            ├── fig 5 - JSON Object Oriented.JPG
            ├── fig 6 - JSON Column Oriented.JPG
            └── fig 7 - Yelp Documentation.JPG
        ├── 3 - Software Engineering in Python
            ├── fig 1 - Local package structure.JPG
            ├── fig 1 - PEP and non PEP codes.JPG
            ├── fig 3 - Portable package structure.JPG
            ├── fig 4 - Anatomy of classes.JPG
            └── fig 5 - Inheritance.JPG
        ├── 5 - Introduction to Shell Script
            ├── fig 1 - paste command.JPG
            ├── fig 2 - wrap up manipulating data.JPG
            ├── fig 3 - multiple actions loop.JPG
            └── fig 4 - nano interface.JPG
        ├── SCALA ETL Part 1
            ├── imperative_vs_functional_Programing.jpg
            ├── spark drop malformed mode.jpg
            └── spark permissive mode.jpg
        ├── SCALA ETL Part 2
            └── image norrmalization.jpg
        ├── Weather_Data_Pipeline
            └── Weather API v1.0.jpg
        └── kafka_fundamentals
            ├── broker.jpg
            ├── kafka architecture.jpg
            └── topics.jpg


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.hcl
2 | *.backup
3 | *x5
4 | Snowflake/.DS_Store
5 | Snowflake/.DS_Store
6 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true,
3 |     "dbt.queryLimit": 500
4 | }


--------------------------------------------------------------------------------
/AWS Glue/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/.DS_Store


--------------------------------------------------------------------------------
/AWS Glue/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/.DS_Store


--------------------------------------------------------------------------------
/AWS Glue/img/aws s3 partitions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/aws s3 partitions.png


--------------------------------------------------------------------------------
/AWS Glue/img/glue architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/glue architecture.png


--------------------------------------------------------------------------------
/AWS Glue/readme:
--------------------------------------------------------------------------------
 1 | ## What is Glue
 2 | * Fully managed ETL service
 3 | * Consists of a central metadata catalog repository (glue data catalog)
 4 | * Spark ETL Engine
 5 | * Flexible Scheduler
 6 | 
 7 | # IMAGE glue architecture
 8 | 
 9 | ## AWS Glue Data Catalog
10 | * Persistent Metadata Store
11 | * Managed service to store, annotate and share metadata wich can be used to query and transdform data
12 | * One glue catalog per AWS Region
13 | * IAM policies control access
14 | * Can be used for data governance
15 | * Store: Data location, checma, data types, data classification...
16 | * Database in Glue
17 |     * Set of associated in Data catalog table definitions organized into logical groups
18 |     * it just organizes the logical data, dont actually move any data 
19 |     * The metadata definition that represnt your data. Data resides in its original store, this is just a representation of the schema
20 | 
21 | ### Create Database Practices
22 | * Organize glue databases in respective folders in S3
23 | * All files for a database should be under the folder, for organization
24 | 
25 | ## AWS Partitions
26 | * Folders where data is stored on S3, which are physical entities, are mapped to parititons, which are logical entities. Ie: columns in glues table
27 | * helps to query data more efficiently because we just query the necessary data
28 | 
29 | # IMAGE s3 partitions
30 | sample folder structure:
31 | * sales
32 |     * 2019
33 |         * july
34 |         * august
35 |             * 01
36 |             * 02
37 |             * 03
38 | 
39 | ## AWS Glue Crawler
40 | * A program that connects to a data store (source or target), progresses through a prioritized list of classifiers to determine the schema  for your data, and then creates metadata tables in the AWS Glue Data Catalog
41 | * Can add data manually in a data catalog (columns by column and its format addition)
42 | * Can add data via crawler also
43 | 
44 | ## AWS Glue Jobs
45 | * Store business logic required for ETL
46 | * It is composed by transformation script, data sources, and data targets
47 | * job runs are initiated by triggers that can be scheduled or triggeres by events
48 | * it is under jobs
49 |     * can select the engine, the temp folder, and the scripts folder
50 | 
51 | ## AWS Triggers
52 | * Create triggers for Glue Jobs
53 | * Triggers can be event based or schedule based
54 | 
55 | ## AWS DEV Endpoint
56 | * Create a cluster to develop scripts and test locally sending the code to test in this cluster on AWS
57 | * Costs can scale very easily


--------------------------------------------------------------------------------
/AWS Lambda/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/.DS_Store


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/.aws-sam/build.toml:
--------------------------------------------------------------------------------
 1 | # This file is auto generated by SAM CLI build command
 2 | 
 3 | [function_build_definitions]
 4 | [function_build_definitions.2ed2c419-0f5d-4864-8032-6061cf637252]
 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\LatLongLog\\hello_world"
 6 | runtime = "python3.9"
 7 | source_md5 = ""
 8 | packagetype = "Zip"
 9 | functions = ["LatLongLog"]
10 | 
11 | [layer_build_definitions]
12 | 


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/events/event.json:
--------------------------------------------------------------------------------
1 | [1.156123,-54.55952]


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/hello_world/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/hello_world/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/hello_world/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | # import requests
 4 | 
 5 | 
 6 | def lambda_handler(event, context):
 7 |     """Sample pure Lambda function
 8 | 
 9 |     Parameters
10 |     ----------
11 |     event: dict, required
12 |         API Gateway Lambda Proxy Input Format
13 | 
14 |         Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format
15 | 
16 |     context: object, required
17 |         Lambda Context runtime methods and attributes
18 | 
19 |         Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html
20 | 
21 |     Returns
22 |     ------
23 |     API Gateway Lambda Proxy Output Format: dict
24 | 
25 |         Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html
26 |     """
27 |     # consifering every event will have a list with only 2 values
28 |     lat = event[0]
29 |     long = event[1]
30 |     log = f"Current Coordinates = LAT:{lat} , LONG:{long}"
31 |     return log


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/hello_world/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default]
 3 | [default.deploy]
 4 | [default.deploy.parameters]
 5 | stack_name = "LatLongLog"
 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s"
 7 | s3_prefix = "LatLongLog"
 8 | region = "us-east-1"
 9 | confirm_changeset = true
10 | capabilities = "CAPABILITY_IAM"
11 | 


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Transform: AWS::Serverless-2016-10-31
 3 | Description: >
 4 |   LatLongLog
 5 | 
 6 |   Sample SAM Template for LatLongLog
 7 | 
 8 | Resources:
 9 |   LatLongLog:
10 |     Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
11 |     Properties:
12 |       CodeUri: hello_world/
13 |       Handler: app.lambda_handler
14 |       Runtime: python3.9
15 | 


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/integration/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/integration/test_api_gateway.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import boto3
 5 | import requests
 6 | 
 7 | """
 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 
 9 | """
10 | 
11 | 
12 | class TestApiGateway(TestCase):
13 |     api_endpoint: str
14 | 
15 |     @classmethod
16 |     def get_stack_name(cls) -> str:
17 |         stack_name = os.environ.get("AWS_SAM_STACK_NAME")
18 |         if not stack_name:
19 |             raise Exception(
20 |                 "Cannot find env var AWS_SAM_STACK_NAME. \n"
21 |                 "Please setup this environment variable with the stack name where we are running integration tests."
22 |             )
23 | 
24 |         return stack_name
25 | 
26 |     def setUp(self) -> None:
27 |         """
28 |         Based on the provided env variable AWS_SAM_STACK_NAME,
29 |         here we use cloudformation API to find out what the HelloWorldApi URL is
30 |         """
31 |         stack_name = TestApiGateway.get_stack_name()
32 | 
33 |         client = boto3.client("cloudformation")
34 | 
35 |         try:
36 |             response = client.describe_stacks(StackName=stack_name)
37 |         except Exception as e:
38 |             raise Exception(
39 |                 f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.'
40 |             ) from e
41 | 
42 |         stacks = response["Stacks"]
43 | 
44 |         stack_outputs = stacks[0]["Outputs"]
45 |         api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"]
46 |         self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}")
47 | 
48 |         self.api_endpoint = api_outputs[0]["OutputValue"]
49 | 
50 |     def test_api_gateway(self):
51 |         """
52 |         Call the API Gateway endpoint and check the response
53 |         """
54 |         response = requests.get(self.api_endpoint)
55 |         self.assertDictEqual(response.json(), {"message": "hello world"})
56 | 


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-mock
3 | boto3


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/unit/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/LatLongLog/tests/unit/test_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from hello_world import app
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def apigw_event():
10 |     """ Generates API GW Event"""
11 | 
12 |     return {
13 |         "body": '{ "test": "body"}',
14 |         "resource": "/{proxy+}",
15 |         "requestContext": {
16 |             "resourceId": "123456",
17 |             "apiId": "1234567890",
18 |             "resourcePath": "/{proxy+}",
19 |             "httpMethod": "POST",
20 |             "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
21 |             "accountId": "123456789012",
22 |             "identity": {
23 |                 "apiKey": "",
24 |                 "userArn": "",
25 |                 "cognitoAuthenticationType": "",
26 |                 "caller": "",
27 |                 "userAgent": "Custom User Agent String",
28 |                 "user": "",
29 |                 "cognitoIdentityPoolId": "",
30 |                 "cognitoIdentityId": "",
31 |                 "cognitoAuthenticationProvider": "",
32 |                 "sourceIp": "127.0.0.1",
33 |                 "accountId": "",
34 |             },
35 |             "stage": "prod",
36 |         },
37 |         "queryStringParameters": {"foo": "bar"},
38 |         "headers": {
39 |             "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
40 |             "Accept-Language": "en-US,en;q=0.8",
41 |             "CloudFront-Is-Desktop-Viewer": "true",
42 |             "CloudFront-Is-SmartTV-Viewer": "false",
43 |             "CloudFront-Is-Mobile-Viewer": "false",
44 |             "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
45 |             "CloudFront-Viewer-Country": "US",
46 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
47 |             "Upgrade-Insecure-Requests": "1",
48 |             "X-Forwarded-Port": "443",
49 |             "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
50 |             "X-Forwarded-Proto": "https",
51 |             "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==",
52 |             "CloudFront-Is-Tablet-Viewer": "false",
53 |             "Cache-Control": "max-age=0",
54 |             "User-Agent": "Custom User Agent String",
55 |             "CloudFront-Forwarded-Proto": "https",
56 |             "Accept-Encoding": "gzip, deflate, sdch",
57 |         },
58 |         "pathParameters": {"proxy": "/examplepath"},
59 |         "httpMethod": "POST",
60 |         "stageVariables": {"baz": "qux"},
61 |         "path": "/examplepath",
62 |     }
63 | 
64 | 
65 | def test_lambda_handler(apigw_event, mocker):
66 | 
67 |     ret = app.lambda_handler(apigw_event, "")
68 |     data = json.loads(ret["body"])
69 | 
70 |     assert ret["statusCode"] == 200
71 |     assert "message" in ret["body"]
72 |     assert data["message"] == "hello world"
73 |     # assert "location" in data.dict_keys()
74 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/.aws-sam/build.toml:
--------------------------------------------------------------------------------
 1 | # This file is auto generated by SAM CLI build command
 2 | 
 3 | [function_build_definitions]
 4 | [function_build_definitions.08eec71e-2945-4a9d-9e9e-90e2225e341d]
 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\getTicket\\hello_world"
 6 | runtime = "python3.9"
 7 | source_md5 = ""
 8 | packagetype = "Zip"
 9 | functions = ["GetTicketFunction"]
10 | 
11 | [layer_build_definitions]
12 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/events/event.json:
--------------------------------------------------------------------------------
1 | ["Cassio"]


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/hello_world/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/hello_world/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/hello_world/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def lambda_handler(event, context):
 5 |     #print(event)
 6 |     student_scores = {"Cassio" : 100, "Victoria" : 100, "Garfinho" : 90 }
 7 |     score_list = []
 8 |     for name in event:
 9 |         score_list.append(student_scores[name])
10 |     return score_list
11 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/hello_world/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default]
 3 | [default.deploy]
 4 | [default.deploy.parameters]
 5 | stack_name = "GetTicket"
 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s"
 7 | s3_prefix = "GetTicket"
 8 | region = "us-east-1"
 9 | confirm_changeset = true
10 | capabilities = "CAPABILITY_IAM"
11 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Transform: AWS::Serverless-2016-10-31
 3 | Description: >
 4 |   getTicket
 5 | 
 6 |   Sample SAM Template for getTicket
 7 | 
 8 | Resources:
 9 |   GetTicketFunction:
10 |     Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
11 |     Properties:
12 |       CodeUri: hello_world/
13 |       Handler: app.lambda_handler
14 |       Runtime: python3.9
15 | 
16 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/integration/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/integration/test_api_gateway.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import boto3
 5 | import requests
 6 | 
 7 | """
 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 
 9 | """
10 | 
11 | 
12 | class TestApiGateway(TestCase):
13 |     api_endpoint: str
14 | 
15 |     @classmethod
16 |     def get_stack_name(cls) -> str:
17 |         stack_name = os.environ.get("AWS_SAM_STACK_NAME")
18 |         if not stack_name:
19 |             raise Exception(
20 |                 "Cannot find env var AWS_SAM_STACK_NAME. \n"
21 |                 "Please setup this environment variable with the stack name where we are running integration tests."
22 |             )
23 | 
24 |         return stack_name
25 | 
26 |     def setUp(self) -> None:
27 |         """
28 |         Based on the provided env variable AWS_SAM_STACK_NAME,
29 |         here we use cloudformation API to find out what the HelloWorldApi URL is
30 |         """
31 |         stack_name = TestApiGateway.get_stack_name()
32 | 
33 |         client = boto3.client("cloudformation")
34 | 
35 |         try:
36 |             response = client.describe_stacks(StackName=stack_name)
37 |         except Exception as e:
38 |             raise Exception(
39 |                 f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.'
40 |             ) from e
41 | 
42 |         stacks = response["Stacks"]
43 | 
44 |         stack_outputs = stacks[0]["Outputs"]
45 |         api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"]
46 |         self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}")
47 | 
48 |         self.api_endpoint = api_outputs[0]["OutputValue"]
49 | 
50 |     def test_api_gateway(self):
51 |         """
52 |         Call the API Gateway endpoint and check the response
53 |         """
54 |         response = requests.get(self.api_endpoint)
55 |         self.assertDictEqual(response.json(), {"message": "hello world"})
56 | 


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-mock
3 | boto3


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/unit/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/getTicket/tests/unit/test_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from hello_world import app
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def apigw_event():
10 |     """ Generates API GW Event"""
11 | 
12 |     return {
13 |         "body": '{ "test": "body"}',
14 |         "resource": "/{proxy+}",
15 |         "requestContext": {
16 |             "resourceId": "123456",
17 |             "apiId": "1234567890",
18 |             "resourcePath": "/{proxy+}",
19 |             "httpMethod": "POST",
20 |             "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
21 |             "accountId": "123456789012",
22 |             "identity": {
23 |                 "apiKey": "",
24 |                 "userArn": "",
25 |                 "cognitoAuthenticationType": "",
26 |                 "caller": "",
27 |                 "userAgent": "Custom User Agent String",
28 |                 "user": "",
29 |                 "cognitoIdentityPoolId": "",
30 |                 "cognitoIdentityId": "",
31 |                 "cognitoAuthenticationProvider": "",
32 |                 "sourceIp": "127.0.0.1",
33 |                 "accountId": "",
34 |             },
35 |             "stage": "prod",
36 |         },
37 |         "queryStringParameters": {"foo": "bar"},
38 |         "headers": {
39 |             "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
40 |             "Accept-Language": "en-US,en;q=0.8",
41 |             "CloudFront-Is-Desktop-Viewer": "true",
42 |             "CloudFront-Is-SmartTV-Viewer": "false",
43 |             "CloudFront-Is-Mobile-Viewer": "false",
44 |             "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
45 |             "CloudFront-Viewer-Country": "US",
46 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
47 |             "Upgrade-Insecure-Requests": "1",
48 |             "X-Forwarded-Port": "443",
49 |             "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
50 |             "X-Forwarded-Proto": "https",
51 |             "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==",
52 |             "CloudFront-Is-Tablet-Viewer": "false",
53 |             "Cache-Control": "max-age=0",
54 |             "User-Agent": "Custom User Agent String",
55 |             "CloudFront-Forwarded-Proto": "https",
56 |             "Accept-Encoding": "gzip, deflate, sdch",
57 |         },
58 |         "pathParameters": {"proxy": "/examplepath"},
59 |         "httpMethod": "POST",
60 |         "stageVariables": {"baz": "qux"},
61 |         "path": "/examplepath",
62 |     }
63 | 
64 | 
65 | def test_lambda_handler(apigw_event, mocker):
66 | 
67 |     ret = app.lambda_handler(apigw_event, "")
68 |     data = json.loads(ret["body"])
69 | 
70 |     assert ret["statusCode"] == 200
71 |     assert "message" in ret["body"]
72 |     assert data["message"] == "hello world"
73 |     # assert "location" in data.dict_keys()
74 | 


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/.aws-sam/build.toml:
--------------------------------------------------------------------------------
 1 | # This file is auto generated by SAM CLI build command
 2 | 
 3 | [function_build_definitions]
 4 | [function_build_definitions.3ea77c0a-a539-45af-8a65-12dbfaa05af7]
 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\lambdatemplateSAM\\hello_world"
 6 | runtime = "python3.9"
 7 | source_md5 = ""
 8 | packagetype = "Zip"
 9 | functions = ["HelloWorldFunction"]
10 | 
11 | [layer_build_definitions]
12 | 


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/events/event.json:
--------------------------------------------------------------------------------
1 | {"Cassio" : [100,10,20], "Victoria" : 100, "Garfinho" : 90 }


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/hello_world/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/hello_world/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/hello_world/app.py:
--------------------------------------------------------------------------------
1 | import json
2 | 
3 | 
4 | def first_lambda(event, context):
5 |     return "hello "+event
6 | 


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/hello_world/datatypes.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os # to get env variables created from template.yml
 3 | import random
 4 | 
 5 | def simple_types(event,context):
 6 |     print(event)
 7 |     return event
 8 | 
 9 | def list_types(event,context):
10 |     print(event)
11 |     student_scores = {"Cassio" : 100, "Victoria" : 100, "Garfinho" : 90 }
12 |     score_list = []
13 |     for name in event:
14 |         score_list.append(student_scores[name])
15 |     return score_list
16 | 
17 | def dict_types(event,context):
18 |     for score in event["Cassio"]:
19 |         print(score)
20 |     return event
21 | 
22 | def context_example(event, context):   
23 |     print("Lambda function ARN:", context.invoked_function_arn)
24 |     print("CloudWatch log stream name:", context.log_stream_name)
25 |     print("CloudWatch log group name:",  context.log_group_name)
26 |     print("Lambda Request ID:", context.aws_request_id)
27 |     print("Lambda function memory limits in MB:", context.memory_limit_in_mb)
28 |     # We have added a 1 second delay so you can see the time remaining in get_remaining_time_in_millis.
29 |     time.sleep(1) 
30 |     print("Lambda time remaining in MS:", context.get_remaining_time_in_millis())
31 |     print(os.getenv('restapi')) 
32 |     return context.invoked_function_arn
33 | 
34 | global_var = random.random()
35 | def cold_start(event,context):
36 |     exec_time_var = random.random()
37 |     return global_var,exec_time_var


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/hello_world/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/outputfile.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/outputfile.txt


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default]
 3 | [default.deploy]
 4 | [default.deploy.parameters]
 5 | stack_name = "firstlambda"
 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s"
 7 | s3_prefix = "firstlambda"
 8 | region = "us-east-1"
 9 | confirm_changeset = true
10 | capabilities = "CAPABILITY_IAM"
11 | 


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09' # mandatory
 2 | Transform: AWS::Serverless-2016-10-31 # mandatory -> tells cloud formation which template this is about
 3 | Description: >
 4 |   lambdatemplateSAM
 5 | 
 6 |   Sample SAM Template for lambdatemplateSAM
 7 | 
 8 | Resources: # mandatory, every resource used should be declared here, like api gateway, dynamo db...
 9 |   HelloWorldFunction:
10 |     Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
11 |     Properties:
12 |       CodeUri: hello_world/
13 |       Handler: datatypes.cold_start
14 |       Runtime: python3.9
15 |       Timeout: 3 # maximum is 15min
16 |       Environment: # pass under evnriment, variables your function can retrieve
17 |         Variables:
18 |           restapi: http://dummy.com
19 |           dbname: mydb


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/integration/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/integration/test_api_gateway.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import boto3
 5 | import requests
 6 | 
 7 | """
 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 
 9 | """
10 | 
11 | 
12 | class TestApiGateway(TestCase):
13 |     api_endpoint: str
14 | 
15 |     @classmethod
16 |     def get_stack_name(cls) -> str:
17 |         stack_name = os.environ.get("AWS_SAM_STACK_NAME")
18 |         if not stack_name:
19 |             raise Exception(
20 |                 "Cannot find env var AWS_SAM_STACK_NAME. \n"
21 |                 "Please setup this environment variable with the stack name where we are running integration tests."
22 |             )
23 | 
24 |         return stack_name
25 | 
26 |     def setUp(self) -> None:
27 |         """
28 |         Based on the provided env variable AWS_SAM_STACK_NAME,
29 |         here we use cloudformation API to find out what the HelloWorldApi URL is
30 |         """
31 |         stack_name = TestApiGateway.get_stack_name()
32 | 
33 |         client = boto3.client("cloudformation")
34 | 
35 |         try:
36 |             response = client.describe_stacks(StackName=stack_name)
37 |         except Exception as e:
38 |             raise Exception(
39 |                 f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.'
40 |             ) from e
41 | 
42 |         stacks = response["Stacks"]
43 | 
44 |         stack_outputs = stacks[0]["Outputs"]
45 |         api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"]
46 |         self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}")
47 | 
48 |         self.api_endpoint = api_outputs[0]["OutputValue"]
49 | 
50 |     def test_api_gateway(self):
51 |         """
52 |         Call the API Gateway endpoint and check the response
53 |         """
54 |         response = requests.get(self.api_endpoint)
55 |         self.assertDictEqual(response.json(), {"message": "hello world"})
56 | 


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-mock
3 | boto3


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/unit/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/lambdatemplateSAM/tests/unit/test_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from hello_world import app
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def apigw_event():
10 |     """ Generates API GW Event"""
11 | 
12 |     return {
13 |         "body": '{ "test": "body"}',
14 |         "resource": "/{proxy+}",
15 |         "requestContext": {
16 |             "resourceId": "123456",
17 |             "apiId": "1234567890",
18 |             "resourcePath": "/{proxy+}",
19 |             "httpMethod": "POST",
20 |             "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
21 |             "accountId": "123456789012",
22 |             "identity": {
23 |                 "apiKey": "",
24 |                 "userArn": "",
25 |                 "cognitoAuthenticationType": "",
26 |                 "caller": "",
27 |                 "userAgent": "Custom User Agent String",
28 |                 "user": "",
29 |                 "cognitoIdentityPoolId": "",
30 |                 "cognitoIdentityId": "",
31 |                 "cognitoAuthenticationProvider": "",
32 |                 "sourceIp": "127.0.0.1",
33 |                 "accountId": "",
34 |             },
35 |             "stage": "prod",
36 |         },
37 |         "queryStringParameters": {"foo": "bar"},
38 |         "headers": {
39 |             "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
40 |             "Accept-Language": "en-US,en;q=0.8",
41 |             "CloudFront-Is-Desktop-Viewer": "true",
42 |             "CloudFront-Is-SmartTV-Viewer": "false",
43 |             "CloudFront-Is-Mobile-Viewer": "false",
44 |             "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
45 |             "CloudFront-Viewer-Country": "US",
46 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
47 |             "Upgrade-Insecure-Requests": "1",
48 |             "X-Forwarded-Port": "443",
49 |             "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
50 |             "X-Forwarded-Proto": "https",
51 |             "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==",
52 |             "CloudFront-Is-Tablet-Viewer": "false",
53 |             "Cache-Control": "max-age=0",
54 |             "User-Agent": "Custom User Agent String",
55 |             "CloudFront-Forwarded-Proto": "https",
56 |             "Accept-Encoding": "gzip, deflate, sdch",
57 |         },
58 |         "pathParameters": {"proxy": "/examplepath"},
59 |         "httpMethod": "POST",
60 |         "stageVariables": {"baz": "qux"},
61 |         "path": "/examplepath",
62 |     }
63 | 
64 | 
65 | def test_lambda_handler(apigw_event, mocker):
66 | 
67 |     ret = app.lambda_handler(apigw_event, "")
68 |     data = json.loads(ret["body"])
69 | 
70 |     assert ret["statusCode"] == 200
71 |     assert "message" in ret["body"]
72 |     assert data["message"] == "hello world"
73 |     # assert "location" in data.dict_keys()
74 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/.aws-sam/build.toml:
--------------------------------------------------------------------------------
 1 | # This file is auto generated by SAM CLI build command
 2 | 
 3 | [function_build_definitions]
 4 | [function_build_definitions.8eb773ee-21d0-4a34-956a-674265864e83]
 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\orders-api\\orders_api"
 6 | runtime = "python3.9"
 7 | source_md5 = ""
 8 | packagetype = "Zip"
 9 | functions = ["CreateOrderFunction", "ReadOrderFunction"]
10 | 
11 | [layer_build_definitions]
12 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/events/event.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "body": "{\"message\": \"hello world\"}",
 3 |   "resource": "/hello",
 4 |   "path": "/hello",
 5 |   "httpMethod": "GET",
 6 |   "isBase64Encoded": false,
 7 |   "queryStringParameters": {
 8 |     "foo": "bar"
 9 |   },
10 |   "pathParameters": {
11 |     "proxy": "/path/to/resource"
12 |   },
13 |   "stageVariables": {
14 |     "baz": "qux"
15 |   },
16 |   "headers": {
17 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
18 |     "Accept-Encoding": "gzip, deflate, sdch",
19 |     "Accept-Language": "en-US,en;q=0.8",
20 |     "Cache-Control": "max-age=0",
21 |     "CloudFront-Forwarded-Proto": "https",
22 |     "CloudFront-Is-Desktop-Viewer": "true",
23 |     "CloudFront-Is-Mobile-Viewer": "false",
24 |     "CloudFront-Is-SmartTV-Viewer": "false",
25 |     "CloudFront-Is-Tablet-Viewer": "false",
26 |     "CloudFront-Viewer-Country": "US",
27 |     "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
28 |     "Upgrade-Insecure-Requests": "1",
29 |     "User-Agent": "Custom User Agent String",
30 |     "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
31 |     "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==",
32 |     "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
33 |     "X-Forwarded-Port": "443",
34 |     "X-Forwarded-Proto": "https"
35 |   },
36 |   "requestContext": {
37 |     "accountId": "123456789012",
38 |     "resourceId": "123456",
39 |     "stage": "prod",
40 |     "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
41 |     "requestTime": "09/Apr/2015:12:34:56 +0000",
42 |     "requestTimeEpoch": 1428582896000,
43 |     "identity": {
44 |       "cognitoIdentityPoolId": null,
45 |       "accountId": null,
46 |       "cognitoIdentityId": null,
47 |       "caller": null,
48 |       "accessKey": null,
49 |       "sourceIp": "127.0.0.1",
50 |       "cognitoAuthenticationType": null,
51 |       "cognitoAuthenticationProvider": null,
52 |       "userArn": null,
53 |       "userAgent": "Custom User Agent String",
54 |       "user": null
55 |     },
56 |     "path": "/prod/hello",
57 |     "resourcePath": "/hello",
58 |     "httpMethod": "POST",
59 |     "apiId": "1234567890",
60 |     "protocol": "HTTP/1.1"
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/orders_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/orders_api/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/orders_api/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | # import requests
 4 | 
 5 | 
 6 | def lambda_handler(event, context):
 7 |     """Sample pure Lambda function
 8 | 
 9 |     Parameters
10 |     ----------
11 |     event: dict, required
12 |         API Gateway Lambda Proxy Input Format
13 | 
14 |         Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format
15 | 
16 |     context: object, required
17 |         Lambda Context runtime methods and attributes
18 | 
19 |         Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html
20 | 
21 |     Returns
22 |     ------
23 |     API Gateway Lambda Proxy Output Format: dict
24 | 
25 |         Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html
26 |     """
27 | 
28 |     # try:
29 |     #     ip = requests.get("http://checkip.amazonaws.com/")
30 |     # except requests.RequestException as e:
31 |     #     # Send some context about this error to Lambda Logs
32 |     #     print(e)
33 | 
34 |     #     raise e
35 | 
36 |     return {
37 |         "statusCode": 200,
38 |         "body": json.dumps({
39 |             "message": "hello world",
40 |             # "location": ip.text.replace("\n", "")
41 |         }),
42 |     }
43 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/orders_api/create.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | import os
 4 | 
 5 | def lambda_handler(event, context):
 6 |     order = json.loads(event['body'])
 7 | 
 8 |     dynamodb = boto3.resource('dynamodb') # tell boto what resource you are using
 9 |     table_name = os.environ.get('ORDERS_TABLE') # get the value from global function defined on yml
10 |     table = dynamodb.Table(table_name) # get table from dynamo db
11 |     response = table.put_item(TableName = table_name, Item=order) #use put item method on table, to put the order coming from body
12 |     print(response) # write response to the logs
13 |     return {
14 |         'statusCode': 201,
15 |         'headers': {},
16 |         'body': json.dumps({'messagem':'order created'})
17 |     }


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/orders_api/read.py:
--------------------------------------------------------------------------------
 1 | import simplejson as json
 2 | import boto3
 3 | import os
 4 | from boto3.dynamodb.conditions import Key
 5 | 
 6 | def lambda_handler(event, context):
 7 |     order = {"id" : 123 , "itemName" : "McBook", "quantity" : 100}
 8 |     dynamodb = boto3.resource('dynamodb') # tell boto what resource you are using
 9 |     table_name = os.environ.get('ORDERS_TABLE') # get the value from global function defined on yml
10 |     table = dynamodb.Table(table_name) # get table from dynamo db
11 |     order_id = int(event['pathParameters']['id']) # read the id passed in the api call
12 |     response = table.query(KeyConditionExpression=Key('id').eq(order_id)) # query the order id
13 |     
14 |     
15 |     return {
16 |         'statusCode': 201,
17 |         'headers': {},
18 |         'body': json.dumps(response['Items'])
19 |     }


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/orders_api/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | boto3
3 | simplejson


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default]
 3 | [default.deploy]
 4 | [default.deploy.parameters]
 5 | stack_name = "orderapi"
 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s"
 7 | s3_prefix = "orderapi"
 8 | region = "us-east-1"
 9 | confirm_changeset = true
10 | capabilities = "CAPABILITY_IAM"
11 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Transform: AWS::Serverless-2016-10-31
 3 | Description: >
 4 |   orders-api
 5 | 
 6 | Globals: # define settings to be reused globally in the resources
 7 |   Function: # declaring the resource type will use the next settings,  this case, all 'AWS::Serverless::Function' type will use the below settings
 8 |     Runtime: python3.9
 9 |     Timeout: 30
10 |     Environment: # block to define env VARIABLES globally for all functions types resource
11 |       Variables:
12 |         ORDERS_TABLE : !Ref OrdersTable # use the intrinsic to reference OrderTable resource as variable ORDER_TABLE
13 | 
14 | Resources:
15 |   OrdersTable: # name on cloud formation stack
16 |     Type: AWS::Serverless::SimpleTable # resource type
17 |     Properties: # block to define the properties
18 |       PrimaryKey: # only mandatory property
19 |         Name: id
20 |         Type: Number
21 |   CreateOrderFunction:
22 |     Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
23 |     Properties:
24 |       CodeUri: orders_api/
25 |       Handler: create.lambda_handler
26 |       # Runtime: python3.9 # don't need this because is in global
27 |       Events:
28 |         CreateOrder: # name of the api trigger event
29 |           Type: Api # type of event
30 |           Properties:
31 |             Path: /orders # path used in the api as parameter
32 |             Method: POST # method used in the API
33 |       Policies: #creating a security policy
34 |         - DynamoDBCrudPolicy: # type of policy for this function
35 |             TableName: !Ref OrdersTable # this policy is restrictec only to the table
36 |   ReadOrderFunction:
37 |     Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
38 |     Properties:
39 |       CodeUri: orders_api/
40 |       Handler: read.lambda_handler
41 |       # Runtime: python3.9 # don't need this because is in global
42 |       Events:
43 |         ReadOrder: # name of the api trigger event
44 |           Type: Api # type of event
45 |           Properties:
46 |             Path: /orders/{id} # path used in the api as parameter
47 |             Method: GET # method used in the API
48 |       Policies: #creating a security policy
49 |         - DynamoDBReadPolicy: # type of policy for this function
50 |             TableName: !Ref OrdersTable # this policy is restrictec only to the table
51 | 
52 | # create outputs to show url to end user
53 | Outputs:
54 |   CreateOrdersAPI:
55 |     Description: "API Gateway endpoint for creating orders"
56 |     # !Sub substitute values with ${} on the string
57 |     Value: !Sub "https://${ServerlessRestApi}.execute-api.${AWS::Region}.amazonaws.com/Prod/orders"
58 |   CreateOrderFunction: # name the getatt, can be anything
59 |     Description: " Get create order function ARN "
60 |     Value: !GetAtt CreateOrderFunction.Arn # function name .Arn
61 |   CreateOrderFunctionIamRole: # name the getatt, can be anything
62 |     Description: "Get create order function role ARN"
63 |     # when a function is created, it creates a role with function name + Role at the end
64 |     Value: !GetAtt CreateOrderFunctionRole.Arn


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/integration/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/integration/test_api_gateway.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import boto3
 5 | import requests
 6 | 
 7 | """
 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 
 9 | """
10 | 
11 | 
12 | class TestApiGateway(TestCase):
13 |     api_endpoint: str
14 | 
15 |     @classmethod
16 |     def get_stack_name(cls) -> str:
17 |         stack_name = os.environ.get("AWS_SAM_STACK_NAME")
18 |         if not stack_name:
19 |             raise Exception(
20 |                 "Cannot find env var AWS_SAM_STACK_NAME. \n"
21 |                 "Please setup this environment variable with the stack name where we are running integration tests."
22 |             )
23 | 
24 |         return stack_name
25 | 
26 |     def setUp(self) -> None:
27 |         """
28 |         Based on the provided env variable AWS_SAM_STACK_NAME,
29 |         here we use cloudformation API to find out what the HelloWorldApi URL is
30 |         """
31 |         stack_name = TestApiGateway.get_stack_name()
32 | 
33 |         client = boto3.client("cloudformation")
34 | 
35 |         try:
36 |             response = client.describe_stacks(StackName=stack_name)
37 |         except Exception as e:
38 |             raise Exception(
39 |                 f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.'
40 |             ) from e
41 | 
42 |         stacks = response["Stacks"]
43 | 
44 |         stack_outputs = stacks[0]["Outputs"]
45 |         api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"]
46 |         self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}")
47 | 
48 |         self.api_endpoint = api_outputs[0]["OutputValue"]
49 | 
50 |     def test_api_gateway(self):
51 |         """
52 |         Call the API Gateway endpoint and check the response
53 |         """
54 |         response = requests.get(self.api_endpoint)
55 |         self.assertDictEqual(response.json(), {"message": "hello world"})
56 | 


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-mock
3 | boto3


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/unit/__init__.py


--------------------------------------------------------------------------------
/AWS Lambda/orders-api/tests/unit/test_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | 
 5 | from hello_world import app
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def apigw_event():
10 |     """ Generates API GW Event"""
11 | 
12 |     return {
13 |         "body": '{ "test": "body"}',
14 |         "resource": "/{proxy+}",
15 |         "requestContext": {
16 |             "resourceId": "123456",
17 |             "apiId": "1234567890",
18 |             "resourcePath": "/{proxy+}",
19 |             "httpMethod": "POST",
20 |             "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
21 |             "accountId": "123456789012",
22 |             "identity": {
23 |                 "apiKey": "",
24 |                 "userArn": "",
25 |                 "cognitoAuthenticationType": "",
26 |                 "caller": "",
27 |                 "userAgent": "Custom User Agent String",
28 |                 "user": "",
29 |                 "cognitoIdentityPoolId": "",
30 |                 "cognitoIdentityId": "",
31 |                 "cognitoAuthenticationProvider": "",
32 |                 "sourceIp": "127.0.0.1",
33 |                 "accountId": "",
34 |             },
35 |             "stage": "prod",
36 |         },
37 |         "queryStringParameters": {"foo": "bar"},
38 |         "headers": {
39 |             "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
40 |             "Accept-Language": "en-US,en;q=0.8",
41 |             "CloudFront-Is-Desktop-Viewer": "true",
42 |             "CloudFront-Is-SmartTV-Viewer": "false",
43 |             "CloudFront-Is-Mobile-Viewer": "false",
44 |             "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
45 |             "CloudFront-Viewer-Country": "US",
46 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
47 |             "Upgrade-Insecure-Requests": "1",
48 |             "X-Forwarded-Port": "443",
49 |             "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
50 |             "X-Forwarded-Proto": "https",
51 |             "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==",
52 |             "CloudFront-Is-Tablet-Viewer": "false",
53 |             "Cache-Control": "max-age=0",
54 |             "User-Agent": "Custom User Agent String",
55 |             "CloudFront-Forwarded-Proto": "https",
56 |             "Accept-Encoding": "gzip, deflate, sdch",
57 |         },
58 |         "pathParameters": {"proxy": "/examplepath"},
59 |         "httpMethod": "POST",
60 |         "stageVariables": {"baz": "qux"},
61 |         "path": "/examplepath",
62 |     }
63 | 
64 | 
65 | def test_lambda_handler(apigw_event, mocker):
66 | 
67 |     ret = app.lambda_handler(apigw_event, "")
68 |     data = json.loads(ret["body"])
69 | 
70 |     assert ret["statusCode"] == 200
71 |     assert "message" in ret["body"]
72 |     assert data["message"] == "hello world"
73 |     # assert "location" in data.dict_keys()
74 | 


--------------------------------------------------------------------------------
/Airflow/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/.DS_Store


--------------------------------------------------------------------------------
/Airflow/DAG Authoring Certification/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/DAG Authoring Certification/.DS_Store


--------------------------------------------------------------------------------
/Airflow/DAG Authoring Certification/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |   name: airflow-dag-authoring-cert
3 | 


--------------------------------------------------------------------------------
/Airflow/DAG Authoring Certification/.dockerignore:
--------------------------------------------------------------------------------
1 | .astro
2 | .git
3 | .env
4 | airflow_settings.yaml
5 | pod-config.yml
6 | logs/


--------------------------------------------------------------------------------
/Airflow/DAG Authoring Certification/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .env
3 | airflow_settings.yaml
4 | pod-config.yml


--------------------------------------------------------------------------------
/Airflow/Fundamentals Certification/oper_sensors_sample_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python import PythonOperator
 3 | from airflow.sensors.filesystem import FileSensor
 4 | from airflow.operators.bash import BashOperator
 5 | from datetime import datetime,timedelta
 6 | from airflow.sensors.filesystem import FileSensor
 7 | from airflow.models.baseoperator import chain, cross_downstream
 8 | 
 9 | default_args = {
10 |          'retry' : 5
11 |         ,'retry_delay' : timedelta(minutes=5)
12 |         ,'email_on_failure': True
13 |         ,'email_on_retry' : True
14 |         ,'email' : 'cassio.bolba@gmail.com'
15 |     }
16 | 
17 | # let's check if the file myfile.txt is in the folder
18 | def _downloading_data (**kwargs):
19 |     with open ('/tmp/myfile.txt','w'):
20 |         f.write('my_data')
21 |     return 42
22 | 
23 | # call the ti to access the xcoms metadata
24 | def checking_data(ti):
25 |     # call the method in ti and pass the xcoms key (can check in admin panel) and the task id where the xcoms is)
26 |     my_xcoms = ti.xcom_pull(key='return_value', task_ids = ['downaloading_data'])
27 |     print(my_xcoms)
28 |     print('check data')
29 | 
30 | def _failure(context): # context brings information about
31 |     print(context)
32 | 
33 | with DAG (   dag_id = 'simple_dag'
34 |             ,schedule_interval = "*/10 * * * *"
35 |             ,start_date = datetime(2021,1,1) 
36 |             ,catchup = False #disable backfilling
37 |             ,default_args = default_args
38 |             ) as dag:
39 | 
40 |     downloading_data = PythonOperator (
41 |          task_id = 'downloading_data'
42 |         ,python_callable = _downloading_data
43 |     )
44 | 
45 |     checking_data = PythonOperator (
46 |          task_id = 'checking_data'
47 |         ,python_callable = checking_data
48 |     )
49 | 
50 |     waiting_data = FileSensor (
51 |          task_id = 'waiting_data'
52 |         ,fs_conn_id= = 'con_id'
53 |         ,filepath =  'my_file.txt'
54 |         ,poke_interval = 15
55 |     )
56 | 
57 |     processing_data = BashOperator (
58 |         task_id = 'processing_data'
59 |         ,bash_command = 'exit 0'
60 |     )
61 | 
62 | downloading_data >> [ waiting_data, processing_data ] 
63 | 
64 | # another way to chain (not in same level)
65 | # chain( downloading_data , waiting_data, processing_data )
66 | 
67 | # creating cross dependencies
68 | # cross_downstream ( [ downloading_data, checking_data ] , [ waiting_data,processing_data ] )


--------------------------------------------------------------------------------
/Airflow/Fundamentals Certification/sensor_operator_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python import PythonOperator
 3 | from airflow.sensors.filesystem import FileSensor
 4 | from datetime import datetime,timedelta
 5 | from airflow.sensors.filesystem import FileSensor
 6 | 
 7 | default_args = {
 8 |          'retry' : 5
 9 |         ,'retry_delay' : timedelta(minutes=5)
10 |     }
11 | 
12 | # let's check if the file myfile.txt is in the folder
13 | def _downloading_data (**kwargs):
14 |     with open ('/tmp/myfile.txt','w'):
15 |         f.write('my_data')
16 | 
17 | 
18 | with DAG (   dag_id = 'simple_dag'
19 |             ,schedule_interval = "*/10 * * * *"
20 |             ,start_date = datetime(2021,1,1) 
21 |             ,catchup = False #disable backfilling
22 |             ,default_args = default_args
23 |             ) as dag:
24 | 
25 |     downloading_data = PythonOperator (
26 |          task_id = 'downloading_data'
27 |         ,python_callable = _downloading_data
28 |     )
29 | 
30 |     waiting_data = FileSensor (
31 |          task_id = 'waiting_data'
32 |         ,fs_conn_id= = 'con_id'
33 |         ,filepath =  'my_file.txt'
34 |         ,poke_interval = 15
35 |     )


--------------------------------------------------------------------------------
/Airflow/Fundamentals Certification/simple_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.dummy import DummyOperator
 3 | from datetime import datetime,timedelta
 4 | 
 5 | default_args = {
 6 |          'retry' : 5
 7 |         ,'retry_delay' : timedelta(minutes=5)
 8 |     }
 9 | 
10 | with DAG (   dag_id = 'simple_dag'
11 |             ,schedule_interval = "*/10 * * * *"
12 |             #,schedule_interval = "@daily" 
13 |             #,schedule_interval = timedelta(hours=7) 
14 |             ,start_date = datetime(2021,1,1) 
15 |             ,catchup = False #disable backfilling
16 |             ,default_args = default_args
17 |             ) as dag:
18 | 
19 |     task_1 = DummyOperator (
20 |         task_id = 'task_1'
21 |     )
22 | 
23 |     task_2 = DummyOperator (
24 |         task_id = 'task_2'
25 |     )


--------------------------------------------------------------------------------
/Airflow/GoogleCloudComposer/dag_bq_gcs.py:
--------------------------------------------------------------------------------
 1 | import airflow
 2 | import datetime
 3 | from airflow import DAG
 4 | from airflow import models
 5 | from airflow.operators import BashOperator
 6 | from airflow.contrib.operators import bigquery_operator
 7 | from airflow.contrib.operators import bigquery_to_gcs
 8 | from airflow.utils import trigger_rule
 9 | 
10 | default_dag_args = {
11 |      'start_date': airflow.utils.dates.days_ago(1),
12 |      'email_on_failure': False,
13 |      'email_on_retry': False,
14 |      'retries': 1,
15 |      'retry_delay' : datetime.timedelta(minutes=5),
16 | }
17 | 
18 | output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv'
19 | #Replace <Your bucket> with your path details
20 | with DAG(
21 |        dag_id='demo_bq_dag',
22 |        schedule_interval = datetime.timedelta(days = 1),
23 |        default_args = default_dag_args) as dag:
24 | 
25 |       bq_airflow_commits_query = bigquery_operator.BigQueryOperator(
26 |          task_id = 'bq_airflow_commits_query',
27 |          bql = """    SELECT Address
28 |          FROM [airflow-studies:Address.Add]
29 |           """)
30 |          
31 | 
32 |       export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator(
33 |          task_id = 'export_airflow_commits_to_gcs',
34 |          source_project_dataset_table = 'airflow-studies:Address.Add',
35 |          destination_cloud_storage_uris = [output_file],
36 |          export_format = 'CSV')
37 | 
38 |       bq_airflow_commits_query >> export_commits_to_gcs    
39 |      
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/Airflow/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/.DS_Store


--------------------------------------------------------------------------------
/Airflow/img/Chain Operator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/Chain Operator.png


--------------------------------------------------------------------------------
/Airflow/img/Python Branch Operator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/Python Branch Operator.png


--------------------------------------------------------------------------------
/Airflow/img/celery cluster.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/celery cluster.jpg


--------------------------------------------------------------------------------
/Airflow/img/multi_node.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/multi_node.jpg


--------------------------------------------------------------------------------
/Airflow/img/one_node.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/one_node.jpg


--------------------------------------------------------------------------------
/Airflow/img/task_instance_view.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/task_instance_view.PNG


--------------------------------------------------------------------------------
/Apache Beam/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/.DS_Store


--------------------------------------------------------------------------------
/Apache Beam/Bach Processing/4.4 - Batch DirectRunner + GCS.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | import os
 3 | 
 4 | serviceAccount = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\dataflow-course-319517-4f98a2ce48a7.json"
 5 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
 6 | 
 7 | p1 = beam.Pipeline()
 8 | 
 9 | class Filter(beam.DoFn):
10 |   def process(self,record):
11 |     if int(record[8]) > 0:
12 |       return [record]
13 | 
14 | Delayed_time = (
15 | p1
16 |   | "Import Data time" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1)
17 |   | "Split by comma time" >> beam.Map(lambda record: record.split(','))
18 |   | "Filter Delays time" >> beam.ParDo(Filter())
19 |   | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8])))
20 |   | "Sum by key time" >> beam.CombinePerKey(sum)
21 | )
22 | 
23 | Delayed_num = (
24 |     p1
25 |     | "Import Data" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1)
26 |     | "Split by comma" >> beam.Map(lambda record: record.split(','))
27 |     | "Filter Delays" >> beam.ParDo(Filter())
28 |     | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
29 |     | "Count by key" >> beam.combiners.Count.PerKey()
30 | )
31 | 
32 | Delay_table = (
33 |     {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 
34 |     | "Group By" >> beam.CoGroupByKey()
35 |     | "Save to GCS" >> beam.io.WriteToText(r"gs://dataflow-course/flights_output.csv")
36 | )
37 | 
38 | p1.run()
39 | 


--------------------------------------------------------------------------------
/Apache Beam/Bach Processing/4.5 - Batch Dataflow + GCS.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | import os
 3 | from apache_beam.options.pipeline_options import PipelineOptions
 4 | 
 5 | pipeline_options = {
 6 |     'project': 'dataflow-course-319517' ,
 7 |     'runner': 'DataflowRunner',
 8 |     'region': 'southamerica-east1',
 9 |     'staging_location': 'gs://dataflow-course/temp',
10 |     'temp_location': 'gs://dataflow-course/temp',
11 |     'template_location': 'gs://dataflow-course/template/batch_job_df_gcs_flights' 
12 |     }
13 |     
14 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
15 | p1 = beam.Pipeline(options=pipeline_options)
16 | 
17 | serviceAccount = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\dataflow-course-319517-4f98a2ce48a7.json"
18 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
19 | 
20 | class Filter(beam.DoFn):
21 |   def process(self,record):
22 |     if int(record[8]) > 0:
23 |       return [record]
24 | 
25 | Delayed_time = (
26 |   p1
27 |   | "Import Data time" >> beam.io.ReadFromText(r"gs://dataflow-course/input/flights_sample.csv", skip_header_lines = 1)
28 |   | "Split by comma time" >> beam.Map(lambda record: record.split(','))
29 |   | "Filter Delays time" >> beam.ParDo(Filter())
30 |   | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8])))
31 |   | "Sum by key time" >> beam.CombinePerKey(sum)
32 | )
33 | 
34 | Delayed_num = (
35 |   p1
36 |   |"Import Data" >> beam.io.ReadFromText(r"gs://dataflow-course/input/flights_sample.csv", skip_header_lines = 1)
37 |     | "Split by comma" >>  beam.Map(lambda record: record.split(','))
38 |     | "Filter Delays" >> beam.ParDo(Filter())
39 |     | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
40 |     | "Count by key" >> beam.combiners.Count.PerKey()
41 | )
42 | 
43 | Delay_table = (
44 |     {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 
45 |     | "Group By" >> beam.CoGroupByKey()
46 |     | "Save to GCS" >> beam.io.WriteToText(r"gs://dataflow-course/output/flights_output.csv")
47 | )
48 | 
49 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Bach Processing/comando deploy.txt:
--------------------------------------------------------------------------------
 1 | python "C:\Users\cassi\Desktop\4.py" \
 2 | 	--input gs://template_dataflow_curso/entrada/voos_sample.csv \
 3 | 	--output gs://template_dataflow_curso/saida/voos_sample.csv \
 4 | 	--runner DataflowRunner \
 5 | 	--project dataflowproject-299811 \
 6 | 	--region southamerica-east1 \
 7 | 	--temp_location gs://template_dataflow_curso/temp \
 8 | 	--staging_location gs://template_dataflow_curso/staging \
 9 | 	--template_location gs://template_dataflow_curso/template \
10 | 
11 | python "C:\Users\cassi\Desktop\4.py"
12 | 
13 | service-<project-ID-number>@dataflow-service-producer-prod.iam.gserviceaccount.com
14 | <project-ID-number>-compute@developer.gserviceaccount.com
15 | 505169071290@cloudservices.gserviceaccount.com
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Apache Beam/Bach Processing/unnest.py:
--------------------------------------------------------------------------------
 1 | dataDict = ('LAX', {'Qtd_Atrasos': [4], 'Tempo_Atrasos': [92]})
 2 | 
 3 | class teste(beam.DoFn):
 4 |     def process(self,record):
 5 |         dict_ = {} 
 6 |         dict_['airport'] = str(record[0])
 7 |         dict_['lista'] = record[1]
 8 |         return(dict_)
 9 | 
10 |     #print(criar_dict(dataDict))
11 | 
12 |     def process(self,record):
13 |         def expand(key, value):
14 |             if isinstance(value, dict):
15 |                 return [ (key + '_' + k, v) for k, v in process(value).items() ]
16 |             else:
17 |                 return [ (key, value) ]
18 | 
19 |         items = [ item for k, v in record.items() for item in expand(k, v) ]
20 | 
21 |         return dict(items)
22 | 
23 |     #teste = (desaninhar_dict(criar_dict(dataDict)))
24 |     # teste['lista_Qtd_Atrasos'] = teste['lista_Qtd_Atrasos'][0]
25 |     # teste['lista_Tempo_Atrasos'] = teste['lista_Tempo_Atrasos'][0]
26 | 
27 |     #print(teste)
28 | 
29 |     def process(self,record):
30 |         dict_ = {} 
31 |         dict_['airport'] = record['airport']
32 |         dict_['lista_Qtd_Atrasos'] = record['lista_Qtd_Atrasos'][0]
33 |         dict_['lista_Tempo_Atrasos'] = record['lista_Tempo_Atrasos'][0]
34 |         return(dict_)
35 | 
36 | print(teste(teste))
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | # def criar_dict2(x):
49 | #     dict_ = {} 
50 | #     dict_['airport'] = str(x[0])
51 | #     dict_['lista_Qtd_Atrasos'] = str(x[1])
52 | #     dict_['lista_Qtd_Atrasos'] = str(x[2])
53 | #     return(dict_)
54 | 
55 | # print(criar_dict2(teste))
56 | 
57 | # def criar_lista(x):
58 | #     lista = list(x.values())
59 | #     return lista
60 | # print(criar_lista(teste))


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.1 - Setup Colab.py:
--------------------------------------------------------------------------------
1 | pip install apache-beam[interactive]
2 | 


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.10 - ParDo.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | class Filter(beam.DoFn):
 6 |   def process(self,record):
 7 |     if int(record[8]) > 0:
 8 |       return [record]
 9 | 
10 | Delayed_time = (
11 | p1
12 |   | "Import Data time" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1)
13 |   | "Split by comma time" >> beam.Map(lambda record: record.split(','))
14 |   | "Filter Delays time" >> beam.ParDo(Filter())
15 |   | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8])))
16 |   | "Sum by key time" >> beam.CombinePerKey(sum)
17 | #  | "Print Results" >> beam.Map(print)
18 | )
19 | 
20 | Delayed_num = (
21 |     p1
22 |     | "Import Data" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1)
23 |     | "Split by comma" >> beam.Map(lambda record: record.split(','))
24 |     | "Filter Delays" >> beam.ParDo(Filter())
25 |     | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
26 |     | "Count by key" >> beam.combiners.Count.PerKey()
27 | #    | "Print Results" >> beam.Map(print)
28 | )
29 | 
30 | Delay_table = (
31 |     {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 
32 |     | beam.CoGroupByKey()
33 |     | beam.Map(print)
34 | )
35 | 
36 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.2 - Create.py:
--------------------------------------------------------------------------------
1 | import apache_beam as beam
2 | 
3 | p1 = beam.Pipeline()
4 | 
5 | p1 | "Tuple" >> beam.Create( [ ("Cassio",32) , ("Vics",21) ] ) | beam.Map(print) #tuple
6 | p1 | "List" >> beam.Create ( [ 1,2,3 ] ) |  beam.Map(print) #list
7 | 
8 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.2 - Read Transform.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | voos = (
 6 | p1
 7 |   # Read files
 8 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 9 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
10 |   | "Print Results" >> beam.Map(print)
11 | )
12 | 
13 | p1.run()
14 | 


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.3 - Write Transform.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | voos = (
 6 | p1
 7 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |   | "Write Results" >> beam.io.WriteToText("Flights.txt")
10 | )
11 | 
12 | p1.run()
13 | 


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.4 - FlatMap.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | Collection = (
 6 |     p1
 7 |     |beam.io.ReadFromText('poem.txt')
 8 |     |beam.FlatMap(lambda record: record.split(' '))
 9 |     |beam.io.WriteToText('result.txt')
10 | )
11 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.4 - Map.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | voos = (
 6 | p1
 7 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |   | "Print Results" >> beam.Map(print)
10 | )
11 | 
12 | p1.run()
13 | 


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.5 - Filter Lambda.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | voos = (
 6 | p1
 7 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |   | "Filter By LA Flights" >> beam.Filter(lambda record: record[3] == "LAX")
10 |   | "Print Results" >> beam.Map(print)
11 | )
12 | 
13 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.5 - Filter Lista.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | words=['quatro','um']
 4 | 
 5 | def FindWords( i ):
 6 |  if i in words:
 7 |     return True
 8 | 
 9 | p1 = beam.Pipeline()
10 | 
11 | Collection = (
12 |     p1
13 |     |beam.io.ReadFromText('Poem.txt')
14 |     |beam.FlatMap(lambda record: record.split(' '))
15 |     |beam.Filter(FindWords)
16 |     |beam.io.WriteToText('results.txt')
17 | )
18 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.6 - Flatten.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p = beam.Pipeline()
 4 | 
 5 | black = ('Adão','Jesus','Mike')
 6 | White = ('Tulio','Mary','Joca')
 7 | first_nations = ('Vic','Marta','Tom')
 8 | 
 9 | black_pc = p | "Creating Pcollection black" >> beam.Create(black)
10 | White_pc = p | "Creating Pcollection White" >> beam.Create(White)
11 | first_nations_pc = p | "Creating Pcollection first_nations" >> beam.Create(first_nations)
12 | 
13 | people = (
14 |     (black_pc,White_pc,first_nations_pc) 
15 |         | beam.Flatten()
16 |         | beam.Map(print))
17 | p.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.7 - CombinePerKey.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | Delayed_time = (
 6 | p1
 7 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |   | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 )
10 |   | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
11 |   | "Sum by key" >> beam.CombinePerKey(sum)
12 |   | "Print Results" >> beam.Map(print)
13 | )
14 | 
15 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.8 - Combiners.Count.Perkey.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | Delayed_num = (
 6 |     p1
 7 |     | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |     | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |     | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 )
10 |     | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
11 |     | "Count Per key" >> beam.combiners.Count.PerKey()
12 |     | "Print Results" >> beam.Map(print)
13 | )
14 | 
15 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/3.9 - CoGroupByKey.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | 
 3 | p1 = beam.Pipeline()
 4 | 
 5 | Delayed_time = (
 6 | p1
 7 |   | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
 8 |   | "Split by comma" >> beam.Map(lambda record: record.split(','))
 9 |   | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 )
10 |   | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
11 |   | "Sum by key" >> beam.CombinePerKey(sum)
12 | #  | "Print Results" >> beam.Map(print)
13 | )
14 | 
15 | Delayed_num = (
16 |     p1
17 |     | "Import Data Num" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1)
18 |     | "Split by comma Num" >> beam.Map(lambda record: record.split(','))
19 |     | "Filter Delays Num" >> beam.Filter(lambda record: int(record[8]) > 0 )
20 |     | "Create a key-value Num" >> beam.Map(lambda record: (record[4],int(record[8])))
21 |     | "Combine by key" >> beam.combiners.Count.PerKey()
22 | #    | "Print Results" >> beam.Map(print)
23 | )
24 | 
25 | Delay_table = (
26 |     {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 
27 |     | beam.CoGroupByKey()
28 |     | beam.Map(print)
29 | )
30 | 
31 | p1.run()


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/Poem.txt:
--------------------------------------------------------------------------------
1 | Há quatro quadros três e três quadros quatro. 
2 | Sendo que quatro destes quadros são quadrados,
3 |  um dos quadros quatro e três dos quadros três.
4 |  Os três quadros que não são quadrados,
5 |  são dois dos quadros quatro e um dos quadros três.


--------------------------------------------------------------------------------
/Apache Beam/Main Functions/voos_sample.csv:
--------------------------------------------------------------------------------
 1 | 2019-04-27,19805,1,JFK,LAX,854,-6,1217,2,355,2475,1
 2 | 2019-04-27,19805,2,LAX,JFK,944,14,1736,-29,269,2475,2
 3 | 2019-04-27,19805,3,JFK,LAX,1224,-6,1614,39,371,2475,3
 4 | 2019-04-27,19805,4,LAX,JFK,1240,25,2028,-27,264,2475,4
 5 | 2019-04-27,19805,5,DFW,HNL,1300,-5,1650,15,510,3784,5
 6 | 2019-04-27,19805,6,OGG,DFW,1901,126,640,95,385,3711,6
 7 | 2019-04-27,19805,7,DFW,OGG,1410,125,1743,138,497,3711,7
 8 | 2019-04-27,19805,8,HNL,DFW,1659,4,458,-22,398,3784,8
 9 | 2019-04-27,19805,9,JFK,LAX,648,-7,1029,19,365,2475,9
10 | 2019-04-27,19805,10,LAX,JFK,2156,21,556,1,265,2475,10
11 | 2019-04-27,19805,12,LAX,JFK,1113,-2,1910,-40,267,2475,11
12 | 2019-04-27,19805,14,OGG,LAX,2235,5,618,-17,270,2486,12
13 | 2019-04-27,19805,15,BOS,ORD,611,-9,756,-19,129,867,13
14 | 2019-04-27,19805,16,SFO,JFK,1312,17,2107,-33,268,2586,14
15 | 2019-04-27,19805,17,ATL,MIA,630,-5,813,-17,83,594,15
16 | 2019-04-27,19805,18,SFO,JFK,22,112,833,88,288,2586,16
17 | 2019-04-27,19805,19,JFK,LAX,1024,-6,1353,18,359,2475,17
18 | 2019-04-27,19805,20,SFO,JFK,1715,135,130,120,277,2586,18
19 | 2019-04-27,19805,21,JFK,LAX,1906,-4,2246,16,359,2475,19
20 | 2019-04-27,19805,22,LAX,JFK,1458,-2,2336,11,272,2475,20
21 | 


--------------------------------------------------------------------------------
/Apache Beam/README.md:
--------------------------------------------------------------------------------
 1 | # Apache Beam
 2 | Files regarding my course sold on udemy. Tey are available in English and Portuguese.
 3 | s
 4 | ## https://www.udemy.com/user/cassio-alessandro-de-bolba/
 5 | 
 6 | ### Folders Description
 7 | Main Functions -> study case with most important functions  
 8 | Bach Processing -> processing data usgin functions learned in Main Functions, to process data on GCP    
 9 | Streaming Piocessing ->  processing data usgin functions learned in Main Functions, to process data on GC  
10 | 


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/5.2.0 - Data Generator.py:
--------------------------------------------------------------------------------
 1 | #pip install google-cloud-pubsub
 2 | 
 3 | import csv
 4 | import time
 5 | from google.cloud import pubsub_v1
 6 | import os
 7 | 
 8 | service_account_key = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json'
 9 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= service_account_key
10 | 
11 | topic = 'projects/dataflowproject-299811/topics/MeuTopico'
12 | publisher = pubsub_v1.PublisherClient()
13 | 
14 | input = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso\Seção 3 - Principais Transfromações\voos_sample.csv"
15 | 
16 | with open(input, 'rb') as file:
17 |     for row in file:
18 |         print('Publishing in Topic')
19 |         publisher.publish(topic,row)
20 |         time.sleep(1)


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/5.2.1 - Voos Streaming DF + Pubsub.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | import os
 3 | from apache_beam.options.pipeline_options import PipelineOptions
 4 | 
 5 | pipeline_options = {
 6 |     'project': 'dataflowproject-299811' ,
 7 |     'runner': 'DataflowRunner',
 8 |     'region': 'southamerica-east1',
 9 |     'job_name': 'cassio',
10 |     'output': 'gs://template_dataflow_curso/saida',
11 |     'staging_location': 'gs://template_dataflow_curso/staging',
12 |     'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv',
13 |     'temp_location': 'gs://template_dataflow_curso/staging',
14 |     'template_location': 'gs://template_dataflow_curso/template/streaming_job_voos',
15 |     'streaming' : True }
16 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
17 | p1 = beam.Pipeline(options=pipeline_options)
18 | 
19 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json'
20 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
21 | 
22 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs'
23 | topic = 'projects/dataflowproject-299811/topics/saida'
24 | 
25 | class split_lines(beam.DoFn):
26 |   def process(self,record):
27 |     return [record.split(',')]
28 | 
29 | class Filter(beam.DoFn):
30 |   def process(self,record):
31 |     if int(record[8]) > 0:
32 |       return [record]
33 | 
34 | 
35 | pcollection_input = (
36 |     p1  | 'Read from pubsub topic' >> beam.io.ReadFromPubSub(subscription= subscription)
37 | )
38 | 
39 | Delayed_time = (
40 |   pcollection_input
41 | #  p1
42 | #  | "Import Data time" >> beam.io.ReadFromText(r"gs://template_dataflow_curso/entrada/voos_sample.csv", skip_header_lines = 1)
43 |   | "Split by comma time" >> beam.ParDo(split_lines())
44 |   | "Filter Delays time" >> beam.ParDo(Filter())
45 |   | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8])))
46 |   | "Sum by key time" >> beam.CombinePerKey(sum)
47 | #  | "Print Results" >> beam.Map(print)
48 | )
49 | 
50 | Delayed_num = (
51 |   pcollection_input
52 | #  p1
53 | #  |"Import Data" >> beam.io.ReadFromText(r"gs://template_dataflow_curso/entrada/voos_sample.csv", skip_header_lines = 1)
54 |     | "Split by comma" >> beam.ParDo(split_lines())
55 |     | "Filter Delays" >> beam.ParDo(Filter())
56 |     | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8])))
57 |     | "Count by key" >> beam.combiners.Count.PerKey()
58 | #   | "Print Results" >> beam.Map(print)
59 | )
60 | 
61 | Delay_table = (
62 |     {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 
63 |     | "join" >> beam.CoGroupByKey()
64 | #    | beam.Map(print)
65 | #    | beam.io.WriteToText(r"gs://template_dataflow_curso/saida/Voos_atrados_qtd.csv")
66 |     | "Converting to byte String" >> beam.Map(lambda row: (''.join(row).encode('utf-8')) )
67 |     | "Writting to Topic" >> beam.io.WriteToPubSub(topic)
68 | )
69 | 
70 | p.run()
71 | 


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/5.4 - Janelas e Noções de Tempo para Streaming.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/Streaming Processing/5.4 - Janelas e Noções de Tempo para Streaming.pptx


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/5.5 - Tumbling Window DF + BQ.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | import os
 3 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions
 4 | from apache_beam import window
 5 | import time
 6 | 
 7 | pipeline_options = {
 8 |     'project': 'dataflowproject-299811' ,
 9 |     'runner': 'DataflowRunner',
10 |     'region': 'southamerica-east1',
11 |     'job_name': 'cassio',
12 |     'output': 'gs://template_dataflow_curso/saida',
13 |     'staging_location': 'gs://template_dataflow_curso/staging',
14 |     'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv',
15 |     'temp_location': 'gs://template_dataflow_curso/temp',
16 |     'template_location': 'gs://template_dataflow_curso/template/streaming_job_df_bq_voos',
17 |     'streaming' : True,
18 |     'enable_streaming_engine' : True,
19 |     'save_main_session': True    }
20 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
21 | p1 = beam.Pipeline(options=pipeline_options)
22 | 
23 | # OPTIONS PARA EXECUTAR STREAMING LOCAL
24 | # options= PipelineOptions()
25 | # options.view_as(StandardOptions).streaming= True
26 | # p1 = beam.Pipeline(options=options)
27 | 
28 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json'
29 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
30 | 
31 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs'
32 | 
33 | class separar_linhas(beam.DoFn):
34 |   def process(self,record):
35 |     return [record.decode("utf-8").split(',')]
36 | 
37 | class filtro(beam.DoFn):
38 |   def process(self,record):
39 |     if int(record[8]) > 0:
40 |       return [record]
41 | 
42 | table_schema = 'airport:STRING, lista_Qtd_Atrasos:INTEGER'
43 | tabela = 'dataflowproject-299811:voos_dataflow.tabela_voos_tumbling'
44 | 
45 | 
46 | Qtd_Atrasos = (
47 |   p1
48 |   | "Ler da subcription" >> beam.io.ReadFromPubSub(subscription= subscription)
49 |   | "Separar por Vírgulas Qtd" >> beam.ParDo(separar_linhas())
50 |   | "Timestamp Customizada" >> beam.Map(lambda record: beam.window.TimestampedValue(record, time.time()))
51 |   | "Pegar voos com Qtd" >> beam.ParDo(filtro())
52 |   | "Criar par Qtd" >> beam.Map(lambda record: (record[4],int(record[8])))
53 |   | "Window" >> beam.WindowInto(window.FixedWindows(10))
54 |   | "Contar por key" >> beam.combiners.Count.PerKey()
55 |   | "Dicionário" >> beam.Map(lambda record:({'airport':record[0],'lista_Qtd_Atrasos':int(record[1])}))
56 |  # | "Mostrar Resultados QTD" >> beam.Map(print)
57 |   | beam.io.WriteToBigQuery(
58 |                               tabela,
59 |                               schema=table_schema,
60 |                               write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
61 |                               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
62 |                               custom_gcs_temp_location = 'gs://template_dataflow_curso/staging' )
63 | )
64 | 
65 | result = p1.run()
66 | result.wait_until_finish()


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/5.6 - Sliding Window DF + BQ.py:
--------------------------------------------------------------------------------
 1 | import apache_beam as beam
 2 | import os
 3 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions
 4 | from apache_beam import window
 5 | import time
 6 | 
 7 | # pipeline_options = {
 8 | #     'project': 'dataflowproject-299811' ,
 9 | #     'runner': 'DataflowRunner',
10 | #     'region': 'southamerica-east1',
11 | #     'job_name': 'cassio',
12 | #     'output': 'gs://template_dataflow_curso/saida',
13 | #     'staging_location': 'gs://template_dataflow_curso/staging',
14 | #     'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv',
15 | #     'temp_location': 'gs://template_dataflow_curso/temp',
16 | #     'template_location': 'gs://template_dataflow_curso/template/streaming_job_df_bq_voos',
17 | #     'streaming' : True,
18 | #     'enable_streaming_engine' : True,
19 | #     'save_main_session': True    }
20 | # pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
21 | # p1 = beam.Pipeline(options=pipeline_options)
22 | 
23 | ## OPTIONS PARA EXECUTAR STREAMING LOCAL
24 | options= PipelineOptions()
25 | options.view_as(StandardOptions).streaming= True
26 | p1 = beam.Pipeline(options=options)
27 | 
28 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json'
29 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
30 | 
31 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs'
32 | 
33 | class separar_linhas(beam.DoFn):
34 |   def process(self,record):
35 |     return [record.decode("utf-8").split(',')]
36 | 
37 | class filtro(beam.DoFn):
38 |   def process(self,record):
39 |     if int(record[8]) > 0:
40 |       return [record]
41 | 
42 | table_schema = 'airport:STRING, lista_Qtd_Atrasos:INTEGER'
43 | tabela = 'dataflowproject-299811:voos_dataflow.tabela_voos_tumbling'
44 | 
45 | 
46 | Qtd_Atrasos = (
47 |   p1
48 |   | "Ler da subcription" >> beam.io.ReadFromPubSub(subscription= subscription)
49 |   | "Separar por Vírgulas Qtd" >> beam.ParDo(separar_linhas())
50 |   | "Timestamp Customizada" >> beam.Map(lambda record: beam.window.TimestampedValue(record, time.time()))
51 |   | "Pegar voos com Qtd" >> beam.ParDo(filtro())
52 |   | "Criar par Qtd" >> beam.Map(lambda record: (record[4],int(record[8])))
53 |   | "Window" >> beam.WindowInto(window.SlidingWindows(10,5))
54 |   | "Contar por key" >> beam.combiners.Count.PerKey()
55 |   | "Dicionário" >> beam.Map(lambda record:({'airport':record[0],'lista_Qtd_Atrasos':int(record[1])}))
56 |   | "Mostrar Resultados QTD" >> beam.Map(print)
57 |   # | beam.io.WriteToBigQuery(
58 |   #                             tabela,
59 |   #                             schema=table_schema,
60 |   #                             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
61 |   #                             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
62 |   #                             custom_gcs_temp_location = 'gs://template_dataflow_curso/staging' )
63 | )
64 | 
65 | result = p1.run()
66 | result.wait_until_finish()


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/Sliding_processor.py:
--------------------------------------------------------------------------------
 1 | pip install apache_beam
 2 | 
 3 | pip install google-cloud-pubsub
 4 | 
 5 | import apache_beam as beam
 6 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions
 7 | import os
 8 | from apache_beam import window
 9 | from apache_beam.transforms.combiners import Count
10 | import time
11 | 
12 | 
13 | serviceAccount = '/content/vivid-now-271806-e22933a07e8a.json'
14 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount
15 | 
16 | input_subscription = 'projects/vivid-now-271806/subscriptions/movie_subscription'
17 | 
18 | options= PipelineOptions()
19 | options.view_as(StandardOptions).streaming= True
20 | 
21 | p = beam.Pipeline(options=options)
22 | 
23 | comedy_movies = 'projects/vivid-now-271806/topics/comedy_movies'
24 | 
25 | def format(element):
26 |     (movie,rating)=element
27 |     return "{r} rating for movieID {MID} in 10 seconds".format(f= rating, MID=movie).encode('utf-8')
28 | 
29 | pubsub_pipeline = (
30 |     p
31 |     | 'Read from pubsub topic' >> beam.io.ReadFromPubSub(subscription= input_subscription)
32 |     # decodificar e split
33 |     | 'Split the records by comma' >> beam.Map(lambda row: row.decode("utf-8").split(','))
34 |     # definir a coluna, ou criar uma coluna. Criando em cada linha uma timestamp
35 |     | 'Timestamp Customizada ' >> beam.Map(lambda row: beam.window.TimestampedValue(row, time.time()))
36 |     # Criar um key value pair, onde indico a coluna id do filme e ratings, para contar quantas ratings recebidas por id de filme
37 |     | 'Form Key Value Pair' >> beam.Map(lambda row: (row[1],float(row[2])))
38 |     # defino minha janela, primeiro parametro é janela, segundo é intervalo de janelas
39 |     | 'Window' >> beam.WindowInto(window.SlidingWindows(4,2))
40 |     # contar ratings por chave
41 |     | 'Count the ratings' >> Count.PerKey()
42 |     # codificar 
43 |     #| 'Converting to byte String' >> beam.Map(lambda row: (''.join(row).encode('utf-8')) )
44 |     | 'format' >> beam.Map(format)
45 |     | 'Publish to output topic' >> beam.io.WriteToPubSub(comedy_movies)
46 | )
47 | result = p.run()
48 | result.wait_until_finish()


--------------------------------------------------------------------------------
/Apache Beam/Streaming Processing/streaming janelas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/Streaming Processing/streaming janelas.xlsx


--------------------------------------------------------------------------------
/Data-Eng-Bootcamp/19 - 21 game.scala:
--------------------------------------------------------------------------------
 1 | // DISCLAIMER
 2 | // BELOW ARE SNIPPETS OF THE CODE REGRADDING THE 21 GAME CARD DONEDURING THE COURSE
 3 | // THE STILL NEEDS TO BE CREATED
 4 | 
 5 | // Define immutable variables for clubs 2♣ through 4♣
 6 | val twoClubs: Int = 2
 7 | val threeClubs: Int = 3
 8 | val fourClubs: Int = 4
 9 | val aceClubs = 1
10 | val aceDiamonds = 1
11 | val aceHearts = 1
12 | val aceSpades = 1
13 | 
14 | // Define immutable variables for player names
15 | val playerA: String = "Alex"
16 | val playerB: String = "Chen"
17 | val playerC: String = "Marta"
18 | 
19 | // Creating the players list
20 | val players1 = List("Alex","Chen")
21 | val players2 = List("Vic","Cassio")
22 | val allPlayers = players1 ::: players2
23 | 
24 | // Choose 
25 | val hand = 
26 | 
27 | // define the functio taking hand as int, then insinde {} id the function
28 | def bust(hand: int) = {
29 |     hand > 21
30 | }
31 | 
32 | // function to compare and show the biggest hand
33 | def maxHand(handA: Int, handB: Int): Int = {
34 |   if (bust(handA) & bust(handB)) println(0)
35 |   else if (bust(handA)) println(handB)
36 |   else if (bust(handB)) println(handA)
37 |   else if (handA > handB) println(handA)
38 |   else handB
39 | }
40 | 
41 | // Create, parameterize, and initialize an array for a round of Twenty-One
42 | // In this exercise it is done at same time, different from previous one
43 | val hands = Array(tenClubs  + fourDiamonds,
44 |               nineSpades  + nineHearts,
45 |               twoClubs  + threeSpades)
46 | 
47 | // Inform a player where their current hand stands
48 | val informPlayer: String = {
49 |   if (hand > 21)
50 |     "Bust! :("
51 |   else if (hand == 21)
52 |     "Twenty-One! :)"
53 |   else  
54 |     "Hit or stay?"
55 | }
56 | 
57 | // Find the number of points that will cause a bust
58 | def pointsToBust(hand: Int) = {
59 |   // If the hand is a bust, 0 points remain
60 |   if (bust(hand))
61 |     println(0)
62 |   // Otherwise, calculate the difference between 21 and the current hand
63 |   else
64 |     println(21 - hand)
65 | }
66 | 
67 | // Create list with five hands of Twenty-One
68 | var hands = List(16, 21, 8, 25, 4)
69 | 
70 | // Loop through hands, finding each hand's number of points to bust
71 | hands.foreach(pointsToBust)
72 | 


--------------------------------------------------------------------------------
/Databricks/Readme.md:
--------------------------------------------------------------------------------
1 | # Databricks
2 | This folder hold my studies in Databricks and some sample projects.  
3 | The studies were mostly don from Databrick Academy notebooks and also some workshops.
4 | 
5 | So far, I used the following languages within Databricks (as you check in the notebooks in this repository):
6 | * SQL
7 | * Spark / PySpark
8 | * Scala


--------------------------------------------------------------------------------
/Databricks/Scala Exercises/People_vs_Execises.scala:
--------------------------------------------------------------------------------
 1 | // Databricks notebook source
 2 | // READING FROM FILE STORE
 3 | //File location and type
 4 | val path_people = "/FileStore/tables/people.json"
 5 | val path_exercises = "/FileStore/tables/exercises.json"
 6 | val file_type = "json"
 7 | 
 8 | // COMMAND ----------
 9 | 
10 | // using multiline due to json file format 
11 | val dfPeople = spark.read.option("multiline","true").json(path_people)
12 | 
13 | val dfExercisesRaw = spark.read.option("multiline","true").json(sc.parallelize(path_exercises))
14 | 
15 | // COMMAND ----------
16 | 
17 | //val explode = dfExercises.withColumn("exercises",explode($"exercises"))
18 | //display(explode)
19 | //
20 | //val dfExercises = spark.read.option("multiline","true").schema(exercisesSchema).json(path_exercises)
21 | 
22 | // COMMAND ----------
23 | 
24 | new_df = old_df.withColumn("name",explode("user_info.name"))
25 |                 .withColumn("last_name",explode("user_info.last_name"))
26 | 
27 | // COMMAND ----------
28 | 
29 | import org.apache.spark.sql.functions._
30 | 
31 | val dfExercisesExp = dfExercisesRaw
32 |                      .withColumn("date",explode($"exercises.date"))
33 |                      .withColumn("exercise_end_time",explode($"exercises.exercise_end_time"))
34 |                      .withColumn("exercise_start_time",explode($"exercises.exercise_start_time"))
35 |                      .withColumn("user",explode($"exercises.user"))
36 |                      .withColumn("exercise_rating", explode($"exercises.metadata.exercise_rating"))
37 |                      .withColumn("heart_rate_samples",explode($"exercises.heart_rate_samples"))
38 |                      .drop($"exercises")
39 | 
40 | 
41 | // COMMAND ----------
42 | 
43 | val test = dfExercisesExp.select($"date",$"user",$"heart_rate_samples.*")
44 | display(test)
45 | //display(test.select(explode('heart_rate_samples) as (Seq("x", "y"))))
46 | 
47 | 
48 | // COMMAND ----------
49 | 
50 | 
51 | 
52 | // COMMAND ----------
53 | 
54 | val test = dfExercisesExp.select($"date",$"user",explode_outer($"heart_rate_samples"))
55 | display(test)
56 | 
57 | // COMMAND ----------
58 | 
59 | //unpivot
60 | //val unPivotDF = test.select($"user",
61 | //expr("stack(4, '11:15', '11:15','11:20','11:20') as (Country)"))
62 | //.where("Total is not null")
63 | //unPivotDF.show()
64 | 
65 | // COMMAND ----------
66 | 
67 | display(test.select(explode('heart_rate_samples) as (Seq("x", "y"))))
68 | 
69 | // COMMAND ----------
70 | 
71 | //val unpivotedDf = test
72 | //    .selectExpr("date","user","stack(1,'11:15','11:15')")
73 | //    .withColumnRenamed("col0","device") // default name of this column is col0
74 | 
75 | // COMMAND ----------
76 | 
77 | //display(test.select($"heart_rate_samples.*"))
78 | 


--------------------------------------------------------------------------------
/Databricks/Scala Exercises/readme.md:
--------------------------------------------------------------------------------
 1 | There are 2 JSON files. The first JSON file (people.json) contains information about individuals. The second JSON (exercises.json) contains exercise data for those individuals. An individual is identified by the “user” key which is connected to “id” key in people.json file. Write a Spark pipeline in Scala that derives the following information from this data:
 2 | 
 3 | Person report that contains the following for each unique person
 4 | * Total number of exercises for person 
 5 |     (count of start time group by user id)
 6 | * Average duration of exercise for person 
 7 |     (avg (end - start time) group by user id)
 8 | * Average exercise rating for person 
 9 |     (avg exercise_rating group by user id)
10 | * Average heart rate during exercise for person
11 | 
12 | Exercise report containing the following info on each exercise:
13 | * Starting/ending time of exercise and duration
14 | * Hourly average heart rate
15 | * Hourly minimum/maximum heart rate
16 | 
17 | Process the JSON data and create at least the above reports (”PersonReport” and “Exercises”).


--------------------------------------------------------------------------------
/Docker/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/.DS_Store


--------------------------------------------------------------------------------
/Docker/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/.DS_Store


--------------------------------------------------------------------------------
/Docker/img/Containerrization PID.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/Containerrization PID.png


--------------------------------------------------------------------------------
/Docker/img/container-layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/container-layer.png


--------------------------------------------------------------------------------
/Docker/img/docker-compose-versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-compose-versions.png


--------------------------------------------------------------------------------
/Docker/img/docker-networks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-networks.png


--------------------------------------------------------------------------------
/Docker/img/docker-volume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-volume.png


--------------------------------------------------------------------------------
/Docker/img/port-mapping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/port-mapping.png


--------------------------------------------------------------------------------
/Docker/img/voting-app-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/voting-app-diagram.png


--------------------------------------------------------------------------------
/Docker/python-sample-app/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python
 2 | 
 3 | COPY my-app.py my-app.py
 4 | COPY requirements.txt requirements.txt
 5 | 
 6 | RUN pip install -r requirements.txt
 7 | 
 8 | CMD python my-app.py
 9 | 
10 | 


--------------------------------------------------------------------------------
/Docker/python-sample-app/my-app.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | def get_weather_info():
 4 |     
 5 |     city = "Porto Alegre"
 6 |     
 7 |     api_key = "868a26a88dcad371f4205a319f26be8c"
 8 | 
 9 |     url = "http://api.openweathermap.org/data/2.5/weather?q="+ str(city) +"&appid="+ api_key
10 | 
11 |     json_data = requests.get(url).json()
12 |     
13 |     return print(f"Current Temperature for {str(city)} is {json_data['main']['temp']/10}")
14 | 
15 | 
16 | if __name__ == "__main__":
17 | 
18 |     get_weather_info()


--------------------------------------------------------------------------------
/Docker/python-sample-app/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/Fundamentals of Data Engineering/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/.DS_Store


--------------------------------------------------------------------------------
/Fundamentals of Data Engineering/readme.md:
--------------------------------------------------------------------------------
 1 | *THESE ARE NOTES FROM THE BOOK, FOR FURTHER CHECKS.*   
 2 | 
 3 | # FUNDAMENTALS OF DATA ENGINEERING
 4 | 
 5 | "we unapologetically take a cloud-first approach. We view the cloud as a fundamentally transformative development that will endure for decades; most on-premises data systems and workloads will eventually move to cloud hosting. We assume that infrastructure and systems are ephemeral and scalable, and that data engineers will lean toward deploying managed services in the cloud".  
 6 | 
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Fundamentals of Data Engineering/src/DE_lifecycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/DE_lifecycle.png


--------------------------------------------------------------------------------
/Fundamentals of Data Engineering/src/DE_stakeholders.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/DE_stakeholders.png


--------------------------------------------------------------------------------
/Fundamentals of Data Engineering/src/monolith_vs_ms_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/monolith_vs_ms_arch.png


--------------------------------------------------------------------------------
/Git/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/.DS_Store


--------------------------------------------------------------------------------
/Git/gitlab-ci-my-first-pipeline.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |     - build
 3 |     - test
 4 | 
 5 | build:
 6 |     stage: build
 7 |     script:
 8 |         - echo "Building"
 9 |         - mkdir build
10 |         - touch build/info.txt
11 |     artifacts:
12 |         paths:
13 |             - build/
14 | 
15 | test:
16 |     stage: test
17 |     script:
18 |         - echo "Testing"
19 |         - test -f "build/info.txt"
20 | 


--------------------------------------------------------------------------------
/Git/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/img/.DS_Store


--------------------------------------------------------------------------------
/Git/img/CI CD Pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/img/CI CD Pipeline.png


--------------------------------------------------------------------------------
/Kubernetes/Kubernetes Up and Running/2/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules


--------------------------------------------------------------------------------
/Kubernetes/Kubernetes Up and Running/2/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Start from a Node.js 10 (LTS) image
 3 | FROM node:10
 4 | # Specify the directory inside the image in which all commands will run
 5 | WORKDIR /usr/src/app
 6 | # Copy package files and install dependencies
 7 | COPY package*.json ./ RUN npm install
 8 | # Copy all of the app files into the image
 9 | COPY . .
10 | # The default command to run when starting the container
11 | CMD [ "npm", "start" ]


--------------------------------------------------------------------------------
/Kubernetes/Kubernetes Up and Running/2/package.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "simple-node",
3 |     "version": "1.0.0",
4 |     "description": "A sample simple application for Kubernetes Up & Running",
5 |      "main": "server.js",
6 |     "scripts": {
7 |     "start": "node server.js" },
8 |     "author": "" }


--------------------------------------------------------------------------------
/Kubernetes/Kubernetes Up and Running/2/server.js:
--------------------------------------------------------------------------------
1 | var express = require('express');
2 | var app = express();
3 | app.get('/', function (req, res) {
4 |   res.send('Hello World!');
5 | });
6 | app.listen(3000, function () { console.log('Listening on port 3000!'); console.log(' http://localhost:3000');
7 | });


--------------------------------------------------------------------------------
/Kubernetes/Kubernetes Up and Running/kubernetes_up_and_running.md:
--------------------------------------------------------------------------------
 1 | # Kubernetes Up and Running
 2 | official  [repo]('https://github.com/kubernetes-up-and-running')
 3 | 
 4 | ## 1. Introduction
 5 | First chapter enphasize the mais concepts od kubernetes and why it is changing the way developers are working by highlighting the most important improvements this technology brought to teams.
 6 | * Velocity   
 7 |     * The Value of Immutability   
 8 |     * Declarative Configuration    
 9 |     * Self-Healing Systems   
10 | * Scaling Your Service and Your Teams    
11 |     * Decoupling   
12 |     * Easy Scaling for Applications and Clusters   
13 |     * Scaling Development Teams with Microservices   
14 |     * Separation of Concerns for Consistency and Scaling   
15 | * Abstracting Your Infrastructure   
16 | * Efficiency
17 | 
18 | ## 2. Creating and Running Containers
19 | * Kubernetes is meant for creating, deploying and managing distributes applications in containers
20 | * Applications are generally comprised of a language runtime, libs and the code
21 | * Traditional method of running multiple programs in same server OS can have troubles dealing with dependencies
22 | * Previous chapter immutability is a big advantage to solve this problem
23 | * Docker helps on building, packing and sharing images
24 | * Docker is the most common  image format, other is OCI
25 | 
26 | ### 2.1 Container Image
27 | * It is a binary package of a container technology (like docker) that encapsulates all files necessary to run a progrma in an OS
28 | * This image can be built locally or pulled from a container registry (like docker hub)
29 | * Container images are constructed with a series of filesystem layers, where each layer inherits and modifies the layers that came before it 
30 | * One conatiner image can be based on other image, and so on
31 | * 2 types of containers:
32 |     * System Containers -> try to mimic a full syste, such a vm does (no longer used that much)
33 |     * Application container -> runs a single program offering the right granularity of isolation and easy scalability
34 | 
35 | ### 2.2 Building Images with Docker
36 | We will use an application container approach to build a image. Install docker
37 | 
38 | #### 2.2.1 Dockerfiles
39 | * Build the folder directory as folder 2
40 | * do via command line to folder  2
41 | * run
42 |     * docker build -t simple-node .
43 |     * docker run -rm -p 3000:3000 simple-node


--------------------------------------------------------------------------------
/Kubernetes/img/kubernetes-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Kubernetes/img/kubernetes-architecture.png


--------------------------------------------------------------------------------
/Kubernetes/k8s_minikube.md:
--------------------------------------------------------------------------------
 1 | # K8s on Minikube - fast lab setup
 2 | video: https://www.youtube.com/watch?v=X48VuDVv0do
 3 | repo from video: https://gitlab.com/nanuchi/youtube-tutorial-series/-/blob/master/basic-kubectl-commands/cli-commands.md
 4 | 
 5 | ## 1. Setu up minikube + virtualbox
 6 | This is an alternative way of running docker without using docker desktop, which for business purpose is no longer free.   
 7 | For study purpose, you can still use docker desktop.   
 8 | Thank to my workmate Sergei that introduced me to the approach below.
 9 | * Install Docker
10 | ```
11 | brew install docker
12 | brew install docker-compose
13 | ```
14 | * Install [VirtualBox](https://minikube.sigs.k8s.io/docs/drivers/virtualbox/)
15 |   
16 |   For Mac users, you might need to allow the oracle machine to run in: system preferences -> security  
17 | * Install [Minikube](https://minikube.sigs.k8s.io/docs/start/).  
18 | * Start cluster using:
19 | ```
20 | minikube start --container-runtime=docker --vm=true --driver=virtualbox --memory=2g
21 | ```
22 | *Hint: when starting the cluster, you can adjust its settings according to your needs. See the options [here](https://minikube.sigs.k8s.io/docs/commands/start/).*  
23 | 
24 | * Run the following command in your shell to point it to the minikube's docker environment:
25 | ```
26 | eval $(minikube docker-env)
27 | ```
28 | * Now you can run docker images:
29 | ```
30 | docker run ...
31 | ```
32 | * If at some moment you want to stop the cluster (e.g. to free unused resources):
33 | ```
34 | minikube stop
35 | ```
36 | * Check if minukube is working
37 | ```
38 | minikube status
39 | ```
40 | If not working delete it and start again
41 | 
42 | ## 2. Kubectl 
43 | Now we have a server installed with minikube and Kubernetes + Kubectl Installed.
44 | Use Kubectl CLI to interact with the kubernetes:
45 | ```
46 | # check nodes
47 | kubectl get nodes 
48 | 
49 | # check pods
50 | kubectl get pod
51 | 
52 | # check services
53 | kubectl get services
54 | 
55 | # create a deployment - deployment manages pods
56 | kubectl create deployment nginx-depl --image=nginx
57 | 
58 | # See the deployments
59 | kubectl get deployment
60 | 
61 | #  manages the deployment, you should not needed to manage with it
62 | kubectl get replicaset
63 | 
64 | # edit the deployment file 
65 | kubectl edit deployment nginx-depl
66 | ```
67 | Layers of abstraction:
68 | Deployment manage a Replica Set > Replica Set manages Pods > Pos is an abstraction
69 | 
70 | ### 2.1 Debugging an Application
71 | First, lets create a Mongo DB Deployment to check some logs:
72 | ```
73 | kubectl create deployment mongo-depl --image=mongo
74 | ```
75 | 2 usefull functions for debugging:
76 | ```
77 | # get the logs of the pod to check waht is happening inside it
78 | kubectl logs {pod-name}
79 | 
80 | # execute the pod entering inside it
81 | kubectl exec -it {pod-name} -- bin/bash
82 | ```
83 | 
84 | ### 2.2 Delete Deployment
85 | ```
86 | kubectl delete deployment mongo-depl
87 | ```
88 | 
89 | ### 2.3 Configuration File
90 | So far we deployed using few deployments options, but you can specify several configuraions in the deployment command, which would be too hard. Then, k8s have a way to config the deployment via configfiles.yml using the apply command.


--------------------------------------------------------------------------------
/Kubernetes/readme.md:
--------------------------------------------------------------------------------
1 | ## Kubernetes
2 | Repo cotaining studies and practice codes with Kubernetes
3 | 
4 | ### Description
5 | k8s_and_data -> Notes on Big Data deployed on K8S
6 | kubernetes up and Running -> Notes on the book with same name
7 | k8s_on_minikube -> fast test of K8s in minikube


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data-Engineering(Projects and Notes)
 2 | Folder to showcase my studies notes and projects in Data Engineering area.
 3 | 
 4 | ## Databricks  
 5 | Contains notes and some scripts created.  
 6 | ### [LAB DATABRICKS FROM A TO Z](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/LAB%20DATABRICKS%20FROM%20A%20TO%20Z%20.md)
 7 | ### [SCALA ETL Part 1 - Data Extraction](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SCALA%20ETL%20Part%201%20-%20Data%20Extraction.md)  
 8 | ### [SPARK ETL Part 1 - Data Extraction](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SPARK%20ETL%20Part%201%20-%20Data%20Extraction.md)  
 9 | ### [SPARK SQL](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SPARK%20SQL.md)
10 | 
11 | ## Data-Eng-Track-Bootcamp  
12 | This folder holds my studies on data engineering with python track, on Datacamp.
13 | ### [2 - Introduction to Data Egineering](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/2%20-%20Introduction%20to%20Data%20Egineering.md)
14 | ### [3 - Data Ingestion with Pandas](https://github.com/cassiobolba/Python/blob/master/Python-Datacamp/3%20-%20Data%20Ingestion%20with%20Pandas.md)  
15 | ### [3.5 - Software Eng. in Python - Clean Code](https://github.com/cassiobolba/Python/blob/master/Python-Datacamp/3.5%20-%20Software%20Eng.%20in%20Python%20-%20Clean%20Code.md)  
16 | ### [6 - Introduction to Shell Script.md](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/6%20-%20Introduction%20to%20Shell%20Script.md)
17 | ### [17 - Introduction Scala.md](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/17%20-%20Introduction%20to%20Scala.md)
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/Snowflake/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Snowflake/.DS_Store


--------------------------------------------------------------------------------
/Snowflake/curated-links-list.md:
--------------------------------------------------------------------------------
 1 | # Curated List of Smnowflake Links to Study
 2 | 
 3 | ## Youtube Channels
 4 | | link   | Status|  Notes on |
 5 | |----------|:-------------:|------:|
 6 | | https://www.youtube.com/@DataEngineering/playlists |  |  |
 7 | 
 8 | 
 9 | ## General Articles
10 | | link   | Status|  Notes on |
11 | |----------|:-------------:|------:|
12 | | https://airbyte.com/blog/snowflake-data-cloud |  |  |
13 | 
14 | 
15 | ## Snowpark
16 | | link   | Status|  Notes on |
17 | |----------|:-------------:|------:|
18 | | https://medium.com/snowflake/your-cheatsheet-to-snowflake-snowpark-dataframes-using-python-e5ec8709d5d7|  |  |
19 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/COPY.sql:
--------------------------------------------------------------------------------
  1 | SELECT 
  2 | 	$1
  3 |     ,$2
  4 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190916.export.csv ;
  5 | 
  6 | CREATE DATABASE GDELT;
  7 | 
  8 | CREATE SCHEMA EVENTS;
  9 | 
 10 | CREATE TABLE IF NOT EXISTS GDELT.EVENTS.EVENTS_FULL (
 11 | 
 12 |      GLOBALEVENTID INT
 13 | 
 14 |     ,SQLDATE  varchar
 15 | 
 16 |     ,MonthYear varchar
 17 | 
 18 |     ,Year varchar
 19 | 
 20 |     ,FractionDate varchar
 21 | 
 22 |     ,Actor1Code varchar
 23 | 
 24 |     ,Actor1Name varchar
 25 | 
 26 |     ,Actor1CountryCode varchar
 27 | 
 28 |     ,Actor1KnownGroupCode varchar
 29 | 
 30 |     ,Actor1EthnicCode varchar
 31 | 
 32 |     ,Actor1Religion1Code varchar
 33 | 
 34 |     ,Actor1Religion2Code varchar
 35 | 
 36 |     ,Actor1Type1Code varchar
 37 | 
 38 |     ,Actor1Type2Code varchar
 39 | 
 40 |     ,Actor1Type3Code varchar
 41 | 
 42 |     ,Actor2Code varchar
 43 | 
 44 |     ,Actor2Name varchar
 45 | 
 46 |     ,Actor2CountryCode varchar
 47 | 
 48 |     ,Actor2KnownGroupCode varchar
 49 | 
 50 |     ,Actor2EthnicCode varchar
 51 | 
 52 |     ,Actor2Religion1Code varchar
 53 | 
 54 |     ,Actor2Religion2Code varchar
 55 | 
 56 |     ,Actor2Type1Code varchar
 57 | 
 58 |     ,Actor2Type2Code varchar
 59 | 
 60 |     ,Actor2Type3Code varchar
 61 | 
 62 |     ,IsRootEvent varchar
 63 | 
 64 |     ,EventCode varchar
 65 | 
 66 |     ,EventBaseCode varchar
 67 | 
 68 |     ,EventRootCode varchar
 69 | 
 70 |     ,QuadClass varchar
 71 | 
 72 |     ,GoldsteinScale varchar
 73 | 
 74 |     ,NumMentions varchar
 75 | 
 76 |     ,NumSources varchar
 77 | 
 78 |     ,NumArticles varchar
 79 | 
 80 |     ,AvgTone varchar
 81 | 
 82 |     ,Actor1Geo_Type varchar
 83 | 
 84 |     ,Actor1Geo_FullName varchar
 85 | 
 86 |     ,Actor1Geo_CountryCode varchar
 87 | 
 88 |     ,Actor1Geo_ADM1Code varchar
 89 | 
 90 |     ,Actor1Geo_Lat varchar
 91 | 
 92 |     ,Actor1Geo_Long varchar
 93 | 
 94 |     ,Actor1Geo_FeatureID varchar
 95 | 
 96 |     ,Actor2Geo_Type varchar
 97 | 
 98 |     ,Actor2Geo_FullName varchar
 99 | 
100 |     ,Actor2Geo_CountryCode varchar
101 | 
102 |     ,Actor2Geo_ADM1Code varchar
103 | 
104 |     ,Actor2Geo_Lat varchar
105 | 
106 |     ,Actor2Geo_Long varchar
107 | 
108 |     ,Actor2Geo_FeatureID varchar
109 | 
110 |     ,ActionGeo_Type varchar
111 | 
112 |     ,ActionGeo_FullName varchar
113 | 
114 |     ,ActionGeo_CountryCode varchar
115 | 
116 |     ,ActionGeo_ADM1Code varchar
117 | 
118 |     ,ActionGeo_Lat varchar
119 | 
120 |     ,ActionGeo_Long varchar
121 | 
122 |     ,ActionGeo_FeatureID varchar
123 | 
124 |     ,DATEADDED varchar
125 | 
126 |     ,SOURCEURL varchar
127 | 
128 |     );
129 | 
130 | 
131 | COPY INTO GDELT.EVENTS.EVENTS_FULL
132 | 	FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events
133 |     file_format = ( type = 'csv' field_delimiter = '\t')
134 |     pattern = '.*2019091.*'
135 |     ;
136 | 
137 | select * from GDELT.EVENTS.EVENTS_FULL;
138 | 
139 | 
140 | 
141 |     


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/FILE FORMAT.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE SCHEMA FILE_FORMATS;
 2 | 
 3 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT
 4 | 	FIELD_DELIMITER = '\t'
 5 |     TYPE = CSV
 6 |     --SKIP_HEADER=1
 7 |     ;
 8 | 
 9 | DESC FILE FORMAT FILE_FORMATS.CSV_TAB_FMT;
10 | 
11 | LIST @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/ ;
12 | 
13 | COPY INTO GDELT.EVENTS.EVENTS_FULL
14 | 	FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20130922.export.csv
15 |     file_format = MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT
16 | ;


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/FLATTEN.sql:
--------------------------------------------------------------------------------
 1 | SELECT DISTINCT
 2 | 	RAW_FILE:id id
 3 |     ,array_size(RAW_FILE:multiMedia) size
 4 | from medias.youtube.statistics_raw ;
 5 | 
 6 | SELECT distinct
 7 | 	 RAW_FILE:id::int id
 8 | 	,RAW_FILE:createdAt createdAt
 9 |     ,RAW_FILE:description::string description
10 |     ,RAW_FILE:likeDislike.dislikes dislikes
11 | 	,RAW_FILE:likeDislike.likes likes
12 |     ,RAW_FILE:likeDislike.userAction user_action
13 |     ,f.value:id multimedia_id
14 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW ,
15 | table(flatten(RAW_FILE:multiMedia)) f
16 | --where RAW_FILE:id::int =  2114 
17 | 
18 | 
19 | ;


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/INSERT.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | CREATE OR REPLACE TABLE MEDIAS.YOUTUBE.STATISTICS AS
 3 | SELECT distinct
 4 | 	 RAW_FILE:id::int id
 5 | 	,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt
 6 |     ,RAW_FILE:description::string description
 7 |     ,RAW_FILE:likeDislike.dislikes::INT dislikes
 8 | 	,RAW_FILE:likeDislike.likes::INT likes
 9 |     ,RAW_FILE:likeDislike.userAction::INT user_action
10 |     ,f.value:id::INT multimedia_id
11 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW ,
12 | table(flatten(RAW_FILE:multiMedia)) f 
13 | ;
14 | 
15 | SELECT COUNT(*) FROM MEDIAS.YOUTUBE.STATISTICS;
16 | 
17 | INSERT INTO MEDIAS.YOUTUBE.STATISTICS
18 | SELECT distinct
19 | 	 RAW_FILE:id::int id
20 | 	,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt
21 |     ,RAW_FILE:description::string description
22 |     ,RAW_FILE:likeDislike.dislikes::INT dislikes
23 | 	,RAW_FILE:likeDislike.likes::INT likes
24 |     ,RAW_FILE:likeDislike.userAction::INT user_action
25 |     ,f.value:id::INT multimedia_id
26 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW ,
27 | table(flatten(RAW_FILE:multiMedia)) f ;


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/LOAD JSON.sql:
--------------------------------------------------------------------------------
 1 | -- CRIAR STAGE > CARREGAR DADOS BRUTOS > ANALISE E PARSE > CRIAR O COMANDO COPY 
 2 | 
 3 | CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES
 4 | 	URL= 's3://snowflake-series/'
 5 |     STORAGE_INTEGRATION = S3_INT;
 6 | 
 7 | LIST @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES;
 8 | 
 9 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.JSON_FMT
10 |     TYPE = JSON;
11 | 
12 | CREATE DATABASE MEDIAS;
13 | 
14 | CREATE SCHEMA MEDIAS.YOUTUBE;
15 | 
16 | CREATE OR REPLACE table MEDIAS.YOUTUBE.STATISTICS_RAW (
17 |     raw_file variant);
18 | 
19 | COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW
20 |     FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES
21 |     file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT
22 |     files = ('youtube_data.json');
23 | 
24 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW ;
25 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/PARSE JSON.sql:
--------------------------------------------------------------------------------
 1 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW;
 2 | 
 3 | SELECT raw_file:createdAt
 4 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW;
 5 | 
 6 | SELECT distinct
 7 | 	 $1:id::int id
 8 | 	,$1:createdAt createdAt
 9 |     ,$1:description::string description
10 |     ,$1:likeDislike.dislikes dislikes
11 | 	,$1:likeDislike.likes likes
12 |     ,$1:likeDislike.userAction user_action
13 |     ,RAW_FILE:multiMedia[0].name name
14 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW
15 | where RAW_FILE:id::int =  2134
16 | union all
17 | SELECT distinct
18 | 	 $1:id::int id
19 | 	,$1:createdAt createdAt
20 |     ,$1:description::string description
21 |     ,$1:likeDislike.dislikes dislikes
22 | 	,$1:likeDislike.likes likes
23 |     ,$1:likeDislike.userAction user_action
24 |     ,RAW_FILE:multiMedia[1].name name
25 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW
26 | where RAW_FILE:id::int =  2134
27 | 
28 | 
29 | ;
30 | 
31 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/STAGES.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE DATABASE MANAGE_DB;
 2 | 
 3 | USE DATABASE MANAGE_DB;
 4 | CREATE SCHEMA EXTERNAL_STAGES;
 5 | 
 6 | USE SCHEMA EXTERNAL_STAGES;
 7 | 
 8 | CREATE OR REPLACE STAGE GDELT_EVENTS
 9 | 	URL = 's3://gdelt-open-data/' ;
10 | 
11 | LIST @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS ;
12 | 
13 | DESC STAGE MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS ;
14 | 
15 | ALTER STAGE MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS
16 | 	SET CREDENTIALS=(aws_key_id='cab' aws_secret_key='123')


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/STORAGE INTEGRATION.sql:
--------------------------------------------------------------------------------
 1 | --arn:aws:iam::749634257170:role/series-snowflake
 2 | 
 3 | CREATE SCHEMA MANAGE_DB.STORAGE_INTERGRATION;
 4 | 
 5 | USE DATABASE MANAGE_DB;
 6 | USE SCHEMA STORAGE_INTERGRATION;
 7 | 
 8 | 
 9 | ;
10 | create or replace storage integration S3_INT
11 |   TYPE = EXTERNAL_STAGE
12 |   STORAGE_PROVIDER = S3
13 |   ENABLED = TRUE 
14 |   STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::749634257170:role/series-snowflake'
15 |   STORAGE_ALLOWED_LOCATIONS = ('s3://snowflake-series/')
16 |   COMMENT = 'My first integration' 
17 |   ;
18 | 
19 | DESC INTEGRATION S3_INT;
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/TRANSFORMATION WITH COPY.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE TABLE GDELT.EVENTS.EVENTS_URL AS 
 2 | SELECT 
 3 | 	 $1::INT GLOBALEVENTID
 4 |     ,TO_DATE($2,'YYYYMMDD') AS SQLDATE
 5 |     ,$3 MONTHYEAR
 6 |     ,$58 SOURCEURL
 7 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190916.export.csv 
 8 | (file_format => MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT)
 9 | ;
10 | 
11 | COPY INTO GDELT.EVENTS.EVENTS_URL FROM (
12 | SELECT 
13 | 	 $1::INT GLOBALEVENTID
14 |     ,TO_DATE($2,'YYYYMMDD') AS SQLDATE
15 |     ,$3 MONTHYEAR
16 |     ,$58 SOURCEURL
17 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190917.export.csv 
18 | (file_format => MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT)
19 | );
20 | 
21 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/automacao/SNOWPIPE.sql:
--------------------------------------------------------------------------------
 1 | CREATE SCHEMA MANAGE_DB.PIPES;
 2 | 
 3 | CREATE PIPE MANAGE_DB.PIPES.JSON_PIPE
 4 | 	AUTO_INGEST = TRUE
 5 |     AS
 6 |     COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW
 7 |     FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES
 8 |     file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT;
 9 | 
10 | DESC PIPE JSON_PIPE;
11 | 
12 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW;
13 | 
14 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW;


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/automacao/TASK TREE.sql:
--------------------------------------------------------------------------------
 1 | SHOW TASKS;
 2 | 
 3 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW;
 4 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW;
 5 | 
 6 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS;
 7 | 
 8 | CREATE OR REPLACE TASK LOAD_STATISTICS
 9 | 	WAREHOUSE = COMPUTE_WH
10 |     COMMENT = 'SEGUNDA TAREFA'
11 |     AFTER LOAD_RAW
12 |     AS
13 |     INSERT INTO MEDIAS.YOUTUBE.STATISTICS
14 |     SELECT distinct
15 |     	 RAW_FILE:id::int id
16 |     	,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt
17 |         ,RAW_FILE:description::string description
18 |         ,RAW_FILE:likeDislike.dislikes::INT dislikes
19 |     	,RAW_FILE:likeDislike.likes::INT likes
20 |         ,RAW_FILE:likeDislike.userAction::INT user_action
21 |         ,f.value:id::INT multimedia_id
22 |     FROM MEDIAS.YOUTUBE.STATISTICS_RAW ,
23 |     table(flatten(RAW_FILE:multiMedia)) f ;
24 | 
25 |     SHOW TASKS;
26 | 
27 |     ALTER TASK LOAD_RAW RESUME;
28 |     ALTER TASK LOAD_STATISTICS RESUME;
29 | 
30 | 
31 | 
32 | select *
33 |   from table(information_schema.task_history())
34 |   order by scheduled_time desc ;


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/automacao/TASK.SQL:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE SCHEMA MANAGE_DB.TASKS;
 2 | 
 3 | CREATE OR REPLACE TASK LOAD_RAW
 4 | 	WAREHOUSE = COMPUTE_WH
 5 |     SCHEDULE = '1 MINUTE'
 6 |     COMMENT = 'HAHA'
 7 |     AS
 8 |     COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW
 9 |     FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES
10 |     file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT
11 | 	;
12 | 
13 | SHOW TASKS;
14 | 
15 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW;
16 | 
17 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW;
18 | 
19 | ALTER TASK LOAD_RAW RESUME;
20 | 
21 | ALTER TASK LOAD_RAW SUSPEND;
22 | 


--------------------------------------------------------------------------------
/Snowflake/the-snowflake-series-course/test.sql:
--------------------------------------------------------------------------------
1 | select
2 |     *
3 | from
4 |     customer
5 | limit
6 |     100
7 | ;


--------------------------------------------------------------------------------
/Snowflake/ws/extras/extras.sql:
--------------------------------------------------------------------------------
 1 | -- Time Travel
 2 | --Use-case: Update data (by mistake)
 3 | UPDATE OUR_FIRST_DB.public.test
 4 | SET FIRST_NAME = 'Joyen' ;
 5 | 
 6 | -- see all is wrong
 7 | SELECT * FROM OUR_FIRST_DB.public.test;
 8 | 
 9 | --Using time travel: Method 1 - 2 minutes back
10 | InSELECT * FROM OUR_FIRST_DB.public.test at (OFFSET => -60*1.5);
11 | 
12 | -- see all is back to normal
13 | SELECT * FROM OUR_FIRST_DB.public.test;
14 | 
15 | 
16 | DROP DATABASE ;
17 | 
18 | UNDROP DATABASE ;
19 | 
20 | 
21 | -- Zero-Copy Clone
22 | CREATE DATABASE MEDIAS_DEV CLONE MEDIA;
23 | -- BEFORE (TIMESTAMP => 1231516)
24 | 


--------------------------------------------------------------------------------
/Snowflake/ws/ingestion/1-storage-integration.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE DATABASE MANAGE_DB ;
 2 | 
 3 | CREATE SCHEMA IF NOT EXISTS MANAGE_DB.STORAGE_INTERGRATION;
 4 | 
 5 | USE DATABASE MANAGE_DB;
 6 | USE SCHEMA STORAGE_INTERGRATION;
 7 | 
 8 | 
 9 | CREATE STORAGE INTEGRATION AWS_S3_INT
10 |   TYPE = EXTERNAL_STAGE
11 |   STORAGE_PROVIDER = S3
12 |   ENABLED = TRUE 
13 |   STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::749634257170:role/snowflake-ws'
14 |   STORAGE_ALLOWED_LOCATIONS = ('s3://snowflake-ws/')
15 |   --STORAGE_BLOCKED_LOCATIONS = ('s3://mybucket3/path3/', 's3://mybucket4/path4/');
16 |   COMMENT = 'Integration to AWS s3' ;
17 | 
18 | -- AZURE EXAMPLE
19 | -- CREATE STORAGE INTEGRATION AZURE_BLOB_INT
20 | --   TYPE = EXTERNAL_STAGE
21 | --   STORAGE_PROVIDER = 'AZURE'
22 | --   ENABLED = TRUE
23 | --   AZURE_TENANT_ID = ''
24 | --   STORAGE_ALLOWED_LOCATIONS = ('*')
25 | --   STORAGE_BLOCKED_LOCATIONS = ('azure://myaccount.blob.core.windows.net/mycontainer/path3/');
26 |   
27 | SHOW INTEGRATIONS ;
28 |   
29 | DESC INTEGRATION AWS_S3_INT;
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/Snowflake/ws/ingestion/2-stage-and-file-format.sql:
--------------------------------------------------------------------------------
 1 | /*-------------------------------------------------------------
 2 | STAGE
 3 | CRIAR STAGE > CARREGAR DADOS BRUTOS > ANALISE E PARSE > CRIAR O COMANDO COPY 
 4 | -------------------------------------------------------------*/
 5 | 
 6 | USE DATABASE MANAGE_DB;
 7 | 
 8 | CREATE SCHEMA IF NOT EXISTS EXTERNAL_STAGES;
 9 | 
10 | USE SCHEMA EXTERNAL_STAGES;
11 | 
12 | CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP
13 | 	URL= 's3://snowflake-ws/'
14 |     --FILE_FORMAT = (  TYPE = CSV | JSON | AVRO | ORC | PARQUET | XML )
15 |     --CREDENTIALS = (  AWS_KEY_ID = '' AWS_SECRET_KEY = '' )
16 |     --ENCRYPTION =  (  TYPE = 'AWS_CSE'  MASTER_KEY = '' )
17 |     --COPY_OPTIONS = ( )
18 |     --TAG = ''
19 |     STORAGE_INTEGRATION = AWS_S3_INT;
20 | 
21 | LIST @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP;
22 | 
23 | DESC STAGE SNOWFLAKE_WORKSHOP;
24 | 
25 | SHOW STAGES LIKE '%WORKSHOP%';
26 | 
27 | 
28 | /*-------------------------------------------------------------
29 | FILE FORMAT
30 | -------------------------------------------------------------*/
31 | 
32 | CREATE SCHEMA FILE_FORMATS;
33 | USE SCHEMA FILE_FORMATS;
34 | 
35 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.JSON_FMT
36 |      TYPE = JSON
37 |      -- COMPRESSION = AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE
38 |      -- DATE_FORMAT = '' | AUTO
39 |      -- TIME_FORMAT = '' | AUTO
40 |      -- TIMESTAMP_FORMAT = '' | AUTO
41 |      -- BINARY_FORMAT = HEX | BASE64 | UTF8
42 |      TRIM_SPACE = TRUE
43 |      -- NULL_IF = ( '' )
44 |      -- FILE_EXTENSION = ''
45 |      -- ENABLE_OCTAL = TRUE | FALSE
46 |      ALLOW_DUPLICATE = TRUE
47 |      -- STRIP_OUTER_ARRAY = TRUE | FALSE
48 |      -- STRIP_NULL_VALUES = TRUE | FALSE
49 |      REPLACE_INVALID_CHARACTERS = TRUE
50 |      -- IGNORE_UTF8_ERRORS = TRUE | FALSE
51 |      -- SKIP_BYTE_ORDER_MARK = TRUE | FALSE
52 |      
53 |      -- TYPE = PARQUET
54 |      -- COMPRESSION = AUTO | LZO | SNAPPY | NONE
55 |      -- SNAPPY_COMPRESSION = TRUE | FALSE
56 |      -- BINARY_AS_TEXT = TRUE | FALSE
57 |      -- TRIM_SPACE = TRUE | FALSE
58 |      -- NULL_IF = ( '' )
59 |     ;
60 | 
61 | SHOW FILE FORMATS;
62 | 
63 | DESC FILE FORMAT JSON_FMT;
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/Snowflake/ws/ingestion/3-copy-command.sql:
--------------------------------------------------------------------------------
 1 | SELECT * FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP
 2 | (file_format => MANAGE_DB.FILE_FORMATS.JSON_FMT);
 3 | 
 4 | CREATE DATABASE MEDIAS;
 5 | 
 6 | CREATE SCHEMA MEDIAS.YOUTUBE;
 7 | 
 8 | CREATE OR REPLACE TABLE MEDIAS.YOUTUBE.RAW (
 9 |     RAW_FILE VARIANT);
10 | 
11 | COPY INTO MEDIAS.YOUTUBE.RAW -- ( RAW_FILE )
12 |     FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP
13 |     FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT
14 |     -- PATTERN = '.*error.json'
15 |     -- FILES = ('youtube_data.json')
16 |     ON_ERROR = SKIP_FILE --CONTINUE | SKIP_FILE | SKIP_FILE_<num> | 'SKIP_FILE_<num>%' | ABORT_STATEMENT
17 |     -- SIZE_LIMIT = <num>
18 |     -- PURGE = TRUE
19 |     -- RETURN_FAILED_ONLY = TRUE | FALSE
20 |     -- MATCH_BY_COLUMN_NAME = CASE_SENSITIVE | CASE_INSENSITIVE | NONE
21 |     -- ENFORCE_LENGTH = TRUE | FALSE
22 |     -- TRUNCATECOLUMNS = TRUE | FALSE
23 |     -- FORCE = TRUE
24 |     ;
25 | 
26 | SELECT * FROM MEDIAS.YOUTUBE.RAW ;
27 | 
28 | TRUNCATE TABLE MEDIAS.YOUTUBE.RAW ;
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/Snowflake/ws/ingestion/4-snowpipe.sql:
--------------------------------------------------------------------------------
 1 | USE DATABASE MANAGE_DB;
 2 | 
 3 | CREATE OR REPLACE SCHEMA MANAGE_DB.PIPES;
 4 | 
 5 | USE SCHEMA PIPES;
 6 | 
 7 | CREATE OR REPLACE PIPE MANAGE_DB.PIPES.YOUTUBE_RAW
 8 |     AUTO_INGEST = TRUE
 9 |     -- ERROR_INTEGRATION = -- Required only when configuring Snowpipe to send error notifications to a cloud messaging service.
10 |     -- AWS_SNS_TOPIC = ''
11 |     -- INTEGRATION = '' -- Required only when configuring AUTO_INGEST for Google Cloud Storage or Microsoft Azure stages.
12 |     COMMENT = 'Pipe to autoingest youtube data from S3'
13 | AS
14 | COPY INTO MEDIAS.YOUTUBE.RAW
15 |     FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP
16 |     FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT
17 |     ON_ERROR = SKIP_FILE
18 |     --FORCE = TRUE
19 |     ;
20 | 
21 | DESC PIPE YOUTUBE_RAW;
22 |     
23 | SELECT * FROM MEDIAS.YOUTUBE.RAW ;  
24 | 
25 | 
26 | /*-------------------------------------------------------------
27 | ERROR HANDLING
28 | -------------------------------------------------------------*/
29 | 
30 | -- Validate pipe is actually working
31 | SELECT SYSTEM$PIPE_STATUS('YOUTUBE_RAW');
32 | 
33 | -- Snowpipe error message
34 | -- sometime can give some general error message
35 | SELECT * FROM TABLE(VALIDATE_PIPE_LOAD(
36 |     PIPE_NAME => 'MANAGE_DB.PIPES.YOUTUBE_RAW',
37 |     START_TIME => DATEADD(YEAR,-2,CURRENT_TIMESTAMP())
38 |         ));
39 | 
40 | -- COPY command history from table to see error massage
41 | -- here we have more details to understand the error
42 | SELECT * FROM TABLE (INFORMATION_SCHEMA.COPY_HISTORY(
43 |    TABLE_NAME  =>  'MEDIAS.YOUTUBE.RAW',
44 |    START_TIME => DATEADD(YEAR,-2,CURRENT_TIMESTAMP())))
45 |    -- END_TIME => 
46 |    ;
47 | 
48 | select *
49 |   from table(information_schema.pipe_usage_history(
50 |     date_range_start=> DATEADD(YEAR,-2,CURRENT_TIMESTAMP()),
51 |     pipe_name=>'MANAGE_DB.PIPES.YOUTUBE_RAW')); --14 dias
52 |    
53 | -- Pause pipes
54 | ALTER PIPE MANAGE_DB.PIPES.YOUTUBE_RAW SET PIPE_EXECUTION_PAUSED = false;
55 | 
56 | ALTER PIPE MANAGE_DB.PIPES.YOUTUBE_RAW REFRESH; --if pipe was created after file ingestion
57 | 
58 | select * from MEDIAS.YOUTUBE.RAW;
59 | /*-------------------------------------------------------------
60 | MANAGING PIPES
61 | -------------------------------------------------------------*/
62 | 
63 | -- Manage pipes -- 
64 | DESC PIPE MANAGE_DB.PIPES.YOUTUBE_RAW;
65 | 
66 | SHOW PIPES;
67 | 
68 | SHOW PIPES LIKE '%YOUTUBE%';
69 | 
70 | SHOW PIPES IN DATABASE MANAGE_DB;
71 | 
72 | SHOW PIPES IN SCHEMA MANAGE_DB.PIPES;
73 | 
74 | SHOW PIPES LIKE '%YOUTUBE%' IN DATABASE MANAGE_DB;


--------------------------------------------------------------------------------
/Snowflake/ws/transformation/3-duplicates.sql:
--------------------------------------------------------------------------------
 1 | SELECT * FROM MEDIAS.YOUTUBE.EVENTS;
 2 | 
 3 | SELECT * FROM MEDIAS.YOUTUBE.RAW;
 4 | 
 5 | MERGE INTO MEDIAS.YOUTUBE.EVENTS tgt
 6 |   USING (
 7 |     SELECT distinct
 8 |     	 RAW_FILE:id::int id
 9 |     	,TO_TIMESTAMP(RAW_FILE:createdAt) created_at
10 |         ,RAW_FILE:description::string description
11 |         ,RAW_FILE:likeDislike.dislikes::int dislikes
12 |     	,RAW_FILE:likeDislike.likes::int likes
13 |         ,RAW_FILE:likeDislike.userAction::int user_action
14 |         ,RAW_FILE:commentCount::int comment_count
15 |         ,RAW_FILE:feedsComment::string feeds_comment
16 |         ,RAW_FILE:location::string location
17 |         ,RAW_FILE:mediatype::int media_type
18 |         ,RAW_FILE:name::string name
19 |         ,RAW_FILE:profilePicture::string profile_picture
20 |         ,RAW_FILE:title::string title
21 |         ,RAW_FILE:userId::int user_id
22 |     FROM MEDIAS.YOUTUBE.RAW 
23 |     ) src
24 |   ON src.id = tgt.id
25 |   WHEN NOT MATCHED THEN 
26 |   INSERT 
27 |       (
28 |     	 id
29 |     	,created_at
30 |         ,description
31 |         ,dislikes
32 |     	,likes
33 |         ,user_action
34 |         ,comment_count
35 |         ,feeds_comment
36 |         ,location
37 |         ,media_type
38 |         ,name
39 |         ,profile_picture
40 |         ,title
41 |         ,user_id
42 |       )
43 |   VALUES 
44 |       (
45 |          src.id
46 |     	,src.created_at
47 |         ,src.description
48 |         ,src.dislikes
49 |     	,src.likes
50 |         ,src.user_action
51 |         ,src.comment_count
52 |         ,src.feeds_comment
53 |         ,src.location
54 |         ,src.media_type
55 |         ,src.name
56 |         ,src.profile_picture
57 |         ,src.title
58 |         ,src.user_id
59 |       );
60 | 


--------------------------------------------------------------------------------
/Snowflake/ws/transformation/5-streams+tasks.sql:
--------------------------------------------------------------------------------
 1 | ------- Automatate the updates using tasks --
 2 | CREATE OR REPLACE TASK all_data_changes
 3 |     WAREHOUSE = COMPUTE_WH
 4 |     SCHEDULE = '1 MINUTE'
 5 |     WHEN SYSTEM$STREAM_HAS_DATA('CARS_STREAM') -- condition to only run when stream has data
 6 |     AS 
 7 | MERGE INTO CARS_STOCK_CURATED F      -- Target table to merge changes from source table
 8 | USING ( SELECT STRE.*
 9 |               ,ST.location
10 |               ,ST.employees
11 |         FROM CARS_STREAM STRE
12 |         JOIN VENDORS ST
13 |         ON STRE.vendor_id = ST.vendor_id
14 |        ) S
15 | ON F.id=S.id
16 | WHEN MATCHED                        -- DELETE condition
17 |     AND S.METADATA$ACTION ='DELETE' 
18 |     AND S.METADATA$ISUPDATE = 'FALSE'
19 |     THEN DELETE                   
20 | WHEN MATCHED                        -- UPDATE condition
21 |     AND S.METADATA$ACTION ='INSERT' 
22 |     AND S.METADATA$ISUPDATE  = 'TRUE'       
23 |     THEN UPDATE 
24 |     SET f.car_model = s.car_model,
25 |         f.price = s.price,
26 |         f.in_stock= s.in_stock,
27 |         f.vendor_id=s.vendor_id
28 | WHEN NOT MATCHED 
29 |     AND S.METADATA$ACTION ='INSERT'
30 |     THEN INSERT 
31 |     (
32 |      id
33 |     ,car_model
34 |     ,price
35 |     ,vendor_id
36 |     ,in_stock
37 |     ,employees
38 |     ,location
39 |     )
40 |     values
41 |     (
42 |      s.id
43 |     ,s.car_model
44 |     ,s.price
45 |     ,s.vendor_id
46 |     ,s.in_stock
47 |     ,s.employees
48 |     ,s.location
49 |     );
50 | 
51 | -- resume the task because they are by default not started
52 | ALTER TASK all_data_changes RESUME;
53 | ALTER TASK all_data_changes SUSPEND;
54 | 
55 | -- check if task is created
56 | SHOW TASKS;
57 | 
58 | 
59 | -- Change data to test
60 | INSERT INTO CARS_STG VALUES (11,'rural',50,1,2);
61 |        
62 | DELETE FROM CARS_STG
63 | WHERE car_model = 'mercedes';    
64 | 
65 | 
66 | -- Verify results
67 | -- stage should be changed
68 | SELECT * FROM CARS_STG;     
69 | -- stream shoudl have the data changed (if the task did not run yet)        
70 | SELECT * FROM CARS_STREAM;
71 | -- after task run (1min) final table should have the new updates
72 | SELECT * FROM CARS_STOCK_CURATED;
73 | 
74 | 
75 | -- Verify the history
76 | select *
77 | from table(information_schema.task_history())
78 | order by name asc,scheduled_time desc;
79 | 
80 | show tasks;


--------------------------------------------------------------------------------
/Snowflake/ws/transformation/6-refactoring.sql:
--------------------------------------------------------------------------------
 1 | COPY INTO MEDIAS.YOUTUBE.RAW 
 2 |     FROM 
 3 |     (
 4 |     
 5 | 
 6 |     
 7 |     )
 8 |     
 9 |     @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP
10 |     FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT
11 |     -- PATTERN = '.*error.json'
12 |     -- FILES = ('youtube_data.json')
13 |     ON_ERROR = SKIP_FILE --CONTINUE | SKIP_FILE | SKIP_FILE_<num> | 'SKIP_FILE_<num>%' | ABORT_STATEMENT
14 |     -- SIZE_LIMIT = <num>
15 |     -- PURGE = TRUE
16 |     -- RETURN_FAILED_ONLY = TRUE | FALSE
17 |     -- MATCH_BY_COLUMN_NAME = CASE_SENSITIVE | CASE_INSENSITIVE | NONE
18 |     -- ENFORCE_LENGTH = TRUE | FALSE
19 |     -- TRUNCATECOLUMNS = TRUE | FALSE
20 |     -- FORCE = TRUE
21 | 
22 | 
23 | COPY INTO OUR_FIRST_DB.PUBLIC.ORDERS_EX
24 |     FROM (select 
25 |             s.$1,
26 |             s.$2, 
27 |             s.$3,
28 |             CASE WHEN CAST(s.$3 as int) < 0 THEN 'not profitable' ELSE 'profitable' END 
29 |           from @MANAGE_DB.external_stages.aws_stage s)
30 |     file_format= (type = csv field_delimiter=',' skip_header=1)
31 |     files=('OrderDetails.csv');


--------------------------------------------------------------------------------
/Terraform/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/.DS_Store


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/06-organization-and-modules/.DS_Store


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/README.md:
--------------------------------------------------------------------------------
1 | ## Modifications
2 | - remove backend definition
3 | - remove provider definition
4 | 
5 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/consul/README.md:
--------------------------------------------------------------------------------
1 | Uses a module from the terraform registry:
2 | 
3 | https://github.com/hashicorp/terraform-aws-consul


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/consul/main.tf:
--------------------------------------------------------------------------------
 1 | ## THIS SHOWS HOW TO IMPORT AND EXTENAL MODULE FROM TERRAFORM REGISTRY
 2 | 
 3 | terraform {
 4 |   # Assumes s3 bucket and dynamo DB table already set up
 5 |   # See /code/03-basics/aws-backend
 6 |   backend "s3" {
 7 |     bucket         = "devops-directive-tf-state"
 8 |     key            = "06-organization-and-modules/consul/terraform.tfstate"
 9 |     region         = "us-east-1"
10 |     dynamodb_table = "terraform-state-locking"
11 |     encrypt        = true
12 |   }
13 | 
14 |   required_providers {
15 |     aws = {
16 |       source  = "hashicorp/aws"
17 |       version = "~> 3.0"
18 |     }
19 |   }
20 | }
21 | 
22 | provider "aws" {
23 |   region = "us-east-1"
24 | }
25 | 
26 | ############################################################
27 | ##
28 | ## NOTE: if you are deploying this in your production setup
29 | ## follow the instructions in the github repo on how to modify
30 | ## deploying with the defaults here as an example of the power
31 | ## of modules.
32 | ##
33 | ## REPO: https://github.com/hashicorp/terraform-aws-consul
34 | ##
35 | ############################################################
36 | module "consul" {
37 |   source = "git@github.com:hashicorp/terraform-aws-consul.git"
38 | }
39 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/compute.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_instance" "instance_1" {
 2 |   ami             = var.ami
 3 |   instance_type   = var.instance_type
 4 |   security_groups = [aws_security_group.instances.name]
 5 |   user_data       = <<-EOF
 6 |               #!/bin/bash
 7 |               echo "Hello, World 1" > index.html
 8 |               python3 -m http.server 8080 &
 9 |               EOF
10 | }
11 | 
12 | resource "aws_instance" "instance_2" {
13 |   ami             = var.ami
14 |   instance_type   = var.instance_type
15 |   security_groups = [aws_security_group.instances.name]
16 |   user_data       = <<-EOF
17 |               #!/bin/bash
18 |               echo "Hello, World 2" > index.html
19 |               python3 -m http.server 8080 &
20 |               EOF
21 | }
22 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/database.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_db_instance" "db_instance" {
 2 |   allocated_storage   = 20
 3 |   storage_type        = "standard"
 4 |   engine              = "postgres"
 5 |   engine_version      = "12.5"
 6 |   instance_class      = "db.t2.micro"
 7 |   name                = var.db_name
 8 |   username            = var.db_user
 9 |   password            = var.db_pass
10 |   skip_final_snapshot = true
11 | }
12 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/dns.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_route53_zone" "primary" {
 2 |   count = var.create_dns_zone ? 1 : 0
 3 |   name  = var.domain
 4 | }
 5 | 
 6 | data "aws_route53_zone" "primary" {
 7 |   count = var.create_dns_zone ? 0 : 1
 8 |   name  = var.domain
 9 | }
10 | 
11 | locals {
12 |   dns_zone_id = var.create_dns_zone ? aws_route53_zone.primary[0].zone_id : data.aws_route53_zone.primary[0].zone_id
13 |   subdomain   = var.environment_name == "production" ? "" : "${var.environment_name}."
14 | }
15 | 
16 | resource "aws_route53_record" "root" {
17 |   zone_id = local.dns_zone_id
18 |   name    = "${local.subdomain}${var.domain}"
19 |   type    = "A"
20 | 
21 |   alias {
22 |     name                   = aws_lb.load_balancer.dns_name
23 |     zone_id                = aws_lb.load_balancer.zone_id
24 |     evaluate_target_health = true
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   required_providers {
3 |     aws = {
4 |       source  = "hashicorp/aws"
5 |       version = "~> 3.0"
6 |     }
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "instance_1_ip_addr" {
 2 |   value = aws_instance.instance_1.public_ip
 3 | }
 4 | 
 5 | output "instance_2_ip_addr" {
 6 |   value = aws_instance.instance_2.public_ip
 7 | }
 8 | 
 9 | output "db_instance_addr" {
10 |   value = aws_db_instance.db_instance.address
11 | }
12 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/storage.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "bucket" {
 2 |   bucket        = var.bucket_name
 3 |   force_destroy = true
 4 |   versioning {
 5 |     enabled = true
 6 |   }
 7 | 
 8 |   server_side_encryption_configuration {
 9 |     rule {
10 |       apply_server_side_encryption_by_default {
11 |         sse_algorithm = "AES256"
12 |       }
13 |     }
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app-module/variables.tf:
--------------------------------------------------------------------------------
 1 | # General Variables
 2 | 
 3 | variable "region" {
 4 |   description = "Default region for provider"
 5 |   type        = string
 6 |   default     = "us-east-1"
 7 | }
 8 | 
 9 | variable "app_name" {
10 |   description = "Name of the web application"
11 |   type        = string
12 |   default     = "web-app"
13 | }
14 | 
15 | variable "environment_name" {
16 |   description = "Deployment environment (dev/staging/production)"
17 |   type        = string
18 |   default     = "dev"
19 | }
20 | 
21 | # EC2 Variables
22 | 
23 | variable "ami" {
24 |   description = "Amazon machine image to use for ec2 instance"
25 |   type        = string
26 |   default     = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1
27 | }
28 | 
29 | variable "instance_type" {
30 |   description = "ec2 instance type"
31 |   type        = string
32 |   default     = "t2.micro"
33 | }
34 | 
35 | # S3 Variables
36 | 
37 | variable "bucket_name" {
38 |   description = "name of s3 bucket for app data"
39 |   type        = string
40 | }
41 | 
42 | # Route 53 Variables
43 | 
44 | variable "create_dns_zone" {
45 |   description = "If true, create new route53 zone, if false read existing route53 zone"
46 |   type        = bool
47 |   default     = false
48 | }
49 | 
50 | variable "domain" {
51 |   description = "Domain for website"
52 |   type        = string
53 | }
54 | 
55 | # RDS Variables
56 | 
57 | variable "db_name" {
58 |   description = "Name of DB"
59 |   type        = string
60 | }
61 | 
62 | variable "db_user" {
63 |   description = "Username for DB"
64 |   type        = string
65 | }
66 | 
67 | variable "db_pass" {
68 |   description = "Password for DB"
69 |   type        = string
70 |   sensitive   = true
71 | }
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Terraform/06-organization-and-modules/web-app/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   # Assumes s3 bucket and dynamo DB table already set up
 3 |   # See /code/03-basics/aws-backend
 4 |   backend "s3" {
 5 |     bucket         = "devops-directive-tf-state"
 6 |     key            = "06-organization-and-modules/web-app/terraform.tfstate"
 7 |     region         = "us-east-1"
 8 |     dynamodb_table = "terraform-state-locking"
 9 |     encrypt        = true
10 |   }
11 | 
12 |   required_providers {
13 |     aws = {
14 |       source  = "hashicorp/aws"
15 |       version = "~> 3.0"
16 |     }
17 |   }
18 | }
19 | 
20 | provider "aws" {
21 |   region = "us-east-1"
22 | }
23 | 
24 | variable "db_pass_1" {
25 |   description = "password for database #1"
26 |   type        = string
27 |   sensitive   = true
28 | }
29 | 
30 | variable "db_pass_2" {
31 |   description = "password for database #2"
32 |   type        = string
33 |   sensitive   = true
34 | }
35 | 
36 | ## import the module on the root directory
37 | module "web_app_1" {
38 |   source = "../web-app-module"
39 | 
40 |   # Input Variables
41 |   bucket_name      = "web-app-1-devops-directive-web-app-data"
42 |   domain           = "devopsdeployed.com"
43 |   app_name         = "web-app-1"
44 |   environment_name = "production"
45 |   instance_type    = "t2.small"
46 |   create_dns_zone  = true
47 |   db_name          = "webapp1db"
48 |   db_user          = "foo"
49 |   db_pass          = var.db_pass_1
50 | }
51 | 
52 | module "web_app_2" {
53 |   source = "../web-app-module"
54 | 
55 |   # Input Variables
56 |   bucket_name      = "web-app-2-devops-directive-web-app-data"
57 |   domain           = "anotherdevopsdeployed.com"
58 |   app_name         = "web-app-2"
59 |   environment_name = "production"
60 |   instance_type    = "t2.small"
61 |   create_dns_zone  = true
62 |   db_name          = "webapp2db"
63 |   db_user          = "bar"
64 |   db_pass          = var.db_pass_2
65 | }
66 | 


--------------------------------------------------------------------------------
/Terraform/2-first-tf-deployment/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 3.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "aws" {
11 |   region = "us-east-1"
12 | }
13 | 
14 | resource "aws_instance" "example" {
15 |   ami           = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1
16 |   instance_type = "t2.micro"
17 | }


--------------------------------------------------------------------------------
/Terraform/2-first-tf-deployment/terraform.tfstate:
--------------------------------------------------------------------------------
1 | {
2 |   "version": 4,
3 |   "terraform_version": "1.1.9",
4 |   "serial": 3,
5 |   "lineage": "5f6cc133-349c-4042-a6c7-b0d657a26a31",
6 |   "outputs": {},
7 |   "resources": []
8 | }
9 | 


--------------------------------------------------------------------------------
/Terraform/3-remote-backend/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   #############################################################
 3 |   ## AFTER RUNNING TERRAFORM APPLY (WITH LOCAL BACKEND)
 4 |   ## YOU WILL UNCOMMENT THIS CODE THEN RERUN TERRAFORM INIT
 5 |   ## TO SWITCH FROM LOCAL BACKEND TO REMOTE AWS BACKEND
 6 |   #############################################################
 7 |   # backend "s3" {
 8 |   #   bucket         = "cabolba88-tf-state-dev" # REPLACE WITH YOUR BUCKET NAME
 9 |   #   key            = "03-basics/import-bootstrap/terraform.tfstate" # WHERE IN THE BUCKET THE STATE FILE WILL BE
10 |   #   region         = "us-east-1"
11 |   #   dynamodb_table = "terraform-state-locking" # TABLE NAME
12 |   #   encrypt        = true
13 |   # }
14 | 
15 |   required_providers { # CONFIG THE PROVIDER VERSION, CAN CHECK ON TERRAFORM SITE
16 |     aws = {
17 |       source  = "hashicorp/aws"
18 |       version = "~> 3.0"
19 |     }
20 |   }
21 | }
22 | 
23 | provider "aws" { # PASSING SOME OPTIONS TO THE PROVIDER
24 |   region = "us-east-1"
25 | }
26 | 
27 | resource "aws_s3_bucket" "terraform_state" {
28 |   bucket        = "cabolba88-tf-state-dev" # REPLACE WITH YOUR BUCKET NAME
29 |   force_destroy = true
30 |   versioning {
31 |     enabled = true
32 |   }
33 | 
34 |   server_side_encryption_configuration {
35 |     rule {
36 |       apply_server_side_encryption_by_default {
37 |         sse_algorithm = "AES256"
38 |       }
39 |     }
40 |   }
41 | }
42 | 
43 | resource "aws_dynamodb_table" "terraform_locks" {
44 |   name         = "terraform-state-locking"
45 |   billing_mode = "PAY_PER_REQUEST"
46 |   hash_key     = "LockID"
47 |   attribute {
48 |     name = "LockID"
49 |     type = "S"
50 |   }
51 | }


--------------------------------------------------------------------------------
/Terraform/3-remote-backend/terraform.tfstate:
--------------------------------------------------------------------------------
1 | {
2 |   "version": 4,
3 |   "terraform_version": "1.1.9",
4 |   "serial": 6,
5 |   "lineage": "69d41dc4-b373-9e1f-2967-04141b114b62",
6 |   "outputs": {},
7 |   "resources": []
8 | }
9 | 


--------------------------------------------------------------------------------
/Terraform/3-web-app/.terraform/terraform.tfstate:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 3,
 3 |     "serial": 2,
 4 |     "lineage": "ea8bd856-bf98-311f-6a59-28d5805ff650",
 5 |     "backend": {
 6 |         "type": "s3",
 7 |         "config": {
 8 |             "access_key": null,
 9 |             "acl": null,
10 |             "assume_role_duration_seconds": null,
11 |             "assume_role_policy": null,
12 |             "assume_role_policy_arns": null,
13 |             "assume_role_tags": null,
14 |             "assume_role_transitive_tag_keys": null,
15 |             "bucket": "cabolba88-tf-state-dev",
16 |             "dynamodb_endpoint": null,
17 |             "dynamodb_table": "terraform-state-locking",
18 |             "encrypt": true,
19 |             "endpoint": null,
20 |             "external_id": null,
21 |             "force_path_style": null,
22 |             "iam_endpoint": null,
23 |             "key": "03-basics/import-bootstrap/terraform.tfstate",
24 |             "kms_key_id": null,
25 |             "max_retries": null,
26 |             "profile": null,
27 |             "region": "us-east-1",
28 |             "role_arn": null,
29 |             "secret_key": null,
30 |             "session_name": null,
31 |             "shared_credentials_file": null,
32 |             "skip_credentials_validation": null,
33 |             "skip_metadata_api_check": null,
34 |             "skip_region_validation": null,
35 |             "sse_customer_key": null,
36 |             "sts_endpoint": null,
37 |             "token": null,
38 |             "workspace_key_prefix": null
39 |         },
40 |         "hash": 1591429616
41 |     },
42 |     "modules": [
43 |         {
44 |             "path": [
45 |                 "root"
46 |             ],
47 |             "outputs": {},
48 |             "resources": {},
49 |             "depends_on": []
50 |         }
51 |     ]
52 | }
53 | 


--------------------------------------------------------------------------------
/Terraform/3-web-app/web-app-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/3-web-app/web-app-architecture.png


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/examples/another-variable-file.tfvars:
--------------------------------------------------------------------------------
1 | instance_name = "hello-world-2"
2 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/examples/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   # backend "s3" {
 3 |   #   bucket         = "devops-directive-tf-state"
 4 |   #   key            = "04-variables-and-outputs/examples/terraform.tfstate"
 5 |   #   region         = "us-east-1"
 6 |   #   dynamodb_table = "terraform-state-locking"
 7 |   #   encrypt        = true
 8 |   # }
 9 | 
10 |   required_providers {
11 |     aws = {
12 |       source  = "hashicorp/aws"
13 |       version = "~> 3.0"
14 |     }
15 |   }
16 | }
17 | 
18 | provider "aws" {
19 |   region = "us-east-1"
20 | }
21 | 
22 | locals {
23 |   extra_tag = "extra-tag"
24 | }
25 | 
26 | resource "aws_instance" "instance" {
27 |   ami           = var.ami #comming from variables.tf
28 |   instance_type = var.instance_type
29 | 
30 |   tags = {
31 |     Name     = var.instance_name
32 |     ExtraTag = local.extra_tag
33 |   }
34 | }
35 | 
36 | resource "aws_db_instance" "db_instance" {
37 |   allocated_storage   = 20
38 |   storage_type        = "gp2"
39 |   engine              = "postgres"
40 |   engine_version      = "12.4"
41 |   instance_class      = "db.t2.micro"
42 |   name                = "mydb"
43 |   username            = var.db_user
44 |   password            = var.db_pass
45 |   skip_final_snapshot = true
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/examples/outputs.tf:
--------------------------------------------------------------------------------
1 | output "instance_ip_addr" {
2 |   value = aws_instance.instance.private_ip
3 | }
4 | 
5 | output "db_instance_addr" {
6 |   value = aws_db_instance.db_instance.address
7 | }
8 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/examples/terraform.tfvars:
--------------------------------------------------------------------------------
1 | instance_name = "hello-world"
2 | ami           = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1
3 | instance_type = "t2.micro"


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/examples/variables.tf:
--------------------------------------------------------------------------------
 1 | # should specify optional vs required
 2 | 
 3 | variable "instance_name" {
 4 |   description = "Name of ec2 instance"
 5 |   type        = string
 6 | }
 7 | 
 8 | variable "ami" {
 9 |   description = "Amazon machine image to use for ec2 instance"
10 |   type        = string
11 |   default     = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1
12 | }
13 | 
14 | variable "instance_type" {
15 |   description = "ec2 instance type"
16 |   type        = string
17 |   default     = "t2.micro"
18 | }
19 | 
20 | variable "db_user" {
21 |   description = "username for database"
22 |   type        = string
23 |   default     = "foo"
24 | }
25 | 
26 | variable "db_pass" {
27 |   description = "password for database"
28 |   type        = string
29 |   sensitive   = true
30 | }
31 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/web-app/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "instance_1_ip_addr" {
 2 |   value = aws_instance.instance_1.public_ip
 3 | }
 4 | 
 5 | output "instance_2_ip_addr" {
 6 |   value = aws_instance.instance_2.public_ip
 7 | }
 8 | 
 9 | # output "db_instance_addr" {
10 | #   value = aws_db_instance.db_instance.address
11 | # }
12 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/web-app/terraform.tfstate:
--------------------------------------------------------------------------------
1 | {
2 |   "version": 4,
3 |   "terraform_version": "1.1.9",
4 |   "serial": 35,
5 |   "lineage": "f60b97d6-2ff5-9c53-a198-fa1e6e8eae80",
6 |   "outputs": {},
7 |   "resources": []
8 | }
9 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/web-app/terraform.tfvars:
--------------------------------------------------------------------------------
1 | bucket_name = "devops-directive-web-app-data"
2 | domain      = "devopsdeployed.com"
3 | db_name     = "mydb"
4 | db_user     = "foo"
5 | # db_pass = "foobarbaz"
6 | 


--------------------------------------------------------------------------------
/Terraform/4-variables-and-outputs/web-app/variables.tf:
--------------------------------------------------------------------------------
 1 | # General Variables
 2 | 
 3 | variable "region" {
 4 |   description = "Default region for provider"
 5 |   type        = string
 6 |   default     = "us-east-1"
 7 | }
 8 | 
 9 | # EC2 Variables
10 | 
11 | variable "ami" {
12 |   description = "Amazon machine image to use for ec2 instance"
13 |   type        = string
14 |   default     = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1
15 | }
16 | 
17 | variable "instance_type" {
18 |   description = "ec2 instance type"
19 |   type        = string
20 |   default     = "t2.micro"
21 | }
22 | 
23 | # S3 Variables
24 | 
25 | variable "bucket_name" {
26 |   description = "name of s3 bucket for app data"
27 |   type        = string
28 | }
29 | 
30 | # Route 53 Variables
31 | 
32 | variable "domain" {
33 |   description = "Domain for website"
34 |   type        = string
35 | }
36 | 
37 | # RDS Variables
38 | 
39 | variable "db_name" {
40 |   description = "Name of DB"
41 |   type        = string
42 | }
43 | 
44 | variable "db_user" {
45 |   description = "Username for DB"
46 |   type        = string
47 | }
48 | 
49 | variable "db_pass" {
50 |   description = "Password for DB"
51 |   type        = string
52 |   sensitive   = true
53 | }
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/Terraform/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/img/.DS_Store


--------------------------------------------------------------------------------
/Terraform/img/2_2_terraform_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/img/2_2_terraform_architecture.png


--------------------------------------------------------------------------------
/airbyte-dbt-airflow-snowflake/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/airbyte-dbt-airflow-snowflake/.DS_Store


--------------------------------------------------------------------------------
/airbyte-dbt-airflow-snowflake/flags.yml:
--------------------------------------------------------------------------------
 1 | flags:
 2 |   - name: performance.backgroundJsonSchemaValidation
 3 |     serve: false
 4 |   - name: heartbeat.failSync
 5 |     serve: true
 6 |   - name: connectors.versionOverridesEnabled
 7 |     serve: true
 8 |   - name: billing.newTrialPolicy
 9 |     serve: false
10 |   - name: heartbeat-max-seconds-between-messages
11 |     serve: "10800"
12 |   - name: use-new-notification-workflow
13 |     serve: false
14 |   - name: validation.removeValidationLimit
15 |     serve: false
16 |   - name: connection.columnSelection
17 |     serve: true
18 |   - name: refreshSchema.period.hours
19 |     serve: 24
20 |   - name: concurrent.source.stream.read
21 |     serve: false
22 |   - name: platform.add-scheduling-jitter
23 |     serve: false
24 |   - name: connectors.apm-enabled
25 |     serve: false
26 |   - name: connectors.useIconUrlInApiResponse
27 |     serve: true
28 |   - name: platform.use-new-schema-update-notification
29 |     serve: false
30 |   # whether the platform will track destination timeouts. If set to 'true', then the platform will monitor if there has been
31 |   # a call to a destination that has taken more than the threshold defined in 'destination-timeout.maxSeconds', and if so,
32 |   # the platform will consider the destination to have timed out.
33 |   - name: destination-timeout-enabled
34 |     serve: true
35 |   # If set to 'true' and the platform detects a destination timeout, it will fail the sync. Otherwise, it will
36 |   # log a message and the sync will proceed.
37 |   - name: destination-timeout.failSync
38 |     serve: true
39 |   # the time the platform waits before it decides that a destination has timed out.
40 |   - name: destination-timeout.seconds
41 |     serve: 86400
42 |   - name: platform.inject-aws-secrets-to-connector-pods
43 |     serve: false
44 |   - name: platform.use-workload-api
45 |     serve: false
46 | 


--------------------------------------------------------------------------------
/airbyte-dbt-airflow-snowflake/temporal/dynamicconfig/development.yaml:
--------------------------------------------------------------------------------
 1 | # when modifying, remember to update the kube version of this file kube/resources/temporal.yaml
 2 | frontend.enableClientVersionCheck:
 3 |   - value: true
 4 |     constraints: {}
 5 | history.persistenceMaxQPS:
 6 |   - value: 3000
 7 |     constraints: {}
 8 | frontend.persistenceMaxQPS:
 9 |   - value: 3000
10 |     constraints: {}
11 | frontend.historyMgrNumConns:
12 |   - value: 30
13 |     constraints: {}
14 | frontend.throttledLogRPS:
15 |   - value: 20
16 |     constraints: {}
17 | history.historyMgrNumConns:
18 |   - value: 50
19 |     constraints: {}
20 | system.advancedVisibilityWritingMode:
21 |   - value: "off"
22 |     constraints: {}
23 | history.defaultActivityRetryPolicy:
24 |   - value:
25 |       InitialIntervalInSeconds: 1
26 |       MaximumIntervalCoefficient: 100.0
27 |       BackoffCoefficient: 2.0
28 |       MaximumAttempts: 0
29 | history.defaultWorkflowRetryPolicy:
30 |   - value:
31 |       InitialIntervalInSeconds: 1
32 |       MaximumIntervalCoefficient: 100.0
33 |       BackoffCoefficient: 2.0
34 |       MaximumAttempts: 0
35 | # Limit for responses. This mostly impacts discovery jobs since they have the largest responses.
36 | limit.blobSize.error:
37 |   - value: 15728640 # 15MB
38 |     constraints: {}
39 | limit.blobSize.warn:
40 |   - value: 10485760 # 10MB
41 |     constraints: {}
42 | 


--------------------------------------------------------------------------------
/dbt/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/.DS_Store


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/dim_customer-3.sql:
--------------------------------------------------------------------------------
 1 | with customers as (
 2 |     select * from {{ ref('stg_customers')}}
 3 | ),
 4 | orders as (
 5 |     select * from {{ ref('fct_orders')}}
 6 | ),
 7 | customer_orders as (
 8 |     select
 9 |         customer_id,
10 |         min(order_date) as first_order_date,
11 |         max(order_date) as most_recent_order_date,
12 |         count(order_id) as number_of_orders,
13 |         sum(amount) as lifetime_value
14 |     from orders
15 |     group by 1
16 | ),
17 | final as (
18 |     select
19 |         customers.customer_id,
20 |         customers.first_name,
21 |         customers.last_name,
22 |         customer_orders.first_order_date,
23 |         customer_orders.most_recent_order_date,
24 |         coalesce(customer_orders.number_of_orders, 0) as number_of_orders,
25 |         customer_orders.lifetime_value
26 |     from customers
27 |     left join customer_orders using (customer_id)
28 | )
29 | select * from final


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/dim_customers-2.sql:
--------------------------------------------------------------------------------
 1 | with customers as (
 2 | 
 3 |     select * from {{ ref('stg_customers')}}
 4 | 
 5 | ),
 6 | 
 7 | orders as (
 8 | 
 9 |     select * from {{ ref('stg_orders') }}
10 | 
11 | ),
12 | 
13 | customer_orders as (
14 | 
15 |     select
16 |         customer_id,
17 | 
18 |         min(order_date) as first_order_date,
19 |         max(order_date) as most_recent_order_date,
20 |         count(order_id) as number_of_orders
21 | 
22 |     from orders
23 | 
24 |     group by 1
25 | 
26 | ),
27 | 
28 | 
29 | final as (
30 | 
31 |     select
32 |         customers.customer_id,
33 |         customers.first_name,
34 |         customers.last_name,
35 |         customer_orders.first_order_date,
36 |         customer_orders.most_recent_order_date,
37 |         coalesce(customer_orders.number_of_orders, 0) as number_of_orders
38 | 
39 |     from customers
40 | 
41 |     left join customer_orders using (customer_id)
42 | 
43 | )
44 | 
45 | select * from final


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/dim_customers.sql:
--------------------------------------------------------------------------------
 1 | with customers as (
 2 | 
 3 |     select
 4 |         id as customer_id,
 5 |         first_name,
 6 |         last_name
 7 | 
 8 |     from raw.jaffle_shop.customers
 9 | 
10 | ),
11 | 
12 | orders as (
13 | 
14 |     select
15 |         id as order_id,
16 |         user_id as customer_id,
17 |         order_date,
18 |         status
19 | 
20 |     from raw.jaffle_shop.orders
21 | 
22 | ),
23 | 
24 | customer_orders as (
25 | 
26 |     select
27 |         customer_id,
28 | 
29 |         min(order_date) as first_order_date,
30 |         max(order_date) as most_recent_order_date,
31 |         count(order_id) as number_of_orders
32 | 
33 |     from orders
34 | 
35 |     group by 1
36 | 
37 | ),
38 | 
39 | 
40 | final as (
41 | 
42 |     select
43 |         customers.customer_id,
44 |         customers.first_name,
45 |         customers.last_name,
46 |         customer_orders.first_order_date,
47 |         customer_orders.most_recent_order_date,
48 |         coalesce(customer_orders.number_of_orders, 0) as number_of_orders
49 | 
50 |     from customers
51 | 
52 |     left join customer_orders using (customer_id)
53 | 
54 | )
55 | 
56 | select * from final


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/fct_orders.sql:
--------------------------------------------------------------------------------
 1 | with orders as  (
 2 |     select * from {{ ref('stg_orders' )}}
 3 | ),
 4 | 
 5 | payments as (
 6 |     select * from {{ ref('stg_payments') }}
 7 | ),
 8 | 
 9 | order_payments as (
10 |     select
11 |         order_id,
12 |         sum(case when status = 'success' then amount end) as amount
13 | 
14 |     from payments
15 |     group by 1
16 | ),
17 | 
18 | final as (
19 | 
20 |     select
21 |         orders.order_id,
22 |         orders.customer_id,
23 |         orders.order_date,
24 |         coalesce(order_payments.amount, 0) as amount
25 | 
26 |     from orders
27 |     left join order_payments using (order_id)
28 | )
29 | 
30 | select * from final


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/img/dbt_and_alatycs_engineer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/dbt_and_alatycs_engineer.png


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/img/dbt_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/dbt_workflow.png


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/img/ineage_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/ineage_graph.png


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/img/modern_data_platform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/modern_data_platform.png


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/insert_sf_dbt_training_data.sql:
--------------------------------------------------------------------------------
 1 | create warehouse transforming;
 2 | 	
 3 | create database raw;
 4 | 
 5 | create database analytics;
 6 | 	
 7 | create schema raw.jaffle_shop;
 8 | 
 9 | -- create this one directly in the schema
10 | create table raw.jaffle_shop.customers
11 | (
12 |     id integer,
13 |     first_name varchar,
14 |     last_name varchar
15 | );
16 | 
17 | copy into raw.jaffle_shop.customers (id, first_name, last_name)
18 |     from 's3://dbt-tutorial-public/jaffle_shop_customers.csv'
19 |         file_format = (
20 |             type = 'CSV'
21 |             field_delimiter = ','
22 |             skip_header = 1
23 |         )
24 | ;
25 | 
26 | create table raw.jaffle_shop.orders
27 | (
28 |   id integer,
29 |   user_id integer,
30 |   order_date date,
31 |   status varchar,
32 |   _etl_loaded_at timestamp default current_timestamp
33 | );
34 | 
35 | copy into raw.jaffle_shop.orders (id, user_id, order_date, status)
36 |     from 's3://dbt-tutorial-public/jaffle_shop_orders.csv'
37 |         file_format = (
38 |             type = 'CSV'
39 |             field_delimiter = ','
40 |             skip_header = 1
41 |         )
42 | ;
43 | 
44 | create schema raw.stripe;
45 | 
46 | create table raw.stripe.payment (
47 |   id integer,
48 |   orderid integer,
49 |   paymentmethod varchar,
50 |   status varchar,
51 |   amount integer,
52 |   created date,
53 |   _batched_at timestamp default current_timestamp
54 | );
55 | 
56 | copy into raw.stripe.payment (id, orderid, paymentmethod, status, amount, created)
57 | from 's3://dbt-tutorial-public/stripe_payments.csv'
58 |     file_format = (
59 |         type = 'CSV'
60 |         field_delimiter = ','
61 |         skip_header = 1
62 |     )
63 | ;


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/jaffle_shop.md:
--------------------------------------------------------------------------------
 1 | {% docs order_status %}
 2 | 	
 3 | One of the following values: 
 4 | 
 5 | | status         | definition                                       |
 6 | |----------------|--------------------------------------------------|
 7 | | placed         | Order placed, not yet shipped                    |
 8 | | shipped        | Order has been shipped, not yet been delivered   |
 9 | | completed      | Order has been received by customers             |
10 | | return pending | Customer indicated they want to return this item |
11 | | returned       | Item has been returned                           |
12 | 
13 | {% enddocs %}


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/src_jaffle_shop-2.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/src_jaffle_shop-2.yml


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/src_jaffle_shop-3.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: jaffle_shop
 5 |     database: raw
 6 |     schema: jaffle_shop
 7 |     tables:
 8 |       - name: customers
 9 |         columns:
10 |           - name: id
11 |             tests:
12 |               - unique
13 |               - not_null
14 |             
15 |       - name: orders
16 |         columns:
17 |           - name: id
18 |             tests:
19 |               - unique              
20 |               - not_null
21 |         loaded_at_field: _etl_loaded_at
22 |         freshness:
23 |           warn_after: {count: 12, period: hour}
24 |           error_after: {count: 24, period: hour}


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/src_jaffle_shop-4.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: jaffle_shop
 5 |     description: A clone of a Postgres application database.
 6 |     database: raw
 7 |     schema: jaffle_shop
 8 |     tables:
 9 |       - name: customers
10 |         description: Raw customers data.
11 |         columns:
12 |           - name: id
13 |             description: Primary key for customers.
14 |             tests:
15 |               - unique
16 |               - not_null
17 | 
18 |       - name: orders
19 |         description: Raw orders data.
20 |         columns:
21 |           - name: id
22 |             description: Primary key for orders.
23 |             tests:
24 |               - unique
25 |               - not_null
26 |         loaded_at_field: _etl_loaded_at
27 |         freshness:
28 |           warn_after: {count: 12, period: hour}
29 |           error_after: {count: 24, period: hour}


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/src_jaffle_shop.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: jaffle_shop
 5 |     database: raw
 6 | 
 7 |     schema: jaffle_shop
 8 |     tables:
 9 |       - name: customers
10 |       - name: orders


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/src_stripe.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |     - name: stripe
 5 |       database: raw
 6 |       schema: stripe
 7 |       tables:
 8 |         - name: payment
 9 |           loaded_at_field: _batched_at
10 |           freshness:
11 |             warn_after: {count: 12,period: hour }


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/stg_customers.sql:
--------------------------------------------------------------------------------
 1 | with customers as (
 2 |     
 3 |     select 
 4 |         id as customer_id,
 5 |         first_name,
 6 |         last_name
 7 | 
 8 |     --from raw.jaffle_shop.customers -- to begin the course
 9 |     from {{ source('jaffle_shop','customers') }} -- used in the source chapter
10 | )
11 | 
12 | select * from customers
13 | 


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/stg_jaffle_shop-2.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: stg_customers
 5 |     description: my description table
 6 |     columns: 
 7 |       - name: customer_id
 8 |         description: my description column
 9 |         tests:
10 |           - unique
11 |           - not_null
12 | 
13 |   - name: stg_orders
14 |     description: my description table
15 |     columns:
16 |       - name: order_id
17 |         description: my description column
18 |         tests:
19 |           - unique
20 |           - not_null
21 |       - name: status
22 |         description:  "{{ doc('order_status') }}"
23 |         tests:
24 |           - accepted_values:
25 |               values:
26 |                 - completed
27 |                 - shipped
28 |                 - returned
29 |                 - return_pending
30 |                 - placed
31 |       - name: customer_id
32 |         description: my description column
33 |         tests:
34 |           - relationships:
35 |               to: ref('stg_customers')
36 |               field: customer_id


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/stg_jaffle_shop.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: stg_customers
 5 |     columns: 
 6 |       - name: customer_id
 7 |         tests:
 8 |           - unique
 9 |           - not_null
10 | 
11 |   - name: stg_orders
12 |     columns:
13 |       - name: order_id
14 |         tests:
15 |           - unique
16 |           - not_null
17 |       - name: status
18 |         tests:
19 |           - accepted_values:
20 |               values:
21 |                 - completed
22 |                 - shipped
23 |                 - returned
24 |                 - return_pending
25 |                 - placed
26 |       - name: customer_id
27 |         tests:
28 |           - relationships:
29 |               to: ref('stg_customers')
30 |               field: customer_id


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/stg_orders.sql:
--------------------------------------------------------------------------------
 1 | with orders as (
 2 |     
 3 |     select
 4 |         id as order_id,
 5 |         user_id as customer_id,
 6 |         order_date,
 7 |         status
 8 | 
 9 |     --from raw.jaffle_shop.orders -- to begin the course
10 |     from {{ source('jaffle_shop','orders') }} -- used in the source chapter
11 | )
12 | 
13 | select * from orders


--------------------------------------------------------------------------------
/dbt/dbt-fundamentals-course/stg_payments.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     id as payment_id,
 3 |     orderid as order_id,
 4 |     paymentmethod as payment_method,
 5 |     status,
 6 | 
 7 |     -- amount is stored in cents, convert it to dollars
 8 |     amount / 100 as amount,
 9 |     created as created_at
10 | 
11 | from raw.stripe.payment 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/.DS_Store


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/.DS_Store


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/.dbt/.user.yml:
--------------------------------------------------------------------------------
1 | id: 02e7707b-ea50-432c-b85b-33224e71baf1
2 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/.dbt/profiles.yml:
--------------------------------------------------------------------------------
 1 | dbt_project:
 2 |   outputs:
 3 |     dev:
 4 |       account: SUPYPQL.OM48075
 5 |       database: AIRBNB
 6 |       password: 
 7 |       role: transform
 8 |       schema: DEV
 9 |       threads: 1
10 |       type: snowflake
11 |       user: dbt
12 |       warehouse: COMPUTE_WH
13 |   target: dev
14 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.DS_Store


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | - dbt run
 7 | - dbt test
 8 | 
 9 | 
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/.gitkeep


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/full_moon_no_sleep.sql:
--------------------------------------------------------------------------------
 1 | WITH fullmoon_reviews AS (
 2 |     SELECT * FROM {{ ref('mart_fullmoon_reviews') }}
 3 | )
 4 | SELECT
 5 |     is_full_moon,
 6 |     review_sentiment,
 7 |     COUNT(*) as reviews
 8 | FROM
 9 |     fullmoon_reviews
10 | GROUP BY
11 |     is_full_moon,
12 |     review_sentiment
13 | ORDER BY
14 |     is_full_moon,
15 |     review_sentiment


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/assets/input_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/assets/input_schema.png


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | # Name your project! Project names should contain only lowercase characters
 2 | # and underscores. A good package name should reflect your organization's
 3 | # name or the intended use of these models
 4 | name: 'dbt_project'
 5 | version: '1.0.0'
 6 | config-version: 2
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'dbt_project'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `model-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | model-paths: ["models"]
15 | analysis-paths: ["analyses"]
16 | test-paths: ["tests"]
17 | seed-paths: ["seeds"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 | asset-paths: ["assets"]
21 | 
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_packages"
25 | 
26 | 
27 | # Configuring models
28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
29 | 
30 | # In this example config, we tell dbt to build all models in the example/
31 | # directory as views. These settings can be overridden in the individual model
32 | # files using the `{{ config(...) }}` macro.
33 | models:
34 |   dbt_project:
35 |     # Config indicated by + and applies to all files under models/example/
36 |     +post-hook:
37 |       - "GRANT SELECT ON {{ this }} TO ROLE REPORTER" #this will be executed after every model run
38 |     +materialized: view
39 |     dim:
40 |       +materialized: table
41 |     src:
42 |       +materialized: ephemeral
43 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/.gitkeep


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/no_nulls_in_columns.sql:
--------------------------------------------------------------------------------
1 | {% macro no_nulls_in_columns(model) %}
2 |     SELECT * FROM {{ model }} WHERE
3 |     {% for col in adapter.get_columns_in_relation(model) -%}
4 |         {{ col.column }} IS NULL OR
5 |     {% endfor %}
6 |     FALSE
7 | {% endmacro %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/positive_values.sql:
--------------------------------------------------------------------------------
1 | {% test positive_value(model, column_name) %}
2 | SELECT
3 |     *
4 | FROM
5 |     {{ model }}
6 | WHERE
7 |     {{ column_name}} < 1
8 | {% endtest %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_hosts_cleansed.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'view'
 4 |     )
 5 | }}
 6 | 
 7 | WITH src_hosts AS (
 8 |     SELECT
 9 |         *
10 |     FROM
11 |         {{ ref('src_hosts') }}
12 | )
13 | SELECT
14 |     host_id,
15 |     NVL(
16 |         host_name,
17 |         'Anonymous'
18 |     ) AS host_name,
19 |     is_superhost,
20 |     created_at,
21 |     updated_at
22 | FROM
23 |     src_hosts


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_listings_cleansed.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'view'
 4 |     )
 5 | }}
 6 | 
 7 | WITH src_listings AS (
 8 |   SELECT
 9 |     *
10 |   FROM
11 |     {{ ref('src_listings') }}
12 | )
13 | SELECT
14 |   listing_id,
15 |   listing_name,
16 |   room_type,
17 |   CASE
18 |     WHEN minimum_nights = 0 THEN 1
19 |     ELSE minimum_nights
20 |   END AS minimum_nights,
21 |   host_id,
22 |   REPLACE(
23 |     price_str,
24 |     '$'
25 |   ) :: NUMBER(
26 |     10,
27 |     2
28 |   ) AS price,
29 |   created_at,
30 |   updated_at
31 | FROM
32 |   src_listings


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_listings_w_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | l AS (
 3 |     SELECT
 4 |         *
 5 |     FROM
 6 |         {{ ref('dim_listings_cleansed') }}
 7 | ),
 8 | h AS (
 9 |     SELECT * 
10 |     FROM {{ ref('dim_hosts_cleansed') }}
11 | )
12 | 
13 | SELECT 
14 |     l.listing_id,
15 |     l.listing_name,
16 |     l.room_type,
17 |     l.minimum_nights,
18 |     l.price,
19 |     l.host_id,
20 |     h.host_name,
21 |     h.is_superhost as host_is_superhost,
22 |     l.created_at,
23 |     GREATEST(l.updated_at, h.updated_at) as updated_at
24 | FROM l
25 | LEFT JOIN h ON (h.host_id = l.host_id)


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/docs.md:
--------------------------------------------------------------------------------
1 | {% docs dim_listing_cleansed_minimum_nights %}
2 | Minimum number of nights required to rent this property. 
3 | 
4 | Keep in mind that old listings might have `minimum_nights` set 
5 | to 0 in the source tables. Our cleansing algorithm updates this to `1`.
6 | 
7 | {% enddocs %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/fct/fct_reviews.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |   config(
 3 |     materialized = 'incremental',
 4 |     on_schema_change='fail' 
 5 |     )
 6 | }}
 7 | -- as to fail if schema changes 
 8 | 
 9 | 
10 | WITH src_reviews AS (
11 |   SELECT * FROM {{ ref('src_reviews') }}
12 | )
13 | SELECT 
14 |   {{ dbt_utils.generate_surrogate_key(['listing_id', 'review_date', 'reviewer_name', 'review_text']) }}
15 |     AS review_id,
16 |   * 
17 |   FROM src_reviews
18 | WHERE review_text is not null
19 | 
20 | -- condition for the append, just id the review_date in the new rows are greater than the maximum review_date in this model
21 | {% if is_incremental() %}
22 |   AND review_date > (select max(review_date) from {{ this }})
23 | {% endif %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/mart/mart_fullmoon_reviews.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |   materialized = 'table',
 3 | ) }}
 4 | 
 5 | WITH fct_reviews AS (
 6 |     SELECT * FROM {{ ref('fct_reviews') }}
 7 | ),
 8 | full_moon_dates AS (
 9 |     SELECT * FROM {{ ref('seed_full_moon_dates') }}
10 | )
11 | 
12 | SELECT
13 |   r.*,
14 |   CASE
15 |     WHEN fm.full_moon_date IS NULL THEN 'not full moon'
16 |     ELSE 'full moon'
17 |   END AS is_full_moon
18 | FROM
19 |   fct_reviews
20 |   r
21 |   LEFT JOIN full_moon_dates
22 |   fm
23 |   ON (TO_DATE(r.review_date) = DATEADD(DAY, 1, fm.full_moon_date))


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/overview.md:
--------------------------------------------------------------------------------
1 | {% docs __overview__ %}
2 | # Airbnb pipeline
3 | 
4 | Hey, welcome to our Airbnb pipeline documentation!
5 | 
6 | Here is the schema of our input data:
7 | ![input schema](assets/input_schema.png)
8 | 
9 | {% enddocs %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: dim_listings_cleansed
 5 |     description: Cleansed table which contains Airbnb listings.
 6 |     columns:
 7 |       
 8 |       - name: listing_id
 9 |         description: Primary key for the listing
10 |         tests:
11 |           - unique
12 |           - not_null
13 |         
14 |       - name: host_id
15 |         description: The hosts's id. References the host table.
16 |         tests:
17 |           - not_null
18 |           - relationships:
19 |               to: ref('dim_hosts_cleansed')
20 |               field: host_id
21 | 
22 |       - name: room_type
23 |         description: Type of the apartment / room
24 |         tests:
25 |           - accepted_values:
26 |               values: ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room']
27 | 
28 |       - name: minimum_nights
29 |         description: '{{ doc("dim_listing_cleansed_minimum_nights") }}' #this call an md doc
30 |         tests:
31 |           - positive_value
32 | 
33 |   - name: dim_hosts_cleansed
34 |     description: Cleansed table which contains Airbnb hosts.
35 |     columns:
36 | 
37 |       - name: host_id
38 |         description: The hosts's id. References the host table.
39 |         tests:
40 |           - not_null
41 |           - unique
42 | 
43 |       - name: host_name
44 |         description: Host full name.
45 |         tests:
46 |           - not_null
47 | 
48 |       - name: is_superhost
49 |         description: Flag to identify host's category.
50 |         tests:
51 |           - accepted_values:
52 |               values: ['t', 'f']
53 | 
54 |       - name: created_at
55 |         description: Date the data was entered in the table.
56 | 
57 |       - name: updated_at
58 |         description: Date the data was updated in the table.


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: airbnb
 5 |     schema: raw
 6 |     tables:
 7 |       - name: listings
 8 |         identifier: raw_listings
 9 | 
10 |       - name: hosts
11 |         identifier: raw_hosts
12 | 
13 |       - name: reviews
14 |         identifier: raw_reviews
15 |         loaded_at_field: date #refer a column in the table where date is updated when loaded
16 |         freshness:
17 |           warn_after: {count: 1, period: hour}
18 |           error_after: {count: 24, period: hour}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_hosts.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_hosts AS (
 2 |     SELECT
 3 |         *
 4 |     FROM
 5 |        --AIRBNB.RAW.RAW_HOSTS --reading direct from table, not included in lineage
 6 |        {{ source ('airbnb','hosts') }}
 7 | 
 8 | )
 9 | SELECT
10 |     id AS host_id,
11 |     NAME AS host_name,
12 |     is_superhost,
13 |     created_at,
14 |     updated_at
15 | FROM
16 |     raw_hosts


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_listings.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_listings AS (
 2 |     SELECT
 3 |         *
 4 |     FROM
 5 |         --AIRBNB.RAW.RAW_LISTINGS
 6 |         {{ source ('airbnb','listings') }}
 7 | )
 8 | SELECT
 9 |     id AS listing_id,
10 |     name AS listing_name,
11 |     listing_url,
12 |     room_type,
13 |     minimum_nights,
14 |     host_id,
15 |     price AS price_str,
16 |     created_at,
17 |     updated_at
18 | FROM
19 |     raw_listings


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_reviews.sql:
--------------------------------------------------------------------------------
 1 | WITH raw_reviews AS (
 2 |     SELECT
 3 |         *
 4 |     FROM
 5 |         --AIRBNB.RAW.RAW_REVIEWS
 6 |         {{ source ('airbnb','reviews') }}
 7 | )
 8 | SELECT
 9 |     listing_id,
10 |     date AS review_date,
11 |     reviewer_name,
12 |     comments AS review_text,
13 |     sentiment AS review_sentiment
14 | FROM
15 |     raw_reviews


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/package-lock.yml:
--------------------------------------------------------------------------------
1 | packages:
2 | - package: dbt-labs/dbt_utils
3 |   version: 1.1.1
4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353
5 | 


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/seeds/.gitkeep


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/.gitkeep


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/scd_raw_listings.sql:
--------------------------------------------------------------------------------
 1 | {% snapshot scd_raw_listings %}
 2 | 
 3 | {{
 4 |    config(
 5 |        target_schema='DEV',
 6 |        unique_key='id',
 7 |        strategy='timestamp',
 8 |        updated_at='updated_at',
 9 |        invalidate_hard_deletes=True
10 |    )
11 | }}
12 | 
13 | select * FROM {{ source('airbnb', 'listings') }}
14 | 
15 | {% endsnapshot %}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/.gitkeep


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/consistent_created_at.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM {{ ref('dim_listings_cleansed') }} l
2 | INNER JOIN {{ ref('fct_reviews') }} r
3 | USING (listing_id)
4 | WHERE l.created_at >= r.review_date


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/dim_listings_minimum_nights.sql:
--------------------------------------------------------------------------------
1 | 
2 | SELECT
3 |     *
4 | FROM
5 |     {{ ref('dim_listings_cleansed') }}
6 | WHERE minimum_nights < 1
7 | LIMIT 10


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/no_nulls_in_dim_linstings.sql:
--------------------------------------------------------------------------------
1 | {{ no_nulls_in_columns(ref('dim_listings_cleansed')) }}


--------------------------------------------------------------------------------
/dbt/dbt-udemy-course/readme.md:
--------------------------------------------------------------------------------
1 | ## 2024 DBT course - udemy
2 | [Course Link](https://github.com/nordquant/complete-dbt-bootcamp-zero-to-hero/blob/main/_course_resources/course-resources.md)   
3 | [Snowflake Instance](https://supypql-om48075.snowflakecomputing.com/console/login)   
4 | 
5 | ### [Setup the course](./setup.md)
6 | ### [Models](./models.md)


--------------------------------------------------------------------------------
/pytest/first_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | def my_func(x):
 4 |     return x + 5
 5 | 
 6 | def test_func():
 7 |     assert my_func(3) == 8
 8 | 
 9 | # to run the test go to consoloe -> pytest first_test.py
10 | 


--------------------------------------------------------------------------------
/pytest/gtfs_test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pytest
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def service_date():
 7 |     return "20220901"
 8 | 
 9 | 
10 | @pytest.fixture
11 | def boarding_time():
12 |     return "2022-09-01T10:00:00+00:00"
13 | 
14 | 
15 | @pytest.fixture
16 | def ticketing_trip_id():
17 |     return "direct:169396293:1:10"
18 | 
19 | 
20 | @pytest.fixture
21 | def from_ticketing_stop_time_id():
22 |     return "88-1"
23 | 
24 | 
25 | @pytest.fixture
26 | def to_ticketing_stop_time_id():
27 |     return "94-10"
28 | 
29 | 
30 | @pytest.fixture
31 | def generate_url_to_test(
32 |                         service_date
33 |                         ,boarding_time
34 |                         ,ticketing_trip_id
35 |                         ,from_ticketing_stop_time_id
36 |                         ,to_ticketing_stop_time_id
37 |                         ):
38 |     # hit this enpoint
39 |     url = 'gtfs-dev.flix.tech/ticketing/web-url?' \
40 |         f'service_date=["{service_date}"]&' \
41 |         f'boarding_time=["{boarding_time}"]&' \
42 |         f'ticketing_trip_id=["{ticketing_trip_id}"]&' \
43 |         f'from_ticketing_stop_time_id=["{from_ticketing_stop_time_id}"]&' \
44 |         f'to_ticketing_stop_time_id=["{to_ticketing_stop_time_id}"]'
45 |     return url
46 | 
47 | 
48 | @pytest.fixture
49 | def generate_expected_url( service_date
50 |                     #,boarding_time
51 |                     ,ticketing_trip_id
52 |                     ,from_ticketing_stop_time_id
53 |                     ,to_ticketing_stop_time_id
54 |                     ):
55 | 
56 |     departureCity = from_ticketing_stop_time_id.split("-")[0]
57 |     departureStation = from_ticketing_stop_time_id.split("-")[1]
58 |     arrivalCity = to_ticketing_stop_time_id.split("-")[0]
59 |     arrivalStation = to_ticketing_stop_time_id.split("-")[1]
60 |     rideDate = '.'.join([service_date[6:],service_date[4:6],service_date[:4]])
61 |     uid = ticketing_trip_id.replace(":","%3A")
62 | 
63 |     expected_url = 'https://shop.global.flixbus.com/search?' \
64 |         f'departureCity={departureCity}&' \
65 |         f'departureStation={departureStation}&' \
66 |         f'arrivalCity={arrivalCity}&' \
67 |         f'arrivalStation={arrivalStation}&' \
68 |         f'rideDate={rideDate}&' \
69 |         'adult=1&' \
70 |         'children=0&' \
71 |         'bike_slot=0&' \
72 |         'currency=EUR&' \
73 |         f'uid={uid}&' \
74 |         'reserve=1'
75 | 
76 |     return expected_url
77 | 
78 | 
79 | @pytest.mark.gtfs
80 | def test_func(generate_expected_url,generate_url_to_test):
81 | 
82 |     # tigger lambda with generate_url_to_test an fetch response
83 |     # response = requests.get(generate_url_to_test).json
84 |     response = {
85 |         "statusCode": 302,
86 |         "headers": {
87 |             "Location": f"https://shop.global.flixbus.com/search?departureCity=88&departureStation=1&arrivalCity=94&arrivalStation=10&rideDate=01.09.2022&adult=1&children=0&bike_slot=0&currency=EUR&uid=direct%3A169396293%3A1%3A10&reserve=1"
88 |         }
89 |     }
90 | 
91 |     assert response["statusCode"] == 302
92 |     assert response["headers"]["Location"] == generate_expected_url
93 | 


--------------------------------------------------------------------------------
/pytest/multiple_tests.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.one
 5 | def test_method1():
 6 |     x = 5
 7 |     y = 10
 8 |     assert x == y
 9 | 
10 | 
11 | @pytest.mark.two
12 | def test_method2():
13 |     x = 5
14 |     y = 10
15 |     assert x+5 == y
16 | 
17 | # to run each test go to console -> py.test multiple_tests.py -m one -v
18 | 


--------------------------------------------------------------------------------
/pytest/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     gtfs: mark a test as a webtest.


--------------------------------------------------------------------------------
/pytest/readme.md:
--------------------------------------------------------------------------------
1 | # Pytest Study
2 | 


--------------------------------------------------------------------------------
/pytest/test_api.py:
--------------------------------------------------------------------------------
 1 | from urllib import response
 2 | import pytest
 3 | import requests
 4 | import json
 5 | 
 6 | def test_valid_login():
 7 |     url = "https://reqres.in/api/login"
 8 |     data = {'email':'eve.holt@reqres.in','password':'cityslicka'}
 9 |     response = requests.get(url,data=data)
10 |     token = json.loads(response.text)
11 |     print(token)
12 |     assert response.status_code == 200
13 |     assert token["page"] == 1
14 | 


--------------------------------------------------------------------------------
/pytest/test_fixtures.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | #fixtures are codes or mocks you run before your test
 4 | @pytest.fixture
 5 | def numbers():
 6 |     return [10,15,20]
 7 | 
 8 | @pytest.mark.one
 9 | def test_one(numbers):
10 |     assert numbers[0] == 10
11 | 
12 | @pytest.mark.two
13 | def test_two(numbers):
14 |     assert numbers[1] == 10
15 | 


--------------------------------------------------------------------------------
/pytest/test_parametrized.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | # pass multiple paramenter to a test
4 | @pytest.mark.parametrize("x,y,z",[(10,20,200),(10,10,200)])
5 | def test_one(x,y,z):
6 |     assert x * y == z


--------------------------------------------------------------------------------
/spark_on_google_colab.py:
--------------------------------------------------------------------------------
 1 | # Download JRE Headless version to Notebook.
 2 | !apt-get install openjdk-8-jdk-headless -qq > /dev/null
 3 | 
 4 | # Download Spark with Hadoop installation zip file and unzip it for further use.
 5 | !wget -q https://downloads.apache.org/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
 6 | !tar xf spark-3.0.2-bin-hadoop2.7.tgz
 7 | 
 8 | # Set the Javahome and Sparkhome variables.
 9 | import os 
10 | os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
11 | os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
12 | 
13 | # Install and Initialize findspark library.
14 | !pip install -q findspark
15 | import findspark
16 | findspark.find()
17 | findspark.init()
18 | 
19 | # Create Spark and SQLContext Sessions.
20 | from pyspark.sql import SparkSession
21 | spark = SparkSession.builder\
22 |         .master("local")\
23 |         .appName("Colab")\
24 |         .config('spark.ui.port', '4050')\
25 |         .getOrCreate()
26 | 
27 | from pyspark.sql import SQLContext
28 | sqlContext = SQLContext(spark)
29 | spark


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 1 - Pipeline.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 1 - Pipeline.JPG


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 2 - Star Schema.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 2 - Star Schema.JPG


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 3 - Parallel Computing.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 3 - Parallel Computing.JPG


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 4 - DAG Example Air Flow.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 4 - DAG Example Air Flow.JPG


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 5 - MPP.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 5 - MPP.JPG


--------------------------------------------------------------------------------
/src/img/1 - Intro to data Engineering/fig 6 - AirFlow UI.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 6 - AirFlow UI.JPG


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Relational DB/1_n_relationship.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/1_n_relationship.jpg


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Relational DB/ENTITY_MODELS.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/ENTITY_MODELS.jpg


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Relational DB/n_n_relationships.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/n_n_relationships.jpg


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Relational DB/postgree_datatypes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/postgree_datatypes.jpg


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Scala/21_game_points.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Scala/21_game_points.jpg


--------------------------------------------------------------------------------
/src/img/17 - Introduction to Scala/Scala_Interpreter.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Scala/Scala_Interpreter.jpg


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 1 - Dataframe.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 1 - Dataframe.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 2 - Loading Excel.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 2 - Loading Excel.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 3 - Datetime Table.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 3 - Datetime Table.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 4 - Datetime Formatting.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 4 - Datetime Formatting.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 5 - JSON Object Oriented.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 5 - JSON Object Oriented.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 6 - JSON Column Oriented.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 6 - JSON Column Oriented.JPG


--------------------------------------------------------------------------------
/src/img/2 - Streamlined data with pandas/fig 7 - Yelp Documentation.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 7 - Yelp Documentation.JPG


--------------------------------------------------------------------------------
/src/img/3 - Software Engineering in Python/fig 1 - Local package structure.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 1 - Local package structure.JPG


--------------------------------------------------------------------------------
/src/img/3 - Software Engineering in Python/fig 1 - PEP and non PEP codes.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 1 - PEP and non PEP codes.JPG


--------------------------------------------------------------------------------
/src/img/3 - Software Engineering in Python/fig 3 - Portable package structure.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 3 - Portable package structure.JPG


--------------------------------------------------------------------------------
/src/img/3 - Software Engineering in Python/fig 4 - Anatomy of classes.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 4 - Anatomy of classes.JPG


--------------------------------------------------------------------------------
/src/img/3 - Software Engineering in Python/fig 5 - Inheritance.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 5 - Inheritance.JPG


--------------------------------------------------------------------------------
/src/img/5 - Introduction to Shell Script/fig 1 - paste command.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 1 - paste command.JPG


--------------------------------------------------------------------------------
/src/img/5 - Introduction to Shell Script/fig 2 - wrap up manipulating data.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 2 - wrap up manipulating data.JPG


--------------------------------------------------------------------------------
/src/img/5 - Introduction to Shell Script/fig 3 - multiple actions loop.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 3 - multiple actions loop.JPG


--------------------------------------------------------------------------------
/src/img/5 - Introduction to Shell Script/fig 4 - nano interface.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 4 - nano interface.JPG


--------------------------------------------------------------------------------
/src/img/SCALA ETL Part 1/imperative_vs_functional_Programing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/imperative_vs_functional_Programing.jpg


--------------------------------------------------------------------------------
/src/img/SCALA ETL Part 1/spark drop malformed mode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/spark drop malformed mode.jpg


--------------------------------------------------------------------------------
/src/img/SCALA ETL Part 1/spark permissive mode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/spark permissive mode.jpg


--------------------------------------------------------------------------------
/src/img/SCALA ETL Part 2/image norrmalization.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 2/image norrmalization.jpg


--------------------------------------------------------------------------------
/src/img/Weather_Data_Pipeline/Weather API v1.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/Weather_Data_Pipeline/Weather API v1.0.jpg


--------------------------------------------------------------------------------
/src/img/kafka_fundamentals/broker.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/broker.jpg


--------------------------------------------------------------------------------
/src/img/kafka_fundamentals/kafka architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/kafka architecture.jpg


--------------------------------------------------------------------------------
/src/img/kafka_fundamentals/topics.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/topics.jpg


--------------------------------------------------------------------------------