├── .DS_Store ├── .gitignore ├── .vscode └── settings.json ├── AWS Glue ├── .DS_Store ├── img │ ├── .DS_Store │ ├── aws s3 partitions.png │ └── glue architecture.png └── readme ├── AWS Lambda ├── .DS_Store ├── LatLongLog │ ├── .aws-sam │ │ └── build.toml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── events │ │ └── event.json │ ├── hello_world │ │ ├── __init__.py │ │ ├── app.py │ │ └── requirements.txt │ ├── samconfig.toml │ ├── template.yaml │ └── tests │ │ ├── __init__.py │ │ ├── integration │ │ ├── __init__.py │ │ └── test_api_gateway.py │ │ ├── requirements.txt │ │ └── unit │ │ ├── __init__.py │ │ └── test_handler.py ├── Readme.md ├── getTicket │ ├── .aws-sam │ │ └── build.toml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── events │ │ └── event.json │ ├── hello_world │ │ ├── __init__.py │ │ ├── app.py │ │ └── requirements.txt │ ├── samconfig.toml │ ├── template.yaml │ └── tests │ │ ├── __init__.py │ │ ├── integration │ │ ├── __init__.py │ │ └── test_api_gateway.py │ │ ├── requirements.txt │ │ └── unit │ │ ├── __init__.py │ │ └── test_handler.py ├── lambdatemplateSAM │ ├── .aws-sam │ │ └── build.toml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── events │ │ └── event.json │ ├── hello_world │ │ ├── __init__.py │ │ ├── app.py │ │ ├── datatypes.py │ │ └── requirements.txt │ ├── outputfile.txt │ ├── samconfig.toml │ ├── template.yaml │ └── tests │ │ ├── __init__.py │ │ ├── integration │ │ ├── __init__.py │ │ └── test_api_gateway.py │ │ ├── requirements.txt │ │ └── unit │ │ ├── __init__.py │ │ └── test_handler.py └── orders-api │ ├── .aws-sam │ └── build.toml │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── events │ └── event.json │ ├── orders_api │ ├── __init__.py │ ├── app.py │ ├── create.py │ ├── read.py │ └── requirements.txt │ ├── samconfig.toml │ ├── template.yaml │ └── tests │ ├── __init__.py │ ├── integration │ ├── __init__.py │ └── test_api_gateway.py │ ├── requirements.txt │ └── unit │ ├── __init__.py │ └── test_handler.py ├── Airflow ├── .DS_Store ├── DAG Authoring Certification │ ├── .DS_Store │ ├── .astro │ │ └── config.yaml │ ├── .dockerignore │ ├── .gitignore │ └── readme.md ├── Fundamentals Certification │ ├── oper_sensors_sample_dag.py │ ├── readme.md │ ├── sensor_operator_dag.py │ └── simple_dag.py ├── GoogleCloudComposer │ └── dag_bq_gcs.py └── img │ ├── .DS_Store │ ├── Chain Operator.png │ ├── Python Branch Operator.png │ ├── celery cluster.jpg │ ├── multi_node.jpg │ ├── one_node.jpg │ └── task_instance_view.PNG ├── Apache Beam ├── .DS_Store ├── Bach Processing │ ├── 4.4 - Batch DirectRunner + GCS.py │ ├── 4.5 - Batch Dataflow + GCS.py │ ├── 4.7 - Batch Dataflow + BigQuery.py │ ├── comando deploy.txt │ └── unnest.py ├── Main Functions │ ├── 3.1 - Setup Colab.py │ ├── 3.10 - ParDo.py │ ├── 3.2 - Create.py │ ├── 3.2 - Read Transform.py │ ├── 3.3 - Write Transform.py │ ├── 3.4 - FlatMap.py │ ├── 3.4 - Map.py │ ├── 3.5 - Filter Lambda.py │ ├── 3.5 - Filter Lista.py │ ├── 3.6 - Flatten.py │ ├── 3.7 - CombinePerKey.py │ ├── 3.8 - Combiners.Count.Perkey.py │ ├── 3.9 - CoGroupByKey.py │ ├── Poem.txt │ └── voos_sample.csv ├── README.md └── Streaming Processing │ ├── 5.2.0 - Data Generator.py │ ├── 5.2.1 - Voos Streaming DF + Pubsub.py │ ├── 5.3 - Voos Streaming DF + BQ.py │ ├── 5.4 - Janelas e Noções de Tempo para Streaming.pptx │ ├── 5.5 - Tumbling Window DF + BQ.py │ ├── 5.6 - Sliding Window DF + BQ.py │ ├── Sliding_processor.py │ └── streaming janelas.xlsx ├── Data-Eng-Bootcamp ├── 17 - Introduction to Relational DB.md ├── 19 - 21 game.scala ├── 19 - Introduction to Scala.md ├── 2 - Introduction to Data Egineering.md └── 6 - Introduction to Shell Script.md ├── Databricks ├── LAB DATABRICKS FROM A TO Z .md ├── Readme.md ├── SCALA ETL Part 1 - Data Extraction.md ├── SCALA ETL Part 2 - Data Transformation.md ├── SPARK ETL Part 1 - Data Extraction.md ├── SPARK SQL.md └── Scala Exercises │ ├── People_vs_Execises.scala │ ├── exercises.json │ ├── notebook.html │ ├── people.json │ └── readme.md ├── Docker ├── .DS_Store ├── img │ ├── .DS_Store │ ├── Containerrization PID.png │ ├── container-layer.png │ ├── docker-compose-versions.png │ ├── docker-networks.png │ ├── docker-volume.png │ ├── port-mapping.png │ └── voting-app-diagram.png ├── python-sample-app │ ├── Dockerfile │ ├── my-app.py │ └── requirements.txt └── readme.md ├── Fundamentals of Data Engineering ├── .DS_Store ├── 1-Foundations.md ├── 2-DE Lifecycle.md ├── 3-Data Architecture.md ├── 4-Chooing Tech Across DE Lifecycle.md ├── readme.md └── src │ ├── DE_lifecycle.png │ ├── DE_stakeholders.png │ └── monolith_vs_ms_arch.png ├── Git ├── .DS_Store ├── Git CI Fundamentals.md ├── gitlab-ci-chapter-3.yml ├── gitlab-ci-chapter-4.yml ├── gitlab-ci-my-first-pipeline.yml ├── img │ ├── .DS_Store │ └── CI CD Pipeline.png └── readme.md ├── Installing-components.md ├── Kafka └── Apache Kafka Fundamentals.md ├── Kubernetes ├── Kubernetes Up and Running │ ├── 2 │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── package.json │ │ └── server.js │ └── kubernetes_up_and_running.md ├── img │ └── kubernetes-architecture.png ├── k8s_and_data.md ├── k8s_minikube.md └── readme.md ├── README.md ├── Snowflake ├── .DS_Store ├── curated-links-list.md ├── live-sf-getting-started.sql ├── readme.md ├── the-snowflake-series-course │ ├── COPY.sql │ ├── FILE FORMAT.sql │ ├── FLATTEN.sql │ ├── INSERT.sql │ ├── LOAD JSON.sql │ ├── PARSE JSON.sql │ ├── STAGES.sql │ ├── STORAGE INTEGRATION.sql │ ├── TRANSFORMATION WITH COPY.sql │ ├── automacao │ │ ├── SNOWPIPE.sql │ │ ├── TASK TREE.sql │ │ └── TASK.SQL │ └── test.sql └── ws │ ├── extras │ └── extras.sql │ ├── ingestion │ ├── 1-storage-integration.sql │ ├── 2-stage-and-file-format.sql │ ├── 3-copy-command.sql │ └── 4-snowpipe.sql │ └── transformation │ ├── 1-semi-structured.sql │ ├── 2-tasks.sql │ ├── 3-duplicates.sql │ ├── 4-streams.sql │ ├── 5-streams+tasks.sql │ └── 6-refactoring.sql ├── Sql ├── SQL HACKER RANK TRAINING.sql └── WINDOW FUNCTIONS.sql ├── Terraform ├── .DS_Store ├── 05-language-features │ └── README.md ├── 06-organization-and-modules │ ├── .DS_Store │ ├── README.md │ ├── consul │ │ ├── README.md │ │ └── main.tf │ ├── web-app-module │ │ ├── compute.tf │ │ ├── database.tf │ │ ├── dns.tf │ │ ├── main.tf │ │ ├── networking.tf │ │ ├── outputs.tf │ │ ├── storage.tf │ │ └── variables.tf │ └── web-app │ │ └── main.tf ├── 2-first-tf-deployment │ ├── main.tf │ └── terraform.tfstate ├── 3-remote-backend │ ├── main.tf │ └── terraform.tfstate ├── 3-web-app │ ├── .terraform │ │ └── terraform.tfstate │ ├── errored.tfstate │ ├── main.tf │ └── web-app-architecture.png ├── 4-variables-and-outputs │ ├── examples │ │ ├── another-variable-file.tfvars │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── terraform.tfvars │ │ └── variables.tf │ └── web-app │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── terraform.tfstate │ │ ├── terraform.tfvars │ │ └── variables.tf ├── img │ ├── .DS_Store │ └── 2_2_terraform_architecture.png └── readme.md ├── airbyte-dbt-airflow-snowflake ├── .DS_Store ├── .env ├── airbyte-docker-compose.yml ├── flags.yml ├── readme.md └── temporal │ └── dynamicconfig │ └── development.yaml ├── dbt ├── .DS_Store ├── dbt-fundamentals-course │ ├── dim_customer-3.sql │ ├── dim_customers-2.sql │ ├── dim_customers.sql │ ├── fct_orders.sql │ ├── img │ │ ├── dbt_and_alatycs_engineer.png │ │ ├── dbt_workflow.png │ │ ├── ineage_graph.png │ │ └── modern_data_platform.png │ ├── insert_sf_dbt_training_data.sql │ ├── jaffle_shop.md │ ├── readme.md │ ├── src_jaffle_shop-2.yml │ ├── src_jaffle_shop-3.yml │ ├── src_jaffle_shop-4.yml │ ├── src_jaffle_shop.yml │ ├── src_stripe.yml │ ├── stg_customers.sql │ ├── stg_jaffle_shop-2.yml │ ├── stg_jaffle_shop.yml │ ├── stg_orders.sql │ └── stg_payments.sql └── dbt-udemy-course │ ├── .DS_Store │ ├── Notes.md │ ├── dbt-course-udemy │ ├── .DS_Store │ ├── .dbt │ │ ├── .user.yml │ │ └── profiles.yml │ ├── dbt_project │ │ ├── .DS_Store │ │ ├── .gitignore │ │ ├── README.md │ │ ├── analyses │ │ │ ├── .gitkeep │ │ │ └── full_moon_no_sleep.sql │ │ ├── assets │ │ │ └── input_schema.png │ │ ├── dbt_project.yml │ │ ├── macros │ │ │ ├── .gitkeep │ │ │ ├── no_nulls_in_columns.sql │ │ │ └── positive_values.sql │ │ ├── models │ │ │ ├── dim │ │ │ │ ├── dim_hosts_cleansed.sql │ │ │ │ ├── dim_listings_cleansed.sql │ │ │ │ └── dim_listings_w_hosts.sql │ │ │ ├── docs.md │ │ │ ├── fct │ │ │ │ └── fct_reviews.sql │ │ │ ├── mart │ │ │ │ └── mart_fullmoon_reviews.sql │ │ │ ├── overview.md │ │ │ ├── schema.yml │ │ │ ├── sources.yml │ │ │ └── src │ │ │ │ ├── src_hosts.sql │ │ │ │ ├── src_listings.sql │ │ │ │ └── src_reviews.sql │ │ ├── package-lock.yml │ │ ├── packages.yml │ │ ├── seeds │ │ │ ├── .gitkeep │ │ │ └── seed_full_moon_dates.csv │ │ ├── snapshots │ │ │ ├── .gitkeep │ │ │ └── scd_raw_listings.sql │ │ └── tests │ │ │ ├── .gitkeep │ │ │ ├── consistent_created_at.sql │ │ │ ├── dim_listings_minimum_nights.sql │ │ │ └── no_nulls_in_dim_linstings.sql │ └── logs │ │ └── dbt.log │ ├── readme.md │ └── setup.md ├── pytest ├── first_test.py ├── gtfs_test.py ├── multiple_tests.py ├── pytest.ini ├── readme.md ├── test_api.py ├── test_fixtures.py └── test_parametrized.py ├── spark_on_google_colab.py └── src └── img ├── 1 - Intro to data Engineering ├── fig 1 - Pipeline.JPG ├── fig 2 - Star Schema.JPG ├── fig 3 - Parallel Computing.JPG ├── fig 4 - DAG Example Air Flow.JPG ├── fig 5 - MPP.JPG └── fig 6 - AirFlow UI.JPG ├── 17 - Introduction to Relational DB ├── 1_n_relationship.jpg ├── ENTITY_MODELS.jpg ├── n_n_relationships.jpg └── postgree_datatypes.jpg ├── 17 - Introduction to Scala ├── 21_game_points.jpg └── Scala_Interpreter.jpg ├── 2 - Streamlined data with pandas ├── fig 1 - Dataframe.JPG ├── fig 2 - Loading Excel.JPG ├── fig 3 - Datetime Table.JPG ├── fig 4 - Datetime Formatting.JPG ├── fig 5 - JSON Object Oriented.JPG ├── fig 6 - JSON Column Oriented.JPG └── fig 7 - Yelp Documentation.JPG ├── 3 - Software Engineering in Python ├── fig 1 - Local package structure.JPG ├── fig 1 - PEP and non PEP codes.JPG ├── fig 3 - Portable package structure.JPG ├── fig 4 - Anatomy of classes.JPG └── fig 5 - Inheritance.JPG ├── 5 - Introduction to Shell Script ├── fig 1 - paste command.JPG ├── fig 2 - wrap up manipulating data.JPG ├── fig 3 - multiple actions loop.JPG └── fig 4 - nano interface.JPG ├── SCALA ETL Part 1 ├── imperative_vs_functional_Programing.jpg ├── spark drop malformed mode.jpg └── spark permissive mode.jpg ├── SCALA ETL Part 2 └── image norrmalization.jpg ├── Weather_Data_Pipeline └── Weather API v1.0.jpg └── kafka_fundamentals ├── broker.jpg ├── kafka architecture.jpg └── topics.jpg /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hcl 2 | *.backup 3 | *x5 4 | Snowflake/.DS_Store 5 | Snowflake/.DS_Store 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true, 3 | "dbt.queryLimit": 500 4 | } -------------------------------------------------------------------------------- /AWS Glue/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/.DS_Store -------------------------------------------------------------------------------- /AWS Glue/img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/.DS_Store -------------------------------------------------------------------------------- /AWS Glue/img/aws s3 partitions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/aws s3 partitions.png -------------------------------------------------------------------------------- /AWS Glue/img/glue architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Glue/img/glue architecture.png -------------------------------------------------------------------------------- /AWS Glue/readme: -------------------------------------------------------------------------------- 1 | ## What is Glue 2 | * Fully managed ETL service 3 | * Consists of a central metadata catalog repository (glue data catalog) 4 | * Spark ETL Engine 5 | * Flexible Scheduler 6 | 7 | # IMAGE glue architecture 8 | 9 | ## AWS Glue Data Catalog 10 | * Persistent Metadata Store 11 | * Managed service to store, annotate and share metadata wich can be used to query and transdform data 12 | * One glue catalog per AWS Region 13 | * IAM policies control access 14 | * Can be used for data governance 15 | * Store: Data location, checma, data types, data classification... 16 | * Database in Glue 17 | * Set of associated in Data catalog table definitions organized into logical groups 18 | * it just organizes the logical data, dont actually move any data 19 | * The metadata definition that represnt your data. Data resides in its original store, this is just a representation of the schema 20 | 21 | ### Create Database Practices 22 | * Organize glue databases in respective folders in S3 23 | * All files for a database should be under the folder, for organization 24 | 25 | ## AWS Partitions 26 | * Folders where data is stored on S3, which are physical entities, are mapped to parititons, which are logical entities. Ie: columns in glues table 27 | * helps to query data more efficiently because we just query the necessary data 28 | 29 | # IMAGE s3 partitions 30 | sample folder structure: 31 | * sales 32 | * 2019 33 | * july 34 | * august 35 | * 01 36 | * 02 37 | * 03 38 | 39 | ## AWS Glue Crawler 40 | * A program that connects to a data store (source or target), progresses through a prioritized list of classifiers to determine the schema for your data, and then creates metadata tables in the AWS Glue Data Catalog 41 | * Can add data manually in a data catalog (columns by column and its format addition) 42 | * Can add data via crawler also 43 | 44 | ## AWS Glue Jobs 45 | * Store business logic required for ETL 46 | * It is composed by transformation script, data sources, and data targets 47 | * job runs are initiated by triggers that can be scheduled or triggeres by events 48 | * it is under jobs 49 | * can select the engine, the temp folder, and the scripts folder 50 | 51 | ## AWS Triggers 52 | * Create triggers for Glue Jobs 53 | * Triggers can be event based or schedule based 54 | 55 | ## AWS DEV Endpoint 56 | * Create a cluster to develop scripts and test locally sending the code to test in this cluster on AWS 57 | * Costs can scale very easily -------------------------------------------------------------------------------- /AWS Lambda/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/.DS_Store -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/.aws-sam/build.toml: -------------------------------------------------------------------------------- 1 | # This file is auto generated by SAM CLI build command 2 | 3 | [function_build_definitions] 4 | [function_build_definitions.2ed2c419-0f5d-4864-8032-6061cf637252] 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\LatLongLog\\hello_world" 6 | runtime = "python3.9" 7 | source_md5 = "" 8 | packagetype = "Zip" 9 | functions = ["LatLongLog"] 10 | 11 | [layer_build_definitions] 12 | -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/events/event.json: -------------------------------------------------------------------------------- 1 | [1.156123,-54.55952] -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/hello_world/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/hello_world/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/hello_world/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # import requests 4 | 5 | 6 | def lambda_handler(event, context): 7 | """Sample pure Lambda function 8 | 9 | Parameters 10 | ---------- 11 | event: dict, required 12 | API Gateway Lambda Proxy Input Format 13 | 14 | Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format 15 | 16 | context: object, required 17 | Lambda Context runtime methods and attributes 18 | 19 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html 20 | 21 | Returns 22 | ------ 23 | API Gateway Lambda Proxy Output Format: dict 24 | 25 | Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html 26 | """ 27 | # consifering every event will have a list with only 2 values 28 | lat = event[0] 29 | long = event[1] 30 | log = f"Current Coordinates = LAT:{lat} , LONG:{long}" 31 | return log -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/hello_world/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "LatLongLog" 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s" 7 | s3_prefix = "LatLongLog" 8 | region = "us-east-1" 9 | confirm_changeset = true 10 | capabilities = "CAPABILITY_IAM" 11 | -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | LatLongLog 5 | 6 | Sample SAM Template for LatLongLog 7 | 8 | Resources: 9 | LatLongLog: 10 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 11 | Properties: 12 | CodeUri: hello_world/ 13 | Handler: app.lambda_handler 14 | Runtime: python3.9 15 | -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/integration/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/integration/test_api_gateway.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import boto3 5 | import requests 6 | 7 | """ 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 9 | """ 10 | 11 | 12 | class TestApiGateway(TestCase): 13 | api_endpoint: str 14 | 15 | @classmethod 16 | def get_stack_name(cls) -> str: 17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME") 18 | if not stack_name: 19 | raise Exception( 20 | "Cannot find env var AWS_SAM_STACK_NAME. \n" 21 | "Please setup this environment variable with the stack name where we are running integration tests." 22 | ) 23 | 24 | return stack_name 25 | 26 | def setUp(self) -> None: 27 | """ 28 | Based on the provided env variable AWS_SAM_STACK_NAME, 29 | here we use cloudformation API to find out what the HelloWorldApi URL is 30 | """ 31 | stack_name = TestApiGateway.get_stack_name() 32 | 33 | client = boto3.client("cloudformation") 34 | 35 | try: 36 | response = client.describe_stacks(StackName=stack_name) 37 | except Exception as e: 38 | raise Exception( 39 | f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.' 40 | ) from e 41 | 42 | stacks = response["Stacks"] 43 | 44 | stack_outputs = stacks[0]["Outputs"] 45 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"] 46 | self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}") 47 | 48 | self.api_endpoint = api_outputs[0]["OutputValue"] 49 | 50 | def test_api_gateway(self): 51 | """ 52 | Call the API Gateway endpoint and check the response 53 | """ 54 | response = requests.get(self.api_endpoint) 55 | self.assertDictEqual(response.json(), {"message": "hello world"}) 56 | -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-mock 3 | boto3 -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/LatLongLog/tests/unit/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/LatLongLog/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from hello_world import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event, mocker): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "hello world" 73 | # assert "location" in data.dict_keys() 74 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/.aws-sam/build.toml: -------------------------------------------------------------------------------- 1 | # This file is auto generated by SAM CLI build command 2 | 3 | [function_build_definitions] 4 | [function_build_definitions.08eec71e-2945-4a9d-9e9e-90e2225e341d] 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\getTicket\\hello_world" 6 | runtime = "python3.9" 7 | source_md5 = "" 8 | packagetype = "Zip" 9 | functions = ["GetTicketFunction"] 10 | 11 | [layer_build_definitions] 12 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/getTicket/events/event.json: -------------------------------------------------------------------------------- 1 | ["Cassio"] -------------------------------------------------------------------------------- /AWS Lambda/getTicket/hello_world/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/hello_world/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/getTicket/hello_world/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def lambda_handler(event, context): 5 | #print(event) 6 | student_scores = {"Cassio" : 100, "Victoria" : 100, "Garfinho" : 90 } 7 | score_list = [] 8 | for name in event: 9 | score_list.append(student_scores[name]) 10 | return score_list 11 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/hello_world/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /AWS Lambda/getTicket/samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "GetTicket" 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s" 7 | s3_prefix = "GetTicket" 8 | region = "us-east-1" 9 | confirm_changeset = true 10 | capabilities = "CAPABILITY_IAM" 11 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | getTicket 5 | 6 | Sample SAM Template for getTicket 7 | 8 | Resources: 9 | GetTicketFunction: 10 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 11 | Properties: 12 | CodeUri: hello_world/ 13 | Handler: app.lambda_handler 14 | Runtime: python3.9 15 | 16 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/integration/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/integration/test_api_gateway.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import boto3 5 | import requests 6 | 7 | """ 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 9 | """ 10 | 11 | 12 | class TestApiGateway(TestCase): 13 | api_endpoint: str 14 | 15 | @classmethod 16 | def get_stack_name(cls) -> str: 17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME") 18 | if not stack_name: 19 | raise Exception( 20 | "Cannot find env var AWS_SAM_STACK_NAME. \n" 21 | "Please setup this environment variable with the stack name where we are running integration tests." 22 | ) 23 | 24 | return stack_name 25 | 26 | def setUp(self) -> None: 27 | """ 28 | Based on the provided env variable AWS_SAM_STACK_NAME, 29 | here we use cloudformation API to find out what the HelloWorldApi URL is 30 | """ 31 | stack_name = TestApiGateway.get_stack_name() 32 | 33 | client = boto3.client("cloudformation") 34 | 35 | try: 36 | response = client.describe_stacks(StackName=stack_name) 37 | except Exception as e: 38 | raise Exception( 39 | f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.' 40 | ) from e 41 | 42 | stacks = response["Stacks"] 43 | 44 | stack_outputs = stacks[0]["Outputs"] 45 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"] 46 | self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}") 47 | 48 | self.api_endpoint = api_outputs[0]["OutputValue"] 49 | 50 | def test_api_gateway(self): 51 | """ 52 | Call the API Gateway endpoint and check the response 53 | """ 54 | response = requests.get(self.api_endpoint) 55 | self.assertDictEqual(response.json(), {"message": "hello world"}) 56 | -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-mock 3 | boto3 -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/getTicket/tests/unit/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/getTicket/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from hello_world import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event, mocker): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "hello world" 73 | # assert "location" in data.dict_keys() 74 | -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/.aws-sam/build.toml: -------------------------------------------------------------------------------- 1 | # This file is auto generated by SAM CLI build command 2 | 3 | [function_build_definitions] 4 | [function_build_definitions.3ea77c0a-a539-45af-8a65-12dbfaa05af7] 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\lambdatemplateSAM\\hello_world" 6 | runtime = "python3.9" 7 | source_md5 = "" 8 | packagetype = "Zip" 9 | functions = ["HelloWorldFunction"] 10 | 11 | [layer_build_definitions] 12 | -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/events/event.json: -------------------------------------------------------------------------------- 1 | {"Cassio" : [100,10,20], "Victoria" : 100, "Garfinho" : 90 } -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/hello_world/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/hello_world/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/hello_world/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def first_lambda(event, context): 5 | return "hello "+event 6 | -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/hello_world/datatypes.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os # to get env variables created from template.yml 3 | import random 4 | 5 | def simple_types(event,context): 6 | print(event) 7 | return event 8 | 9 | def list_types(event,context): 10 | print(event) 11 | student_scores = {"Cassio" : 100, "Victoria" : 100, "Garfinho" : 90 } 12 | score_list = [] 13 | for name in event: 14 | score_list.append(student_scores[name]) 15 | return score_list 16 | 17 | def dict_types(event,context): 18 | for score in event["Cassio"]: 19 | print(score) 20 | return event 21 | 22 | def context_example(event, context): 23 | print("Lambda function ARN:", context.invoked_function_arn) 24 | print("CloudWatch log stream name:", context.log_stream_name) 25 | print("CloudWatch log group name:", context.log_group_name) 26 | print("Lambda Request ID:", context.aws_request_id) 27 | print("Lambda function memory limits in MB:", context.memory_limit_in_mb) 28 | # We have added a 1 second delay so you can see the time remaining in get_remaining_time_in_millis. 29 | time.sleep(1) 30 | print("Lambda time remaining in MS:", context.get_remaining_time_in_millis()) 31 | print(os.getenv('restapi')) 32 | return context.invoked_function_arn 33 | 34 | global_var = random.random() 35 | def cold_start(event,context): 36 | exec_time_var = random.random() 37 | return global_var,exec_time_var -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/hello_world/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/outputfile.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/outputfile.txt -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "firstlambda" 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s" 7 | s3_prefix = "firstlambda" 8 | region = "us-east-1" 9 | confirm_changeset = true 10 | capabilities = "CAPABILITY_IAM" 11 | -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' # mandatory 2 | Transform: AWS::Serverless-2016-10-31 # mandatory -> tells cloud formation which template this is about 3 | Description: > 4 | lambdatemplateSAM 5 | 6 | Sample SAM Template for lambdatemplateSAM 7 | 8 | Resources: # mandatory, every resource used should be declared here, like api gateway, dynamo db... 9 | HelloWorldFunction: 10 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 11 | Properties: 12 | CodeUri: hello_world/ 13 | Handler: datatypes.cold_start 14 | Runtime: python3.9 15 | Timeout: 3 # maximum is 15min 16 | Environment: # pass under evnriment, variables your function can retrieve 17 | Variables: 18 | restapi: http://dummy.com 19 | dbname: mydb -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/integration/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/integration/test_api_gateway.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import boto3 5 | import requests 6 | 7 | """ 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 9 | """ 10 | 11 | 12 | class TestApiGateway(TestCase): 13 | api_endpoint: str 14 | 15 | @classmethod 16 | def get_stack_name(cls) -> str: 17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME") 18 | if not stack_name: 19 | raise Exception( 20 | "Cannot find env var AWS_SAM_STACK_NAME. \n" 21 | "Please setup this environment variable with the stack name where we are running integration tests." 22 | ) 23 | 24 | return stack_name 25 | 26 | def setUp(self) -> None: 27 | """ 28 | Based on the provided env variable AWS_SAM_STACK_NAME, 29 | here we use cloudformation API to find out what the HelloWorldApi URL is 30 | """ 31 | stack_name = TestApiGateway.get_stack_name() 32 | 33 | client = boto3.client("cloudformation") 34 | 35 | try: 36 | response = client.describe_stacks(StackName=stack_name) 37 | except Exception as e: 38 | raise Exception( 39 | f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.' 40 | ) from e 41 | 42 | stacks = response["Stacks"] 43 | 44 | stack_outputs = stacks[0]["Outputs"] 45 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"] 46 | self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}") 47 | 48 | self.api_endpoint = api_outputs[0]["OutputValue"] 49 | 50 | def test_api_gateway(self): 51 | """ 52 | Call the API Gateway endpoint and check the response 53 | """ 54 | response = requests.get(self.api_endpoint) 55 | self.assertDictEqual(response.json(), {"message": "hello world"}) 56 | -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-mock 3 | boto3 -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/lambdatemplateSAM/tests/unit/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/lambdatemplateSAM/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from hello_world import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event, mocker): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "hello world" 73 | # assert "location" in data.dict_keys() 74 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/.aws-sam/build.toml: -------------------------------------------------------------------------------- 1 | # This file is auto generated by SAM CLI build command 2 | 3 | [function_build_definitions] 4 | [function_build_definitions.8eb773ee-21d0-4a34-956a-674265864e83] 5 | codeuri = "C:\\Users\\cassi\\Google Drive\\AWS\\Lambda for Python Dev - Udemy\\orders-api\\orders_api" 6 | runtime = "python3.9" 7 | source_md5 = "" 8 | packagetype = "Zip" 9 | functions = ["CreateOrderFunction", "ReadOrderFunction"] 10 | 11 | [layer_build_definitions] 12 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/orders-api/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "body": "{\"message\": \"hello world\"}", 3 | "resource": "/hello", 4 | "path": "/hello", 5 | "httpMethod": "GET", 6 | "isBase64Encoded": false, 7 | "queryStringParameters": { 8 | "foo": "bar" 9 | }, 10 | "pathParameters": { 11 | "proxy": "/path/to/resource" 12 | }, 13 | "stageVariables": { 14 | "baz": "qux" 15 | }, 16 | "headers": { 17 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 18 | "Accept-Encoding": "gzip, deflate, sdch", 19 | "Accept-Language": "en-US,en;q=0.8", 20 | "Cache-Control": "max-age=0", 21 | "CloudFront-Forwarded-Proto": "https", 22 | "CloudFront-Is-Desktop-Viewer": "true", 23 | "CloudFront-Is-Mobile-Viewer": "false", 24 | "CloudFront-Is-SmartTV-Viewer": "false", 25 | "CloudFront-Is-Tablet-Viewer": "false", 26 | "CloudFront-Viewer-Country": "US", 27 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 28 | "Upgrade-Insecure-Requests": "1", 29 | "User-Agent": "Custom User Agent String", 30 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 31 | "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==", 32 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 33 | "X-Forwarded-Port": "443", 34 | "X-Forwarded-Proto": "https" 35 | }, 36 | "requestContext": { 37 | "accountId": "123456789012", 38 | "resourceId": "123456", 39 | "stage": "prod", 40 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 41 | "requestTime": "09/Apr/2015:12:34:56 +0000", 42 | "requestTimeEpoch": 1428582896000, 43 | "identity": { 44 | "cognitoIdentityPoolId": null, 45 | "accountId": null, 46 | "cognitoIdentityId": null, 47 | "caller": null, 48 | "accessKey": null, 49 | "sourceIp": "127.0.0.1", 50 | "cognitoAuthenticationType": null, 51 | "cognitoAuthenticationProvider": null, 52 | "userArn": null, 53 | "userAgent": "Custom User Agent String", 54 | "user": null 55 | }, 56 | "path": "/prod/hello", 57 | "resourcePath": "/hello", 58 | "httpMethod": "POST", 59 | "apiId": "1234567890", 60 | "protocol": "HTTP/1.1" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/orders_api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/orders_api/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/orders-api/orders_api/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # import requests 4 | 5 | 6 | def lambda_handler(event, context): 7 | """Sample pure Lambda function 8 | 9 | Parameters 10 | ---------- 11 | event: dict, required 12 | API Gateway Lambda Proxy Input Format 13 | 14 | Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format 15 | 16 | context: object, required 17 | Lambda Context runtime methods and attributes 18 | 19 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html 20 | 21 | Returns 22 | ------ 23 | API Gateway Lambda Proxy Output Format: dict 24 | 25 | Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html 26 | """ 27 | 28 | # try: 29 | # ip = requests.get("http://checkip.amazonaws.com/") 30 | # except requests.RequestException as e: 31 | # # Send some context about this error to Lambda Logs 32 | # print(e) 33 | 34 | # raise e 35 | 36 | return { 37 | "statusCode": 200, 38 | "body": json.dumps({ 39 | "message": "hello world", 40 | # "location": ip.text.replace("\n", "") 41 | }), 42 | } 43 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/orders_api/create.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import os 4 | 5 | def lambda_handler(event, context): 6 | order = json.loads(event['body']) 7 | 8 | dynamodb = boto3.resource('dynamodb') # tell boto what resource you are using 9 | table_name = os.environ.get('ORDERS_TABLE') # get the value from global function defined on yml 10 | table = dynamodb.Table(table_name) # get table from dynamo db 11 | response = table.put_item(TableName = table_name, Item=order) #use put item method on table, to put the order coming from body 12 | print(response) # write response to the logs 13 | return { 14 | 'statusCode': 201, 15 | 'headers': {}, 16 | 'body': json.dumps({'messagem':'order created'}) 17 | } -------------------------------------------------------------------------------- /AWS Lambda/orders-api/orders_api/read.py: -------------------------------------------------------------------------------- 1 | import simplejson as json 2 | import boto3 3 | import os 4 | from boto3.dynamodb.conditions import Key 5 | 6 | def lambda_handler(event, context): 7 | order = {"id" : 123 , "itemName" : "McBook", "quantity" : 100} 8 | dynamodb = boto3.resource('dynamodb') # tell boto what resource you are using 9 | table_name = os.environ.get('ORDERS_TABLE') # get the value from global function defined on yml 10 | table = dynamodb.Table(table_name) # get table from dynamo db 11 | order_id = int(event['pathParameters']['id']) # read the id passed in the api call 12 | response = table.query(KeyConditionExpression=Key('id').eq(order_id)) # query the order id 13 | 14 | 15 | return { 16 | 'statusCode': 201, 17 | 'headers': {}, 18 | 'body': json.dumps(response['Items']) 19 | } -------------------------------------------------------------------------------- /AWS Lambda/orders-api/orders_api/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | boto3 3 | simplejson -------------------------------------------------------------------------------- /AWS Lambda/orders-api/samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "orderapi" 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-ui27hk7cis4s" 7 | s3_prefix = "orderapi" 8 | region = "us-east-1" 9 | confirm_changeset = true 10 | capabilities = "CAPABILITY_IAM" 11 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | orders-api 5 | 6 | Globals: # define settings to be reused globally in the resources 7 | Function: # declaring the resource type will use the next settings, this case, all 'AWS::Serverless::Function' type will use the below settings 8 | Runtime: python3.9 9 | Timeout: 30 10 | Environment: # block to define env VARIABLES globally for all functions types resource 11 | Variables: 12 | ORDERS_TABLE : !Ref OrdersTable # use the intrinsic to reference OrderTable resource as variable ORDER_TABLE 13 | 14 | Resources: 15 | OrdersTable: # name on cloud formation stack 16 | Type: AWS::Serverless::SimpleTable # resource type 17 | Properties: # block to define the properties 18 | PrimaryKey: # only mandatory property 19 | Name: id 20 | Type: Number 21 | CreateOrderFunction: 22 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 23 | Properties: 24 | CodeUri: orders_api/ 25 | Handler: create.lambda_handler 26 | # Runtime: python3.9 # don't need this because is in global 27 | Events: 28 | CreateOrder: # name of the api trigger event 29 | Type: Api # type of event 30 | Properties: 31 | Path: /orders # path used in the api as parameter 32 | Method: POST # method used in the API 33 | Policies: #creating a security policy 34 | - DynamoDBCrudPolicy: # type of policy for this function 35 | TableName: !Ref OrdersTable # this policy is restrictec only to the table 36 | ReadOrderFunction: 37 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 38 | Properties: 39 | CodeUri: orders_api/ 40 | Handler: read.lambda_handler 41 | # Runtime: python3.9 # don't need this because is in global 42 | Events: 43 | ReadOrder: # name of the api trigger event 44 | Type: Api # type of event 45 | Properties: 46 | Path: /orders/{id} # path used in the api as parameter 47 | Method: GET # method used in the API 48 | Policies: #creating a security policy 49 | - DynamoDBReadPolicy: # type of policy for this function 50 | TableName: !Ref OrdersTable # this policy is restrictec only to the table 51 | 52 | # create outputs to show url to end user 53 | Outputs: 54 | CreateOrdersAPI: 55 | Description: "API Gateway endpoint for creating orders" 56 | # !Sub substitute values with ${} on the string 57 | Value: !Sub "https://${ServerlessRestApi}.execute-api.${AWS::Region}.amazonaws.com/Prod/orders" 58 | CreateOrderFunction: # name the getatt, can be anything 59 | Description: " Get create order function ARN " 60 | Value: !GetAtt CreateOrderFunction.Arn # function name .Arn 61 | CreateOrderFunctionIamRole: # name the getatt, can be anything 62 | Description: "Get create order function role ARN" 63 | # when a function is created, it creates a role with function name + Role at the end 64 | Value: !GetAtt CreateOrderFunctionRole.Arn -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/integration/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/integration/test_api_gateway.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import boto3 5 | import requests 6 | 7 | """ 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 9 | """ 10 | 11 | 12 | class TestApiGateway(TestCase): 13 | api_endpoint: str 14 | 15 | @classmethod 16 | def get_stack_name(cls) -> str: 17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME") 18 | if not stack_name: 19 | raise Exception( 20 | "Cannot find env var AWS_SAM_STACK_NAME. \n" 21 | "Please setup this environment variable with the stack name where we are running integration tests." 22 | ) 23 | 24 | return stack_name 25 | 26 | def setUp(self) -> None: 27 | """ 28 | Based on the provided env variable AWS_SAM_STACK_NAME, 29 | here we use cloudformation API to find out what the HelloWorldApi URL is 30 | """ 31 | stack_name = TestApiGateway.get_stack_name() 32 | 33 | client = boto3.client("cloudformation") 34 | 35 | try: 36 | response = client.describe_stacks(StackName=stack_name) 37 | except Exception as e: 38 | raise Exception( 39 | f"Cannot find stack {stack_name}. \n" f'Please make sure stack with the name "{stack_name}" exists.' 40 | ) from e 41 | 42 | stacks = response["Stacks"] 43 | 44 | stack_outputs = stacks[0]["Outputs"] 45 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"] 46 | self.assertTrue(api_outputs, f"Cannot find output HelloWorldApi in stack {stack_name}") 47 | 48 | self.api_endpoint = api_outputs[0]["OutputValue"] 49 | 50 | def test_api_gateway(self): 51 | """ 52 | Call the API Gateway endpoint and check the response 53 | """ 54 | response = requests.get(self.api_endpoint) 55 | self.assertDictEqual(response.json(), {"message": "hello world"}) 56 | -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-mock 3 | boto3 -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/AWS Lambda/orders-api/tests/unit/__init__.py -------------------------------------------------------------------------------- /AWS Lambda/orders-api/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from hello_world import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event, mocker): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "hello world" 73 | # assert "location" in data.dict_keys() 74 | -------------------------------------------------------------------------------- /Airflow/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/.DS_Store -------------------------------------------------------------------------------- /Airflow/DAG Authoring Certification/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/DAG Authoring Certification/.DS_Store -------------------------------------------------------------------------------- /Airflow/DAG Authoring Certification/.astro/config.yaml: -------------------------------------------------------------------------------- 1 | project: 2 | name: airflow-dag-authoring-cert 3 | -------------------------------------------------------------------------------- /Airflow/DAG Authoring Certification/.dockerignore: -------------------------------------------------------------------------------- 1 | .astro 2 | .git 3 | .env 4 | airflow_settings.yaml 5 | pod-config.yml 6 | logs/ -------------------------------------------------------------------------------- /Airflow/DAG Authoring Certification/.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .env 3 | airflow_settings.yaml 4 | pod-config.yml -------------------------------------------------------------------------------- /Airflow/Fundamentals Certification/oper_sensors_sample_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.python import PythonOperator 3 | from airflow.sensors.filesystem import FileSensor 4 | from airflow.operators.bash import BashOperator 5 | from datetime import datetime,timedelta 6 | from airflow.sensors.filesystem import FileSensor 7 | from airflow.models.baseoperator import chain, cross_downstream 8 | 9 | default_args = { 10 | 'retry' : 5 11 | ,'retry_delay' : timedelta(minutes=5) 12 | ,'email_on_failure': True 13 | ,'email_on_retry' : True 14 | ,'email' : 'cassio.bolba@gmail.com' 15 | } 16 | 17 | # let's check if the file myfile.txt is in the folder 18 | def _downloading_data (**kwargs): 19 | with open ('/tmp/myfile.txt','w'): 20 | f.write('my_data') 21 | return 42 22 | 23 | # call the ti to access the xcoms metadata 24 | def checking_data(ti): 25 | # call the method in ti and pass the xcoms key (can check in admin panel) and the task id where the xcoms is) 26 | my_xcoms = ti.xcom_pull(key='return_value', task_ids = ['downaloading_data']) 27 | print(my_xcoms) 28 | print('check data') 29 | 30 | def _failure(context): # context brings information about 31 | print(context) 32 | 33 | with DAG ( dag_id = 'simple_dag' 34 | ,schedule_interval = "*/10 * * * *" 35 | ,start_date = datetime(2021,1,1) 36 | ,catchup = False #disable backfilling 37 | ,default_args = default_args 38 | ) as dag: 39 | 40 | downloading_data = PythonOperator ( 41 | task_id = 'downloading_data' 42 | ,python_callable = _downloading_data 43 | ) 44 | 45 | checking_data = PythonOperator ( 46 | task_id = 'checking_data' 47 | ,python_callable = checking_data 48 | ) 49 | 50 | waiting_data = FileSensor ( 51 | task_id = 'waiting_data' 52 | ,fs_conn_id= = 'con_id' 53 | ,filepath = 'my_file.txt' 54 | ,poke_interval = 15 55 | ) 56 | 57 | processing_data = BashOperator ( 58 | task_id = 'processing_data' 59 | ,bash_command = 'exit 0' 60 | ) 61 | 62 | downloading_data >> [ waiting_data, processing_data ] 63 | 64 | # another way to chain (not in same level) 65 | # chain( downloading_data , waiting_data, processing_data ) 66 | 67 | # creating cross dependencies 68 | # cross_downstream ( [ downloading_data, checking_data ] , [ waiting_data,processing_data ] ) -------------------------------------------------------------------------------- /Airflow/Fundamentals Certification/sensor_operator_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.python import PythonOperator 3 | from airflow.sensors.filesystem import FileSensor 4 | from datetime import datetime,timedelta 5 | from airflow.sensors.filesystem import FileSensor 6 | 7 | default_args = { 8 | 'retry' : 5 9 | ,'retry_delay' : timedelta(minutes=5) 10 | } 11 | 12 | # let's check if the file myfile.txt is in the folder 13 | def _downloading_data (**kwargs): 14 | with open ('/tmp/myfile.txt','w'): 15 | f.write('my_data') 16 | 17 | 18 | with DAG ( dag_id = 'simple_dag' 19 | ,schedule_interval = "*/10 * * * *" 20 | ,start_date = datetime(2021,1,1) 21 | ,catchup = False #disable backfilling 22 | ,default_args = default_args 23 | ) as dag: 24 | 25 | downloading_data = PythonOperator ( 26 | task_id = 'downloading_data' 27 | ,python_callable = _downloading_data 28 | ) 29 | 30 | waiting_data = FileSensor ( 31 | task_id = 'waiting_data' 32 | ,fs_conn_id= = 'con_id' 33 | ,filepath = 'my_file.txt' 34 | ,poke_interval = 15 35 | ) -------------------------------------------------------------------------------- /Airflow/Fundamentals Certification/simple_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.dummy import DummyOperator 3 | from datetime import datetime,timedelta 4 | 5 | default_args = { 6 | 'retry' : 5 7 | ,'retry_delay' : timedelta(minutes=5) 8 | } 9 | 10 | with DAG ( dag_id = 'simple_dag' 11 | ,schedule_interval = "*/10 * * * *" 12 | #,schedule_interval = "@daily" 13 | #,schedule_interval = timedelta(hours=7) 14 | ,start_date = datetime(2021,1,1) 15 | ,catchup = False #disable backfilling 16 | ,default_args = default_args 17 | ) as dag: 18 | 19 | task_1 = DummyOperator ( 20 | task_id = 'task_1' 21 | ) 22 | 23 | task_2 = DummyOperator ( 24 | task_id = 'task_2' 25 | ) -------------------------------------------------------------------------------- /Airflow/GoogleCloudComposer/dag_bq_gcs.py: -------------------------------------------------------------------------------- 1 | import airflow 2 | import datetime 3 | from airflow import DAG 4 | from airflow import models 5 | from airflow.operators import BashOperator 6 | from airflow.contrib.operators import bigquery_operator 7 | from airflow.contrib.operators import bigquery_to_gcs 8 | from airflow.utils import trigger_rule 9 | 10 | default_dag_args = { 11 | 'start_date': airflow.utils.dates.days_ago(1), 12 | 'email_on_failure': False, 13 | 'email_on_retry': False, 14 | 'retries': 1, 15 | 'retry_delay' : datetime.timedelta(minutes=5), 16 | } 17 | 18 | output_file = 'gs://southamerica-east1-cassio-a-77e1beeb-bucket/data/address.csv' 19 | #Replace with your path details 20 | with DAG( 21 | dag_id='demo_bq_dag', 22 | schedule_interval = datetime.timedelta(days = 1), 23 | default_args = default_dag_args) as dag: 24 | 25 | bq_airflow_commits_query = bigquery_operator.BigQueryOperator( 26 | task_id = 'bq_airflow_commits_query', 27 | bql = """ SELECT Address 28 | FROM [airflow-studies:Address.Add] 29 | """) 30 | 31 | 32 | export_commits_to_gcs = bigquery_to_gcs.BigQueryToCloudStorageOperator( 33 | task_id = 'export_airflow_commits_to_gcs', 34 | source_project_dataset_table = 'airflow-studies:Address.Add', 35 | destination_cloud_storage_uris = [output_file], 36 | export_format = 'CSV') 37 | 38 | bq_airflow_commits_query >> export_commits_to_gcs 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Airflow/img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/.DS_Store -------------------------------------------------------------------------------- /Airflow/img/Chain Operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/Chain Operator.png -------------------------------------------------------------------------------- /Airflow/img/Python Branch Operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/Python Branch Operator.png -------------------------------------------------------------------------------- /Airflow/img/celery cluster.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/celery cluster.jpg -------------------------------------------------------------------------------- /Airflow/img/multi_node.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/multi_node.jpg -------------------------------------------------------------------------------- /Airflow/img/one_node.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/one_node.jpg -------------------------------------------------------------------------------- /Airflow/img/task_instance_view.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Airflow/img/task_instance_view.PNG -------------------------------------------------------------------------------- /Apache Beam/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/.DS_Store -------------------------------------------------------------------------------- /Apache Beam/Bach Processing/4.4 - Batch DirectRunner + GCS.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import os 3 | 4 | serviceAccount = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\dataflow-course-319517-4f98a2ce48a7.json" 5 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 6 | 7 | p1 = beam.Pipeline() 8 | 9 | class Filter(beam.DoFn): 10 | def process(self,record): 11 | if int(record[8]) > 0: 12 | return [record] 13 | 14 | Delayed_time = ( 15 | p1 16 | | "Import Data time" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1) 17 | | "Split by comma time" >> beam.Map(lambda record: record.split(',')) 18 | | "Filter Delays time" >> beam.ParDo(Filter()) 19 | | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8]))) 20 | | "Sum by key time" >> beam.CombinePerKey(sum) 21 | ) 22 | 23 | Delayed_num = ( 24 | p1 25 | | "Import Data" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1) 26 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 27 | | "Filter Delays" >> beam.ParDo(Filter()) 28 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 29 | | "Count by key" >> beam.combiners.Count.PerKey() 30 | ) 31 | 32 | Delay_table = ( 33 | {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 34 | | "Group By" >> beam.CoGroupByKey() 35 | | "Save to GCS" >> beam.io.WriteToText(r"gs://dataflow-course/flights_output.csv") 36 | ) 37 | 38 | p1.run() 39 | -------------------------------------------------------------------------------- /Apache Beam/Bach Processing/4.5 - Batch Dataflow + GCS.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import os 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | 5 | pipeline_options = { 6 | 'project': 'dataflow-course-319517' , 7 | 'runner': 'DataflowRunner', 8 | 'region': 'southamerica-east1', 9 | 'staging_location': 'gs://dataflow-course/temp', 10 | 'temp_location': 'gs://dataflow-course/temp', 11 | 'template_location': 'gs://dataflow-course/template/batch_job_df_gcs_flights' 12 | } 13 | 14 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options) 15 | p1 = beam.Pipeline(options=pipeline_options) 16 | 17 | serviceAccount = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\dataflow-course-319517-4f98a2ce48a7.json" 18 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 19 | 20 | class Filter(beam.DoFn): 21 | def process(self,record): 22 | if int(record[8]) > 0: 23 | return [record] 24 | 25 | Delayed_time = ( 26 | p1 27 | | "Import Data time" >> beam.io.ReadFromText(r"gs://dataflow-course/input/flights_sample.csv", skip_header_lines = 1) 28 | | "Split by comma time" >> beam.Map(lambda record: record.split(',')) 29 | | "Filter Delays time" >> beam.ParDo(Filter()) 30 | | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8]))) 31 | | "Sum by key time" >> beam.CombinePerKey(sum) 32 | ) 33 | 34 | Delayed_num = ( 35 | p1 36 | |"Import Data" >> beam.io.ReadFromText(r"gs://dataflow-course/input/flights_sample.csv", skip_header_lines = 1) 37 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 38 | | "Filter Delays" >> beam.ParDo(Filter()) 39 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 40 | | "Count by key" >> beam.combiners.Count.PerKey() 41 | ) 42 | 43 | Delay_table = ( 44 | {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 45 | | "Group By" >> beam.CoGroupByKey() 46 | | "Save to GCS" >> beam.io.WriteToText(r"gs://dataflow-course/output/flights_output.csv") 47 | ) 48 | 49 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Bach Processing/comando deploy.txt: -------------------------------------------------------------------------------- 1 | python "C:\Users\cassi\Desktop\4.py" \ 2 | --input gs://template_dataflow_curso/entrada/voos_sample.csv \ 3 | --output gs://template_dataflow_curso/saida/voos_sample.csv \ 4 | --runner DataflowRunner \ 5 | --project dataflowproject-299811 \ 6 | --region southamerica-east1 \ 7 | --temp_location gs://template_dataflow_curso/temp \ 8 | --staging_location gs://template_dataflow_curso/staging \ 9 | --template_location gs://template_dataflow_curso/template \ 10 | 11 | python "C:\Users\cassi\Desktop\4.py" 12 | 13 | service-@dataflow-service-producer-prod.iam.gserviceaccount.com 14 | -compute@developer.gserviceaccount.com 15 | 505169071290@cloudservices.gserviceaccount.com 16 | 17 | 18 | -------------------------------------------------------------------------------- /Apache Beam/Bach Processing/unnest.py: -------------------------------------------------------------------------------- 1 | dataDict = ('LAX', {'Qtd_Atrasos': [4], 'Tempo_Atrasos': [92]}) 2 | 3 | class teste(beam.DoFn): 4 | def process(self,record): 5 | dict_ = {} 6 | dict_['airport'] = str(record[0]) 7 | dict_['lista'] = record[1] 8 | return(dict_) 9 | 10 | #print(criar_dict(dataDict)) 11 | 12 | def process(self,record): 13 | def expand(key, value): 14 | if isinstance(value, dict): 15 | return [ (key + '_' + k, v) for k, v in process(value).items() ] 16 | else: 17 | return [ (key, value) ] 18 | 19 | items = [ item for k, v in record.items() for item in expand(k, v) ] 20 | 21 | return dict(items) 22 | 23 | #teste = (desaninhar_dict(criar_dict(dataDict))) 24 | # teste['lista_Qtd_Atrasos'] = teste['lista_Qtd_Atrasos'][0] 25 | # teste['lista_Tempo_Atrasos'] = teste['lista_Tempo_Atrasos'][0] 26 | 27 | #print(teste) 28 | 29 | def process(self,record): 30 | dict_ = {} 31 | dict_['airport'] = record['airport'] 32 | dict_['lista_Qtd_Atrasos'] = record['lista_Qtd_Atrasos'][0] 33 | dict_['lista_Tempo_Atrasos'] = record['lista_Tempo_Atrasos'][0] 34 | return(dict_) 35 | 36 | print(teste(teste)) 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | # def criar_dict2(x): 49 | # dict_ = {} 50 | # dict_['airport'] = str(x[0]) 51 | # dict_['lista_Qtd_Atrasos'] = str(x[1]) 52 | # dict_['lista_Qtd_Atrasos'] = str(x[2]) 53 | # return(dict_) 54 | 55 | # print(criar_dict2(teste)) 56 | 57 | # def criar_lista(x): 58 | # lista = list(x.values()) 59 | # return lista 60 | # print(criar_lista(teste)) -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.1 - Setup Colab.py: -------------------------------------------------------------------------------- 1 | pip install apache-beam[interactive] 2 | -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.10 - ParDo.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | class Filter(beam.DoFn): 6 | def process(self,record): 7 | if int(record[8]) > 0: 8 | return [record] 9 | 10 | Delayed_time = ( 11 | p1 12 | | "Import Data time" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1) 13 | | "Split by comma time" >> beam.Map(lambda record: record.split(',')) 14 | | "Filter Delays time" >> beam.ParDo(Filter()) 15 | | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8]))) 16 | | "Sum by key time" >> beam.CombinePerKey(sum) 17 | # | "Print Results" >> beam.Map(print) 18 | ) 19 | 20 | Delayed_num = ( 21 | p1 22 | | "Import Data" >> beam.io.ReadFromText(r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso_EN\flights_sample.csv", skip_header_lines = 1) 23 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 24 | | "Filter Delays" >> beam.ParDo(Filter()) 25 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 26 | | "Count by key" >> beam.combiners.Count.PerKey() 27 | # | "Print Results" >> beam.Map(print) 28 | ) 29 | 30 | Delay_table = ( 31 | {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 32 | | beam.CoGroupByKey() 33 | | beam.Map(print) 34 | ) 35 | 36 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.2 - Create.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | p1 | "Tuple" >> beam.Create( [ ("Cassio",32) , ("Vics",21) ] ) | beam.Map(print) #tuple 6 | p1 | "List" >> beam.Create ( [ 1,2,3 ] ) | beam.Map(print) #list 7 | 8 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.2 - Read Transform.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | voos = ( 6 | p1 7 | # Read files 8 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 9 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 10 | | "Print Results" >> beam.Map(print) 11 | ) 12 | 13 | p1.run() 14 | -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.3 - Write Transform.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | voos = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Write Results" >> beam.io.WriteToText("Flights.txt") 10 | ) 11 | 12 | p1.run() 13 | -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.4 - FlatMap.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | Collection = ( 6 | p1 7 | |beam.io.ReadFromText('poem.txt') 8 | |beam.FlatMap(lambda record: record.split(' ')) 9 | |beam.io.WriteToText('result.txt') 10 | ) 11 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.4 - Map.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | voos = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Print Results" >> beam.Map(print) 10 | ) 11 | 12 | p1.run() 13 | -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.5 - Filter Lambda.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | voos = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Filter By LA Flights" >> beam.Filter(lambda record: record[3] == "LAX") 10 | | "Print Results" >> beam.Map(print) 11 | ) 12 | 13 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.5 - Filter Lista.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | words=['quatro','um'] 4 | 5 | def FindWords( i ): 6 | if i in words: 7 | return True 8 | 9 | p1 = beam.Pipeline() 10 | 11 | Collection = ( 12 | p1 13 | |beam.io.ReadFromText('Poem.txt') 14 | |beam.FlatMap(lambda record: record.split(' ')) 15 | |beam.Filter(FindWords) 16 | |beam.io.WriteToText('results.txt') 17 | ) 18 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.6 - Flatten.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p = beam.Pipeline() 4 | 5 | black = ('Adão','Jesus','Mike') 6 | White = ('Tulio','Mary','Joca') 7 | first_nations = ('Vic','Marta','Tom') 8 | 9 | black_pc = p | "Creating Pcollection black" >> beam.Create(black) 10 | White_pc = p | "Creating Pcollection White" >> beam.Create(White) 11 | first_nations_pc = p | "Creating Pcollection first_nations" >> beam.Create(first_nations) 12 | 13 | people = ( 14 | (black_pc,White_pc,first_nations_pc) 15 | | beam.Flatten() 16 | | beam.Map(print)) 17 | p.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.7 - CombinePerKey.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | Delayed_time = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 ) 10 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 11 | | "Sum by key" >> beam.CombinePerKey(sum) 12 | | "Print Results" >> beam.Map(print) 13 | ) 14 | 15 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.8 - Combiners.Count.Perkey.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | Delayed_num = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 ) 10 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 11 | | "Count Per key" >> beam.combiners.Count.PerKey() 12 | | "Print Results" >> beam.Map(print) 13 | ) 14 | 15 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/3.9 - CoGroupByKey.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | 3 | p1 = beam.Pipeline() 4 | 5 | Delayed_time = ( 6 | p1 7 | | "Import Data" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 8 | | "Split by comma" >> beam.Map(lambda record: record.split(',')) 9 | | "Filter Delays" >> beam.Filter(lambda record: int(record[8]) > 0 ) 10 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 11 | | "Sum by key" >> beam.CombinePerKey(sum) 12 | # | "Print Results" >> beam.Map(print) 13 | ) 14 | 15 | Delayed_num = ( 16 | p1 17 | | "Import Data Num" >> beam.io.ReadFromText("flights_sample.csv", skip_header_lines = 1) 18 | | "Split by comma Num" >> beam.Map(lambda record: record.split(',')) 19 | | "Filter Delays Num" >> beam.Filter(lambda record: int(record[8]) > 0 ) 20 | | "Create a key-value Num" >> beam.Map(lambda record: (record[4],int(record[8]))) 21 | | "Combine by key" >> beam.combiners.Count.PerKey() 22 | # | "Print Results" >> beam.Map(print) 23 | ) 24 | 25 | Delay_table = ( 26 | {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 27 | | beam.CoGroupByKey() 28 | | beam.Map(print) 29 | ) 30 | 31 | p1.run() -------------------------------------------------------------------------------- /Apache Beam/Main Functions/Poem.txt: -------------------------------------------------------------------------------- 1 | Há quatro quadros três e três quadros quatro. 2 | Sendo que quatro destes quadros são quadrados, 3 | um dos quadros quatro e três dos quadros três. 4 | Os três quadros que não são quadrados, 5 | são dois dos quadros quatro e um dos quadros três. -------------------------------------------------------------------------------- /Apache Beam/Main Functions/voos_sample.csv: -------------------------------------------------------------------------------- 1 | 2019-04-27,19805,1,JFK,LAX,854,-6,1217,2,355,2475,1 2 | 2019-04-27,19805,2,LAX,JFK,944,14,1736,-29,269,2475,2 3 | 2019-04-27,19805,3,JFK,LAX,1224,-6,1614,39,371,2475,3 4 | 2019-04-27,19805,4,LAX,JFK,1240,25,2028,-27,264,2475,4 5 | 2019-04-27,19805,5,DFW,HNL,1300,-5,1650,15,510,3784,5 6 | 2019-04-27,19805,6,OGG,DFW,1901,126,640,95,385,3711,6 7 | 2019-04-27,19805,7,DFW,OGG,1410,125,1743,138,497,3711,7 8 | 2019-04-27,19805,8,HNL,DFW,1659,4,458,-22,398,3784,8 9 | 2019-04-27,19805,9,JFK,LAX,648,-7,1029,19,365,2475,9 10 | 2019-04-27,19805,10,LAX,JFK,2156,21,556,1,265,2475,10 11 | 2019-04-27,19805,12,LAX,JFK,1113,-2,1910,-40,267,2475,11 12 | 2019-04-27,19805,14,OGG,LAX,2235,5,618,-17,270,2486,12 13 | 2019-04-27,19805,15,BOS,ORD,611,-9,756,-19,129,867,13 14 | 2019-04-27,19805,16,SFO,JFK,1312,17,2107,-33,268,2586,14 15 | 2019-04-27,19805,17,ATL,MIA,630,-5,813,-17,83,594,15 16 | 2019-04-27,19805,18,SFO,JFK,22,112,833,88,288,2586,16 17 | 2019-04-27,19805,19,JFK,LAX,1024,-6,1353,18,359,2475,17 18 | 2019-04-27,19805,20,SFO,JFK,1715,135,130,120,277,2586,18 19 | 2019-04-27,19805,21,JFK,LAX,1906,-4,2246,16,359,2475,19 20 | 2019-04-27,19805,22,LAX,JFK,1458,-2,2336,11,272,2475,20 21 | -------------------------------------------------------------------------------- /Apache Beam/README.md: -------------------------------------------------------------------------------- 1 | # Apache Beam 2 | Files regarding my course sold on udemy. Tey are available in English and Portuguese. 3 | s 4 | ## https://www.udemy.com/user/cassio-alessandro-de-bolba/ 5 | 6 | ### Folders Description 7 | Main Functions -> study case with most important functions 8 | Bach Processing -> processing data usgin functions learned in Main Functions, to process data on GCP 9 | Streaming Piocessing -> processing data usgin functions learned in Main Functions, to process data on GC 10 | -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/5.2.0 - Data Generator.py: -------------------------------------------------------------------------------- 1 | #pip install google-cloud-pubsub 2 | 3 | import csv 4 | import time 5 | from google.cloud import pubsub_v1 6 | import os 7 | 8 | service_account_key = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json' 9 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= service_account_key 10 | 11 | topic = 'projects/dataflowproject-299811/topics/MeuTopico' 12 | publisher = pubsub_v1.PublisherClient() 13 | 14 | input = r"C:\Users\cassi\Google Drive\GCP\Dataflow Course\Meu_Curso\Seção 3 - Principais Transfromações\voos_sample.csv" 15 | 16 | with open(input, 'rb') as file: 17 | for row in file: 18 | print('Publishing in Topic') 19 | publisher.publish(topic,row) 20 | time.sleep(1) -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/5.2.1 - Voos Streaming DF + Pubsub.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import os 3 | from apache_beam.options.pipeline_options import PipelineOptions 4 | 5 | pipeline_options = { 6 | 'project': 'dataflowproject-299811' , 7 | 'runner': 'DataflowRunner', 8 | 'region': 'southamerica-east1', 9 | 'job_name': 'cassio', 10 | 'output': 'gs://template_dataflow_curso/saida', 11 | 'staging_location': 'gs://template_dataflow_curso/staging', 12 | 'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv', 13 | 'temp_location': 'gs://template_dataflow_curso/staging', 14 | 'template_location': 'gs://template_dataflow_curso/template/streaming_job_voos', 15 | 'streaming' : True } 16 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options) 17 | p1 = beam.Pipeline(options=pipeline_options) 18 | 19 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json' 20 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 21 | 22 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs' 23 | topic = 'projects/dataflowproject-299811/topics/saida' 24 | 25 | class split_lines(beam.DoFn): 26 | def process(self,record): 27 | return [record.split(',')] 28 | 29 | class Filter(beam.DoFn): 30 | def process(self,record): 31 | if int(record[8]) > 0: 32 | return [record] 33 | 34 | 35 | pcollection_input = ( 36 | p1 | 'Read from pubsub topic' >> beam.io.ReadFromPubSub(subscription= subscription) 37 | ) 38 | 39 | Delayed_time = ( 40 | pcollection_input 41 | # p1 42 | # | "Import Data time" >> beam.io.ReadFromText(r"gs://template_dataflow_curso/entrada/voos_sample.csv", skip_header_lines = 1) 43 | | "Split by comma time" >> beam.ParDo(split_lines()) 44 | | "Filter Delays time" >> beam.ParDo(Filter()) 45 | | "Create a key-value time" >> beam.Map(lambda record: (record[4],int(record[8]))) 46 | | "Sum by key time" >> beam.CombinePerKey(sum) 47 | # | "Print Results" >> beam.Map(print) 48 | ) 49 | 50 | Delayed_num = ( 51 | pcollection_input 52 | # p1 53 | # |"Import Data" >> beam.io.ReadFromText(r"gs://template_dataflow_curso/entrada/voos_sample.csv", skip_header_lines = 1) 54 | | "Split by comma" >> beam.ParDo(split_lines()) 55 | | "Filter Delays" >> beam.ParDo(Filter()) 56 | | "Create a key-value" >> beam.Map(lambda record: (record[4],int(record[8]))) 57 | | "Count by key" >> beam.combiners.Count.PerKey() 58 | # | "Print Results" >> beam.Map(print) 59 | ) 60 | 61 | Delay_table = ( 62 | {'Delayed_num':Delayed_num,'Delayed_time':Delayed_time} 63 | | "join" >> beam.CoGroupByKey() 64 | # | beam.Map(print) 65 | # | beam.io.WriteToText(r"gs://template_dataflow_curso/saida/Voos_atrados_qtd.csv") 66 | | "Converting to byte String" >> beam.Map(lambda row: (''.join(row).encode('utf-8')) ) 67 | | "Writting to Topic" >> beam.io.WriteToPubSub(topic) 68 | ) 69 | 70 | p.run() 71 | -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/5.4 - Janelas e Noções de Tempo para Streaming.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/Streaming Processing/5.4 - Janelas e Noções de Tempo para Streaming.pptx -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/5.5 - Tumbling Window DF + BQ.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import os 3 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions 4 | from apache_beam import window 5 | import time 6 | 7 | pipeline_options = { 8 | 'project': 'dataflowproject-299811' , 9 | 'runner': 'DataflowRunner', 10 | 'region': 'southamerica-east1', 11 | 'job_name': 'cassio', 12 | 'output': 'gs://template_dataflow_curso/saida', 13 | 'staging_location': 'gs://template_dataflow_curso/staging', 14 | 'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv', 15 | 'temp_location': 'gs://template_dataflow_curso/temp', 16 | 'template_location': 'gs://template_dataflow_curso/template/streaming_job_df_bq_voos', 17 | 'streaming' : True, 18 | 'enable_streaming_engine' : True, 19 | 'save_main_session': True } 20 | pipeline_options = PipelineOptions.from_dictionary(pipeline_options) 21 | p1 = beam.Pipeline(options=pipeline_options) 22 | 23 | # OPTIONS PARA EXECUTAR STREAMING LOCAL 24 | # options= PipelineOptions() 25 | # options.view_as(StandardOptions).streaming= True 26 | # p1 = beam.Pipeline(options=options) 27 | 28 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json' 29 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 30 | 31 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs' 32 | 33 | class separar_linhas(beam.DoFn): 34 | def process(self,record): 35 | return [record.decode("utf-8").split(',')] 36 | 37 | class filtro(beam.DoFn): 38 | def process(self,record): 39 | if int(record[8]) > 0: 40 | return [record] 41 | 42 | table_schema = 'airport:STRING, lista_Qtd_Atrasos:INTEGER' 43 | tabela = 'dataflowproject-299811:voos_dataflow.tabela_voos_tumbling' 44 | 45 | 46 | Qtd_Atrasos = ( 47 | p1 48 | | "Ler da subcription" >> beam.io.ReadFromPubSub(subscription= subscription) 49 | | "Separar por Vírgulas Qtd" >> beam.ParDo(separar_linhas()) 50 | | "Timestamp Customizada" >> beam.Map(lambda record: beam.window.TimestampedValue(record, time.time())) 51 | | "Pegar voos com Qtd" >> beam.ParDo(filtro()) 52 | | "Criar par Qtd" >> beam.Map(lambda record: (record[4],int(record[8]))) 53 | | "Window" >> beam.WindowInto(window.FixedWindows(10)) 54 | | "Contar por key" >> beam.combiners.Count.PerKey() 55 | | "Dicionário" >> beam.Map(lambda record:({'airport':record[0],'lista_Qtd_Atrasos':int(record[1])})) 56 | # | "Mostrar Resultados QTD" >> beam.Map(print) 57 | | beam.io.WriteToBigQuery( 58 | tabela, 59 | schema=table_schema, 60 | write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 61 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 62 | custom_gcs_temp_location = 'gs://template_dataflow_curso/staging' ) 63 | ) 64 | 65 | result = p1.run() 66 | result.wait_until_finish() -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/5.6 - Sliding Window DF + BQ.py: -------------------------------------------------------------------------------- 1 | import apache_beam as beam 2 | import os 3 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions 4 | from apache_beam import window 5 | import time 6 | 7 | # pipeline_options = { 8 | # 'project': 'dataflowproject-299811' , 9 | # 'runner': 'DataflowRunner', 10 | # 'region': 'southamerica-east1', 11 | # 'job_name': 'cassio', 12 | # 'output': 'gs://template_dataflow_curso/saida', 13 | # 'staging_location': 'gs://template_dataflow_curso/staging', 14 | # 'input': 'gs://template_dataflow_curso/entrada/voos_sample.csv', 15 | # 'temp_location': 'gs://template_dataflow_curso/temp', 16 | # 'template_location': 'gs://template_dataflow_curso/template/streaming_job_df_bq_voos', 17 | # 'streaming' : True, 18 | # 'enable_streaming_engine' : True, 19 | # 'save_main_session': True } 20 | # pipeline_options = PipelineOptions.from_dictionary(pipeline_options) 21 | # p1 = beam.Pipeline(options=pipeline_options) 22 | 23 | ## OPTIONS PARA EXECUTAR STREAMING LOCAL 24 | options= PipelineOptions() 25 | options.view_as(StandardOptions).streaming= True 26 | p1 = beam.Pipeline(options=options) 27 | 28 | serviceAccount = r'C:\Users\cassi\Google Drive\GCP\Dataflow Course\testes\dataflowproject-299811-5207946866a4.json' 29 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 30 | 31 | subscription = 'projects/dataflowproject-299811/subscriptions/MinhaSubs' 32 | 33 | class separar_linhas(beam.DoFn): 34 | def process(self,record): 35 | return [record.decode("utf-8").split(',')] 36 | 37 | class filtro(beam.DoFn): 38 | def process(self,record): 39 | if int(record[8]) > 0: 40 | return [record] 41 | 42 | table_schema = 'airport:STRING, lista_Qtd_Atrasos:INTEGER' 43 | tabela = 'dataflowproject-299811:voos_dataflow.tabela_voos_tumbling' 44 | 45 | 46 | Qtd_Atrasos = ( 47 | p1 48 | | "Ler da subcription" >> beam.io.ReadFromPubSub(subscription= subscription) 49 | | "Separar por Vírgulas Qtd" >> beam.ParDo(separar_linhas()) 50 | | "Timestamp Customizada" >> beam.Map(lambda record: beam.window.TimestampedValue(record, time.time())) 51 | | "Pegar voos com Qtd" >> beam.ParDo(filtro()) 52 | | "Criar par Qtd" >> beam.Map(lambda record: (record[4],int(record[8]))) 53 | | "Window" >> beam.WindowInto(window.SlidingWindows(10,5)) 54 | | "Contar por key" >> beam.combiners.Count.PerKey() 55 | | "Dicionário" >> beam.Map(lambda record:({'airport':record[0],'lista_Qtd_Atrasos':int(record[1])})) 56 | | "Mostrar Resultados QTD" >> beam.Map(print) 57 | # | beam.io.WriteToBigQuery( 58 | # tabela, 59 | # schema=table_schema, 60 | # write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, 61 | # create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 62 | # custom_gcs_temp_location = 'gs://template_dataflow_curso/staging' ) 63 | ) 64 | 65 | result = p1.run() 66 | result.wait_until_finish() -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/Sliding_processor.py: -------------------------------------------------------------------------------- 1 | pip install apache_beam 2 | 3 | pip install google-cloud-pubsub 4 | 5 | import apache_beam as beam 6 | from apache_beam.options.pipeline_options import PipelineOptions,StandardOptions 7 | import os 8 | from apache_beam import window 9 | from apache_beam.transforms.combiners import Count 10 | import time 11 | 12 | 13 | serviceAccount = '/content/vivid-now-271806-e22933a07e8a.json' 14 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= serviceAccount 15 | 16 | input_subscription = 'projects/vivid-now-271806/subscriptions/movie_subscription' 17 | 18 | options= PipelineOptions() 19 | options.view_as(StandardOptions).streaming= True 20 | 21 | p = beam.Pipeline(options=options) 22 | 23 | comedy_movies = 'projects/vivid-now-271806/topics/comedy_movies' 24 | 25 | def format(element): 26 | (movie,rating)=element 27 | return "{r} rating for movieID {MID} in 10 seconds".format(f= rating, MID=movie).encode('utf-8') 28 | 29 | pubsub_pipeline = ( 30 | p 31 | | 'Read from pubsub topic' >> beam.io.ReadFromPubSub(subscription= input_subscription) 32 | # decodificar e split 33 | | 'Split the records by comma' >> beam.Map(lambda row: row.decode("utf-8").split(',')) 34 | # definir a coluna, ou criar uma coluna. Criando em cada linha uma timestamp 35 | | 'Timestamp Customizada ' >> beam.Map(lambda row: beam.window.TimestampedValue(row, time.time())) 36 | # Criar um key value pair, onde indico a coluna id do filme e ratings, para contar quantas ratings recebidas por id de filme 37 | | 'Form Key Value Pair' >> beam.Map(lambda row: (row[1],float(row[2]))) 38 | # defino minha janela, primeiro parametro é janela, segundo é intervalo de janelas 39 | | 'Window' >> beam.WindowInto(window.SlidingWindows(4,2)) 40 | # contar ratings por chave 41 | | 'Count the ratings' >> Count.PerKey() 42 | # codificar 43 | #| 'Converting to byte String' >> beam.Map(lambda row: (''.join(row).encode('utf-8')) ) 44 | | 'format' >> beam.Map(format) 45 | | 'Publish to output topic' >> beam.io.WriteToPubSub(comedy_movies) 46 | ) 47 | result = p.run() 48 | result.wait_until_finish() -------------------------------------------------------------------------------- /Apache Beam/Streaming Processing/streaming janelas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Apache Beam/Streaming Processing/streaming janelas.xlsx -------------------------------------------------------------------------------- /Data-Eng-Bootcamp/19 - 21 game.scala: -------------------------------------------------------------------------------- 1 | // DISCLAIMER 2 | // BELOW ARE SNIPPETS OF THE CODE REGRADDING THE 21 GAME CARD DONEDURING THE COURSE 3 | // THE STILL NEEDS TO BE CREATED 4 | 5 | // Define immutable variables for clubs 2♣ through 4♣ 6 | val twoClubs: Int = 2 7 | val threeClubs: Int = 3 8 | val fourClubs: Int = 4 9 | val aceClubs = 1 10 | val aceDiamonds = 1 11 | val aceHearts = 1 12 | val aceSpades = 1 13 | 14 | // Define immutable variables for player names 15 | val playerA: String = "Alex" 16 | val playerB: String = "Chen" 17 | val playerC: String = "Marta" 18 | 19 | // Creating the players list 20 | val players1 = List("Alex","Chen") 21 | val players2 = List("Vic","Cassio") 22 | val allPlayers = players1 ::: players2 23 | 24 | // Choose 25 | val hand = 26 | 27 | // define the functio taking hand as int, then insinde {} id the function 28 | def bust(hand: int) = { 29 | hand > 21 30 | } 31 | 32 | // function to compare and show the biggest hand 33 | def maxHand(handA: Int, handB: Int): Int = { 34 | if (bust(handA) & bust(handB)) println(0) 35 | else if (bust(handA)) println(handB) 36 | else if (bust(handB)) println(handA) 37 | else if (handA > handB) println(handA) 38 | else handB 39 | } 40 | 41 | // Create, parameterize, and initialize an array for a round of Twenty-One 42 | // In this exercise it is done at same time, different from previous one 43 | val hands = Array(tenClubs + fourDiamonds, 44 | nineSpades + nineHearts, 45 | twoClubs + threeSpades) 46 | 47 | // Inform a player where their current hand stands 48 | val informPlayer: String = { 49 | if (hand > 21) 50 | "Bust! :(" 51 | else if (hand == 21) 52 | "Twenty-One! :)" 53 | else 54 | "Hit or stay?" 55 | } 56 | 57 | // Find the number of points that will cause a bust 58 | def pointsToBust(hand: Int) = { 59 | // If the hand is a bust, 0 points remain 60 | if (bust(hand)) 61 | println(0) 62 | // Otherwise, calculate the difference between 21 and the current hand 63 | else 64 | println(21 - hand) 65 | } 66 | 67 | // Create list with five hands of Twenty-One 68 | var hands = List(16, 21, 8, 25, 4) 69 | 70 | // Loop through hands, finding each hand's number of points to bust 71 | hands.foreach(pointsToBust) 72 | -------------------------------------------------------------------------------- /Databricks/Readme.md: -------------------------------------------------------------------------------- 1 | # Databricks 2 | This folder hold my studies in Databricks and some sample projects. 3 | The studies were mostly don from Databrick Academy notebooks and also some workshops. 4 | 5 | So far, I used the following languages within Databricks (as you check in the notebooks in this repository): 6 | * SQL 7 | * Spark / PySpark 8 | * Scala -------------------------------------------------------------------------------- /Databricks/Scala Exercises/People_vs_Execises.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | // READING FROM FILE STORE 3 | //File location and type 4 | val path_people = "/FileStore/tables/people.json" 5 | val path_exercises = "/FileStore/tables/exercises.json" 6 | val file_type = "json" 7 | 8 | // COMMAND ---------- 9 | 10 | // using multiline due to json file format 11 | val dfPeople = spark.read.option("multiline","true").json(path_people) 12 | 13 | val dfExercisesRaw = spark.read.option("multiline","true").json(sc.parallelize(path_exercises)) 14 | 15 | // COMMAND ---------- 16 | 17 | //val explode = dfExercises.withColumn("exercises",explode($"exercises")) 18 | //display(explode) 19 | // 20 | //val dfExercises = spark.read.option("multiline","true").schema(exercisesSchema).json(path_exercises) 21 | 22 | // COMMAND ---------- 23 | 24 | new_df = old_df.withColumn("name",explode("user_info.name")) 25 | .withColumn("last_name",explode("user_info.last_name")) 26 | 27 | // COMMAND ---------- 28 | 29 | import org.apache.spark.sql.functions._ 30 | 31 | val dfExercisesExp = dfExercisesRaw 32 | .withColumn("date",explode($"exercises.date")) 33 | .withColumn("exercise_end_time",explode($"exercises.exercise_end_time")) 34 | .withColumn("exercise_start_time",explode($"exercises.exercise_start_time")) 35 | .withColumn("user",explode($"exercises.user")) 36 | .withColumn("exercise_rating", explode($"exercises.metadata.exercise_rating")) 37 | .withColumn("heart_rate_samples",explode($"exercises.heart_rate_samples")) 38 | .drop($"exercises") 39 | 40 | 41 | // COMMAND ---------- 42 | 43 | val test = dfExercisesExp.select($"date",$"user",$"heart_rate_samples.*") 44 | display(test) 45 | //display(test.select(explode('heart_rate_samples) as (Seq("x", "y")))) 46 | 47 | 48 | // COMMAND ---------- 49 | 50 | 51 | 52 | // COMMAND ---------- 53 | 54 | val test = dfExercisesExp.select($"date",$"user",explode_outer($"heart_rate_samples")) 55 | display(test) 56 | 57 | // COMMAND ---------- 58 | 59 | //unpivot 60 | //val unPivotDF = test.select($"user", 61 | //expr("stack(4, '11:15', '11:15','11:20','11:20') as (Country)")) 62 | //.where("Total is not null") 63 | //unPivotDF.show() 64 | 65 | // COMMAND ---------- 66 | 67 | display(test.select(explode('heart_rate_samples) as (Seq("x", "y")))) 68 | 69 | // COMMAND ---------- 70 | 71 | //val unpivotedDf = test 72 | // .selectExpr("date","user","stack(1,'11:15','11:15')") 73 | // .withColumnRenamed("col0","device") // default name of this column is col0 74 | 75 | // COMMAND ---------- 76 | 77 | //display(test.select($"heart_rate_samples.*")) 78 | -------------------------------------------------------------------------------- /Databricks/Scala Exercises/readme.md: -------------------------------------------------------------------------------- 1 | There are 2 JSON files. The first JSON file (people.json) contains information about individuals. The second JSON (exercises.json) contains exercise data for those individuals. An individual is identified by the “user” key which is connected to “id” key in people.json file. Write a Spark pipeline in Scala that derives the following information from this data: 2 | 3 | Person report that contains the following for each unique person 4 | * Total number of exercises for person 5 | (count of start time group by user id) 6 | * Average duration of exercise for person 7 | (avg (end - start time) group by user id) 8 | * Average exercise rating for person 9 | (avg exercise_rating group by user id) 10 | * Average heart rate during exercise for person 11 | 12 | Exercise report containing the following info on each exercise: 13 | * Starting/ending time of exercise and duration 14 | * Hourly average heart rate 15 | * Hourly minimum/maximum heart rate 16 | 17 | Process the JSON data and create at least the above reports (”PersonReport” and “Exercises”). -------------------------------------------------------------------------------- /Docker/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/.DS_Store -------------------------------------------------------------------------------- /Docker/img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/.DS_Store -------------------------------------------------------------------------------- /Docker/img/Containerrization PID.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/Containerrization PID.png -------------------------------------------------------------------------------- /Docker/img/container-layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/container-layer.png -------------------------------------------------------------------------------- /Docker/img/docker-compose-versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-compose-versions.png -------------------------------------------------------------------------------- /Docker/img/docker-networks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-networks.png -------------------------------------------------------------------------------- /Docker/img/docker-volume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/docker-volume.png -------------------------------------------------------------------------------- /Docker/img/port-mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/port-mapping.png -------------------------------------------------------------------------------- /Docker/img/voting-app-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Docker/img/voting-app-diagram.png -------------------------------------------------------------------------------- /Docker/python-sample-app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python 2 | 3 | COPY my-app.py my-app.py 4 | COPY requirements.txt requirements.txt 5 | 6 | RUN pip install -r requirements.txt 7 | 8 | CMD python my-app.py 9 | 10 | -------------------------------------------------------------------------------- /Docker/python-sample-app/my-app.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | def get_weather_info(): 4 | 5 | city = "Porto Alegre" 6 | 7 | api_key = "868a26a88dcad371f4205a319f26be8c" 8 | 9 | url = "http://api.openweathermap.org/data/2.5/weather?q="+ str(city) +"&appid="+ api_key 10 | 11 | json_data = requests.get(url).json() 12 | 13 | return print(f"Current Temperature for {str(city)} is {json_data['main']['temp']/10}") 14 | 15 | 16 | if __name__ == "__main__": 17 | 18 | get_weather_info() -------------------------------------------------------------------------------- /Docker/python-sample-app/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /Fundamentals of Data Engineering/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/.DS_Store -------------------------------------------------------------------------------- /Fundamentals of Data Engineering/readme.md: -------------------------------------------------------------------------------- 1 | *THESE ARE NOTES FROM THE BOOK, FOR FURTHER CHECKS.* 2 | 3 | # FUNDAMENTALS OF DATA ENGINEERING 4 | 5 | "we unapologetically take a cloud-first approach. We view the cloud as a fundamentally transformative development that will endure for decades; most on-premises data systems and workloads will eventually move to cloud hosting. We assume that infrastructure and systems are ephemeral and scalable, and that data engineers will lean toward deploying managed services in the cloud". 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Fundamentals of Data Engineering/src/DE_lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/DE_lifecycle.png -------------------------------------------------------------------------------- /Fundamentals of Data Engineering/src/DE_stakeholders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/DE_stakeholders.png -------------------------------------------------------------------------------- /Fundamentals of Data Engineering/src/monolith_vs_ms_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Fundamentals of Data Engineering/src/monolith_vs_ms_arch.png -------------------------------------------------------------------------------- /Git/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/.DS_Store -------------------------------------------------------------------------------- /Git/gitlab-ci-my-first-pipeline.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - build 3 | - test 4 | 5 | build: 6 | stage: build 7 | script: 8 | - echo "Building" 9 | - mkdir build 10 | - touch build/info.txt 11 | artifacts: 12 | paths: 13 | - build/ 14 | 15 | test: 16 | stage: test 17 | script: 18 | - echo "Testing" 19 | - test -f "build/info.txt" 20 | -------------------------------------------------------------------------------- /Git/img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/img/.DS_Store -------------------------------------------------------------------------------- /Git/img/CI CD Pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Git/img/CI CD Pipeline.png -------------------------------------------------------------------------------- /Kubernetes/Kubernetes Up and Running/2/.dockerignore: -------------------------------------------------------------------------------- 1 | node_modules -------------------------------------------------------------------------------- /Kubernetes/Kubernetes Up and Running/2/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # Start from a Node.js 10 (LTS) image 3 | FROM node:10 4 | # Specify the directory inside the image in which all commands will run 5 | WORKDIR /usr/src/app 6 | # Copy package files and install dependencies 7 | COPY package*.json ./ RUN npm install 8 | # Copy all of the app files into the image 9 | COPY . . 10 | # The default command to run when starting the container 11 | CMD [ "npm", "start" ] -------------------------------------------------------------------------------- /Kubernetes/Kubernetes Up and Running/2/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "simple-node", 3 | "version": "1.0.0", 4 | "description": "A sample simple application for Kubernetes Up & Running", 5 | "main": "server.js", 6 | "scripts": { 7 | "start": "node server.js" }, 8 | "author": "" } -------------------------------------------------------------------------------- /Kubernetes/Kubernetes Up and Running/2/server.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var app = express(); 3 | app.get('/', function (req, res) { 4 | res.send('Hello World!'); 5 | }); 6 | app.listen(3000, function () { console.log('Listening on port 3000!'); console.log(' http://localhost:3000'); 7 | }); -------------------------------------------------------------------------------- /Kubernetes/Kubernetes Up and Running/kubernetes_up_and_running.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Up and Running 2 | official [repo]('https://github.com/kubernetes-up-and-running') 3 | 4 | ## 1. Introduction 5 | First chapter enphasize the mais concepts od kubernetes and why it is changing the way developers are working by highlighting the most important improvements this technology brought to teams. 6 | * Velocity 7 | * The Value of Immutability 8 | * Declarative Configuration 9 | * Self-Healing Systems 10 | * Scaling Your Service and Your Teams 11 | * Decoupling 12 | * Easy Scaling for Applications and Clusters 13 | * Scaling Development Teams with Microservices 14 | * Separation of Concerns for Consistency and Scaling 15 | * Abstracting Your Infrastructure 16 | * Efficiency 17 | 18 | ## 2. Creating and Running Containers 19 | * Kubernetes is meant for creating, deploying and managing distributes applications in containers 20 | * Applications are generally comprised of a language runtime, libs and the code 21 | * Traditional method of running multiple programs in same server OS can have troubles dealing with dependencies 22 | * Previous chapter immutability is a big advantage to solve this problem 23 | * Docker helps on building, packing and sharing images 24 | * Docker is the most common image format, other is OCI 25 | 26 | ### 2.1 Container Image 27 | * It is a binary package of a container technology (like docker) that encapsulates all files necessary to run a progrma in an OS 28 | * This image can be built locally or pulled from a container registry (like docker hub) 29 | * Container images are constructed with a series of filesystem layers, where each layer inherits and modifies the layers that came before it 30 | * One conatiner image can be based on other image, and so on 31 | * 2 types of containers: 32 | * System Containers -> try to mimic a full syste, such a vm does (no longer used that much) 33 | * Application container -> runs a single program offering the right granularity of isolation and easy scalability 34 | 35 | ### 2.2 Building Images with Docker 36 | We will use an application container approach to build a image. Install docker 37 | 38 | #### 2.2.1 Dockerfiles 39 | * Build the folder directory as folder 2 40 | * do via command line to folder 2 41 | * run 42 | * docker build -t simple-node . 43 | * docker run -rm -p 3000:3000 simple-node -------------------------------------------------------------------------------- /Kubernetes/img/kubernetes-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Kubernetes/img/kubernetes-architecture.png -------------------------------------------------------------------------------- /Kubernetes/k8s_minikube.md: -------------------------------------------------------------------------------- 1 | # K8s on Minikube - fast lab setup 2 | video: https://www.youtube.com/watch?v=X48VuDVv0do 3 | repo from video: https://gitlab.com/nanuchi/youtube-tutorial-series/-/blob/master/basic-kubectl-commands/cli-commands.md 4 | 5 | ## 1. Setu up minikube + virtualbox 6 | This is an alternative way of running docker without using docker desktop, which for business purpose is no longer free. 7 | For study purpose, you can still use docker desktop. 8 | Thank to my workmate Sergei that introduced me to the approach below. 9 | * Install Docker 10 | ``` 11 | brew install docker 12 | brew install docker-compose 13 | ``` 14 | * Install [VirtualBox](https://minikube.sigs.k8s.io/docs/drivers/virtualbox/) 15 | 16 | For Mac users, you might need to allow the oracle machine to run in: system preferences -> security 17 | * Install [Minikube](https://minikube.sigs.k8s.io/docs/start/). 18 | * Start cluster using: 19 | ``` 20 | minikube start --container-runtime=docker --vm=true --driver=virtualbox --memory=2g 21 | ``` 22 | *Hint: when starting the cluster, you can adjust its settings according to your needs. See the options [here](https://minikube.sigs.k8s.io/docs/commands/start/).* 23 | 24 | * Run the following command in your shell to point it to the minikube's docker environment: 25 | ``` 26 | eval $(minikube docker-env) 27 | ``` 28 | * Now you can run docker images: 29 | ``` 30 | docker run ... 31 | ``` 32 | * If at some moment you want to stop the cluster (e.g. to free unused resources): 33 | ``` 34 | minikube stop 35 | ``` 36 | * Check if minukube is working 37 | ``` 38 | minikube status 39 | ``` 40 | If not working delete it and start again 41 | 42 | ## 2. Kubectl 43 | Now we have a server installed with minikube and Kubernetes + Kubectl Installed. 44 | Use Kubectl CLI to interact with the kubernetes: 45 | ``` 46 | # check nodes 47 | kubectl get nodes 48 | 49 | # check pods 50 | kubectl get pod 51 | 52 | # check services 53 | kubectl get services 54 | 55 | # create a deployment - deployment manages pods 56 | kubectl create deployment nginx-depl --image=nginx 57 | 58 | # See the deployments 59 | kubectl get deployment 60 | 61 | # manages the deployment, you should not needed to manage with it 62 | kubectl get replicaset 63 | 64 | # edit the deployment file 65 | kubectl edit deployment nginx-depl 66 | ``` 67 | Layers of abstraction: 68 | Deployment manage a Replica Set > Replica Set manages Pods > Pos is an abstraction 69 | 70 | ### 2.1 Debugging an Application 71 | First, lets create a Mongo DB Deployment to check some logs: 72 | ``` 73 | kubectl create deployment mongo-depl --image=mongo 74 | ``` 75 | 2 usefull functions for debugging: 76 | ``` 77 | # get the logs of the pod to check waht is happening inside it 78 | kubectl logs {pod-name} 79 | 80 | # execute the pod entering inside it 81 | kubectl exec -it {pod-name} -- bin/bash 82 | ``` 83 | 84 | ### 2.2 Delete Deployment 85 | ``` 86 | kubectl delete deployment mongo-depl 87 | ``` 88 | 89 | ### 2.3 Configuration File 90 | So far we deployed using few deployments options, but you can specify several configuraions in the deployment command, which would be too hard. Then, k8s have a way to config the deployment via configfiles.yml using the apply command. -------------------------------------------------------------------------------- /Kubernetes/readme.md: -------------------------------------------------------------------------------- 1 | ## Kubernetes 2 | Repo cotaining studies and practice codes with Kubernetes 3 | 4 | ### Description 5 | k8s_and_data -> Notes on Big Data deployed on K8S 6 | kubernetes up and Running -> Notes on the book with same name 7 | k8s_on_minikube -> fast test of K8s in minikube -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Engineering(Projects and Notes) 2 | Folder to showcase my studies notes and projects in Data Engineering area. 3 | 4 | ## Databricks 5 | Contains notes and some scripts created. 6 | ### [LAB DATABRICKS FROM A TO Z](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/LAB%20DATABRICKS%20FROM%20A%20TO%20Z%20.md) 7 | ### [SCALA ETL Part 1 - Data Extraction](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SCALA%20ETL%20Part%201%20-%20Data%20Extraction.md) 8 | ### [SPARK ETL Part 1 - Data Extraction](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SPARK%20ETL%20Part%201%20-%20Data%20Extraction.md) 9 | ### [SPARK SQL](https://github.com/cassiobolba/Data-Engineering/blob/master/Databricks/SPARK%20SQL.md) 10 | 11 | ## Data-Eng-Track-Bootcamp 12 | This folder holds my studies on data engineering with python track, on Datacamp. 13 | ### [2 - Introduction to Data Egineering](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/2%20-%20Introduction%20to%20Data%20Egineering.md) 14 | ### [3 - Data Ingestion with Pandas](https://github.com/cassiobolba/Python/blob/master/Python-Datacamp/3%20-%20Data%20Ingestion%20with%20Pandas.md) 15 | ### [3.5 - Software Eng. in Python - Clean Code](https://github.com/cassiobolba/Python/blob/master/Python-Datacamp/3.5%20-%20Software%20Eng.%20in%20Python%20-%20Clean%20Code.md) 16 | ### [6 - Introduction to Shell Script.md](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/6%20-%20Introduction%20to%20Shell%20Script.md) 17 | ### [17 - Introduction Scala.md](https://github.com/cassiobolba/Data-Engineering/blob/master/Data-Eng-Track-Bootcamp/17%20-%20Introduction%20to%20Scala.md) 18 | 19 | 20 | -------------------------------------------------------------------------------- /Snowflake/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Snowflake/.DS_Store -------------------------------------------------------------------------------- /Snowflake/curated-links-list.md: -------------------------------------------------------------------------------- 1 | # Curated List of Smnowflake Links to Study 2 | 3 | ## Youtube Channels 4 | | link | Status| Notes on | 5 | |----------|:-------------:|------:| 6 | | https://www.youtube.com/@DataEngineering/playlists | | | 7 | 8 | 9 | ## General Articles 10 | | link | Status| Notes on | 11 | |----------|:-------------:|------:| 12 | | https://airbyte.com/blog/snowflake-data-cloud | | | 13 | 14 | 15 | ## Snowpark 16 | | link | Status| Notes on | 17 | |----------|:-------------:|------:| 18 | | https://medium.com/snowflake/your-cheatsheet-to-snowflake-snowpark-dataframes-using-python-e5ec8709d5d7| | | 19 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/COPY.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | $1 3 | ,$2 4 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190916.export.csv ; 5 | 6 | CREATE DATABASE GDELT; 7 | 8 | CREATE SCHEMA EVENTS; 9 | 10 | CREATE TABLE IF NOT EXISTS GDELT.EVENTS.EVENTS_FULL ( 11 | 12 | GLOBALEVENTID INT 13 | 14 | ,SQLDATE varchar 15 | 16 | ,MonthYear varchar 17 | 18 | ,Year varchar 19 | 20 | ,FractionDate varchar 21 | 22 | ,Actor1Code varchar 23 | 24 | ,Actor1Name varchar 25 | 26 | ,Actor1CountryCode varchar 27 | 28 | ,Actor1KnownGroupCode varchar 29 | 30 | ,Actor1EthnicCode varchar 31 | 32 | ,Actor1Religion1Code varchar 33 | 34 | ,Actor1Religion2Code varchar 35 | 36 | ,Actor1Type1Code varchar 37 | 38 | ,Actor1Type2Code varchar 39 | 40 | ,Actor1Type3Code varchar 41 | 42 | ,Actor2Code varchar 43 | 44 | ,Actor2Name varchar 45 | 46 | ,Actor2CountryCode varchar 47 | 48 | ,Actor2KnownGroupCode varchar 49 | 50 | ,Actor2EthnicCode varchar 51 | 52 | ,Actor2Religion1Code varchar 53 | 54 | ,Actor2Religion2Code varchar 55 | 56 | ,Actor2Type1Code varchar 57 | 58 | ,Actor2Type2Code varchar 59 | 60 | ,Actor2Type3Code varchar 61 | 62 | ,IsRootEvent varchar 63 | 64 | ,EventCode varchar 65 | 66 | ,EventBaseCode varchar 67 | 68 | ,EventRootCode varchar 69 | 70 | ,QuadClass varchar 71 | 72 | ,GoldsteinScale varchar 73 | 74 | ,NumMentions varchar 75 | 76 | ,NumSources varchar 77 | 78 | ,NumArticles varchar 79 | 80 | ,AvgTone varchar 81 | 82 | ,Actor1Geo_Type varchar 83 | 84 | ,Actor1Geo_FullName varchar 85 | 86 | ,Actor1Geo_CountryCode varchar 87 | 88 | ,Actor1Geo_ADM1Code varchar 89 | 90 | ,Actor1Geo_Lat varchar 91 | 92 | ,Actor1Geo_Long varchar 93 | 94 | ,Actor1Geo_FeatureID varchar 95 | 96 | ,Actor2Geo_Type varchar 97 | 98 | ,Actor2Geo_FullName varchar 99 | 100 | ,Actor2Geo_CountryCode varchar 101 | 102 | ,Actor2Geo_ADM1Code varchar 103 | 104 | ,Actor2Geo_Lat varchar 105 | 106 | ,Actor2Geo_Long varchar 107 | 108 | ,Actor2Geo_FeatureID varchar 109 | 110 | ,ActionGeo_Type varchar 111 | 112 | ,ActionGeo_FullName varchar 113 | 114 | ,ActionGeo_CountryCode varchar 115 | 116 | ,ActionGeo_ADM1Code varchar 117 | 118 | ,ActionGeo_Lat varchar 119 | 120 | ,ActionGeo_Long varchar 121 | 122 | ,ActionGeo_FeatureID varchar 123 | 124 | ,DATEADDED varchar 125 | 126 | ,SOURCEURL varchar 127 | 128 | ); 129 | 130 | 131 | COPY INTO GDELT.EVENTS.EVENTS_FULL 132 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events 133 | file_format = ( type = 'csv' field_delimiter = '\t') 134 | pattern = '.*2019091.*' 135 | ; 136 | 137 | select * from GDELT.EVENTS.EVENTS_FULL; 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/FILE FORMAT.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE SCHEMA FILE_FORMATS; 2 | 3 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT 4 | FIELD_DELIMITER = '\t' 5 | TYPE = CSV 6 | --SKIP_HEADER=1 7 | ; 8 | 9 | DESC FILE FORMAT FILE_FORMATS.CSV_TAB_FMT; 10 | 11 | LIST @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/ ; 12 | 13 | COPY INTO GDELT.EVENTS.EVENTS_FULL 14 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20130922.export.csv 15 | file_format = MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT 16 | ; -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/FLATTEN.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | RAW_FILE:id id 3 | ,array_size(RAW_FILE:multiMedia) size 4 | from medias.youtube.statistics_raw ; 5 | 6 | SELECT distinct 7 | RAW_FILE:id::int id 8 | ,RAW_FILE:createdAt createdAt 9 | ,RAW_FILE:description::string description 10 | ,RAW_FILE:likeDislike.dislikes dislikes 11 | ,RAW_FILE:likeDislike.likes likes 12 | ,RAW_FILE:likeDislike.userAction user_action 13 | ,f.value:id multimedia_id 14 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW , 15 | table(flatten(RAW_FILE:multiMedia)) f 16 | --where RAW_FILE:id::int = 2114 17 | 18 | 19 | ; -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/INSERT.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE OR REPLACE TABLE MEDIAS.YOUTUBE.STATISTICS AS 3 | SELECT distinct 4 | RAW_FILE:id::int id 5 | ,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt 6 | ,RAW_FILE:description::string description 7 | ,RAW_FILE:likeDislike.dislikes::INT dislikes 8 | ,RAW_FILE:likeDislike.likes::INT likes 9 | ,RAW_FILE:likeDislike.userAction::INT user_action 10 | ,f.value:id::INT multimedia_id 11 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW , 12 | table(flatten(RAW_FILE:multiMedia)) f 13 | ; 14 | 15 | SELECT COUNT(*) FROM MEDIAS.YOUTUBE.STATISTICS; 16 | 17 | INSERT INTO MEDIAS.YOUTUBE.STATISTICS 18 | SELECT distinct 19 | RAW_FILE:id::int id 20 | ,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt 21 | ,RAW_FILE:description::string description 22 | ,RAW_FILE:likeDislike.dislikes::INT dislikes 23 | ,RAW_FILE:likeDislike.likes::INT likes 24 | ,RAW_FILE:likeDislike.userAction::INT user_action 25 | ,f.value:id::INT multimedia_id 26 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW , 27 | table(flatten(RAW_FILE:multiMedia)) f ; -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/LOAD JSON.sql: -------------------------------------------------------------------------------- 1 | -- CRIAR STAGE > CARREGAR DADOS BRUTOS > ANALISE E PARSE > CRIAR O COMANDO COPY 2 | 3 | CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES 4 | URL= 's3://snowflake-series/' 5 | STORAGE_INTEGRATION = S3_INT; 6 | 7 | LIST @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES; 8 | 9 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.JSON_FMT 10 | TYPE = JSON; 11 | 12 | CREATE DATABASE MEDIAS; 13 | 14 | CREATE SCHEMA MEDIAS.YOUTUBE; 15 | 16 | CREATE OR REPLACE table MEDIAS.YOUTUBE.STATISTICS_RAW ( 17 | raw_file variant); 18 | 19 | COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW 20 | FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES 21 | file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT 22 | files = ('youtube_data.json'); 23 | 24 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW ; 25 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/PARSE JSON.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW; 2 | 3 | SELECT raw_file:createdAt 4 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW; 5 | 6 | SELECT distinct 7 | $1:id::int id 8 | ,$1:createdAt createdAt 9 | ,$1:description::string description 10 | ,$1:likeDislike.dislikes dislikes 11 | ,$1:likeDislike.likes likes 12 | ,$1:likeDislike.userAction user_action 13 | ,RAW_FILE:multiMedia[0].name name 14 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW 15 | where RAW_FILE:id::int = 2134 16 | union all 17 | SELECT distinct 18 | $1:id::int id 19 | ,$1:createdAt createdAt 20 | ,$1:description::string description 21 | ,$1:likeDislike.dislikes dislikes 22 | ,$1:likeDislike.likes likes 23 | ,$1:likeDislike.userAction user_action 24 | ,RAW_FILE:multiMedia[1].name name 25 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW 26 | where RAW_FILE:id::int = 2134 27 | 28 | 29 | ; 30 | 31 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/STAGES.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE DATABASE MANAGE_DB; 2 | 3 | USE DATABASE MANAGE_DB; 4 | CREATE SCHEMA EXTERNAL_STAGES; 5 | 6 | USE SCHEMA EXTERNAL_STAGES; 7 | 8 | CREATE OR REPLACE STAGE GDELT_EVENTS 9 | URL = 's3://gdelt-open-data/' ; 10 | 11 | LIST @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS ; 12 | 13 | DESC STAGE MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS ; 14 | 15 | ALTER STAGE MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS 16 | SET CREDENTIALS=(aws_key_id='cab' aws_secret_key='123') -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/STORAGE INTEGRATION.sql: -------------------------------------------------------------------------------- 1 | --arn:aws:iam::749634257170:role/series-snowflake 2 | 3 | CREATE SCHEMA MANAGE_DB.STORAGE_INTERGRATION; 4 | 5 | USE DATABASE MANAGE_DB; 6 | USE SCHEMA STORAGE_INTERGRATION; 7 | 8 | 9 | ; 10 | create or replace storage integration S3_INT 11 | TYPE = EXTERNAL_STAGE 12 | STORAGE_PROVIDER = S3 13 | ENABLED = TRUE 14 | STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::749634257170:role/series-snowflake' 15 | STORAGE_ALLOWED_LOCATIONS = ('s3://snowflake-series/') 16 | COMMENT = 'My first integration' 17 | ; 18 | 19 | DESC INTEGRATION S3_INT; 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/TRANSFORMATION WITH COPY.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE GDELT.EVENTS.EVENTS_URL AS 2 | SELECT 3 | $1::INT GLOBALEVENTID 4 | ,TO_DATE($2,'YYYYMMDD') AS SQLDATE 5 | ,$3 MONTHYEAR 6 | ,$58 SOURCEURL 7 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190916.export.csv 8 | (file_format => MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT) 9 | ; 10 | 11 | COPY INTO GDELT.EVENTS.EVENTS_URL FROM ( 12 | SELECT 13 | $1::INT GLOBALEVENTID 14 | ,TO_DATE($2,'YYYYMMDD') AS SQLDATE 15 | ,$3 MONTHYEAR 16 | ,$58 SOURCEURL 17 | FROM @MANAGE_DB.EXTERNAL_STAGES.GDELT_EVENTS/events/20190917.export.csv 18 | (file_format => MANAGE_DB.FILE_FORMATS.CSV_TAB_FMT) 19 | ); 20 | 21 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/automacao/SNOWPIPE.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA MANAGE_DB.PIPES; 2 | 3 | CREATE PIPE MANAGE_DB.PIPES.JSON_PIPE 4 | AUTO_INGEST = TRUE 5 | AS 6 | COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW 7 | FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES 8 | file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT; 9 | 10 | DESC PIPE JSON_PIPE; 11 | 12 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW; 13 | 14 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW; -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/automacao/TASK TREE.sql: -------------------------------------------------------------------------------- 1 | SHOW TASKS; 2 | 3 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW; 4 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW; 5 | 6 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS; 7 | 8 | CREATE OR REPLACE TASK LOAD_STATISTICS 9 | WAREHOUSE = COMPUTE_WH 10 | COMMENT = 'SEGUNDA TAREFA' 11 | AFTER LOAD_RAW 12 | AS 13 | INSERT INTO MEDIAS.YOUTUBE.STATISTICS 14 | SELECT distinct 15 | RAW_FILE:id::int id 16 | ,TO_TIMESTAMP(RAW_FILE:createdAt) createdAt 17 | ,RAW_FILE:description::string description 18 | ,RAW_FILE:likeDislike.dislikes::INT dislikes 19 | ,RAW_FILE:likeDislike.likes::INT likes 20 | ,RAW_FILE:likeDislike.userAction::INT user_action 21 | ,f.value:id::INT multimedia_id 22 | FROM MEDIAS.YOUTUBE.STATISTICS_RAW , 23 | table(flatten(RAW_FILE:multiMedia)) f ; 24 | 25 | SHOW TASKS; 26 | 27 | ALTER TASK LOAD_RAW RESUME; 28 | ALTER TASK LOAD_STATISTICS RESUME; 29 | 30 | 31 | 32 | select * 33 | from table(information_schema.task_history()) 34 | order by scheduled_time desc ; -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/automacao/TASK.SQL: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE SCHEMA MANAGE_DB.TASKS; 2 | 3 | CREATE OR REPLACE TASK LOAD_RAW 4 | WAREHOUSE = COMPUTE_WH 5 | SCHEDULE = '1 MINUTE' 6 | COMMENT = 'HAHA' 7 | AS 8 | COPY INTO MEDIAS.YOUTUBE.STATISTICS_RAW 9 | FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_SERIES 10 | file_format= MANAGE_DB.FILE_FORMATS.JSON_FMT 11 | ; 12 | 13 | SHOW TASKS; 14 | 15 | SELECT * FROM MEDIAS.YOUTUBE.STATISTICS_RAW; 16 | 17 | TRUNCATE TABLE MEDIAS.YOUTUBE.STATISTICS_RAW; 18 | 19 | ALTER TASK LOAD_RAW RESUME; 20 | 21 | ALTER TASK LOAD_RAW SUSPEND; 22 | -------------------------------------------------------------------------------- /Snowflake/the-snowflake-series-course/test.sql: -------------------------------------------------------------------------------- 1 | select 2 | * 3 | from 4 | customer 5 | limit 6 | 100 7 | ; -------------------------------------------------------------------------------- /Snowflake/ws/extras/extras.sql: -------------------------------------------------------------------------------- 1 | -- Time Travel 2 | --Use-case: Update data (by mistake) 3 | UPDATE OUR_FIRST_DB.public.test 4 | SET FIRST_NAME = 'Joyen' ; 5 | 6 | -- see all is wrong 7 | SELECT * FROM OUR_FIRST_DB.public.test; 8 | 9 | --Using time travel: Method 1 - 2 minutes back 10 | InSELECT * FROM OUR_FIRST_DB.public.test at (OFFSET => -60*1.5); 11 | 12 | -- see all is back to normal 13 | SELECT * FROM OUR_FIRST_DB.public.test; 14 | 15 | 16 | DROP DATABASE ; 17 | 18 | UNDROP DATABASE ; 19 | 20 | 21 | -- Zero-Copy Clone 22 | CREATE DATABASE MEDIAS_DEV CLONE MEDIA; 23 | -- BEFORE (TIMESTAMP => 1231516) 24 | -------------------------------------------------------------------------------- /Snowflake/ws/ingestion/1-storage-integration.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE DATABASE MANAGE_DB ; 2 | 3 | CREATE SCHEMA IF NOT EXISTS MANAGE_DB.STORAGE_INTERGRATION; 4 | 5 | USE DATABASE MANAGE_DB; 6 | USE SCHEMA STORAGE_INTERGRATION; 7 | 8 | 9 | CREATE STORAGE INTEGRATION AWS_S3_INT 10 | TYPE = EXTERNAL_STAGE 11 | STORAGE_PROVIDER = S3 12 | ENABLED = TRUE 13 | STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::749634257170:role/snowflake-ws' 14 | STORAGE_ALLOWED_LOCATIONS = ('s3://snowflake-ws/') 15 | --STORAGE_BLOCKED_LOCATIONS = ('s3://mybucket3/path3/', 's3://mybucket4/path4/'); 16 | COMMENT = 'Integration to AWS s3' ; 17 | 18 | -- AZURE EXAMPLE 19 | -- CREATE STORAGE INTEGRATION AZURE_BLOB_INT 20 | -- TYPE = EXTERNAL_STAGE 21 | -- STORAGE_PROVIDER = 'AZURE' 22 | -- ENABLED = TRUE 23 | -- AZURE_TENANT_ID = '' 24 | -- STORAGE_ALLOWED_LOCATIONS = ('*') 25 | -- STORAGE_BLOCKED_LOCATIONS = ('azure://myaccount.blob.core.windows.net/mycontainer/path3/'); 26 | 27 | SHOW INTEGRATIONS ; 28 | 29 | DESC INTEGRATION AWS_S3_INT; 30 | 31 | 32 | -------------------------------------------------------------------------------- /Snowflake/ws/ingestion/2-stage-and-file-format.sql: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------- 2 | STAGE 3 | CRIAR STAGE > CARREGAR DADOS BRUTOS > ANALISE E PARSE > CRIAR O COMANDO COPY 4 | -------------------------------------------------------------*/ 5 | 6 | USE DATABASE MANAGE_DB; 7 | 8 | CREATE SCHEMA IF NOT EXISTS EXTERNAL_STAGES; 9 | 10 | USE SCHEMA EXTERNAL_STAGES; 11 | 12 | CREATE OR REPLACE STAGE MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP 13 | URL= 's3://snowflake-ws/' 14 | --FILE_FORMAT = ( TYPE = CSV | JSON | AVRO | ORC | PARQUET | XML ) 15 | --CREDENTIALS = ( AWS_KEY_ID = '' AWS_SECRET_KEY = '' ) 16 | --ENCRYPTION = ( TYPE = 'AWS_CSE' MASTER_KEY = '' ) 17 | --COPY_OPTIONS = ( ) 18 | --TAG = '' 19 | STORAGE_INTEGRATION = AWS_S3_INT; 20 | 21 | LIST @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP; 22 | 23 | DESC STAGE SNOWFLAKE_WORKSHOP; 24 | 25 | SHOW STAGES LIKE '%WORKSHOP%'; 26 | 27 | 28 | /*------------------------------------------------------------- 29 | FILE FORMAT 30 | -------------------------------------------------------------*/ 31 | 32 | CREATE SCHEMA FILE_FORMATS; 33 | USE SCHEMA FILE_FORMATS; 34 | 35 | CREATE OR REPLACE FILE FORMAT MANAGE_DB.FILE_FORMATS.JSON_FMT 36 | TYPE = JSON 37 | -- COMPRESSION = AUTO | GZIP | BZ2 | BROTLI | ZSTD | DEFLATE | RAW_DEFLATE | NONE 38 | -- DATE_FORMAT = '' | AUTO 39 | -- TIME_FORMAT = '' | AUTO 40 | -- TIMESTAMP_FORMAT = '' | AUTO 41 | -- BINARY_FORMAT = HEX | BASE64 | UTF8 42 | TRIM_SPACE = TRUE 43 | -- NULL_IF = ( '' ) 44 | -- FILE_EXTENSION = '' 45 | -- ENABLE_OCTAL = TRUE | FALSE 46 | ALLOW_DUPLICATE = TRUE 47 | -- STRIP_OUTER_ARRAY = TRUE | FALSE 48 | -- STRIP_NULL_VALUES = TRUE | FALSE 49 | REPLACE_INVALID_CHARACTERS = TRUE 50 | -- IGNORE_UTF8_ERRORS = TRUE | FALSE 51 | -- SKIP_BYTE_ORDER_MARK = TRUE | FALSE 52 | 53 | -- TYPE = PARQUET 54 | -- COMPRESSION = AUTO | LZO | SNAPPY | NONE 55 | -- SNAPPY_COMPRESSION = TRUE | FALSE 56 | -- BINARY_AS_TEXT = TRUE | FALSE 57 | -- TRIM_SPACE = TRUE | FALSE 58 | -- NULL_IF = ( '' ) 59 | ; 60 | 61 | SHOW FILE FORMATS; 62 | 63 | DESC FILE FORMAT JSON_FMT; 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /Snowflake/ws/ingestion/3-copy-command.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP 2 | (file_format => MANAGE_DB.FILE_FORMATS.JSON_FMT); 3 | 4 | CREATE DATABASE MEDIAS; 5 | 6 | CREATE SCHEMA MEDIAS.YOUTUBE; 7 | 8 | CREATE OR REPLACE TABLE MEDIAS.YOUTUBE.RAW ( 9 | RAW_FILE VARIANT); 10 | 11 | COPY INTO MEDIAS.YOUTUBE.RAW -- ( RAW_FILE ) 12 | FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP 13 | FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT 14 | -- PATTERN = '.*error.json' 15 | -- FILES = ('youtube_data.json') 16 | ON_ERROR = SKIP_FILE --CONTINUE | SKIP_FILE | SKIP_FILE_ | 'SKIP_FILE_%' | ABORT_STATEMENT 17 | -- SIZE_LIMIT = 18 | -- PURGE = TRUE 19 | -- RETURN_FAILED_ONLY = TRUE | FALSE 20 | -- MATCH_BY_COLUMN_NAME = CASE_SENSITIVE | CASE_INSENSITIVE | NONE 21 | -- ENFORCE_LENGTH = TRUE | FALSE 22 | -- TRUNCATECOLUMNS = TRUE | FALSE 23 | -- FORCE = TRUE 24 | ; 25 | 26 | SELECT * FROM MEDIAS.YOUTUBE.RAW ; 27 | 28 | TRUNCATE TABLE MEDIAS.YOUTUBE.RAW ; 29 | 30 | 31 | -------------------------------------------------------------------------------- /Snowflake/ws/ingestion/4-snowpipe.sql: -------------------------------------------------------------------------------- 1 | USE DATABASE MANAGE_DB; 2 | 3 | CREATE OR REPLACE SCHEMA MANAGE_DB.PIPES; 4 | 5 | USE SCHEMA PIPES; 6 | 7 | CREATE OR REPLACE PIPE MANAGE_DB.PIPES.YOUTUBE_RAW 8 | AUTO_INGEST = TRUE 9 | -- ERROR_INTEGRATION = -- Required only when configuring Snowpipe to send error notifications to a cloud messaging service. 10 | -- AWS_SNS_TOPIC = '' 11 | -- INTEGRATION = '' -- Required only when configuring AUTO_INGEST for Google Cloud Storage or Microsoft Azure stages. 12 | COMMENT = 'Pipe to autoingest youtube data from S3' 13 | AS 14 | COPY INTO MEDIAS.YOUTUBE.RAW 15 | FROM @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP 16 | FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT 17 | ON_ERROR = SKIP_FILE 18 | --FORCE = TRUE 19 | ; 20 | 21 | DESC PIPE YOUTUBE_RAW; 22 | 23 | SELECT * FROM MEDIAS.YOUTUBE.RAW ; 24 | 25 | 26 | /*------------------------------------------------------------- 27 | ERROR HANDLING 28 | -------------------------------------------------------------*/ 29 | 30 | -- Validate pipe is actually working 31 | SELECT SYSTEM$PIPE_STATUS('YOUTUBE_RAW'); 32 | 33 | -- Snowpipe error message 34 | -- sometime can give some general error message 35 | SELECT * FROM TABLE(VALIDATE_PIPE_LOAD( 36 | PIPE_NAME => 'MANAGE_DB.PIPES.YOUTUBE_RAW', 37 | START_TIME => DATEADD(YEAR,-2,CURRENT_TIMESTAMP()) 38 | )); 39 | 40 | -- COPY command history from table to see error massage 41 | -- here we have more details to understand the error 42 | SELECT * FROM TABLE (INFORMATION_SCHEMA.COPY_HISTORY( 43 | TABLE_NAME => 'MEDIAS.YOUTUBE.RAW', 44 | START_TIME => DATEADD(YEAR,-2,CURRENT_TIMESTAMP()))) 45 | -- END_TIME => 46 | ; 47 | 48 | select * 49 | from table(information_schema.pipe_usage_history( 50 | date_range_start=> DATEADD(YEAR,-2,CURRENT_TIMESTAMP()), 51 | pipe_name=>'MANAGE_DB.PIPES.YOUTUBE_RAW')); --14 dias 52 | 53 | -- Pause pipes 54 | ALTER PIPE MANAGE_DB.PIPES.YOUTUBE_RAW SET PIPE_EXECUTION_PAUSED = false; 55 | 56 | ALTER PIPE MANAGE_DB.PIPES.YOUTUBE_RAW REFRESH; --if pipe was created after file ingestion 57 | 58 | select * from MEDIAS.YOUTUBE.RAW; 59 | /*------------------------------------------------------------- 60 | MANAGING PIPES 61 | -------------------------------------------------------------*/ 62 | 63 | -- Manage pipes -- 64 | DESC PIPE MANAGE_DB.PIPES.YOUTUBE_RAW; 65 | 66 | SHOW PIPES; 67 | 68 | SHOW PIPES LIKE '%YOUTUBE%'; 69 | 70 | SHOW PIPES IN DATABASE MANAGE_DB; 71 | 72 | SHOW PIPES IN SCHEMA MANAGE_DB.PIPES; 73 | 74 | SHOW PIPES LIKE '%YOUTUBE%' IN DATABASE MANAGE_DB; -------------------------------------------------------------------------------- /Snowflake/ws/transformation/3-duplicates.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM MEDIAS.YOUTUBE.EVENTS; 2 | 3 | SELECT * FROM MEDIAS.YOUTUBE.RAW; 4 | 5 | MERGE INTO MEDIAS.YOUTUBE.EVENTS tgt 6 | USING ( 7 | SELECT distinct 8 | RAW_FILE:id::int id 9 | ,TO_TIMESTAMP(RAW_FILE:createdAt) created_at 10 | ,RAW_FILE:description::string description 11 | ,RAW_FILE:likeDislike.dislikes::int dislikes 12 | ,RAW_FILE:likeDislike.likes::int likes 13 | ,RAW_FILE:likeDislike.userAction::int user_action 14 | ,RAW_FILE:commentCount::int comment_count 15 | ,RAW_FILE:feedsComment::string feeds_comment 16 | ,RAW_FILE:location::string location 17 | ,RAW_FILE:mediatype::int media_type 18 | ,RAW_FILE:name::string name 19 | ,RAW_FILE:profilePicture::string profile_picture 20 | ,RAW_FILE:title::string title 21 | ,RAW_FILE:userId::int user_id 22 | FROM MEDIAS.YOUTUBE.RAW 23 | ) src 24 | ON src.id = tgt.id 25 | WHEN NOT MATCHED THEN 26 | INSERT 27 | ( 28 | id 29 | ,created_at 30 | ,description 31 | ,dislikes 32 | ,likes 33 | ,user_action 34 | ,comment_count 35 | ,feeds_comment 36 | ,location 37 | ,media_type 38 | ,name 39 | ,profile_picture 40 | ,title 41 | ,user_id 42 | ) 43 | VALUES 44 | ( 45 | src.id 46 | ,src.created_at 47 | ,src.description 48 | ,src.dislikes 49 | ,src.likes 50 | ,src.user_action 51 | ,src.comment_count 52 | ,src.feeds_comment 53 | ,src.location 54 | ,src.media_type 55 | ,src.name 56 | ,src.profile_picture 57 | ,src.title 58 | ,src.user_id 59 | ); 60 | -------------------------------------------------------------------------------- /Snowflake/ws/transformation/5-streams+tasks.sql: -------------------------------------------------------------------------------- 1 | ------- Automatate the updates using tasks -- 2 | CREATE OR REPLACE TASK all_data_changes 3 | WAREHOUSE = COMPUTE_WH 4 | SCHEDULE = '1 MINUTE' 5 | WHEN SYSTEM$STREAM_HAS_DATA('CARS_STREAM') -- condition to only run when stream has data 6 | AS 7 | MERGE INTO CARS_STOCK_CURATED F -- Target table to merge changes from source table 8 | USING ( SELECT STRE.* 9 | ,ST.location 10 | ,ST.employees 11 | FROM CARS_STREAM STRE 12 | JOIN VENDORS ST 13 | ON STRE.vendor_id = ST.vendor_id 14 | ) S 15 | ON F.id=S.id 16 | WHEN MATCHED -- DELETE condition 17 | AND S.METADATA$ACTION ='DELETE' 18 | AND S.METADATA$ISUPDATE = 'FALSE' 19 | THEN DELETE 20 | WHEN MATCHED -- UPDATE condition 21 | AND S.METADATA$ACTION ='INSERT' 22 | AND S.METADATA$ISUPDATE = 'TRUE' 23 | THEN UPDATE 24 | SET f.car_model = s.car_model, 25 | f.price = s.price, 26 | f.in_stock= s.in_stock, 27 | f.vendor_id=s.vendor_id 28 | WHEN NOT MATCHED 29 | AND S.METADATA$ACTION ='INSERT' 30 | THEN INSERT 31 | ( 32 | id 33 | ,car_model 34 | ,price 35 | ,vendor_id 36 | ,in_stock 37 | ,employees 38 | ,location 39 | ) 40 | values 41 | ( 42 | s.id 43 | ,s.car_model 44 | ,s.price 45 | ,s.vendor_id 46 | ,s.in_stock 47 | ,s.employees 48 | ,s.location 49 | ); 50 | 51 | -- resume the task because they are by default not started 52 | ALTER TASK all_data_changes RESUME; 53 | ALTER TASK all_data_changes SUSPEND; 54 | 55 | -- check if task is created 56 | SHOW TASKS; 57 | 58 | 59 | -- Change data to test 60 | INSERT INTO CARS_STG VALUES (11,'rural',50,1,2); 61 | 62 | DELETE FROM CARS_STG 63 | WHERE car_model = 'mercedes'; 64 | 65 | 66 | -- Verify results 67 | -- stage should be changed 68 | SELECT * FROM CARS_STG; 69 | -- stream shoudl have the data changed (if the task did not run yet) 70 | SELECT * FROM CARS_STREAM; 71 | -- after task run (1min) final table should have the new updates 72 | SELECT * FROM CARS_STOCK_CURATED; 73 | 74 | 75 | -- Verify the history 76 | select * 77 | from table(information_schema.task_history()) 78 | order by name asc,scheduled_time desc; 79 | 80 | show tasks; -------------------------------------------------------------------------------- /Snowflake/ws/transformation/6-refactoring.sql: -------------------------------------------------------------------------------- 1 | COPY INTO MEDIAS.YOUTUBE.RAW 2 | FROM 3 | ( 4 | 5 | 6 | 7 | ) 8 | 9 | @MANAGE_DB.EXTERNAL_STAGES.SNOWFLAKE_WORKSHOP 10 | FILE_FORMAT= MANAGE_DB.FILE_FORMATS.JSON_FMT 11 | -- PATTERN = '.*error.json' 12 | -- FILES = ('youtube_data.json') 13 | ON_ERROR = SKIP_FILE --CONTINUE | SKIP_FILE | SKIP_FILE_ | 'SKIP_FILE_%' | ABORT_STATEMENT 14 | -- SIZE_LIMIT = 15 | -- PURGE = TRUE 16 | -- RETURN_FAILED_ONLY = TRUE | FALSE 17 | -- MATCH_BY_COLUMN_NAME = CASE_SENSITIVE | CASE_INSENSITIVE | NONE 18 | -- ENFORCE_LENGTH = TRUE | FALSE 19 | -- TRUNCATECOLUMNS = TRUE | FALSE 20 | -- FORCE = TRUE 21 | 22 | 23 | COPY INTO OUR_FIRST_DB.PUBLIC.ORDERS_EX 24 | FROM (select 25 | s.$1, 26 | s.$2, 27 | s.$3, 28 | CASE WHEN CAST(s.$3 as int) < 0 THEN 'not profitable' ELSE 'profitable' END 29 | from @MANAGE_DB.external_stages.aws_stage s) 30 | file_format= (type = csv field_delimiter=',' skip_header=1) 31 | files=('OrderDetails.csv'); -------------------------------------------------------------------------------- /Terraform/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/.DS_Store -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/06-organization-and-modules/.DS_Store -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/README.md: -------------------------------------------------------------------------------- 1 | ## Modifications 2 | - remove backend definition 3 | - remove provider definition 4 | 5 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/consul/README.md: -------------------------------------------------------------------------------- 1 | Uses a module from the terraform registry: 2 | 3 | https://github.com/hashicorp/terraform-aws-consul -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/consul/main.tf: -------------------------------------------------------------------------------- 1 | ## THIS SHOWS HOW TO IMPORT AND EXTENAL MODULE FROM TERRAFORM REGISTRY 2 | 3 | terraform { 4 | # Assumes s3 bucket and dynamo DB table already set up 5 | # See /code/03-basics/aws-backend 6 | backend "s3" { 7 | bucket = "devops-directive-tf-state" 8 | key = "06-organization-and-modules/consul/terraform.tfstate" 9 | region = "us-east-1" 10 | dynamodb_table = "terraform-state-locking" 11 | encrypt = true 12 | } 13 | 14 | required_providers { 15 | aws = { 16 | source = "hashicorp/aws" 17 | version = "~> 3.0" 18 | } 19 | } 20 | } 21 | 22 | provider "aws" { 23 | region = "us-east-1" 24 | } 25 | 26 | ############################################################ 27 | ## 28 | ## NOTE: if you are deploying this in your production setup 29 | ## follow the instructions in the github repo on how to modify 30 | ## deploying with the defaults here as an example of the power 31 | ## of modules. 32 | ## 33 | ## REPO: https://github.com/hashicorp/terraform-aws-consul 34 | ## 35 | ############################################################ 36 | module "consul" { 37 | source = "git@github.com:hashicorp/terraform-aws-consul.git" 38 | } 39 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/compute.tf: -------------------------------------------------------------------------------- 1 | resource "aws_instance" "instance_1" { 2 | ami = var.ami 3 | instance_type = var.instance_type 4 | security_groups = [aws_security_group.instances.name] 5 | user_data = <<-EOF 6 | #!/bin/bash 7 | echo "Hello, World 1" > index.html 8 | python3 -m http.server 8080 & 9 | EOF 10 | } 11 | 12 | resource "aws_instance" "instance_2" { 13 | ami = var.ami 14 | instance_type = var.instance_type 15 | security_groups = [aws_security_group.instances.name] 16 | user_data = <<-EOF 17 | #!/bin/bash 18 | echo "Hello, World 2" > index.html 19 | python3 -m http.server 8080 & 20 | EOF 21 | } 22 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/database.tf: -------------------------------------------------------------------------------- 1 | resource "aws_db_instance" "db_instance" { 2 | allocated_storage = 20 3 | storage_type = "standard" 4 | engine = "postgres" 5 | engine_version = "12.5" 6 | instance_class = "db.t2.micro" 7 | name = var.db_name 8 | username = var.db_user 9 | password = var.db_pass 10 | skip_final_snapshot = true 11 | } 12 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/dns.tf: -------------------------------------------------------------------------------- 1 | resource "aws_route53_zone" "primary" { 2 | count = var.create_dns_zone ? 1 : 0 3 | name = var.domain 4 | } 5 | 6 | data "aws_route53_zone" "primary" { 7 | count = var.create_dns_zone ? 0 : 1 8 | name = var.domain 9 | } 10 | 11 | locals { 12 | dns_zone_id = var.create_dns_zone ? aws_route53_zone.primary[0].zone_id : data.aws_route53_zone.primary[0].zone_id 13 | subdomain = var.environment_name == "production" ? "" : "${var.environment_name}." 14 | } 15 | 16 | resource "aws_route53_record" "root" { 17 | zone_id = local.dns_zone_id 18 | name = "${local.subdomain}${var.domain}" 19 | type = "A" 20 | 21 | alias { 22 | name = aws_lb.load_balancer.dns_name 23 | zone_id = aws_lb.load_balancer.zone_id 24 | evaluate_target_health = true 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 3.0" 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/outputs.tf: -------------------------------------------------------------------------------- 1 | output "instance_1_ip_addr" { 2 | value = aws_instance.instance_1.public_ip 3 | } 4 | 5 | output "instance_2_ip_addr" { 6 | value = aws_instance.instance_2.public_ip 7 | } 8 | 9 | output "db_instance_addr" { 10 | value = aws_db_instance.db_instance.address 11 | } 12 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/storage.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "bucket" { 2 | bucket = var.bucket_name 3 | force_destroy = true 4 | versioning { 5 | enabled = true 6 | } 7 | 8 | server_side_encryption_configuration { 9 | rule { 10 | apply_server_side_encryption_by_default { 11 | sse_algorithm = "AES256" 12 | } 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app-module/variables.tf: -------------------------------------------------------------------------------- 1 | # General Variables 2 | 3 | variable "region" { 4 | description = "Default region for provider" 5 | type = string 6 | default = "us-east-1" 7 | } 8 | 9 | variable "app_name" { 10 | description = "Name of the web application" 11 | type = string 12 | default = "web-app" 13 | } 14 | 15 | variable "environment_name" { 16 | description = "Deployment environment (dev/staging/production)" 17 | type = string 18 | default = "dev" 19 | } 20 | 21 | # EC2 Variables 22 | 23 | variable "ami" { 24 | description = "Amazon machine image to use for ec2 instance" 25 | type = string 26 | default = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1 27 | } 28 | 29 | variable "instance_type" { 30 | description = "ec2 instance type" 31 | type = string 32 | default = "t2.micro" 33 | } 34 | 35 | # S3 Variables 36 | 37 | variable "bucket_name" { 38 | description = "name of s3 bucket for app data" 39 | type = string 40 | } 41 | 42 | # Route 53 Variables 43 | 44 | variable "create_dns_zone" { 45 | description = "If true, create new route53 zone, if false read existing route53 zone" 46 | type = bool 47 | default = false 48 | } 49 | 50 | variable "domain" { 51 | description = "Domain for website" 52 | type = string 53 | } 54 | 55 | # RDS Variables 56 | 57 | variable "db_name" { 58 | description = "Name of DB" 59 | type = string 60 | } 61 | 62 | variable "db_user" { 63 | description = "Username for DB" 64 | type = string 65 | } 66 | 67 | variable "db_pass" { 68 | description = "Password for DB" 69 | type = string 70 | sensitive = true 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /Terraform/06-organization-and-modules/web-app/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | # Assumes s3 bucket and dynamo DB table already set up 3 | # See /code/03-basics/aws-backend 4 | backend "s3" { 5 | bucket = "devops-directive-tf-state" 6 | key = "06-organization-and-modules/web-app/terraform.tfstate" 7 | region = "us-east-1" 8 | dynamodb_table = "terraform-state-locking" 9 | encrypt = true 10 | } 11 | 12 | required_providers { 13 | aws = { 14 | source = "hashicorp/aws" 15 | version = "~> 3.0" 16 | } 17 | } 18 | } 19 | 20 | provider "aws" { 21 | region = "us-east-1" 22 | } 23 | 24 | variable "db_pass_1" { 25 | description = "password for database #1" 26 | type = string 27 | sensitive = true 28 | } 29 | 30 | variable "db_pass_2" { 31 | description = "password for database #2" 32 | type = string 33 | sensitive = true 34 | } 35 | 36 | ## import the module on the root directory 37 | module "web_app_1" { 38 | source = "../web-app-module" 39 | 40 | # Input Variables 41 | bucket_name = "web-app-1-devops-directive-web-app-data" 42 | domain = "devopsdeployed.com" 43 | app_name = "web-app-1" 44 | environment_name = "production" 45 | instance_type = "t2.small" 46 | create_dns_zone = true 47 | db_name = "webapp1db" 48 | db_user = "foo" 49 | db_pass = var.db_pass_1 50 | } 51 | 52 | module "web_app_2" { 53 | source = "../web-app-module" 54 | 55 | # Input Variables 56 | bucket_name = "web-app-2-devops-directive-web-app-data" 57 | domain = "anotherdevopsdeployed.com" 58 | app_name = "web-app-2" 59 | environment_name = "production" 60 | instance_type = "t2.small" 61 | create_dns_zone = true 62 | db_name = "webapp2db" 63 | db_user = "bar" 64 | db_pass = var.db_pass_2 65 | } 66 | -------------------------------------------------------------------------------- /Terraform/2-first-tf-deployment/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 3.0" 6 | } 7 | } 8 | } 9 | 10 | provider "aws" { 11 | region = "us-east-1" 12 | } 13 | 14 | resource "aws_instance" "example" { 15 | ami = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1 16 | instance_type = "t2.micro" 17 | } -------------------------------------------------------------------------------- /Terraform/2-first-tf-deployment/terraform.tfstate: -------------------------------------------------------------------------------- 1 | { 2 | "version": 4, 3 | "terraform_version": "1.1.9", 4 | "serial": 3, 5 | "lineage": "5f6cc133-349c-4042-a6c7-b0d657a26a31", 6 | "outputs": {}, 7 | "resources": [] 8 | } 9 | -------------------------------------------------------------------------------- /Terraform/3-remote-backend/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | ############################################################# 3 | ## AFTER RUNNING TERRAFORM APPLY (WITH LOCAL BACKEND) 4 | ## YOU WILL UNCOMMENT THIS CODE THEN RERUN TERRAFORM INIT 5 | ## TO SWITCH FROM LOCAL BACKEND TO REMOTE AWS BACKEND 6 | ############################################################# 7 | # backend "s3" { 8 | # bucket = "cabolba88-tf-state-dev" # REPLACE WITH YOUR BUCKET NAME 9 | # key = "03-basics/import-bootstrap/terraform.tfstate" # WHERE IN THE BUCKET THE STATE FILE WILL BE 10 | # region = "us-east-1" 11 | # dynamodb_table = "terraform-state-locking" # TABLE NAME 12 | # encrypt = true 13 | # } 14 | 15 | required_providers { # CONFIG THE PROVIDER VERSION, CAN CHECK ON TERRAFORM SITE 16 | aws = { 17 | source = "hashicorp/aws" 18 | version = "~> 3.0" 19 | } 20 | } 21 | } 22 | 23 | provider "aws" { # PASSING SOME OPTIONS TO THE PROVIDER 24 | region = "us-east-1" 25 | } 26 | 27 | resource "aws_s3_bucket" "terraform_state" { 28 | bucket = "cabolba88-tf-state-dev" # REPLACE WITH YOUR BUCKET NAME 29 | force_destroy = true 30 | versioning { 31 | enabled = true 32 | } 33 | 34 | server_side_encryption_configuration { 35 | rule { 36 | apply_server_side_encryption_by_default { 37 | sse_algorithm = "AES256" 38 | } 39 | } 40 | } 41 | } 42 | 43 | resource "aws_dynamodb_table" "terraform_locks" { 44 | name = "terraform-state-locking" 45 | billing_mode = "PAY_PER_REQUEST" 46 | hash_key = "LockID" 47 | attribute { 48 | name = "LockID" 49 | type = "S" 50 | } 51 | } -------------------------------------------------------------------------------- /Terraform/3-remote-backend/terraform.tfstate: -------------------------------------------------------------------------------- 1 | { 2 | "version": 4, 3 | "terraform_version": "1.1.9", 4 | "serial": 6, 5 | "lineage": "69d41dc4-b373-9e1f-2967-04141b114b62", 6 | "outputs": {}, 7 | "resources": [] 8 | } 9 | -------------------------------------------------------------------------------- /Terraform/3-web-app/.terraform/terraform.tfstate: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "serial": 2, 4 | "lineage": "ea8bd856-bf98-311f-6a59-28d5805ff650", 5 | "backend": { 6 | "type": "s3", 7 | "config": { 8 | "access_key": null, 9 | "acl": null, 10 | "assume_role_duration_seconds": null, 11 | "assume_role_policy": null, 12 | "assume_role_policy_arns": null, 13 | "assume_role_tags": null, 14 | "assume_role_transitive_tag_keys": null, 15 | "bucket": "cabolba88-tf-state-dev", 16 | "dynamodb_endpoint": null, 17 | "dynamodb_table": "terraform-state-locking", 18 | "encrypt": true, 19 | "endpoint": null, 20 | "external_id": null, 21 | "force_path_style": null, 22 | "iam_endpoint": null, 23 | "key": "03-basics/import-bootstrap/terraform.tfstate", 24 | "kms_key_id": null, 25 | "max_retries": null, 26 | "profile": null, 27 | "region": "us-east-1", 28 | "role_arn": null, 29 | "secret_key": null, 30 | "session_name": null, 31 | "shared_credentials_file": null, 32 | "skip_credentials_validation": null, 33 | "skip_metadata_api_check": null, 34 | "skip_region_validation": null, 35 | "sse_customer_key": null, 36 | "sts_endpoint": null, 37 | "token": null, 38 | "workspace_key_prefix": null 39 | }, 40 | "hash": 1591429616 41 | }, 42 | "modules": [ 43 | { 44 | "path": [ 45 | "root" 46 | ], 47 | "outputs": {}, 48 | "resources": {}, 49 | "depends_on": [] 50 | } 51 | ] 52 | } 53 | -------------------------------------------------------------------------------- /Terraform/3-web-app/web-app-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/3-web-app/web-app-architecture.png -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/examples/another-variable-file.tfvars: -------------------------------------------------------------------------------- 1 | instance_name = "hello-world-2" 2 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/examples/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | # backend "s3" { 3 | # bucket = "devops-directive-tf-state" 4 | # key = "04-variables-and-outputs/examples/terraform.tfstate" 5 | # region = "us-east-1" 6 | # dynamodb_table = "terraform-state-locking" 7 | # encrypt = true 8 | # } 9 | 10 | required_providers { 11 | aws = { 12 | source = "hashicorp/aws" 13 | version = "~> 3.0" 14 | } 15 | } 16 | } 17 | 18 | provider "aws" { 19 | region = "us-east-1" 20 | } 21 | 22 | locals { 23 | extra_tag = "extra-tag" 24 | } 25 | 26 | resource "aws_instance" "instance" { 27 | ami = var.ami #comming from variables.tf 28 | instance_type = var.instance_type 29 | 30 | tags = { 31 | Name = var.instance_name 32 | ExtraTag = local.extra_tag 33 | } 34 | } 35 | 36 | resource "aws_db_instance" "db_instance" { 37 | allocated_storage = 20 38 | storage_type = "gp2" 39 | engine = "postgres" 40 | engine_version = "12.4" 41 | instance_class = "db.t2.micro" 42 | name = "mydb" 43 | username = var.db_user 44 | password = var.db_pass 45 | skip_final_snapshot = true 46 | } 47 | 48 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/examples/outputs.tf: -------------------------------------------------------------------------------- 1 | output "instance_ip_addr" { 2 | value = aws_instance.instance.private_ip 3 | } 4 | 5 | output "db_instance_addr" { 6 | value = aws_db_instance.db_instance.address 7 | } 8 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/examples/terraform.tfvars: -------------------------------------------------------------------------------- 1 | instance_name = "hello-world" 2 | ami = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1 3 | instance_type = "t2.micro" -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/examples/variables.tf: -------------------------------------------------------------------------------- 1 | # should specify optional vs required 2 | 3 | variable "instance_name" { 4 | description = "Name of ec2 instance" 5 | type = string 6 | } 7 | 8 | variable "ami" { 9 | description = "Amazon machine image to use for ec2 instance" 10 | type = string 11 | default = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1 12 | } 13 | 14 | variable "instance_type" { 15 | description = "ec2 instance type" 16 | type = string 17 | default = "t2.micro" 18 | } 19 | 20 | variable "db_user" { 21 | description = "username for database" 22 | type = string 23 | default = "foo" 24 | } 25 | 26 | variable "db_pass" { 27 | description = "password for database" 28 | type = string 29 | sensitive = true 30 | } 31 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/web-app/outputs.tf: -------------------------------------------------------------------------------- 1 | output "instance_1_ip_addr" { 2 | value = aws_instance.instance_1.public_ip 3 | } 4 | 5 | output "instance_2_ip_addr" { 6 | value = aws_instance.instance_2.public_ip 7 | } 8 | 9 | # output "db_instance_addr" { 10 | # value = aws_db_instance.db_instance.address 11 | # } 12 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/web-app/terraform.tfstate: -------------------------------------------------------------------------------- 1 | { 2 | "version": 4, 3 | "terraform_version": "1.1.9", 4 | "serial": 35, 5 | "lineage": "f60b97d6-2ff5-9c53-a198-fa1e6e8eae80", 6 | "outputs": {}, 7 | "resources": [] 8 | } 9 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/web-app/terraform.tfvars: -------------------------------------------------------------------------------- 1 | bucket_name = "devops-directive-web-app-data" 2 | domain = "devopsdeployed.com" 3 | db_name = "mydb" 4 | db_user = "foo" 5 | # db_pass = "foobarbaz" 6 | -------------------------------------------------------------------------------- /Terraform/4-variables-and-outputs/web-app/variables.tf: -------------------------------------------------------------------------------- 1 | # General Variables 2 | 3 | variable "region" { 4 | description = "Default region for provider" 5 | type = string 6 | default = "us-east-1" 7 | } 8 | 9 | # EC2 Variables 10 | 11 | variable "ami" { 12 | description = "Amazon machine image to use for ec2 instance" 13 | type = string 14 | default = "ami-011899242bb902164" # Ubuntu 20.04 LTS // us-east-1 15 | } 16 | 17 | variable "instance_type" { 18 | description = "ec2 instance type" 19 | type = string 20 | default = "t2.micro" 21 | } 22 | 23 | # S3 Variables 24 | 25 | variable "bucket_name" { 26 | description = "name of s3 bucket for app data" 27 | type = string 28 | } 29 | 30 | # Route 53 Variables 31 | 32 | variable "domain" { 33 | description = "Domain for website" 34 | type = string 35 | } 36 | 37 | # RDS Variables 38 | 39 | variable "db_name" { 40 | description = "Name of DB" 41 | type = string 42 | } 43 | 44 | variable "db_user" { 45 | description = "Username for DB" 46 | type = string 47 | } 48 | 49 | variable "db_pass" { 50 | description = "Password for DB" 51 | type = string 52 | sensitive = true 53 | } 54 | 55 | 56 | -------------------------------------------------------------------------------- /Terraform/img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/img/.DS_Store -------------------------------------------------------------------------------- /Terraform/img/2_2_terraform_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/Terraform/img/2_2_terraform_architecture.png -------------------------------------------------------------------------------- /airbyte-dbt-airflow-snowflake/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/airbyte-dbt-airflow-snowflake/.DS_Store -------------------------------------------------------------------------------- /airbyte-dbt-airflow-snowflake/flags.yml: -------------------------------------------------------------------------------- 1 | flags: 2 | - name: performance.backgroundJsonSchemaValidation 3 | serve: false 4 | - name: heartbeat.failSync 5 | serve: true 6 | - name: connectors.versionOverridesEnabled 7 | serve: true 8 | - name: billing.newTrialPolicy 9 | serve: false 10 | - name: heartbeat-max-seconds-between-messages 11 | serve: "10800" 12 | - name: use-new-notification-workflow 13 | serve: false 14 | - name: validation.removeValidationLimit 15 | serve: false 16 | - name: connection.columnSelection 17 | serve: true 18 | - name: refreshSchema.period.hours 19 | serve: 24 20 | - name: concurrent.source.stream.read 21 | serve: false 22 | - name: platform.add-scheduling-jitter 23 | serve: false 24 | - name: connectors.apm-enabled 25 | serve: false 26 | - name: connectors.useIconUrlInApiResponse 27 | serve: true 28 | - name: platform.use-new-schema-update-notification 29 | serve: false 30 | # whether the platform will track destination timeouts. If set to 'true', then the platform will monitor if there has been 31 | # a call to a destination that has taken more than the threshold defined in 'destination-timeout.maxSeconds', and if so, 32 | # the platform will consider the destination to have timed out. 33 | - name: destination-timeout-enabled 34 | serve: true 35 | # If set to 'true' and the platform detects a destination timeout, it will fail the sync. Otherwise, it will 36 | # log a message and the sync will proceed. 37 | - name: destination-timeout.failSync 38 | serve: true 39 | # the time the platform waits before it decides that a destination has timed out. 40 | - name: destination-timeout.seconds 41 | serve: 86400 42 | - name: platform.inject-aws-secrets-to-connector-pods 43 | serve: false 44 | - name: platform.use-workload-api 45 | serve: false 46 | -------------------------------------------------------------------------------- /airbyte-dbt-airflow-snowflake/temporal/dynamicconfig/development.yaml: -------------------------------------------------------------------------------- 1 | # when modifying, remember to update the kube version of this file kube/resources/temporal.yaml 2 | frontend.enableClientVersionCheck: 3 | - value: true 4 | constraints: {} 5 | history.persistenceMaxQPS: 6 | - value: 3000 7 | constraints: {} 8 | frontend.persistenceMaxQPS: 9 | - value: 3000 10 | constraints: {} 11 | frontend.historyMgrNumConns: 12 | - value: 30 13 | constraints: {} 14 | frontend.throttledLogRPS: 15 | - value: 20 16 | constraints: {} 17 | history.historyMgrNumConns: 18 | - value: 50 19 | constraints: {} 20 | system.advancedVisibilityWritingMode: 21 | - value: "off" 22 | constraints: {} 23 | history.defaultActivityRetryPolicy: 24 | - value: 25 | InitialIntervalInSeconds: 1 26 | MaximumIntervalCoefficient: 100.0 27 | BackoffCoefficient: 2.0 28 | MaximumAttempts: 0 29 | history.defaultWorkflowRetryPolicy: 30 | - value: 31 | InitialIntervalInSeconds: 1 32 | MaximumIntervalCoefficient: 100.0 33 | BackoffCoefficient: 2.0 34 | MaximumAttempts: 0 35 | # Limit for responses. This mostly impacts discovery jobs since they have the largest responses. 36 | limit.blobSize.error: 37 | - value: 15728640 # 15MB 38 | constraints: {} 39 | limit.blobSize.warn: 40 | - value: 10485760 # 10MB 41 | constraints: {} 42 | -------------------------------------------------------------------------------- /dbt/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/.DS_Store -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/dim_customer-3.sql: -------------------------------------------------------------------------------- 1 | with customers as ( 2 | select * from {{ ref('stg_customers')}} 3 | ), 4 | orders as ( 5 | select * from {{ ref('fct_orders')}} 6 | ), 7 | customer_orders as ( 8 | select 9 | customer_id, 10 | min(order_date) as first_order_date, 11 | max(order_date) as most_recent_order_date, 12 | count(order_id) as number_of_orders, 13 | sum(amount) as lifetime_value 14 | from orders 15 | group by 1 16 | ), 17 | final as ( 18 | select 19 | customers.customer_id, 20 | customers.first_name, 21 | customers.last_name, 22 | customer_orders.first_order_date, 23 | customer_orders.most_recent_order_date, 24 | coalesce(customer_orders.number_of_orders, 0) as number_of_orders, 25 | customer_orders.lifetime_value 26 | from customers 27 | left join customer_orders using (customer_id) 28 | ) 29 | select * from final -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/dim_customers-2.sql: -------------------------------------------------------------------------------- 1 | with customers as ( 2 | 3 | select * from {{ ref('stg_customers')}} 4 | 5 | ), 6 | 7 | orders as ( 8 | 9 | select * from {{ ref('stg_orders') }} 10 | 11 | ), 12 | 13 | customer_orders as ( 14 | 15 | select 16 | customer_id, 17 | 18 | min(order_date) as first_order_date, 19 | max(order_date) as most_recent_order_date, 20 | count(order_id) as number_of_orders 21 | 22 | from orders 23 | 24 | group by 1 25 | 26 | ), 27 | 28 | 29 | final as ( 30 | 31 | select 32 | customers.customer_id, 33 | customers.first_name, 34 | customers.last_name, 35 | customer_orders.first_order_date, 36 | customer_orders.most_recent_order_date, 37 | coalesce(customer_orders.number_of_orders, 0) as number_of_orders 38 | 39 | from customers 40 | 41 | left join customer_orders using (customer_id) 42 | 43 | ) 44 | 45 | select * from final -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/dim_customers.sql: -------------------------------------------------------------------------------- 1 | with customers as ( 2 | 3 | select 4 | id as customer_id, 5 | first_name, 6 | last_name 7 | 8 | from raw.jaffle_shop.customers 9 | 10 | ), 11 | 12 | orders as ( 13 | 14 | select 15 | id as order_id, 16 | user_id as customer_id, 17 | order_date, 18 | status 19 | 20 | from raw.jaffle_shop.orders 21 | 22 | ), 23 | 24 | customer_orders as ( 25 | 26 | select 27 | customer_id, 28 | 29 | min(order_date) as first_order_date, 30 | max(order_date) as most_recent_order_date, 31 | count(order_id) as number_of_orders 32 | 33 | from orders 34 | 35 | group by 1 36 | 37 | ), 38 | 39 | 40 | final as ( 41 | 42 | select 43 | customers.customer_id, 44 | customers.first_name, 45 | customers.last_name, 46 | customer_orders.first_order_date, 47 | customer_orders.most_recent_order_date, 48 | coalesce(customer_orders.number_of_orders, 0) as number_of_orders 49 | 50 | from customers 51 | 52 | left join customer_orders using (customer_id) 53 | 54 | ) 55 | 56 | select * from final -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/fct_orders.sql: -------------------------------------------------------------------------------- 1 | with orders as ( 2 | select * from {{ ref('stg_orders' )}} 3 | ), 4 | 5 | payments as ( 6 | select * from {{ ref('stg_payments') }} 7 | ), 8 | 9 | order_payments as ( 10 | select 11 | order_id, 12 | sum(case when status = 'success' then amount end) as amount 13 | 14 | from payments 15 | group by 1 16 | ), 17 | 18 | final as ( 19 | 20 | select 21 | orders.order_id, 22 | orders.customer_id, 23 | orders.order_date, 24 | coalesce(order_payments.amount, 0) as amount 25 | 26 | from orders 27 | left join order_payments using (order_id) 28 | ) 29 | 30 | select * from final -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/img/dbt_and_alatycs_engineer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/dbt_and_alatycs_engineer.png -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/img/dbt_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/dbt_workflow.png -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/img/ineage_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/ineage_graph.png -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/img/modern_data_platform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/img/modern_data_platform.png -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/insert_sf_dbt_training_data.sql: -------------------------------------------------------------------------------- 1 | create warehouse transforming; 2 | 3 | create database raw; 4 | 5 | create database analytics; 6 | 7 | create schema raw.jaffle_shop; 8 | 9 | -- create this one directly in the schema 10 | create table raw.jaffle_shop.customers 11 | ( 12 | id integer, 13 | first_name varchar, 14 | last_name varchar 15 | ); 16 | 17 | copy into raw.jaffle_shop.customers (id, first_name, last_name) 18 | from 's3://dbt-tutorial-public/jaffle_shop_customers.csv' 19 | file_format = ( 20 | type = 'CSV' 21 | field_delimiter = ',' 22 | skip_header = 1 23 | ) 24 | ; 25 | 26 | create table raw.jaffle_shop.orders 27 | ( 28 | id integer, 29 | user_id integer, 30 | order_date date, 31 | status varchar, 32 | _etl_loaded_at timestamp default current_timestamp 33 | ); 34 | 35 | copy into raw.jaffle_shop.orders (id, user_id, order_date, status) 36 | from 's3://dbt-tutorial-public/jaffle_shop_orders.csv' 37 | file_format = ( 38 | type = 'CSV' 39 | field_delimiter = ',' 40 | skip_header = 1 41 | ) 42 | ; 43 | 44 | create schema raw.stripe; 45 | 46 | create table raw.stripe.payment ( 47 | id integer, 48 | orderid integer, 49 | paymentmethod varchar, 50 | status varchar, 51 | amount integer, 52 | created date, 53 | _batched_at timestamp default current_timestamp 54 | ); 55 | 56 | copy into raw.stripe.payment (id, orderid, paymentmethod, status, amount, created) 57 | from 's3://dbt-tutorial-public/stripe_payments.csv' 58 | file_format = ( 59 | type = 'CSV' 60 | field_delimiter = ',' 61 | skip_header = 1 62 | ) 63 | ; -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/jaffle_shop.md: -------------------------------------------------------------------------------- 1 | {% docs order_status %} 2 | 3 | One of the following values: 4 | 5 | | status | definition | 6 | |----------------|--------------------------------------------------| 7 | | placed | Order placed, not yet shipped | 8 | | shipped | Order has been shipped, not yet been delivered | 9 | | completed | Order has been received by customers | 10 | | return pending | Customer indicated they want to return this item | 11 | | returned | Item has been returned | 12 | 13 | {% enddocs %} -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/src_jaffle_shop-2.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-fundamentals-course/src_jaffle_shop-2.yml -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/src_jaffle_shop-3.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: jaffle_shop 5 | database: raw 6 | schema: jaffle_shop 7 | tables: 8 | - name: customers 9 | columns: 10 | - name: id 11 | tests: 12 | - unique 13 | - not_null 14 | 15 | - name: orders 16 | columns: 17 | - name: id 18 | tests: 19 | - unique 20 | - not_null 21 | loaded_at_field: _etl_loaded_at 22 | freshness: 23 | warn_after: {count: 12, period: hour} 24 | error_after: {count: 24, period: hour} -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/src_jaffle_shop-4.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: jaffle_shop 5 | description: A clone of a Postgres application database. 6 | database: raw 7 | schema: jaffle_shop 8 | tables: 9 | - name: customers 10 | description: Raw customers data. 11 | columns: 12 | - name: id 13 | description: Primary key for customers. 14 | tests: 15 | - unique 16 | - not_null 17 | 18 | - name: orders 19 | description: Raw orders data. 20 | columns: 21 | - name: id 22 | description: Primary key for orders. 23 | tests: 24 | - unique 25 | - not_null 26 | loaded_at_field: _etl_loaded_at 27 | freshness: 28 | warn_after: {count: 12, period: hour} 29 | error_after: {count: 24, period: hour} -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/src_jaffle_shop.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: jaffle_shop 5 | database: raw 6 | 7 | schema: jaffle_shop 8 | tables: 9 | - name: customers 10 | - name: orders -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/src_stripe.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: stripe 5 | database: raw 6 | schema: stripe 7 | tables: 8 | - name: payment 9 | loaded_at_field: _batched_at 10 | freshness: 11 | warn_after: {count: 12,period: hour } -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/stg_customers.sql: -------------------------------------------------------------------------------- 1 | with customers as ( 2 | 3 | select 4 | id as customer_id, 5 | first_name, 6 | last_name 7 | 8 | --from raw.jaffle_shop.customers -- to begin the course 9 | from {{ source('jaffle_shop','customers') }} -- used in the source chapter 10 | ) 11 | 12 | select * from customers 13 | -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/stg_jaffle_shop-2.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_customers 5 | description: my description table 6 | columns: 7 | - name: customer_id 8 | description: my description column 9 | tests: 10 | - unique 11 | - not_null 12 | 13 | - name: stg_orders 14 | description: my description table 15 | columns: 16 | - name: order_id 17 | description: my description column 18 | tests: 19 | - unique 20 | - not_null 21 | - name: status 22 | description: "{{ doc('order_status') }}" 23 | tests: 24 | - accepted_values: 25 | values: 26 | - completed 27 | - shipped 28 | - returned 29 | - return_pending 30 | - placed 31 | - name: customer_id 32 | description: my description column 33 | tests: 34 | - relationships: 35 | to: ref('stg_customers') 36 | field: customer_id -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/stg_jaffle_shop.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_customers 5 | columns: 6 | - name: customer_id 7 | tests: 8 | - unique 9 | - not_null 10 | 11 | - name: stg_orders 12 | columns: 13 | - name: order_id 14 | tests: 15 | - unique 16 | - not_null 17 | - name: status 18 | tests: 19 | - accepted_values: 20 | values: 21 | - completed 22 | - shipped 23 | - returned 24 | - return_pending 25 | - placed 26 | - name: customer_id 27 | tests: 28 | - relationships: 29 | to: ref('stg_customers') 30 | field: customer_id -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/stg_orders.sql: -------------------------------------------------------------------------------- 1 | with orders as ( 2 | 3 | select 4 | id as order_id, 5 | user_id as customer_id, 6 | order_date, 7 | status 8 | 9 | --from raw.jaffle_shop.orders -- to begin the course 10 | from {{ source('jaffle_shop','orders') }} -- used in the source chapter 11 | ) 12 | 13 | select * from orders -------------------------------------------------------------------------------- /dbt/dbt-fundamentals-course/stg_payments.sql: -------------------------------------------------------------------------------- 1 | select 2 | id as payment_id, 3 | orderid as order_id, 4 | paymentmethod as payment_method, 5 | status, 6 | 7 | -- amount is stored in cents, convert it to dollars 8 | amount / 100 as amount, 9 | created as created_at 10 | 11 | from raw.stripe.payment -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/.DS_Store -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/.DS_Store -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/.dbt/.user.yml: -------------------------------------------------------------------------------- 1 | id: 02e7707b-ea50-432c-b85b-33224e71baf1 2 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/.dbt/profiles.yml: -------------------------------------------------------------------------------- 1 | dbt_project: 2 | outputs: 3 | dev: 4 | account: SUPYPQL.OM48075 5 | database: AIRBNB 6 | password: 7 | role: transform 8 | schema: DEV 9 | threads: 1 10 | type: snowflake 11 | user: dbt 12 | warehouse: COMPUTE_WH 13 | target: dev 14 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.DS_Store -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/.gitkeep -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/analyses/full_moon_no_sleep.sql: -------------------------------------------------------------------------------- 1 | WITH fullmoon_reviews AS ( 2 | SELECT * FROM {{ ref('mart_fullmoon_reviews') }} 3 | ) 4 | SELECT 5 | is_full_moon, 6 | review_sentiment, 7 | COUNT(*) as reviews 8 | FROM 9 | fullmoon_reviews 10 | GROUP BY 11 | is_full_moon, 12 | review_sentiment 13 | ORDER BY 14 | is_full_moon, 15 | review_sentiment -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/assets/input_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/assets/input_schema.png -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/dbt_project.yml: -------------------------------------------------------------------------------- 1 | # Name your project! Project names should contain only lowercase characters 2 | # and underscores. A good package name should reflect your organization's 3 | # name or the intended use of these models 4 | name: 'dbt_project' 5 | version: '1.0.0' 6 | config-version: 2 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'dbt_project' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `model-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | model-paths: ["models"] 15 | analysis-paths: ["analyses"] 16 | test-paths: ["tests"] 17 | seed-paths: ["seeds"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | asset-paths: ["assets"] 21 | 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_packages" 25 | 26 | 27 | # Configuring models 28 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 29 | 30 | # In this example config, we tell dbt to build all models in the example/ 31 | # directory as views. These settings can be overridden in the individual model 32 | # files using the `{{ config(...) }}` macro. 33 | models: 34 | dbt_project: 35 | # Config indicated by + and applies to all files under models/example/ 36 | +post-hook: 37 | - "GRANT SELECT ON {{ this }} TO ROLE REPORTER" #this will be executed after every model run 38 | +materialized: view 39 | dim: 40 | +materialized: table 41 | src: 42 | +materialized: ephemeral 43 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/.gitkeep -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/no_nulls_in_columns.sql: -------------------------------------------------------------------------------- 1 | {% macro no_nulls_in_columns(model) %} 2 | SELECT * FROM {{ model }} WHERE 3 | {% for col in adapter.get_columns_in_relation(model) -%} 4 | {{ col.column }} IS NULL OR 5 | {% endfor %} 6 | FALSE 7 | {% endmacro %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/macros/positive_values.sql: -------------------------------------------------------------------------------- 1 | {% test positive_value(model, column_name) %} 2 | SELECT 3 | * 4 | FROM 5 | {{ model }} 6 | WHERE 7 | {{ column_name}} < 1 8 | {% endtest %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_hosts_cleansed.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view' 4 | ) 5 | }} 6 | 7 | WITH src_hosts AS ( 8 | SELECT 9 | * 10 | FROM 11 | {{ ref('src_hosts') }} 12 | ) 13 | SELECT 14 | host_id, 15 | NVL( 16 | host_name, 17 | 'Anonymous' 18 | ) AS host_name, 19 | is_superhost, 20 | created_at, 21 | updated_at 22 | FROM 23 | src_hosts -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_listings_cleansed.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view' 4 | ) 5 | }} 6 | 7 | WITH src_listings AS ( 8 | SELECT 9 | * 10 | FROM 11 | {{ ref('src_listings') }} 12 | ) 13 | SELECT 14 | listing_id, 15 | listing_name, 16 | room_type, 17 | CASE 18 | WHEN minimum_nights = 0 THEN 1 19 | ELSE minimum_nights 20 | END AS minimum_nights, 21 | host_id, 22 | REPLACE( 23 | price_str, 24 | '$' 25 | ) :: NUMBER( 26 | 10, 27 | 2 28 | ) AS price, 29 | created_at, 30 | updated_at 31 | FROM 32 | src_listings -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/dim/dim_listings_w_hosts.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | l AS ( 3 | SELECT 4 | * 5 | FROM 6 | {{ ref('dim_listings_cleansed') }} 7 | ), 8 | h AS ( 9 | SELECT * 10 | FROM {{ ref('dim_hosts_cleansed') }} 11 | ) 12 | 13 | SELECT 14 | l.listing_id, 15 | l.listing_name, 16 | l.room_type, 17 | l.minimum_nights, 18 | l.price, 19 | l.host_id, 20 | h.host_name, 21 | h.is_superhost as host_is_superhost, 22 | l.created_at, 23 | GREATEST(l.updated_at, h.updated_at) as updated_at 24 | FROM l 25 | LEFT JOIN h ON (h.host_id = l.host_id) -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/docs.md: -------------------------------------------------------------------------------- 1 | {% docs dim_listing_cleansed_minimum_nights %} 2 | Minimum number of nights required to rent this property. 3 | 4 | Keep in mind that old listings might have `minimum_nights` set 5 | to 0 in the source tables. Our cleansing algorithm updates this to `1`. 6 | 7 | {% enddocs %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/fct/fct_reviews.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | on_schema_change='fail' 5 | ) 6 | }} 7 | -- as to fail if schema changes 8 | 9 | 10 | WITH src_reviews AS ( 11 | SELECT * FROM {{ ref('src_reviews') }} 12 | ) 13 | SELECT 14 | {{ dbt_utils.generate_surrogate_key(['listing_id', 'review_date', 'reviewer_name', 'review_text']) }} 15 | AS review_id, 16 | * 17 | FROM src_reviews 18 | WHERE review_text is not null 19 | 20 | -- condition for the append, just id the review_date in the new rows are greater than the maximum review_date in this model 21 | {% if is_incremental() %} 22 | AND review_date > (select max(review_date) from {{ this }}) 23 | {% endif %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/mart/mart_fullmoon_reviews.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized = 'table', 3 | ) }} 4 | 5 | WITH fct_reviews AS ( 6 | SELECT * FROM {{ ref('fct_reviews') }} 7 | ), 8 | full_moon_dates AS ( 9 | SELECT * FROM {{ ref('seed_full_moon_dates') }} 10 | ) 11 | 12 | SELECT 13 | r.*, 14 | CASE 15 | WHEN fm.full_moon_date IS NULL THEN 'not full moon' 16 | ELSE 'full moon' 17 | END AS is_full_moon 18 | FROM 19 | fct_reviews 20 | r 21 | LEFT JOIN full_moon_dates 22 | fm 23 | ON (TO_DATE(r.review_date) = DATEADD(DAY, 1, fm.full_moon_date)) -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/overview.md: -------------------------------------------------------------------------------- 1 | {% docs __overview__ %} 2 | # Airbnb pipeline 3 | 4 | Hey, welcome to our Airbnb pipeline documentation! 5 | 6 | Here is the schema of our input data: 7 | ![input schema](assets/input_schema.png) 8 | 9 | {% enddocs %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_listings_cleansed 5 | description: Cleansed table which contains Airbnb listings. 6 | columns: 7 | 8 | - name: listing_id 9 | description: Primary key for the listing 10 | tests: 11 | - unique 12 | - not_null 13 | 14 | - name: host_id 15 | description: The hosts's id. References the host table. 16 | tests: 17 | - not_null 18 | - relationships: 19 | to: ref('dim_hosts_cleansed') 20 | field: host_id 21 | 22 | - name: room_type 23 | description: Type of the apartment / room 24 | tests: 25 | - accepted_values: 26 | values: ['Entire home/apt', 'Private room', 'Shared room', 'Hotel room'] 27 | 28 | - name: minimum_nights 29 | description: '{{ doc("dim_listing_cleansed_minimum_nights") }}' #this call an md doc 30 | tests: 31 | - positive_value 32 | 33 | - name: dim_hosts_cleansed 34 | description: Cleansed table which contains Airbnb hosts. 35 | columns: 36 | 37 | - name: host_id 38 | description: The hosts's id. References the host table. 39 | tests: 40 | - not_null 41 | - unique 42 | 43 | - name: host_name 44 | description: Host full name. 45 | tests: 46 | - not_null 47 | 48 | - name: is_superhost 49 | description: Flag to identify host's category. 50 | tests: 51 | - accepted_values: 52 | values: ['t', 'f'] 53 | 54 | - name: created_at 55 | description: Date the data was entered in the table. 56 | 57 | - name: updated_at 58 | description: Date the data was updated in the table. -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: airbnb 5 | schema: raw 6 | tables: 7 | - name: listings 8 | identifier: raw_listings 9 | 10 | - name: hosts 11 | identifier: raw_hosts 12 | 13 | - name: reviews 14 | identifier: raw_reviews 15 | loaded_at_field: date #refer a column in the table where date is updated when loaded 16 | freshness: 17 | warn_after: {count: 1, period: hour} 18 | error_after: {count: 24, period: hour} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_hosts.sql: -------------------------------------------------------------------------------- 1 | WITH raw_hosts AS ( 2 | SELECT 3 | * 4 | FROM 5 | --AIRBNB.RAW.RAW_HOSTS --reading direct from table, not included in lineage 6 | {{ source ('airbnb','hosts') }} 7 | 8 | ) 9 | SELECT 10 | id AS host_id, 11 | NAME AS host_name, 12 | is_superhost, 13 | created_at, 14 | updated_at 15 | FROM 16 | raw_hosts -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_listings.sql: -------------------------------------------------------------------------------- 1 | WITH raw_listings AS ( 2 | SELECT 3 | * 4 | FROM 5 | --AIRBNB.RAW.RAW_LISTINGS 6 | {{ source ('airbnb','listings') }} 7 | ) 8 | SELECT 9 | id AS listing_id, 10 | name AS listing_name, 11 | listing_url, 12 | room_type, 13 | minimum_nights, 14 | host_id, 15 | price AS price_str, 16 | created_at, 17 | updated_at 18 | FROM 19 | raw_listings -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/models/src/src_reviews.sql: -------------------------------------------------------------------------------- 1 | WITH raw_reviews AS ( 2 | SELECT 3 | * 4 | FROM 5 | --AIRBNB.RAW.RAW_REVIEWS 6 | {{ source ('airbnb','reviews') }} 7 | ) 8 | SELECT 9 | listing_id, 10 | date AS review_date, 11 | reviewer_name, 12 | comments AS review_text, 13 | sentiment AS review_sentiment 14 | FROM 15 | raw_reviews -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | sha1_hash: a158c48c59c2bb7d729d2a4e215aabe5bb4f3353 5 | -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/seeds/.gitkeep -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/.gitkeep -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/snapshots/scd_raw_listings.sql: -------------------------------------------------------------------------------- 1 | {% snapshot scd_raw_listings %} 2 | 3 | {{ 4 | config( 5 | target_schema='DEV', 6 | unique_key='id', 7 | strategy='timestamp', 8 | updated_at='updated_at', 9 | invalidate_hard_deletes=True 10 | ) 11 | }} 12 | 13 | select * FROM {{ source('airbnb', 'listings') }} 14 | 15 | {% endsnapshot %} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/.gitkeep -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/consistent_created_at.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM {{ ref('dim_listings_cleansed') }} l 2 | INNER JOIN {{ ref('fct_reviews') }} r 3 | USING (listing_id) 4 | WHERE l.created_at >= r.review_date -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/dim_listings_minimum_nights.sql: -------------------------------------------------------------------------------- 1 | 2 | SELECT 3 | * 4 | FROM 5 | {{ ref('dim_listings_cleansed') }} 6 | WHERE minimum_nights < 1 7 | LIMIT 10 -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/dbt-course-udemy/dbt_project/tests/no_nulls_in_dim_linstings.sql: -------------------------------------------------------------------------------- 1 | {{ no_nulls_in_columns(ref('dim_listings_cleansed')) }} -------------------------------------------------------------------------------- /dbt/dbt-udemy-course/readme.md: -------------------------------------------------------------------------------- 1 | ## 2024 DBT course - udemy 2 | [Course Link](https://github.com/nordquant/complete-dbt-bootcamp-zero-to-hero/blob/main/_course_resources/course-resources.md) 3 | [Snowflake Instance](https://supypql-om48075.snowflakecomputing.com/console/login) 4 | 5 | ### [Setup the course](./setup.md) 6 | ### [Models](./models.md) -------------------------------------------------------------------------------- /pytest/first_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | def my_func(x): 4 | return x + 5 5 | 6 | def test_func(): 7 | assert my_func(3) == 8 8 | 9 | # to run the test go to consoloe -> pytest first_test.py 10 | -------------------------------------------------------------------------------- /pytest/gtfs_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def service_date(): 7 | return "20220901" 8 | 9 | 10 | @pytest.fixture 11 | def boarding_time(): 12 | return "2022-09-01T10:00:00+00:00" 13 | 14 | 15 | @pytest.fixture 16 | def ticketing_trip_id(): 17 | return "direct:169396293:1:10" 18 | 19 | 20 | @pytest.fixture 21 | def from_ticketing_stop_time_id(): 22 | return "88-1" 23 | 24 | 25 | @pytest.fixture 26 | def to_ticketing_stop_time_id(): 27 | return "94-10" 28 | 29 | 30 | @pytest.fixture 31 | def generate_url_to_test( 32 | service_date 33 | ,boarding_time 34 | ,ticketing_trip_id 35 | ,from_ticketing_stop_time_id 36 | ,to_ticketing_stop_time_id 37 | ): 38 | # hit this enpoint 39 | url = 'gtfs-dev.flix.tech/ticketing/web-url?' \ 40 | f'service_date=["{service_date}"]&' \ 41 | f'boarding_time=["{boarding_time}"]&' \ 42 | f'ticketing_trip_id=["{ticketing_trip_id}"]&' \ 43 | f'from_ticketing_stop_time_id=["{from_ticketing_stop_time_id}"]&' \ 44 | f'to_ticketing_stop_time_id=["{to_ticketing_stop_time_id}"]' 45 | return url 46 | 47 | 48 | @pytest.fixture 49 | def generate_expected_url( service_date 50 | #,boarding_time 51 | ,ticketing_trip_id 52 | ,from_ticketing_stop_time_id 53 | ,to_ticketing_stop_time_id 54 | ): 55 | 56 | departureCity = from_ticketing_stop_time_id.split("-")[0] 57 | departureStation = from_ticketing_stop_time_id.split("-")[1] 58 | arrivalCity = to_ticketing_stop_time_id.split("-")[0] 59 | arrivalStation = to_ticketing_stop_time_id.split("-")[1] 60 | rideDate = '.'.join([service_date[6:],service_date[4:6],service_date[:4]]) 61 | uid = ticketing_trip_id.replace(":","%3A") 62 | 63 | expected_url = 'https://shop.global.flixbus.com/search?' \ 64 | f'departureCity={departureCity}&' \ 65 | f'departureStation={departureStation}&' \ 66 | f'arrivalCity={arrivalCity}&' \ 67 | f'arrivalStation={arrivalStation}&' \ 68 | f'rideDate={rideDate}&' \ 69 | 'adult=1&' \ 70 | 'children=0&' \ 71 | 'bike_slot=0&' \ 72 | 'currency=EUR&' \ 73 | f'uid={uid}&' \ 74 | 'reserve=1' 75 | 76 | return expected_url 77 | 78 | 79 | @pytest.mark.gtfs 80 | def test_func(generate_expected_url,generate_url_to_test): 81 | 82 | # tigger lambda with generate_url_to_test an fetch response 83 | # response = requests.get(generate_url_to_test).json 84 | response = { 85 | "statusCode": 302, 86 | "headers": { 87 | "Location": f"https://shop.global.flixbus.com/search?departureCity=88&departureStation=1&arrivalCity=94&arrivalStation=10&rideDate=01.09.2022&adult=1&children=0&bike_slot=0¤cy=EUR&uid=direct%3A169396293%3A1%3A10&reserve=1" 88 | } 89 | } 90 | 91 | assert response["statusCode"] == 302 92 | assert response["headers"]["Location"] == generate_expected_url 93 | -------------------------------------------------------------------------------- /pytest/multiple_tests.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.one 5 | def test_method1(): 6 | x = 5 7 | y = 10 8 | assert x == y 9 | 10 | 11 | @pytest.mark.two 12 | def test_method2(): 13 | x = 5 14 | y = 10 15 | assert x+5 == y 16 | 17 | # to run each test go to console -> py.test multiple_tests.py -m one -v 18 | -------------------------------------------------------------------------------- /pytest/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | gtfs: mark a test as a webtest. -------------------------------------------------------------------------------- /pytest/readme.md: -------------------------------------------------------------------------------- 1 | # Pytest Study 2 | -------------------------------------------------------------------------------- /pytest/test_api.py: -------------------------------------------------------------------------------- 1 | from urllib import response 2 | import pytest 3 | import requests 4 | import json 5 | 6 | def test_valid_login(): 7 | url = "https://reqres.in/api/login" 8 | data = {'email':'eve.holt@reqres.in','password':'cityslicka'} 9 | response = requests.get(url,data=data) 10 | token = json.loads(response.text) 11 | print(token) 12 | assert response.status_code == 200 13 | assert token["page"] == 1 14 | -------------------------------------------------------------------------------- /pytest/test_fixtures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | #fixtures are codes or mocks you run before your test 4 | @pytest.fixture 5 | def numbers(): 6 | return [10,15,20] 7 | 8 | @pytest.mark.one 9 | def test_one(numbers): 10 | assert numbers[0] == 10 11 | 12 | @pytest.mark.two 13 | def test_two(numbers): 14 | assert numbers[1] == 10 15 | -------------------------------------------------------------------------------- /pytest/test_parametrized.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # pass multiple paramenter to a test 4 | @pytest.mark.parametrize("x,y,z",[(10,20,200),(10,10,200)]) 5 | def test_one(x,y,z): 6 | assert x * y == z -------------------------------------------------------------------------------- /spark_on_google_colab.py: -------------------------------------------------------------------------------- 1 | # Download JRE Headless version to Notebook. 2 | !apt-get install openjdk-8-jdk-headless -qq > /dev/null 3 | 4 | # Download Spark with Hadoop installation zip file and unzip it for further use. 5 | !wget -q https://downloads.apache.org/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz 6 | !tar xf spark-3.0.2-bin-hadoop2.7.tgz 7 | 8 | # Set the Javahome and Sparkhome variables. 9 | import os 10 | os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 11 | os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7" 12 | 13 | # Install and Initialize findspark library. 14 | !pip install -q findspark 15 | import findspark 16 | findspark.find() 17 | findspark.init() 18 | 19 | # Create Spark and SQLContext Sessions. 20 | from pyspark.sql import SparkSession 21 | spark = SparkSession.builder\ 22 | .master("local")\ 23 | .appName("Colab")\ 24 | .config('spark.ui.port', '4050')\ 25 | .getOrCreate() 26 | 27 | from pyspark.sql import SQLContext 28 | sqlContext = SQLContext(spark) 29 | spark -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 1 - Pipeline.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 1 - Pipeline.JPG -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 2 - Star Schema.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 2 - Star Schema.JPG -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 3 - Parallel Computing.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 3 - Parallel Computing.JPG -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 4 - DAG Example Air Flow.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 4 - DAG Example Air Flow.JPG -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 5 - MPP.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 5 - MPP.JPG -------------------------------------------------------------------------------- /src/img/1 - Intro to data Engineering/fig 6 - AirFlow UI.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/1 - Intro to data Engineering/fig 6 - AirFlow UI.JPG -------------------------------------------------------------------------------- /src/img/17 - Introduction to Relational DB/1_n_relationship.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/1_n_relationship.jpg -------------------------------------------------------------------------------- /src/img/17 - Introduction to Relational DB/ENTITY_MODELS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/ENTITY_MODELS.jpg -------------------------------------------------------------------------------- /src/img/17 - Introduction to Relational DB/n_n_relationships.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/n_n_relationships.jpg -------------------------------------------------------------------------------- /src/img/17 - Introduction to Relational DB/postgree_datatypes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Relational DB/postgree_datatypes.jpg -------------------------------------------------------------------------------- /src/img/17 - Introduction to Scala/21_game_points.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Scala/21_game_points.jpg -------------------------------------------------------------------------------- /src/img/17 - Introduction to Scala/Scala_Interpreter.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/17 - Introduction to Scala/Scala_Interpreter.jpg -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 1 - Dataframe.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 1 - Dataframe.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 2 - Loading Excel.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 2 - Loading Excel.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 3 - Datetime Table.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 3 - Datetime Table.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 4 - Datetime Formatting.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 4 - Datetime Formatting.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 5 - JSON Object Oriented.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 5 - JSON Object Oriented.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 6 - JSON Column Oriented.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 6 - JSON Column Oriented.JPG -------------------------------------------------------------------------------- /src/img/2 - Streamlined data with pandas/fig 7 - Yelp Documentation.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/2 - Streamlined data with pandas/fig 7 - Yelp Documentation.JPG -------------------------------------------------------------------------------- /src/img/3 - Software Engineering in Python/fig 1 - Local package structure.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 1 - Local package structure.JPG -------------------------------------------------------------------------------- /src/img/3 - Software Engineering in Python/fig 1 - PEP and non PEP codes.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 1 - PEP and non PEP codes.JPG -------------------------------------------------------------------------------- /src/img/3 - Software Engineering in Python/fig 3 - Portable package structure.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 3 - Portable package structure.JPG -------------------------------------------------------------------------------- /src/img/3 - Software Engineering in Python/fig 4 - Anatomy of classes.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 4 - Anatomy of classes.JPG -------------------------------------------------------------------------------- /src/img/3 - Software Engineering in Python/fig 5 - Inheritance.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/3 - Software Engineering in Python/fig 5 - Inheritance.JPG -------------------------------------------------------------------------------- /src/img/5 - Introduction to Shell Script/fig 1 - paste command.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 1 - paste command.JPG -------------------------------------------------------------------------------- /src/img/5 - Introduction to Shell Script/fig 2 - wrap up manipulating data.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 2 - wrap up manipulating data.JPG -------------------------------------------------------------------------------- /src/img/5 - Introduction to Shell Script/fig 3 - multiple actions loop.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 3 - multiple actions loop.JPG -------------------------------------------------------------------------------- /src/img/5 - Introduction to Shell Script/fig 4 - nano interface.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/5 - Introduction to Shell Script/fig 4 - nano interface.JPG -------------------------------------------------------------------------------- /src/img/SCALA ETL Part 1/imperative_vs_functional_Programing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/imperative_vs_functional_Programing.jpg -------------------------------------------------------------------------------- /src/img/SCALA ETL Part 1/spark drop malformed mode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/spark drop malformed mode.jpg -------------------------------------------------------------------------------- /src/img/SCALA ETL Part 1/spark permissive mode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 1/spark permissive mode.jpg -------------------------------------------------------------------------------- /src/img/SCALA ETL Part 2/image norrmalization.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/SCALA ETL Part 2/image norrmalization.jpg -------------------------------------------------------------------------------- /src/img/Weather_Data_Pipeline/Weather API v1.0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/Weather_Data_Pipeline/Weather API v1.0.jpg -------------------------------------------------------------------------------- /src/img/kafka_fundamentals/broker.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/broker.jpg -------------------------------------------------------------------------------- /src/img/kafka_fundamentals/kafka architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/kafka architecture.jpg -------------------------------------------------------------------------------- /src/img/kafka_fundamentals/topics.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cassiobolba/Data-Engineering/2795d615ffe66a2b1cae7495953f50440551fff3/src/img/kafka_fundamentals/topics.jpg --------------------------------------------------------------------------------