├── terraform
    ├── permissions
    │   ├── role_glueJobs.json
    │   └── policy_glueJobs.json
    ├── redshift.tf
    ├── main.tf
    ├── backend.tf
    ├── terraform.tfvars
    ├── crawler.tf
    ├── dms-replication-instance-module.tf
    ├── variables.tf
    ├── rds_database.tf
    ├── buckets.tf
    ├── dms-resources.tf
    └── iam.tf
├── ingestao-rds
    ├── model.py
    └── app.py
├── README.md
└── processing
    └── job-spark-app-emr-redshift.py


/terraform/permissions/role_glueJobs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Version": "2012-10-17",
 3 |     "Statement": [
 4 |       {
 5 |         "Action": "sts:AssumeRole",
 6 |         "Principal": {
 7 |           "Service": "glue.amazonaws.com"
 8 |         },
 9 |         "Effect": "Allow",
10 |         "Sid": ""
11 |       }
12 |     ]
13 |   }


--------------------------------------------------------------------------------
/terraform/redshift.tf:
--------------------------------------------------------------------------------
1 | resource "aws_redshift_cluster" "redshift-cluster" {
2 |   cluster_identifier = "redshift-cluster"
3 |   database_name      = "coins"
4 |   master_username    = "awsuser"
5 |   master_password    = "U%SlC7*Y807n"
6 |   node_type          = "dc2.large"
7 |   cluster_type       = "single-node"
8 |   skip_final_snapshot = true
9 | }


--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 4.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | # Configure the AWS Provider
11 | provider "aws" {
12 |   region = "us-east-1"
13 | }
14 | 
15 | provider "aws" {
16 |   alias = "region-us-east-2"
17 |   region = "us-east-2"
18 | }


--------------------------------------------------------------------------------
/terraform/backend.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   backend "s3" {
 3 |     # Edit the bucket name and region
 4 |     bucket         = "stack-terraform-backend"
 5 |     key            = "global/s3/terraform.tfstate"
 6 |     region         = "us-east-1"
 7 | 
 8 |     # Edit the bucket name and region
 9 |     dynamodb_table = "terraform-locks"
10 |     encrypt        = true
11 |   }
12 | }


--------------------------------------------------------------------------------
/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | project_name = "bootcampde"
 2 | environment = "prod"
 3 | bucket_names = ["raw","processed","curated","scripts"]
 4 | db_username = "postgres"
 5 | db_password = "Stack2022!"
 6 | key_pair_name = "pair-bootcamp"
 7 | db_port = 3306
 8 | db_server_name = "mysql-stagingdb.chlqfmqqbzxx.us-east-1.rds.amazonaws.com"
 9 | vpc_group_name = "vpc-group-name-dms"
10 | vpc_group_description = "Allow access to environment"
11 | vpc_id = ["vpc-0761bfe5fcd29ef7b"]
12 | subnet_id = ["subnet-0338eec367c32a38d","subnet-033f327fa63c99d4f"]
13 | security_group_id_list = ["sg-0b41a6d6dda8ae746"]
14 | athena_database_name = "default"


--------------------------------------------------------------------------------
/terraform/crawler.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | resource "aws_glue_crawler" "crawler_processed" {
 3 |   database_name = var.athena_database_name
 4 |   name          = "${var.project_name}-crawler-processed-${var.environment}"
 5 |   role          = aws_iam_role.glue_job.arn
 6 | 
 7 |   delta_target {
 8 |     delta_tables = ["s3://processed-stack-bootcampde/tb_coins/"]
 9 |     write_manifest = "true"
10 |   }
11 | }
12 | 
13 | resource "aws_glue_crawler" "crawler_curated" {
14 |   database_name = var.athena_database_name
15 |   name          = "${var.project_name}-crawler-curated-${var.environment}"
16 |   role          = aws_iam_role.glue_job.arn
17 | 
18 |   delta_target {
19 |     delta_tables = ["s3://curated-stack-bootcampde/coins_circulating_supply/","s3://curated-stack-bootcampde/top10_prices_2022/"]
20 |     write_manifest = "true"
21 |   }
22 | }


--------------------------------------------------------------------------------
/terraform/dms-replication-instance-module.tf:
--------------------------------------------------------------------------------
 1 | data "aws_partition" "current" {}
 2 | data "aws_region" "current" {}
 3 | data "aws_caller_identity" "current" {}
 4 | 
 5 | 
 6 | module "dms" {
 7 |   source  = "terraform-aws-modules/dms/aws"
 8 |   version = "~> 1.6"
 9 | 
10 |   # Subnet group
11 |   repl_subnet_group_name        = var.vpc_group_name
12 |   repl_subnet_group_description = var.vpc_group_description
13 |   repl_subnet_group_subnet_ids  = var.subnet_id
14 |   
15 |   # Instance
16 |   repl_instance_apply_immediately      = true
17 |   repl_instance_multi_az               = false
18 |   repl_instance_class                  = "dms.t3.micro"
19 |   repl_instance_id                     = "${var.project_name}-dms-instance-${var.environment}"
20 |   repl_instance_publicly_accessible    = false
21 |   repl_instance_vpc_security_group_ids = var.security_group_id_list
22 | 
23 |   depends_on = [aws_s3_bucket.buckets-stack, aws_db_instance.PostgrelSQL-01]
24 | }


--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "environment" {
 2 |     description = "setup the environment"
 3 | }
 4 | 
 5 | variable "project_name" {
 6 |     description = "Nome do projeto"
 7 | }
 8 | 
 9 | variable "bucket_names" {
10 |     type = list(string)
11 | }
12 | 
13 | variable "db_username" {
14 |     type = string
15 | }
16 | 
17 | variable "db_password" {
18 |     type = string
19 |     sensitive = false
20 | }
21 | variable "db_port" {
22 |     type = string
23 | }
24 | 
25 | variable "vpc_group_name" {
26 |     type = string
27 | }
28 | variable "vpc_group_description" {
29 |     type = string
30 | }
31 | 
32 | variable "subnet_id" {
33 |     type = list(string)
34 | }
35 | 
36 | variable "vpc_id" {
37 |     type = list(string)
38 | }
39 | variable "security_group_id_list" {
40 |     type = list(string)
41 | }
42 | 
43 | 
44 | variable "athena_database_name" {
45 |     type = string
46 | }
47 | 
48 | variable "key_pair_name" {
49 |     type = string
50 | }
51 | 
52 | 
53 | variable "db_server_name" {
54 |     type = string
55 | }
56 | 


--------------------------------------------------------------------------------
/ingestao-rds/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sqlite3 import Timestamp
 3 | from pandas import DatetimeTZDtype
 4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, Text
 5 | from sqlalchemy.ext.declarative import declarative_base
 6 | from sqlalchemy.orm import sessionmaker
 7 | 
 8 | Base = declarative_base()
 9 | 
10 | 
11 | class Coins(Base):
12 |     __tablename__ = 'tb_coins'  # if you use base it is obligatory
13 |     id = Column(Integer, primary_key=True)  # obligatory
14 |     name = Column(String)
15 |     symbol = Column(String)
16 |     data_added = Column(Text)
17 |     last_updated = Column(Text)
18 |     price = Column(Float)
19 |     volume_24h = Column(Float)
20 |     circulating_supply = Column(Float)
21 |     total_supply = Column(Float)
22 |     max_supply = Column(Float)
23 |     volume_24h = Column(Float)
24 |     percent_change_1h = Column(Float)
25 |     percent_change_24h = Column(Float)
26 |     percent_change_7d = Column(Float)
27 |    
28 |     def start():
29 |         db_string = "postgresql://postgres:*8xayZycAE5m@server01.csddqd4agbf7.us-east-1.rds.amazonaws.com/coins"
30 |         engine = create_engine(db_string)
31 |         Session = sessionmaker(bind=engine)
32 |         session = Session()
33 |         Base.metadata.create_all(engine)
34 |         print ('\nTable created on database')
35 |         return session, engine
36 | 


--------------------------------------------------------------------------------
/terraform/rds_database.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_security_group" "access-rds-port" {
 2 |   name        = "access_rds"
 3 | 
 4 |   ingress {
 5 |     from_port   = 3306
 6 |     to_port     = 3306
 7 |     protocol    = "tcp"
 8 |     
 9 |     # Please restrict your ingress to only necessary IPs and ports.
10 |     # Opening to 0.0.0.0/0 can lead to security vulnerabilities.
11 |     cidr_blocks = ["0.0.0.0/0"]
12 | 
13 |   }
14 |   tags = {
15 |     security_group = "access-rds-port"
16 |   }
17 | }
18 | 
19 | resource "aws_db_instance" "PostgrelSQL-01" {
20 |   db_name              = "coins"
21 |   engine               = "postgres"
22 |   engine_version       = "13.7"
23 |   instance_class       = "db.t3.micro"
24 |   username             = var.db_username
25 |   password             = var.db_password
26 |   #parameter_group_name = "default.mysql5.7"
27 |   skip_final_snapshot  = true
28 |   
29 |   # resource identifier
30 |   identifier = "${var.project_name}-rds-database-${var.environment}"
31 |   
32 |   # Storage options
33 |   allocated_storage    = 50
34 |   max_allocated_storage = 100
35 |   
36 |   # allow remotly access
37 |   vpc_security_group_ids = [aws_security_group.access-rds-port.id]
38 |   publicly_accessible = "true"
39 | }
40 | 
41 | # print information
42 | output "address_endpoint" {
43 |     value = aws_db_instance.PostgrelSQL-01.address
44 | }
45 | output "db_user_admin" {
46 |     value = aws_db_instance.PostgrelSQL-01.username
47 | }
48 | output "instance_port" {
49 |     value = aws_db_instance.PostgrelSQL-01.port
50 | }


--------------------------------------------------------------------------------
/terraform/buckets.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_s3_bucket" "buckets-stack" {
 2 |   count  = length(var.bucket_names)
 3 |   bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
 4 |   force_destroy = true
 5 |     tags = {
 6 |     Bucket_Name    = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
 7 |     environment = var.environment
 8 |     Cost_Center = "TI"
 9 |     Project_Name = var.project_name
10 |    }
11 |   }
12 | 
13 | resource "aws_s3_bucket_server_side_encryption_configuration" "bucket-encryption" {
14 |   count  = length(var.bucket_names)
15 |   bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
16 |   rule {
17 |     apply_server_side_encryption_by_default {
18 |       sse_algorithm = "AES256"
19 |     }
20 |   }
21 |     depends_on = [
22 |     aws_s3_bucket.buckets-stack
23 |   ]
24 | }
25 | 
26 | resource "aws_s3_bucket_acl" "bucket_acl" {
27 |   count  = length(var.bucket_names)
28 |   bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
29 |   acl    = "private"
30 |     depends_on = [
31 |     aws_s3_bucket.buckets-stack
32 |   ]
33 | 
34 | }
35 | 
36 | resource "aws_s3_bucket_public_access_block" "public_access_block" {
37 |   count  = length(var.bucket_names)
38 |   bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
39 | 
40 |   block_public_acls       = true
41 |   block_public_policy     = true
42 |   ignore_public_acls      = true
43 |   restrict_public_buckets = true
44 |   depends_on = [
45 |     aws_s3_bucket.buckets-stack
46 |   ]
47 | }


--------------------------------------------------------------------------------
/terraform/dms-resources.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_dms_endpoint" "postgresql_endpoint" {
 2 |   endpoint_id                 = "${var.project_name}-endpoint-source-${var.environment}"
 3 |   endpoint_type               = "source"
 4 |   engine_name                 = "postgres"
 5 |   username                    = var.db_username
 6 |   password                    = var.db_password
 7 |   port                        = var.db_port
 8 |   database_name               = aws_db_instance.PostgrelSQL-01.db_name
 9 |   server_name                 = aws_db_instance.PostgrelSQL-01.address
10 |   ssl_mode                    = "none"
11 |   depends_on = [aws_s3_bucket.buckets-stack,aws_db_instance.PostgrelSQL-01]
12 | }
13 | 
14 | resource "aws_dms_endpoint" "s3_endpoint" {
15 |   endpoint_id                 = "${var.project_name}-endpoint-target-${var.environment}"
16 |   endpoint_type               = "target"
17 |   engine_name                 = "s3"
18 |   ssl_mode                    = "none"
19 |   extra_connection_attributes = "IncludeOpForFullLoad=True;TimestampColumnName=TIMESTAMP;AddColumnName=True"
20 |   
21 |   s3_settings {
22 |     bucket_name             = aws_s3_bucket_public_access_block.public_access_block[0].bucket
23 |     service_access_role_arn = "arn:aws:iam::395882348933:role/Role-DMS-S3-Access"
24 |     add_column_name = true
25 |     cdc_path = "cdc"
26 |     timestamp_column_name = "TIMESTAMP"
27 |  }
28 |   depends_on = [aws_s3_bucket.buckets-stack]
29 | }
30 | 
31 | resource "aws_dms_replication_task" "replication-task1" {
32 |   migration_type            = "full-load"
33 |   replication_instance_arn  = module.dms.replication_instance_arn
34 |   replication_task_id       = "${var.project_name}-replication-task-${var.environment}"
35 |   source_endpoint_arn       = aws_dms_endpoint.postgresql_endpoint.endpoint_arn
36 |   target_endpoint_arn       = aws_dms_endpoint.s3_endpoint.endpoint_arn
37 |   table_mappings            = "{\"rules\":[{\"rule-type\":\"selection\",\"rule-id\":\"1\",\"rule-name\":\"1\",\"object-locator\":{\"schema-name\":\"%\",\"table-name\":\"%\"},\"rule-action\":\"include\"}]}"
38 |   tags = {
39 |     Name = "${var.project_name}-replication-task-${var.environment}"
40 |   }
41 |   depends_on = [aws_s3_bucket.buckets-stack]
42 | }
43 | 
44 |   


--------------------------------------------------------------------------------
/terraform/iam.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_iam_role" "glue_job" {
 2 |   name               = "${var.project_name}-glue-job-role"
 3 |   path               = "/"
 4 |   description        = "Provides write permissions to CloudWatch Logs and S3 Full Access"
 5 |   assume_role_policy = file("permissions/role_glueJobs.json")
 6 | }
 7 | 
 8 | resource "aws_iam_policy" "glue_job_policy" {
 9 |   name        = "${var.project_name}-glue-job-policy"
10 |   path        = "/"
11 |   description = "Provides write permissions to CloudWatch Logs and S3 Full Access"
12 |   policy      = file("permissions/policy_glueJobs.json")
13 | }
14 | 
15 | resource "aws_iam_role_policy_attachment" "glue_job1" {
16 |   role       = aws_iam_role.glue_job.name
17 |   policy_arn = aws_iam_policy.glue_job_policy.arn
18 | }
19 | 
20 | resource "aws_iam_role" "s3_role" {
21 |   name        = "dms-s3-role"
22 |   description = "Role used to migrate data from S3 via DMS"
23 | 
24 |   assume_role_policy = jsonencode({
25 |     Version = "2012-10-17"
26 |     Statement = [
27 |       {
28 |         Sid    = "DMSAssume"
29 |         Action = "sts:AssumeRole"
30 |         Effect = "Allow"
31 |         Principal = {
32 |           Service = "dms.${data.aws_partition.current.dns_suffix}"
33 |         }
34 |       },
35 |     ]
36 |   })
37 | 
38 |   inline_policy {
39 |     name = "dms-s3-role"
40 | 
41 |     policy = jsonencode({
42 |       Version = "2012-10-17"
43 |       Statement = [
44 |         {
45 |           Sid      = "DMSS3"
46 |           Action   = ["s3:*"]
47 |           Effect   = "Allow"
48 |           Resource = "*"
49 |         }
50 |       ]
51 |     })
52 |   }
53 | }
54 | 
55 | data "aws_iam_policy_document" "dms_assume_role" {
56 |   statement {
57 |     actions = ["sts:AssumeRole"]
58 | 
59 |     principals {
60 |       identifiers = ["dms.amazonaws.com"]
61 |       type        = "Service"
62 |     }
63 |   }
64 | }
65 | 
66 | resource "aws_iam_role" "dms-vpc-role" {
67 |   assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json
68 |   name               = "dmsvpc-role"
69 | }
70 | 
71 | 
72 | resource "aws_iam_role" "dms-cloudwatch-logs-role" {
73 |   assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json
74 |   name               = "dms-cloudwatch-logs-role1"
75 | }
76 | 
77 | resource "aws_iam_role_policy_attachment" "dms-cloudwatch-logs-role-AmazonDMSCloudWatchLogsRole" {
78 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonDMSCloudWatchLogsRole"
79 |   role       = aws_iam_role.dms-cloudwatch-logs-role.name
80 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bootcamp de Engenharia de Dados.
 2 | 
 3 | ### Para rodar a aplicação de ingestão faça:
 4 | 
 5 | 1. Provisione o RDS PostgreSQL na AWS conforme abordado em aula.
 6 | 2. Crie um banco de dados, por exemplo: coins
 7 | 3. Configure o arquivo model.py com o nome da tabela a ser criada, exemplo: tb_coins.
 8 | 4. Edite variável `db_string` com o endpoint do RDS na AWS.
 9 | 5. Altere o arquivo app.py inserindo a chave da api como argumento da função get_data (key)
10 | 6. Execute a aplicação para consumo da API e persistência no banco de dados.
11 | 
12 | 
13 | ### Para rodar a aplicação Spark:
14 | 1. Suba o Amazon EMR.
15 | 
16 | 2. Navegue até o diretório `processing`
17 | 
18 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo:
19 | 
20 | <code>scp -i ~/Downloads/pair-bootcamp.pem job-spark-app-emr-redshift.py hadoop@ec2-54-90-3-194.compute-1.amazonaws.com:/home/hadoop/ </code>
21 | 
22 | 4. Conecte no servidor master usando `ssh`, exemplo:
23 | 
24 | <code> ssh -i ~/Downloads/pair-bootcamp.pem hadoop@ec2-54-90-3-194.compute-1.amazonaws.com </code>
25 | 
26 | *Obs*: Antes de executar a aplicação verifique se o redshift está iniciado, caso não esteja edite a aplicação alterando a variável `flag_write_redshift` para false.
27 | 
28 | 5. Execute o comando spark-submit, exemplo:
29 | <code>spark-submit --packages io.delta:delta-core_2.12:2.0.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"  --jars /usr/share/aws/redshift/jdbc/RedshiftJDBC.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-redshift.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-avro.jar,/usr/share/aws/redshift/spark-redshift/lib/minimal-json.jar job-spark-app-emr-redshift.py</code>
30 | 
31 | 
32 | ### Para provisionar recursos com Terraform:
33 | 1. Navegue até o diretório `Terraform`
34 | 
35 | 2. Instale o aplicativo `terraform`
36 | 
37 | 3. Instale o aplicativo `aws-cli`. Veja esse link: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
38 | 
39 | 4. Autentique na AWS com o comando:
40 | 
41 | <code>aws configure</code>
42 | 
43 | 5. Antes de provisionar os recursos crie o backend para usar como *backend* e a tabela no Dynamodb.
44 | 
45 | 6. Edite os arquivos *terraform.tfvars* e *variables.tf* com as informações da sua infraestrutura.
46 | 
47 | 7. Provisione os recursos com os comandos:
48 | 
49 | Provisione os recursos com o comando:
50 | 
51 | <code>terraform init></code>
52 | 
53 | <code>terraform plan></code>
54 | 
55 | <code>terraform apply></code>
56 | 
57 | 
58 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo:


--------------------------------------------------------------------------------
/processing/job-spark-app-emr-redshift.py:
--------------------------------------------------------------------------------
  1 | from os.path import abspath
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import *
  4 | 
  5 | # setup da aplicação Spark
  6 | spark = SparkSession \
  7 |     .builder \
  8 |     .appName("job-1-spark") \
  9 |     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
 10 |     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
 11 |     .getOrCreate()
 12 | 
 13 | # definindo o método de logging da aplicação use INFO somente para DEV [INFO,ERROR]
 14 | spark.sparkContext.setLogLevel("ERROR")
 15 | 
 16 | def read_csv(bucket, path):
 17 |     # lendo os dados do Data Lake
 18 |     df = spark.read.format("csv")\
 19 |         .option("header", "True")\
 20 |         .option("inferSchema","True")\
 21 |         .csv(f"{bucket}/{path}")
 22 |     # imprime os dados lidos da raw
 23 |     print ("\nImprime os dados lidos da raw:")
 24 |     print (df.show(5))
 25 |     # imprime o schema do dataframe
 26 |     print ("\nImprime o schema do dataframe lido da raw:")
 27 |     print (df.printSchema())
 28 |     return df
 29 | 
 30 | def read_delta(bucket, path):
 31 |     df = spark.read.format("delta")\
 32 |         .load(f"{bucket}/{path}")
 33 |     return df
 34 | 
 35 | def write_processed(bucket, path, col_partition, data_format, mode):
 36 |     print ("\nEscrevendo os dados lidos da raw para delta na processing zone...")
 37 |     try:
 38 |         df.write.format(data_format)\
 39 |             .partitionBy(col_partition)\
 40 |             .mode(mode)\
 41 |             .save(f"{bucket}/{path}")
 42 |         print (f"Dados escritos na processed com sucesso!")
 43 |         return 0
 44 |     except Exception as err:
 45 |         print (f"Falha para escrever dados na processed: {err}")
 46 |         return 1
 47 | 
 48 | def write_curated(bucket, path, dataframe, data_format, mode):
 49 |     # converte os dados processados para parquet e escreve na curated zone
 50 |     print ("\nEscrevendo os dados na curated zone...")
 51 |     try:
 52 |         dataframe.write.format(data_format)\
 53 |                 .mode(mode)\
 54 |                 .save(f"{bucket}/{path}")
 55 |         print (f"Dados escritos na curated com sucesso!")
 56 |         return 0
 57 |     except Exception as err:
 58 |         print (f"Falha para escrever dados na processed: {err}")
 59 |         return 1
 60 | 
 61 | def write_redshift(url_jdbc, table_name, dataframe):
 62 |     try:
 63 |         dataframe.write.format("jdbc")\
 64 |                 .options(url=url_jdbc,
 65 |                         driver='com.amazon.redshift.jdbc42.Driver',
 66 |                         user='awsuser',
 67 |                         password='U%SlC7*Y807n',
 68 |                         dbtable=table_name
 69 |                         )\
 70 |                 .mode('overwrite')\
 71 |                 .save()
 72 |         print (f"Dados escritos no Redshift com sucesso!")
 73 |         return 0
 74 |     except Exception as err:
 75 |         print (f"Falha para escrever dados no Redshift: {err}")
 76 |         return 1
 77 | 
 78 | 
 79 | def analytics_tables(bucket, dataframe, table_name, flag_write_redshift, url_jdbc):
 80 |     # cria uma view para trabalhar com sql
 81 |     dataframe.createOrReplaceTempView(table_name)
 82 |     # processa os dados conforme regra de negócio
 83 |     df_query1 = df.groupBy("name") \
 84 |                 .agg(sum("circulating_supply").alias("circulating_supply")) \
 85 |                 .sort(desc("circulating_supply")) \
 86 |                 .limit(10)
 87 |     df_query2 = df.select(col('name'),col('symbol'),col('price'))\
 88 |                 .sort(desc("price"))\
 89 |                 .limit(10)
 90 |     # imprime o resultado do dataframe criado
 91 |     print ("\n Top 10 Cryptomoedas com maior fornecimento de circulação  no mercado\n")
 92 |     print (df_query1.show())
 93 |     print ("\n Top 10 Cryptomoedas com preços mais altos de 2022\n")
 94 |     print (df_query2.show())
 95 |     write_curated(f"{bucket}","coins_circulating_supply",df_query1,"delta","overwrite")
 96 |     write_curated(f"{bucket}","top10_prices_2022",df_query2,"delta","overwrite")
 97 | 
 98 |     if flag_write_redshift == True:
 99 |         write_redshift(url_jdbc, "coins_circulating_supply", df_query1)
100 |         write_redshift(url_jdbc,"top10_prices_2022",df_query2)
101 | 
102 | 
103 | # Ler dados da raw
104 | df = read_csv('s3a://raw-stack-bootcampde','public/tb_coins/')
105 | 
106 | # Cria uma coluna de ano para particionar os dados
107 | df = df.withColumn("year", year(df.data_added))
108 | 
109 | # Processa os dados e escreve na camada processed
110 | write_processed("s3a://processed-stack-bootcampde","tb_coins","year","delta","overwrite")
111 | 
112 | # Lear dados da processed e escreve na camada curated.
113 | df = read_delta("s3a://processed-stack-bootcampde","tb_coins")
114 | 
115 | flag_write_redshift = True
116 | url_jdbc = "jdbc:redshift://redshift-cluster-1.cufcxu0ztur8.us-east-1.redshift.amazonaws.com:5439/dev"
117 | analytics_tables("s3a://curated-stack-bootcampde",df,"tb_coins", flag_write_redshift, url_jdbc)
118 | 
119 | # para a aplicação
120 | spark.stop()


--------------------------------------------------------------------------------
/ingestao-rds/app.py:
--------------------------------------------------------------------------------
  1 | from ast import Str, Try
  2 | import json
  3 | from sqlite3 import Date
  4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, null
  5 | from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
  6 | from sqlalchemy.ext.declarative import declarative_base
  7 | from sqlalchemy.orm import sessionmaker
  8 | from model import Coins
  9 | from requests import Request, Session
 10 | from datetime import datetime
 11 | import pandas as pd
 12 | 
 13 | def check_if_valid_data(df: pd.DataFrame) -> bool:
 14 |     
 15 |     # Check if dataframe is empty
 16 |     if df.empty:
 17 |         print("\nDataframe empty. Finishing execution")
 18 |         return False 
 19 | 
 20 |     # Check for nulls
 21 |     if df.symbol.empty:
 22 |         raise Exception("\nSymbol is Null or the value is empty")
 23 |  
 24 |      # Check for nulls
 25 |     if df.price.empty:
 26 |         raise Exception("\nPrice is Null or the value is empty")
 27 | 
 28 |     # Check for nulls
 29 |     if df.data_added.empty:
 30 |         raise Exception("\nData is Null or the value is empty")
 31 | 
 32 |     return True
 33 |     
 34 | def load_data(table_name, coins_df, session_db, engine_db):
 35 |     
 36 |     # validate
 37 |     if check_if_valid_data(coins_df):
 38 |         print("\nData valid, proceed to Load stage")
 39 |     
 40 |     # load data on database
 41 |     try:
 42 |         coins_df.to_sql(table_name, engine_db, index=False, if_exists='append')
 43 |         print ('\nData Loaded on Database')
 44 | 
 45 |     except Exception as err:
 46 |         print(f"\nFail to load data on database: {err}")
 47 | 
 48 |     session_db.commit()
 49 |     session_db.close()
 50 |     print("\nClose database successfully")
 51 |     return session_db
 52 | 
 53 | def get_data(session_db, engine_db, start, limit, convert, key, url):
 54 |     
 55 |     # set limit of data from api
 56 |     parameters = {
 57 |         'start': start,
 58 |         'limit': limit,
 59 |         'convert': convert
 60 |     }
 61 | 
 62 |     headers = {
 63 |         'Accepts': 'application/json',
 64 |         'X-CMC_PRO_API_KEY': key,
 65 |     }
 66 | 
 67 |     session = Session()
 68 |     session.headers.update(headers)
 69 | 
 70 |     name = []
 71 |     symbol = []
 72 |     data_added = []
 73 |     last_updated = []
 74 |     price = []
 75 |     volume_24h = []
 76 |     circulating_supply = []
 77 |     total_supply = []
 78 |     max_supply = []
 79 |     volume_24h = []
 80 |     percent_change_1h = []
 81 |     percent_change_24h = []
 82 |     percent_change_7d = []
 83 | 
 84 |     try:
 85 |         response = session.get(url, params=parameters)
 86 |         data = json.loads(response.text)
 87 | 
 88 |         print ('\n')
 89 |         for coin in data['data']:
 90 |             name.append(coin['name'])
 91 |             symbol.append(coin['symbol'])
 92 |             data_added.append(coin['date_added'])
 93 |             last_updated.append(coin['last_updated'])
 94 |             circulating_supply.append(coin['circulating_supply'])
 95 |             total_supply.append(coin['total_supply'])
 96 |             max_supply.append(coin['max_supply'])
 97 |             price.append(coin['quote']['USD']['price'])
 98 |             volume_24h.append(coin['quote']['USD']['volume_24h'])
 99 |             percent_change_1h.append(coin['quote']['USD']['percent_change_1h'])
100 |             percent_change_24h.append(coin['quote']['USD']['percent_change_24h'])
101 |             percent_change_7d.append(coin['quote']['USD']['percent_change_7d'])
102 | 
103 | 
104 |         # Prepare a dictionary in order to turn it into a pandas dataframe below       
105 |         coin_dict = {
106 |             "name" : name,
107 |             "symbol": symbol,
108 |             "data_added" : data_added,
109 |             "last_updated" : last_updated,
110 |             "price": price,
111 |             "volume_24h": volume_24h,
112 |             "circulating_supply" : circulating_supply,
113 |             "total_supply": total_supply,
114 |             "max_supply": max_supply,
115 |             "volume_24h": volume_24h,
116 |             "percent_change_1h": percent_change_1h,
117 |             "percent_change_24h": percent_change_24h,
118 |             "percent_change_7d": percent_change_7d
119 | 
120 |         }
121 |     except Exception as e:
122 |         print (f'Error to get data from APi: {e}')
123 |         exit(1)
124 |     
125 |     # create dataframe to structure data
126 |     coins_df = pd.DataFrame(coin_dict, columns = ["name", "symbol", "data_added", "last_updated","price","volume_24h","circulating_supply","total_supply","max_supply","percent_change_1h","percent_change_24h","percent_change_7d"])
127 |     print ("Data on Pandas Dataframe:\n")
128 |     print(coins_df.head(100))
129 |     
130 |     # call the function to load data on database
131 |     load_data('tb_coins',coins_df, session_db, engine_db)
132 | 
133 | # Declaration base
134 | Base = declarative_base()
135 | 
136 | # Make the coin table
137 | get_session_db, get_engine = Coins.start()
138 | 
139 | # call the get_data function and load data on database
140 | get_data(get_session_db,
141 |          get_engine,
142 |          '1',
143 |          '5000',
144 |          'USD',
145 |          '7bdc01a6-f004-4c0b-b21c-1c1d3970352f',
146 |          'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest')


--------------------------------------------------------------------------------
/terraform/permissions/policy_glueJobs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Version": "2012-10-17",
  3 |     "Statement": [
  4 |         {
  5 |             "Effect": "Allow",
  6 |             "Action": [
  7 |                 "glue:*",
  8 |                 "redshift:DescribeClusters",
  9 |                 "redshift:DescribeClusterSubnetGroups",
 10 |                 "iam:ListRoles",
 11 |                 "iam:ListUsers",
 12 |                 "iam:ListGroups",
 13 |                 "iam:ListRolePolicies",
 14 |                 "iam:GetRole",
 15 |                 "iam:GetRolePolicy",
 16 |                 "iam:ListAttachedRolePolicies",
 17 |                 "ec2:DescribeSecurityGroups",
 18 |                 "ec2:DescribeSubnets",
 19 |                 "ec2:DescribeVpcs",
 20 |                 "ec2:DescribeVpcEndpoints",
 21 |                 "ec2:DescribeRouteTables",
 22 |                 "ec2:DescribeVpcAttribute",
 23 |                 "ec2:DescribeKeyPairs",
 24 |                 "ec2:DescribeInstances",
 25 |                 "rds:DescribeDBInstances",
 26 |                 "rds:DescribeDBClusters",
 27 |                 "rds:DescribeDBSubnetGroups",
 28 |                 "s3:ListAllMyBuckets",
 29 |                 "s3:ListBucket",
 30 |                 "s3:GetBucketAcl",
 31 |                 "s3:GetBucketLocation",
 32 |                 "cloudformation:DescribeStacks",
 33 |                 "cloudformation:GetTemplateSummary",
 34 |                 "dynamodb:ListTables",
 35 |                 "kms:ListAliases",
 36 |                 "kms:DescribeKey",
 37 |                 "cloudwatch:GetMetricData",
 38 |                 "cloudwatch:ListDashboards"                
 39 |             ],
 40 |             "Resource": [
 41 |                 "*"
 42 |             ]
 43 |         },
 44 |         {
 45 |             "Effect": "Allow",
 46 |             "Action": [
 47 |                 "s3:GetObject",
 48 |                 "s3:PutObject"
 49 |             ],
 50 |             "Resource": [
 51 |                 "arn:aws:s3:::*"
 52 |             ]
 53 |         },
 54 |         {
 55 |             "Effect": "Allow",
 56 |             "Action": [
 57 |                 "tag:GetResources"
 58 |             ],
 59 |             "Resource": [
 60 |                 "*"
 61 |             ]
 62 |         },
 63 |         {
 64 |             "Effect": "Allow",
 65 |             "Action": [
 66 |                 "s3:CreateBucket",
 67 |                 "s3:PutBucketPublicAccessBlock"            ],
 68 |             "Resource": [
 69 |                 "arn:aws:s3:::*"
 70 |             ]
 71 |         },
 72 |         {
 73 |             "Effect": "Allow",
 74 |             "Action": [
 75 |                 "logs:GetLogEvents"
 76 |             ],
 77 |             "Resource": [
 78 |                 "arn:aws:logs:*:*:/aws-glue/*"
 79 |             ]
 80 |         },
 81 |         {
 82 |             "Effect": "Allow",
 83 |             "Action": [
 84 |                 "cloudformation:CreateStack",
 85 |                 "cloudformation:DeleteStack"
 86 |             ],
 87 |             "Resource": "arn:aws:cloudformation:*:*:stack/aws-glue*/*"
 88 |         },
 89 |         {
 90 |             "Effect": "Allow",
 91 |             "Action": [
 92 |                 "ec2:RunInstances"
 93 |             ],
 94 |             "Resource": [
 95 |                 "arn:aws:ec2:*:*:instance/*",
 96 |                 "arn:aws:ec2:*:*:key-pair/*",
 97 |                 "arn:aws:ec2:*:*:image/*",
 98 |                 "arn:aws:ec2:*:*:security-group/*",
 99 |                 "arn:aws:ec2:*:*:network-interface/*",
100 |                 "arn:aws:ec2:*:*:subnet/*",
101 |                 "arn:aws:ec2:*:*:volume/*"
102 |             ]
103 |         },
104 |         {
105 |             "Effect": "Allow",
106 |             "Action": [
107 |                 "ec2:TerminateInstances",
108 |                 "ec2:CreateTags",
109 |                 "ec2:DeleteTags"
110 |             ],
111 |             "Resource": [
112 |                 "arn:aws:ec2:*:*:instance/*"
113 |             ],
114 |             "Condition": {
115 |                 "StringLike": {
116 |                     "ec2:ResourceTag/aws:cloudformation:stack-id": "arn:aws:cloudformation:*:*:stack/*/*"
117 |                 },
118 |                 "StringEquals": {
119 |                     "ec2:ResourceTag/aws:cloudformation:logical-id": "ZeppelinInstance"
120 |                 }
121 |             }
122 |         },
123 |         {
124 |             "Action": [
125 |                 "iam:PassRole"
126 |             ],
127 |             "Effect": "Allow",
128 |             "Resource": "arn:aws:iam::*:role/AWSGlueServiceRole*",
129 |             "Condition": {
130 |                 "StringLike": {
131 |                     "iam:PassedToService": [
132 |                         "glue.amazonaws.com"
133 |                     ]
134 |                 }
135 |             }
136 |         },
137 |         {
138 |             "Action": [
139 |                 "iam:PassRole"
140 |             ],
141 |             "Effect": "Allow",
142 |             "Resource": "arn:aws:iam::*:role/AWSGlueServiceNotebookRole*",
143 |             "Condition": {
144 |                 "StringLike": {
145 |                     "iam:PassedToService": [
146 |                         "ec2.amazonaws.com"
147 |                     ]
148 |                 }
149 |             }
150 |         },
151 |         {
152 |             "Action": [
153 |                 "iam:PassRole"
154 |             ],
155 |             "Effect": "Allow",
156 |             "Resource": [
157 |                 "arn:aws:iam::*:role/service-role/AWSGlueServiceRole*"
158 |             ],
159 |             "Condition": {
160 |                 "StringLike": {
161 |                     "iam:PassedToService": [
162 |                         "glue.amazonaws.com"
163 |                     ]
164 |                 }
165 |             }
166 |         }
167 |     ]
168 | }


--------------------------------------------------------------------------------