├── terraform
├── permissions
│ ├── role_glueJobs.json
│ └── policy_glueJobs.json
├── redshift.tf
├── main.tf
├── backend.tf
├── terraform.tfvars
├── crawler.tf
├── dms-replication-instance-module.tf
├── variables.tf
├── rds_database.tf
├── buckets.tf
├── dms-resources.tf
└── iam.tf
├── ingestao-rds
├── model.py
└── app.py
├── README.md
└── processing
└── job-spark-app-emr-redshift.py
/terraform/permissions/role_glueJobs.json:
--------------------------------------------------------------------------------
1 | {
2 | "Version": "2012-10-17",
3 | "Statement": [
4 | {
5 | "Action": "sts:AssumeRole",
6 | "Principal": {
7 | "Service": "glue.amazonaws.com"
8 | },
9 | "Effect": "Allow",
10 | "Sid": ""
11 | }
12 | ]
13 | }
--------------------------------------------------------------------------------
/terraform/redshift.tf:
--------------------------------------------------------------------------------
1 | resource "aws_redshift_cluster" "redshift-cluster" {
2 | cluster_identifier = "redshift-cluster"
3 | database_name = "coins"
4 | master_username = "awsuser"
5 | master_password = "U%SlC7*Y807n"
6 | node_type = "dc2.large"
7 | cluster_type = "single-node"
8 | skip_final_snapshot = true
9 | }
--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | aws = {
4 | source = "hashicorp/aws"
5 | version = "~> 4.0"
6 | }
7 | }
8 | }
9 |
10 | # Configure the AWS Provider
11 | provider "aws" {
12 | region = "us-east-1"
13 | }
14 |
15 | provider "aws" {
16 | alias = "region-us-east-2"
17 | region = "us-east-2"
18 | }
--------------------------------------------------------------------------------
/terraform/backend.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | backend "s3" {
3 | # Edit the bucket name and region
4 | bucket = "stack-terraform-backend"
5 | key = "global/s3/terraform.tfstate"
6 | region = "us-east-1"
7 |
8 | # Edit the bucket name and region
9 | dynamodb_table = "terraform-locks"
10 | encrypt = true
11 | }
12 | }
--------------------------------------------------------------------------------
/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | project_name = "bootcampde"
2 | environment = "prod"
3 | bucket_names = ["raw","processed","curated","scripts"]
4 | db_username = "postgres"
5 | db_password = "Stack2022!"
6 | key_pair_name = "pair-bootcamp"
7 | db_port = 3306
8 | db_server_name = "mysql-stagingdb.chlqfmqqbzxx.us-east-1.rds.amazonaws.com"
9 | vpc_group_name = "vpc-group-name-dms"
10 | vpc_group_description = "Allow access to environment"
11 | vpc_id = ["vpc-0761bfe5fcd29ef7b"]
12 | subnet_id = ["subnet-0338eec367c32a38d","subnet-033f327fa63c99d4f"]
13 | security_group_id_list = ["sg-0b41a6d6dda8ae746"]
14 | athena_database_name = "default"
--------------------------------------------------------------------------------
/terraform/crawler.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "aws_glue_crawler" "crawler_processed" {
3 | database_name = var.athena_database_name
4 | name = "${var.project_name}-crawler-processed-${var.environment}"
5 | role = aws_iam_role.glue_job.arn
6 |
7 | delta_target {
8 | delta_tables = ["s3://processed-stack-bootcampde/tb_coins/"]
9 | write_manifest = "true"
10 | }
11 | }
12 |
13 | resource "aws_glue_crawler" "crawler_curated" {
14 | database_name = var.athena_database_name
15 | name = "${var.project_name}-crawler-curated-${var.environment}"
16 | role = aws_iam_role.glue_job.arn
17 |
18 | delta_target {
19 | delta_tables = ["s3://curated-stack-bootcampde/coins_circulating_supply/","s3://curated-stack-bootcampde/top10_prices_2022/"]
20 | write_manifest = "true"
21 | }
22 | }
--------------------------------------------------------------------------------
/terraform/dms-replication-instance-module.tf:
--------------------------------------------------------------------------------
1 | data "aws_partition" "current" {}
2 | data "aws_region" "current" {}
3 | data "aws_caller_identity" "current" {}
4 |
5 |
6 | module "dms" {
7 | source = "terraform-aws-modules/dms/aws"
8 | version = "~> 1.6"
9 |
10 | # Subnet group
11 | repl_subnet_group_name = var.vpc_group_name
12 | repl_subnet_group_description = var.vpc_group_description
13 | repl_subnet_group_subnet_ids = var.subnet_id
14 |
15 | # Instance
16 | repl_instance_apply_immediately = true
17 | repl_instance_multi_az = false
18 | repl_instance_class = "dms.t3.micro"
19 | repl_instance_id = "${var.project_name}-dms-instance-${var.environment}"
20 | repl_instance_publicly_accessible = false
21 | repl_instance_vpc_security_group_ids = var.security_group_id_list
22 |
23 | depends_on = [aws_s3_bucket.buckets-stack, aws_db_instance.PostgrelSQL-01]
24 | }
--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {
2 | description = "setup the environment"
3 | }
4 |
5 | variable "project_name" {
6 | description = "Nome do projeto"
7 | }
8 |
9 | variable "bucket_names" {
10 | type = list(string)
11 | }
12 |
13 | variable "db_username" {
14 | type = string
15 | }
16 |
17 | variable "db_password" {
18 | type = string
19 | sensitive = false
20 | }
21 | variable "db_port" {
22 | type = string
23 | }
24 |
25 | variable "vpc_group_name" {
26 | type = string
27 | }
28 | variable "vpc_group_description" {
29 | type = string
30 | }
31 |
32 | variable "subnet_id" {
33 | type = list(string)
34 | }
35 |
36 | variable "vpc_id" {
37 | type = list(string)
38 | }
39 | variable "security_group_id_list" {
40 | type = list(string)
41 | }
42 |
43 |
44 | variable "athena_database_name" {
45 | type = string
46 | }
47 |
48 | variable "key_pair_name" {
49 | type = string
50 | }
51 |
52 |
53 | variable "db_server_name" {
54 | type = string
55 | }
56 |
--------------------------------------------------------------------------------
/ingestao-rds/model.py:
--------------------------------------------------------------------------------
1 |
2 | from sqlite3 import Timestamp
3 | from pandas import DatetimeTZDtype
4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, Text
5 | from sqlalchemy.ext.declarative import declarative_base
6 | from sqlalchemy.orm import sessionmaker
7 |
8 | Base = declarative_base()
9 |
10 |
11 | class Coins(Base):
12 | __tablename__ = 'tb_coins' # if you use base it is obligatory
13 | id = Column(Integer, primary_key=True) # obligatory
14 | name = Column(String)
15 | symbol = Column(String)
16 | data_added = Column(Text)
17 | last_updated = Column(Text)
18 | price = Column(Float)
19 | volume_24h = Column(Float)
20 | circulating_supply = Column(Float)
21 | total_supply = Column(Float)
22 | max_supply = Column(Float)
23 | volume_24h = Column(Float)
24 | percent_change_1h = Column(Float)
25 | percent_change_24h = Column(Float)
26 | percent_change_7d = Column(Float)
27 |
28 | def start():
29 | db_string = "postgresql://postgres:*8xayZycAE5m@server01.csddqd4agbf7.us-east-1.rds.amazonaws.com/coins"
30 | engine = create_engine(db_string)
31 | Session = sessionmaker(bind=engine)
32 | session = Session()
33 | Base.metadata.create_all(engine)
34 | print ('\nTable created on database')
35 | return session, engine
36 |
--------------------------------------------------------------------------------
/terraform/rds_database.tf:
--------------------------------------------------------------------------------
1 | resource "aws_security_group" "access-rds-port" {
2 | name = "access_rds"
3 |
4 | ingress {
5 | from_port = 3306
6 | to_port = 3306
7 | protocol = "tcp"
8 |
9 | # Please restrict your ingress to only necessary IPs and ports.
10 | # Opening to 0.0.0.0/0 can lead to security vulnerabilities.
11 | cidr_blocks = ["0.0.0.0/0"]
12 |
13 | }
14 | tags = {
15 | security_group = "access-rds-port"
16 | }
17 | }
18 |
19 | resource "aws_db_instance" "PostgrelSQL-01" {
20 | db_name = "coins"
21 | engine = "postgres"
22 | engine_version = "13.7"
23 | instance_class = "db.t3.micro"
24 | username = var.db_username
25 | password = var.db_password
26 | #parameter_group_name = "default.mysql5.7"
27 | skip_final_snapshot = true
28 |
29 | # resource identifier
30 | identifier = "${var.project_name}-rds-database-${var.environment}"
31 |
32 | # Storage options
33 | allocated_storage = 50
34 | max_allocated_storage = 100
35 |
36 | # allow remotly access
37 | vpc_security_group_ids = [aws_security_group.access-rds-port.id]
38 | publicly_accessible = "true"
39 | }
40 |
41 | # print information
42 | output "address_endpoint" {
43 | value = aws_db_instance.PostgrelSQL-01.address
44 | }
45 | output "db_user_admin" {
46 | value = aws_db_instance.PostgrelSQL-01.username
47 | }
48 | output "instance_port" {
49 | value = aws_db_instance.PostgrelSQL-01.port
50 | }
--------------------------------------------------------------------------------
/terraform/buckets.tf:
--------------------------------------------------------------------------------
1 | resource "aws_s3_bucket" "buckets-stack" {
2 | count = length(var.bucket_names)
3 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
4 | force_destroy = true
5 | tags = {
6 | Bucket_Name = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
7 | environment = var.environment
8 | Cost_Center = "TI"
9 | Project_Name = var.project_name
10 | }
11 | }
12 |
13 | resource "aws_s3_bucket_server_side_encryption_configuration" "bucket-encryption" {
14 | count = length(var.bucket_names)
15 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
16 | rule {
17 | apply_server_side_encryption_by_default {
18 | sse_algorithm = "AES256"
19 | }
20 | }
21 | depends_on = [
22 | aws_s3_bucket.buckets-stack
23 | ]
24 | }
25 |
26 | resource "aws_s3_bucket_acl" "bucket_acl" {
27 | count = length(var.bucket_names)
28 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
29 | acl = "private"
30 | depends_on = [
31 | aws_s3_bucket.buckets-stack
32 | ]
33 |
34 | }
35 |
36 | resource "aws_s3_bucket_public_access_block" "public_access_block" {
37 | count = length(var.bucket_names)
38 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}"
39 |
40 | block_public_acls = true
41 | block_public_policy = true
42 | ignore_public_acls = true
43 | restrict_public_buckets = true
44 | depends_on = [
45 | aws_s3_bucket.buckets-stack
46 | ]
47 | }
--------------------------------------------------------------------------------
/terraform/dms-resources.tf:
--------------------------------------------------------------------------------
1 | resource "aws_dms_endpoint" "postgresql_endpoint" {
2 | endpoint_id = "${var.project_name}-endpoint-source-${var.environment}"
3 | endpoint_type = "source"
4 | engine_name = "postgres"
5 | username = var.db_username
6 | password = var.db_password
7 | port = var.db_port
8 | database_name = aws_db_instance.PostgrelSQL-01.db_name
9 | server_name = aws_db_instance.PostgrelSQL-01.address
10 | ssl_mode = "none"
11 | depends_on = [aws_s3_bucket.buckets-stack,aws_db_instance.PostgrelSQL-01]
12 | }
13 |
14 | resource "aws_dms_endpoint" "s3_endpoint" {
15 | endpoint_id = "${var.project_name}-endpoint-target-${var.environment}"
16 | endpoint_type = "target"
17 | engine_name = "s3"
18 | ssl_mode = "none"
19 | extra_connection_attributes = "IncludeOpForFullLoad=True;TimestampColumnName=TIMESTAMP;AddColumnName=True"
20 |
21 | s3_settings {
22 | bucket_name = aws_s3_bucket_public_access_block.public_access_block[0].bucket
23 | service_access_role_arn = "arn:aws:iam::395882348933:role/Role-DMS-S3-Access"
24 | add_column_name = true
25 | cdc_path = "cdc"
26 | timestamp_column_name = "TIMESTAMP"
27 | }
28 | depends_on = [aws_s3_bucket.buckets-stack]
29 | }
30 |
31 | resource "aws_dms_replication_task" "replication-task1" {
32 | migration_type = "full-load"
33 | replication_instance_arn = module.dms.replication_instance_arn
34 | replication_task_id = "${var.project_name}-replication-task-${var.environment}"
35 | source_endpoint_arn = aws_dms_endpoint.postgresql_endpoint.endpoint_arn
36 | target_endpoint_arn = aws_dms_endpoint.s3_endpoint.endpoint_arn
37 | table_mappings = "{\"rules\":[{\"rule-type\":\"selection\",\"rule-id\":\"1\",\"rule-name\":\"1\",\"object-locator\":{\"schema-name\":\"%\",\"table-name\":\"%\"},\"rule-action\":\"include\"}]}"
38 | tags = {
39 | Name = "${var.project_name}-replication-task-${var.environment}"
40 | }
41 | depends_on = [aws_s3_bucket.buckets-stack]
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/terraform/iam.tf:
--------------------------------------------------------------------------------
1 | resource "aws_iam_role" "glue_job" {
2 | name = "${var.project_name}-glue-job-role"
3 | path = "/"
4 | description = "Provides write permissions to CloudWatch Logs and S3 Full Access"
5 | assume_role_policy = file("permissions/role_glueJobs.json")
6 | }
7 |
8 | resource "aws_iam_policy" "glue_job_policy" {
9 | name = "${var.project_name}-glue-job-policy"
10 | path = "/"
11 | description = "Provides write permissions to CloudWatch Logs and S3 Full Access"
12 | policy = file("permissions/policy_glueJobs.json")
13 | }
14 |
15 | resource "aws_iam_role_policy_attachment" "glue_job1" {
16 | role = aws_iam_role.glue_job.name
17 | policy_arn = aws_iam_policy.glue_job_policy.arn
18 | }
19 |
20 | resource "aws_iam_role" "s3_role" {
21 | name = "dms-s3-role"
22 | description = "Role used to migrate data from S3 via DMS"
23 |
24 | assume_role_policy = jsonencode({
25 | Version = "2012-10-17"
26 | Statement = [
27 | {
28 | Sid = "DMSAssume"
29 | Action = "sts:AssumeRole"
30 | Effect = "Allow"
31 | Principal = {
32 | Service = "dms.${data.aws_partition.current.dns_suffix}"
33 | }
34 | },
35 | ]
36 | })
37 |
38 | inline_policy {
39 | name = "dms-s3-role"
40 |
41 | policy = jsonencode({
42 | Version = "2012-10-17"
43 | Statement = [
44 | {
45 | Sid = "DMSS3"
46 | Action = ["s3:*"]
47 | Effect = "Allow"
48 | Resource = "*"
49 | }
50 | ]
51 | })
52 | }
53 | }
54 |
55 | data "aws_iam_policy_document" "dms_assume_role" {
56 | statement {
57 | actions = ["sts:AssumeRole"]
58 |
59 | principals {
60 | identifiers = ["dms.amazonaws.com"]
61 | type = "Service"
62 | }
63 | }
64 | }
65 |
66 | resource "aws_iam_role" "dms-vpc-role" {
67 | assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json
68 | name = "dmsvpc-role"
69 | }
70 |
71 |
72 | resource "aws_iam_role" "dms-cloudwatch-logs-role" {
73 | assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json
74 | name = "dms-cloudwatch-logs-role1"
75 | }
76 |
77 | resource "aws_iam_role_policy_attachment" "dms-cloudwatch-logs-role-AmazonDMSCloudWatchLogsRole" {
78 | policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonDMSCloudWatchLogsRole"
79 | role = aws_iam_role.dms-cloudwatch-logs-role.name
80 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bootcamp de Engenharia de Dados.
2 |
3 | ### Para rodar a aplicação de ingestão faça:
4 |
5 | 1. Provisione o RDS PostgreSQL na AWS conforme abordado em aula.
6 | 2. Crie um banco de dados, por exemplo: coins
7 | 3. Configure o arquivo model.py com o nome da tabela a ser criada, exemplo: tb_coins.
8 | 4. Edite variável `db_string` com o endpoint do RDS na AWS.
9 | 5. Altere o arquivo app.py inserindo a chave da api como argumento da função get_data (key)
10 | 6. Execute a aplicação para consumo da API e persistência no banco de dados.
11 |
12 |
13 | ### Para rodar a aplicação Spark:
14 | 1. Suba o Amazon EMR.
15 |
16 | 2. Navegue até o diretório `processing`
17 |
18 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo:
19 |
20 | scp -i ~/Downloads/pair-bootcamp.pem job-spark-app-emr-redshift.py hadoop@ec2-54-90-3-194.compute-1.amazonaws.com:/home/hadoop/
21 |
22 | 4. Conecte no servidor master usando `ssh`, exemplo:
23 |
24 | ssh -i ~/Downloads/pair-bootcamp.pem hadoop@ec2-54-90-3-194.compute-1.amazonaws.com
25 |
26 | *Obs*: Antes de executar a aplicação verifique se o redshift está iniciado, caso não esteja edite a aplicação alterando a variável `flag_write_redshift` para false.
27 |
28 | 5. Execute o comando spark-submit, exemplo:
29 | spark-submit --packages io.delta:delta-core_2.12:2.0.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" --jars /usr/share/aws/redshift/jdbc/RedshiftJDBC.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-redshift.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-avro.jar,/usr/share/aws/redshift/spark-redshift/lib/minimal-json.jar job-spark-app-emr-redshift.py
30 |
31 |
32 | ### Para provisionar recursos com Terraform:
33 | 1. Navegue até o diretório `Terraform`
34 |
35 | 2. Instale o aplicativo `terraform`
36 |
37 | 3. Instale o aplicativo `aws-cli`. Veja esse link: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html
38 |
39 | 4. Autentique na AWS com o comando:
40 |
41 | aws configure
42 |
43 | 5. Antes de provisionar os recursos crie o backend para usar como *backend* e a tabela no Dynamodb.
44 |
45 | 6. Edite os arquivos *terraform.tfvars* e *variables.tf* com as informações da sua infraestrutura.
46 |
47 | 7. Provisione os recursos com os comandos:
48 |
49 | Provisione os recursos com o comando:
50 |
51 | terraform init>
52 |
53 | terraform plan>
54 |
55 | terraform apply>
56 |
57 |
58 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo:
--------------------------------------------------------------------------------
/processing/job-spark-app-emr-redshift.py:
--------------------------------------------------------------------------------
1 | from os.path import abspath
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.functions import *
4 |
5 | # setup da aplicação Spark
6 | spark = SparkSession \
7 | .builder \
8 | .appName("job-1-spark") \
9 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
10 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
11 | .getOrCreate()
12 |
13 | # definindo o método de logging da aplicação use INFO somente para DEV [INFO,ERROR]
14 | spark.sparkContext.setLogLevel("ERROR")
15 |
16 | def read_csv(bucket, path):
17 | # lendo os dados do Data Lake
18 | df = spark.read.format("csv")\
19 | .option("header", "True")\
20 | .option("inferSchema","True")\
21 | .csv(f"{bucket}/{path}")
22 | # imprime os dados lidos da raw
23 | print ("\nImprime os dados lidos da raw:")
24 | print (df.show(5))
25 | # imprime o schema do dataframe
26 | print ("\nImprime o schema do dataframe lido da raw:")
27 | print (df.printSchema())
28 | return df
29 |
30 | def read_delta(bucket, path):
31 | df = spark.read.format("delta")\
32 | .load(f"{bucket}/{path}")
33 | return df
34 |
35 | def write_processed(bucket, path, col_partition, data_format, mode):
36 | print ("\nEscrevendo os dados lidos da raw para delta na processing zone...")
37 | try:
38 | df.write.format(data_format)\
39 | .partitionBy(col_partition)\
40 | .mode(mode)\
41 | .save(f"{bucket}/{path}")
42 | print (f"Dados escritos na processed com sucesso!")
43 | return 0
44 | except Exception as err:
45 | print (f"Falha para escrever dados na processed: {err}")
46 | return 1
47 |
48 | def write_curated(bucket, path, dataframe, data_format, mode):
49 | # converte os dados processados para parquet e escreve na curated zone
50 | print ("\nEscrevendo os dados na curated zone...")
51 | try:
52 | dataframe.write.format(data_format)\
53 | .mode(mode)\
54 | .save(f"{bucket}/{path}")
55 | print (f"Dados escritos na curated com sucesso!")
56 | return 0
57 | except Exception as err:
58 | print (f"Falha para escrever dados na processed: {err}")
59 | return 1
60 |
61 | def write_redshift(url_jdbc, table_name, dataframe):
62 | try:
63 | dataframe.write.format("jdbc")\
64 | .options(url=url_jdbc,
65 | driver='com.amazon.redshift.jdbc42.Driver',
66 | user='awsuser',
67 | password='U%SlC7*Y807n',
68 | dbtable=table_name
69 | )\
70 | .mode('overwrite')\
71 | .save()
72 | print (f"Dados escritos no Redshift com sucesso!")
73 | return 0
74 | except Exception as err:
75 | print (f"Falha para escrever dados no Redshift: {err}")
76 | return 1
77 |
78 |
79 | def analytics_tables(bucket, dataframe, table_name, flag_write_redshift, url_jdbc):
80 | # cria uma view para trabalhar com sql
81 | dataframe.createOrReplaceTempView(table_name)
82 | # processa os dados conforme regra de negócio
83 | df_query1 = df.groupBy("name") \
84 | .agg(sum("circulating_supply").alias("circulating_supply")) \
85 | .sort(desc("circulating_supply")) \
86 | .limit(10)
87 | df_query2 = df.select(col('name'),col('symbol'),col('price'))\
88 | .sort(desc("price"))\
89 | .limit(10)
90 | # imprime o resultado do dataframe criado
91 | print ("\n Top 10 Cryptomoedas com maior fornecimento de circulação no mercado\n")
92 | print (df_query1.show())
93 | print ("\n Top 10 Cryptomoedas com preços mais altos de 2022\n")
94 | print (df_query2.show())
95 | write_curated(f"{bucket}","coins_circulating_supply",df_query1,"delta","overwrite")
96 | write_curated(f"{bucket}","top10_prices_2022",df_query2,"delta","overwrite")
97 |
98 | if flag_write_redshift == True:
99 | write_redshift(url_jdbc, "coins_circulating_supply", df_query1)
100 | write_redshift(url_jdbc,"top10_prices_2022",df_query2)
101 |
102 |
103 | # Ler dados da raw
104 | df = read_csv('s3a://raw-stack-bootcampde','public/tb_coins/')
105 |
106 | # Cria uma coluna de ano para particionar os dados
107 | df = df.withColumn("year", year(df.data_added))
108 |
109 | # Processa os dados e escreve na camada processed
110 | write_processed("s3a://processed-stack-bootcampde","tb_coins","year","delta","overwrite")
111 |
112 | # Lear dados da processed e escreve na camada curated.
113 | df = read_delta("s3a://processed-stack-bootcampde","tb_coins")
114 |
115 | flag_write_redshift = True
116 | url_jdbc = "jdbc:redshift://redshift-cluster-1.cufcxu0ztur8.us-east-1.redshift.amazonaws.com:5439/dev"
117 | analytics_tables("s3a://curated-stack-bootcampde",df,"tb_coins", flag_write_redshift, url_jdbc)
118 |
119 | # para a aplicação
120 | spark.stop()
--------------------------------------------------------------------------------
/ingestao-rds/app.py:
--------------------------------------------------------------------------------
1 | from ast import Str, Try
2 | import json
3 | from sqlite3 import Date
4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, null
5 | from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
6 | from sqlalchemy.ext.declarative import declarative_base
7 | from sqlalchemy.orm import sessionmaker
8 | from model import Coins
9 | from requests import Request, Session
10 | from datetime import datetime
11 | import pandas as pd
12 |
13 | def check_if_valid_data(df: pd.DataFrame) -> bool:
14 |
15 | # Check if dataframe is empty
16 | if df.empty:
17 | print("\nDataframe empty. Finishing execution")
18 | return False
19 |
20 | # Check for nulls
21 | if df.symbol.empty:
22 | raise Exception("\nSymbol is Null or the value is empty")
23 |
24 | # Check for nulls
25 | if df.price.empty:
26 | raise Exception("\nPrice is Null or the value is empty")
27 |
28 | # Check for nulls
29 | if df.data_added.empty:
30 | raise Exception("\nData is Null or the value is empty")
31 |
32 | return True
33 |
34 | def load_data(table_name, coins_df, session_db, engine_db):
35 |
36 | # validate
37 | if check_if_valid_data(coins_df):
38 | print("\nData valid, proceed to Load stage")
39 |
40 | # load data on database
41 | try:
42 | coins_df.to_sql(table_name, engine_db, index=False, if_exists='append')
43 | print ('\nData Loaded on Database')
44 |
45 | except Exception as err:
46 | print(f"\nFail to load data on database: {err}")
47 |
48 | session_db.commit()
49 | session_db.close()
50 | print("\nClose database successfully")
51 | return session_db
52 |
53 | def get_data(session_db, engine_db, start, limit, convert, key, url):
54 |
55 | # set limit of data from api
56 | parameters = {
57 | 'start': start,
58 | 'limit': limit,
59 | 'convert': convert
60 | }
61 |
62 | headers = {
63 | 'Accepts': 'application/json',
64 | 'X-CMC_PRO_API_KEY': key,
65 | }
66 |
67 | session = Session()
68 | session.headers.update(headers)
69 |
70 | name = []
71 | symbol = []
72 | data_added = []
73 | last_updated = []
74 | price = []
75 | volume_24h = []
76 | circulating_supply = []
77 | total_supply = []
78 | max_supply = []
79 | volume_24h = []
80 | percent_change_1h = []
81 | percent_change_24h = []
82 | percent_change_7d = []
83 |
84 | try:
85 | response = session.get(url, params=parameters)
86 | data = json.loads(response.text)
87 |
88 | print ('\n')
89 | for coin in data['data']:
90 | name.append(coin['name'])
91 | symbol.append(coin['symbol'])
92 | data_added.append(coin['date_added'])
93 | last_updated.append(coin['last_updated'])
94 | circulating_supply.append(coin['circulating_supply'])
95 | total_supply.append(coin['total_supply'])
96 | max_supply.append(coin['max_supply'])
97 | price.append(coin['quote']['USD']['price'])
98 | volume_24h.append(coin['quote']['USD']['volume_24h'])
99 | percent_change_1h.append(coin['quote']['USD']['percent_change_1h'])
100 | percent_change_24h.append(coin['quote']['USD']['percent_change_24h'])
101 | percent_change_7d.append(coin['quote']['USD']['percent_change_7d'])
102 |
103 |
104 | # Prepare a dictionary in order to turn it into a pandas dataframe below
105 | coin_dict = {
106 | "name" : name,
107 | "symbol": symbol,
108 | "data_added" : data_added,
109 | "last_updated" : last_updated,
110 | "price": price,
111 | "volume_24h": volume_24h,
112 | "circulating_supply" : circulating_supply,
113 | "total_supply": total_supply,
114 | "max_supply": max_supply,
115 | "volume_24h": volume_24h,
116 | "percent_change_1h": percent_change_1h,
117 | "percent_change_24h": percent_change_24h,
118 | "percent_change_7d": percent_change_7d
119 |
120 | }
121 | except Exception as e:
122 | print (f'Error to get data from APi: {e}')
123 | exit(1)
124 |
125 | # create dataframe to structure data
126 | coins_df = pd.DataFrame(coin_dict, columns = ["name", "symbol", "data_added", "last_updated","price","volume_24h","circulating_supply","total_supply","max_supply","percent_change_1h","percent_change_24h","percent_change_7d"])
127 | print ("Data on Pandas Dataframe:\n")
128 | print(coins_df.head(100))
129 |
130 | # call the function to load data on database
131 | load_data('tb_coins',coins_df, session_db, engine_db)
132 |
133 | # Declaration base
134 | Base = declarative_base()
135 |
136 | # Make the coin table
137 | get_session_db, get_engine = Coins.start()
138 |
139 | # call the get_data function and load data on database
140 | get_data(get_session_db,
141 | get_engine,
142 | '1',
143 | '5000',
144 | 'USD',
145 | '7bdc01a6-f004-4c0b-b21c-1c1d3970352f',
146 | 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest')
--------------------------------------------------------------------------------
/terraform/permissions/policy_glueJobs.json:
--------------------------------------------------------------------------------
1 | {
2 | "Version": "2012-10-17",
3 | "Statement": [
4 | {
5 | "Effect": "Allow",
6 | "Action": [
7 | "glue:*",
8 | "redshift:DescribeClusters",
9 | "redshift:DescribeClusterSubnetGroups",
10 | "iam:ListRoles",
11 | "iam:ListUsers",
12 | "iam:ListGroups",
13 | "iam:ListRolePolicies",
14 | "iam:GetRole",
15 | "iam:GetRolePolicy",
16 | "iam:ListAttachedRolePolicies",
17 | "ec2:DescribeSecurityGroups",
18 | "ec2:DescribeSubnets",
19 | "ec2:DescribeVpcs",
20 | "ec2:DescribeVpcEndpoints",
21 | "ec2:DescribeRouteTables",
22 | "ec2:DescribeVpcAttribute",
23 | "ec2:DescribeKeyPairs",
24 | "ec2:DescribeInstances",
25 | "rds:DescribeDBInstances",
26 | "rds:DescribeDBClusters",
27 | "rds:DescribeDBSubnetGroups",
28 | "s3:ListAllMyBuckets",
29 | "s3:ListBucket",
30 | "s3:GetBucketAcl",
31 | "s3:GetBucketLocation",
32 | "cloudformation:DescribeStacks",
33 | "cloudformation:GetTemplateSummary",
34 | "dynamodb:ListTables",
35 | "kms:ListAliases",
36 | "kms:DescribeKey",
37 | "cloudwatch:GetMetricData",
38 | "cloudwatch:ListDashboards"
39 | ],
40 | "Resource": [
41 | "*"
42 | ]
43 | },
44 | {
45 | "Effect": "Allow",
46 | "Action": [
47 | "s3:GetObject",
48 | "s3:PutObject"
49 | ],
50 | "Resource": [
51 | "arn:aws:s3:::*"
52 | ]
53 | },
54 | {
55 | "Effect": "Allow",
56 | "Action": [
57 | "tag:GetResources"
58 | ],
59 | "Resource": [
60 | "*"
61 | ]
62 | },
63 | {
64 | "Effect": "Allow",
65 | "Action": [
66 | "s3:CreateBucket",
67 | "s3:PutBucketPublicAccessBlock" ],
68 | "Resource": [
69 | "arn:aws:s3:::*"
70 | ]
71 | },
72 | {
73 | "Effect": "Allow",
74 | "Action": [
75 | "logs:GetLogEvents"
76 | ],
77 | "Resource": [
78 | "arn:aws:logs:*:*:/aws-glue/*"
79 | ]
80 | },
81 | {
82 | "Effect": "Allow",
83 | "Action": [
84 | "cloudformation:CreateStack",
85 | "cloudformation:DeleteStack"
86 | ],
87 | "Resource": "arn:aws:cloudformation:*:*:stack/aws-glue*/*"
88 | },
89 | {
90 | "Effect": "Allow",
91 | "Action": [
92 | "ec2:RunInstances"
93 | ],
94 | "Resource": [
95 | "arn:aws:ec2:*:*:instance/*",
96 | "arn:aws:ec2:*:*:key-pair/*",
97 | "arn:aws:ec2:*:*:image/*",
98 | "arn:aws:ec2:*:*:security-group/*",
99 | "arn:aws:ec2:*:*:network-interface/*",
100 | "arn:aws:ec2:*:*:subnet/*",
101 | "arn:aws:ec2:*:*:volume/*"
102 | ]
103 | },
104 | {
105 | "Effect": "Allow",
106 | "Action": [
107 | "ec2:TerminateInstances",
108 | "ec2:CreateTags",
109 | "ec2:DeleteTags"
110 | ],
111 | "Resource": [
112 | "arn:aws:ec2:*:*:instance/*"
113 | ],
114 | "Condition": {
115 | "StringLike": {
116 | "ec2:ResourceTag/aws:cloudformation:stack-id": "arn:aws:cloudformation:*:*:stack/*/*"
117 | },
118 | "StringEquals": {
119 | "ec2:ResourceTag/aws:cloudformation:logical-id": "ZeppelinInstance"
120 | }
121 | }
122 | },
123 | {
124 | "Action": [
125 | "iam:PassRole"
126 | ],
127 | "Effect": "Allow",
128 | "Resource": "arn:aws:iam::*:role/AWSGlueServiceRole*",
129 | "Condition": {
130 | "StringLike": {
131 | "iam:PassedToService": [
132 | "glue.amazonaws.com"
133 | ]
134 | }
135 | }
136 | },
137 | {
138 | "Action": [
139 | "iam:PassRole"
140 | ],
141 | "Effect": "Allow",
142 | "Resource": "arn:aws:iam::*:role/AWSGlueServiceNotebookRole*",
143 | "Condition": {
144 | "StringLike": {
145 | "iam:PassedToService": [
146 | "ec2.amazonaws.com"
147 | ]
148 | }
149 | }
150 | },
151 | {
152 | "Action": [
153 | "iam:PassRole"
154 | ],
155 | "Effect": "Allow",
156 | "Resource": [
157 | "arn:aws:iam::*:role/service-role/AWSGlueServiceRole*"
158 | ],
159 | "Condition": {
160 | "StringLike": {
161 | "iam:PassedToService": [
162 | "glue.amazonaws.com"
163 | ]
164 | }
165 | }
166 | }
167 | ]
168 | }
--------------------------------------------------------------------------------