├── terraform ├── permissions │ ├── role_glueJobs.json │ └── policy_glueJobs.json ├── redshift.tf ├── main.tf ├── backend.tf ├── terraform.tfvars ├── crawler.tf ├── dms-replication-instance-module.tf ├── variables.tf ├── rds_database.tf ├── buckets.tf ├── dms-resources.tf └── iam.tf ├── ingestao-rds ├── model.py └── app.py ├── README.md └── processing └── job-spark-app-emr-redshift.py /terraform/permissions/role_glueJobs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": "sts:AssumeRole", 6 | "Principal": { 7 | "Service": "glue.amazonaws.com" 8 | }, 9 | "Effect": "Allow", 10 | "Sid": "" 11 | } 12 | ] 13 | } -------------------------------------------------------------------------------- /terraform/redshift.tf: -------------------------------------------------------------------------------- 1 | resource "aws_redshift_cluster" "redshift-cluster" { 2 | cluster_identifier = "redshift-cluster" 3 | database_name = "coins" 4 | master_username = "awsuser" 5 | master_password = "U%SlC7*Y807n" 6 | node_type = "dc2.large" 7 | cluster_type = "single-node" 8 | skip_final_snapshot = true 9 | } -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 4.0" 6 | } 7 | } 8 | } 9 | 10 | # Configure the AWS Provider 11 | provider "aws" { 12 | region = "us-east-1" 13 | } 14 | 15 | provider "aws" { 16 | alias = "region-us-east-2" 17 | region = "us-east-2" 18 | } -------------------------------------------------------------------------------- /terraform/backend.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | backend "s3" { 3 | # Edit the bucket name and region 4 | bucket = "stack-terraform-backend" 5 | key = "global/s3/terraform.tfstate" 6 | region = "us-east-1" 7 | 8 | # Edit the bucket name and region 9 | dynamodb_table = "terraform-locks" 10 | encrypt = true 11 | } 12 | } -------------------------------------------------------------------------------- /terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project_name = "bootcampde" 2 | environment = "prod" 3 | bucket_names = ["raw","processed","curated","scripts"] 4 | db_username = "postgres" 5 | db_password = "Stack2022!" 6 | key_pair_name = "pair-bootcamp" 7 | db_port = 3306 8 | db_server_name = "mysql-stagingdb.chlqfmqqbzxx.us-east-1.rds.amazonaws.com" 9 | vpc_group_name = "vpc-group-name-dms" 10 | vpc_group_description = "Allow access to environment" 11 | vpc_id = ["vpc-0761bfe5fcd29ef7b"] 12 | subnet_id = ["subnet-0338eec367c32a38d","subnet-033f327fa63c99d4f"] 13 | security_group_id_list = ["sg-0b41a6d6dda8ae746"] 14 | athena_database_name = "default" -------------------------------------------------------------------------------- /terraform/crawler.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "aws_glue_crawler" "crawler_processed" { 3 | database_name = var.athena_database_name 4 | name = "${var.project_name}-crawler-processed-${var.environment}" 5 | role = aws_iam_role.glue_job.arn 6 | 7 | delta_target { 8 | delta_tables = ["s3://processed-stack-bootcampde/tb_coins/"] 9 | write_manifest = "true" 10 | } 11 | } 12 | 13 | resource "aws_glue_crawler" "crawler_curated" { 14 | database_name = var.athena_database_name 15 | name = "${var.project_name}-crawler-curated-${var.environment}" 16 | role = aws_iam_role.glue_job.arn 17 | 18 | delta_target { 19 | delta_tables = ["s3://curated-stack-bootcampde/coins_circulating_supply/","s3://curated-stack-bootcampde/top10_prices_2022/"] 20 | write_manifest = "true" 21 | } 22 | } -------------------------------------------------------------------------------- /terraform/dms-replication-instance-module.tf: -------------------------------------------------------------------------------- 1 | data "aws_partition" "current" {} 2 | data "aws_region" "current" {} 3 | data "aws_caller_identity" "current" {} 4 | 5 | 6 | module "dms" { 7 | source = "terraform-aws-modules/dms/aws" 8 | version = "~> 1.6" 9 | 10 | # Subnet group 11 | repl_subnet_group_name = var.vpc_group_name 12 | repl_subnet_group_description = var.vpc_group_description 13 | repl_subnet_group_subnet_ids = var.subnet_id 14 | 15 | # Instance 16 | repl_instance_apply_immediately = true 17 | repl_instance_multi_az = false 18 | repl_instance_class = "dms.t3.micro" 19 | repl_instance_id = "${var.project_name}-dms-instance-${var.environment}" 20 | repl_instance_publicly_accessible = false 21 | repl_instance_vpc_security_group_ids = var.security_group_id_list 22 | 23 | depends_on = [aws_s3_bucket.buckets-stack, aws_db_instance.PostgrelSQL-01] 24 | } -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "environment" { 2 | description = "setup the environment" 3 | } 4 | 5 | variable "project_name" { 6 | description = "Nome do projeto" 7 | } 8 | 9 | variable "bucket_names" { 10 | type = list(string) 11 | } 12 | 13 | variable "db_username" { 14 | type = string 15 | } 16 | 17 | variable "db_password" { 18 | type = string 19 | sensitive = false 20 | } 21 | variable "db_port" { 22 | type = string 23 | } 24 | 25 | variable "vpc_group_name" { 26 | type = string 27 | } 28 | variable "vpc_group_description" { 29 | type = string 30 | } 31 | 32 | variable "subnet_id" { 33 | type = list(string) 34 | } 35 | 36 | variable "vpc_id" { 37 | type = list(string) 38 | } 39 | variable "security_group_id_list" { 40 | type = list(string) 41 | } 42 | 43 | 44 | variable "athena_database_name" { 45 | type = string 46 | } 47 | 48 | variable "key_pair_name" { 49 | type = string 50 | } 51 | 52 | 53 | variable "db_server_name" { 54 | type = string 55 | } 56 | -------------------------------------------------------------------------------- /ingestao-rds/model.py: -------------------------------------------------------------------------------- 1 | 2 | from sqlite3 import Timestamp 3 | from pandas import DatetimeTZDtype 4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, Text 5 | from sqlalchemy.ext.declarative import declarative_base 6 | from sqlalchemy.orm import sessionmaker 7 | 8 | Base = declarative_base() 9 | 10 | 11 | class Coins(Base): 12 | __tablename__ = 'tb_coins' # if you use base it is obligatory 13 | id = Column(Integer, primary_key=True) # obligatory 14 | name = Column(String) 15 | symbol = Column(String) 16 | data_added = Column(Text) 17 | last_updated = Column(Text) 18 | price = Column(Float) 19 | volume_24h = Column(Float) 20 | circulating_supply = Column(Float) 21 | total_supply = Column(Float) 22 | max_supply = Column(Float) 23 | volume_24h = Column(Float) 24 | percent_change_1h = Column(Float) 25 | percent_change_24h = Column(Float) 26 | percent_change_7d = Column(Float) 27 | 28 | def start(): 29 | db_string = "postgresql://postgres:*8xayZycAE5m@server01.csddqd4agbf7.us-east-1.rds.amazonaws.com/coins" 30 | engine = create_engine(db_string) 31 | Session = sessionmaker(bind=engine) 32 | session = Session() 33 | Base.metadata.create_all(engine) 34 | print ('\nTable created on database') 35 | return session, engine 36 | -------------------------------------------------------------------------------- /terraform/rds_database.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "access-rds-port" { 2 | name = "access_rds" 3 | 4 | ingress { 5 | from_port = 3306 6 | to_port = 3306 7 | protocol = "tcp" 8 | 9 | # Please restrict your ingress to only necessary IPs and ports. 10 | # Opening to 0.0.0.0/0 can lead to security vulnerabilities. 11 | cidr_blocks = ["0.0.0.0/0"] 12 | 13 | } 14 | tags = { 15 | security_group = "access-rds-port" 16 | } 17 | } 18 | 19 | resource "aws_db_instance" "PostgrelSQL-01" { 20 | db_name = "coins" 21 | engine = "postgres" 22 | engine_version = "13.7" 23 | instance_class = "db.t3.micro" 24 | username = var.db_username 25 | password = var.db_password 26 | #parameter_group_name = "default.mysql5.7" 27 | skip_final_snapshot = true 28 | 29 | # resource identifier 30 | identifier = "${var.project_name}-rds-database-${var.environment}" 31 | 32 | # Storage options 33 | allocated_storage = 50 34 | max_allocated_storage = 100 35 | 36 | # allow remotly access 37 | vpc_security_group_ids = [aws_security_group.access-rds-port.id] 38 | publicly_accessible = "true" 39 | } 40 | 41 | # print information 42 | output "address_endpoint" { 43 | value = aws_db_instance.PostgrelSQL-01.address 44 | } 45 | output "db_user_admin" { 46 | value = aws_db_instance.PostgrelSQL-01.username 47 | } 48 | output "instance_port" { 49 | value = aws_db_instance.PostgrelSQL-01.port 50 | } -------------------------------------------------------------------------------- /terraform/buckets.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "buckets-stack" { 2 | count = length(var.bucket_names) 3 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}" 4 | force_destroy = true 5 | tags = { 6 | Bucket_Name = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}" 7 | environment = var.environment 8 | Cost_Center = "TI" 9 | Project_Name = var.project_name 10 | } 11 | } 12 | 13 | resource "aws_s3_bucket_server_side_encryption_configuration" "bucket-encryption" { 14 | count = length(var.bucket_names) 15 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}" 16 | rule { 17 | apply_server_side_encryption_by_default { 18 | sse_algorithm = "AES256" 19 | } 20 | } 21 | depends_on = [ 22 | aws_s3_bucket.buckets-stack 23 | ] 24 | } 25 | 26 | resource "aws_s3_bucket_acl" "bucket_acl" { 27 | count = length(var.bucket_names) 28 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}" 29 | acl = "private" 30 | depends_on = [ 31 | aws_s3_bucket.buckets-stack 32 | ] 33 | 34 | } 35 | 36 | resource "aws_s3_bucket_public_access_block" "public_access_block" { 37 | count = length(var.bucket_names) 38 | bucket = "${var.project_name}-${var.bucket_names[count.index]}-${var.environment}" 39 | 40 | block_public_acls = true 41 | block_public_policy = true 42 | ignore_public_acls = true 43 | restrict_public_buckets = true 44 | depends_on = [ 45 | aws_s3_bucket.buckets-stack 46 | ] 47 | } -------------------------------------------------------------------------------- /terraform/dms-resources.tf: -------------------------------------------------------------------------------- 1 | resource "aws_dms_endpoint" "postgresql_endpoint" { 2 | endpoint_id = "${var.project_name}-endpoint-source-${var.environment}" 3 | endpoint_type = "source" 4 | engine_name = "postgres" 5 | username = var.db_username 6 | password = var.db_password 7 | port = var.db_port 8 | database_name = aws_db_instance.PostgrelSQL-01.db_name 9 | server_name = aws_db_instance.PostgrelSQL-01.address 10 | ssl_mode = "none" 11 | depends_on = [aws_s3_bucket.buckets-stack,aws_db_instance.PostgrelSQL-01] 12 | } 13 | 14 | resource "aws_dms_endpoint" "s3_endpoint" { 15 | endpoint_id = "${var.project_name}-endpoint-target-${var.environment}" 16 | endpoint_type = "target" 17 | engine_name = "s3" 18 | ssl_mode = "none" 19 | extra_connection_attributes = "IncludeOpForFullLoad=True;TimestampColumnName=TIMESTAMP;AddColumnName=True" 20 | 21 | s3_settings { 22 | bucket_name = aws_s3_bucket_public_access_block.public_access_block[0].bucket 23 | service_access_role_arn = "arn:aws:iam::395882348933:role/Role-DMS-S3-Access" 24 | add_column_name = true 25 | cdc_path = "cdc" 26 | timestamp_column_name = "TIMESTAMP" 27 | } 28 | depends_on = [aws_s3_bucket.buckets-stack] 29 | } 30 | 31 | resource "aws_dms_replication_task" "replication-task1" { 32 | migration_type = "full-load" 33 | replication_instance_arn = module.dms.replication_instance_arn 34 | replication_task_id = "${var.project_name}-replication-task-${var.environment}" 35 | source_endpoint_arn = aws_dms_endpoint.postgresql_endpoint.endpoint_arn 36 | target_endpoint_arn = aws_dms_endpoint.s3_endpoint.endpoint_arn 37 | table_mappings = "{\"rules\":[{\"rule-type\":\"selection\",\"rule-id\":\"1\",\"rule-name\":\"1\",\"object-locator\":{\"schema-name\":\"%\",\"table-name\":\"%\"},\"rule-action\":\"include\"}]}" 38 | tags = { 39 | Name = "${var.project_name}-replication-task-${var.environment}" 40 | } 41 | depends_on = [aws_s3_bucket.buckets-stack] 42 | } 43 | 44 | -------------------------------------------------------------------------------- /terraform/iam.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "glue_job" { 2 | name = "${var.project_name}-glue-job-role" 3 | path = "/" 4 | description = "Provides write permissions to CloudWatch Logs and S3 Full Access" 5 | assume_role_policy = file("permissions/role_glueJobs.json") 6 | } 7 | 8 | resource "aws_iam_policy" "glue_job_policy" { 9 | name = "${var.project_name}-glue-job-policy" 10 | path = "/" 11 | description = "Provides write permissions to CloudWatch Logs and S3 Full Access" 12 | policy = file("permissions/policy_glueJobs.json") 13 | } 14 | 15 | resource "aws_iam_role_policy_attachment" "glue_job1" { 16 | role = aws_iam_role.glue_job.name 17 | policy_arn = aws_iam_policy.glue_job_policy.arn 18 | } 19 | 20 | resource "aws_iam_role" "s3_role" { 21 | name = "dms-s3-role" 22 | description = "Role used to migrate data from S3 via DMS" 23 | 24 | assume_role_policy = jsonencode({ 25 | Version = "2012-10-17" 26 | Statement = [ 27 | { 28 | Sid = "DMSAssume" 29 | Action = "sts:AssumeRole" 30 | Effect = "Allow" 31 | Principal = { 32 | Service = "dms.${data.aws_partition.current.dns_suffix}" 33 | } 34 | }, 35 | ] 36 | }) 37 | 38 | inline_policy { 39 | name = "dms-s3-role" 40 | 41 | policy = jsonencode({ 42 | Version = "2012-10-17" 43 | Statement = [ 44 | { 45 | Sid = "DMSS3" 46 | Action = ["s3:*"] 47 | Effect = "Allow" 48 | Resource = "*" 49 | } 50 | ] 51 | }) 52 | } 53 | } 54 | 55 | data "aws_iam_policy_document" "dms_assume_role" { 56 | statement { 57 | actions = ["sts:AssumeRole"] 58 | 59 | principals { 60 | identifiers = ["dms.amazonaws.com"] 61 | type = "Service" 62 | } 63 | } 64 | } 65 | 66 | resource "aws_iam_role" "dms-vpc-role" { 67 | assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json 68 | name = "dmsvpc-role" 69 | } 70 | 71 | 72 | resource "aws_iam_role" "dms-cloudwatch-logs-role" { 73 | assume_role_policy = data.aws_iam_policy_document.dms_assume_role.json 74 | name = "dms-cloudwatch-logs-role1" 75 | } 76 | 77 | resource "aws_iam_role_policy_attachment" "dms-cloudwatch-logs-role-AmazonDMSCloudWatchLogsRole" { 78 | policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonDMSCloudWatchLogsRole" 79 | role = aws_iam_role.dms-cloudwatch-logs-role.name 80 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bootcamp de Engenharia de Dados. 2 | 3 | ### Para rodar a aplicação de ingestão faça: 4 | 5 | 1. Provisione o RDS PostgreSQL na AWS conforme abordado em aula. 6 | 2. Crie um banco de dados, por exemplo: coins 7 | 3. Configure o arquivo model.py com o nome da tabela a ser criada, exemplo: tb_coins. 8 | 4. Edite variável `db_string` com o endpoint do RDS na AWS. 9 | 5. Altere o arquivo app.py inserindo a chave da api como argumento da função get_data (key) 10 | 6. Execute a aplicação para consumo da API e persistência no banco de dados. 11 | 12 | 13 | ### Para rodar a aplicação Spark: 14 | 1. Suba o Amazon EMR. 15 | 16 | 2. Navegue até o diretório `processing` 17 | 18 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo: 19 | 20 | scp -i ~/Downloads/pair-bootcamp.pem job-spark-app-emr-redshift.py hadoop@ec2-54-90-3-194.compute-1.amazonaws.com:/home/hadoop/ 21 | 22 | 4. Conecte no servidor master usando `ssh`, exemplo: 23 | 24 | ssh -i ~/Downloads/pair-bootcamp.pem hadoop@ec2-54-90-3-194.compute-1.amazonaws.com 25 | 26 | *Obs*: Antes de executar a aplicação verifique se o redshift está iniciado, caso não esteja edite a aplicação alterando a variável `flag_write_redshift` para false. 27 | 28 | 5. Execute o comando spark-submit, exemplo: 29 | spark-submit --packages io.delta:delta-core_2.12:2.0.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" --jars /usr/share/aws/redshift/jdbc/RedshiftJDBC.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-redshift.jar,/usr/share/aws/redshift/spark-redshift/lib/spark-avro.jar,/usr/share/aws/redshift/spark-redshift/lib/minimal-json.jar job-spark-app-emr-redshift.py 30 | 31 | 32 | ### Para provisionar recursos com Terraform: 33 | 1. Navegue até o diretório `Terraform` 34 | 35 | 2. Instale o aplicativo `terraform` 36 | 37 | 3. Instale o aplicativo `aws-cli`. Veja esse link: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html 38 | 39 | 4. Autentique na AWS com o comando: 40 | 41 | aws configure 42 | 43 | 5. Antes de provisionar os recursos crie o backend para usar como *backend* e a tabela no Dynamodb. 44 | 45 | 6. Edite os arquivos *terraform.tfvars* e *variables.tf* com as informações da sua infraestrutura. 46 | 47 | 7. Provisione os recursos com os comandos: 48 | 49 | Provisione os recursos com o comando: 50 | 51 | terraform init> 52 | 53 | terraform plan> 54 | 55 | terraform apply> 56 | 57 | 58 | 3. Copie a aplicação para o servidor usando o comando `scp`, exemplo: -------------------------------------------------------------------------------- /processing/job-spark-app-emr-redshift.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import * 4 | 5 | # setup da aplicação Spark 6 | spark = SparkSession \ 7 | .builder \ 8 | .appName("job-1-spark") \ 9 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\ 10 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\ 11 | .getOrCreate() 12 | 13 | # definindo o método de logging da aplicação use INFO somente para DEV [INFO,ERROR] 14 | spark.sparkContext.setLogLevel("ERROR") 15 | 16 | def read_csv(bucket, path): 17 | # lendo os dados do Data Lake 18 | df = spark.read.format("csv")\ 19 | .option("header", "True")\ 20 | .option("inferSchema","True")\ 21 | .csv(f"{bucket}/{path}") 22 | # imprime os dados lidos da raw 23 | print ("\nImprime os dados lidos da raw:") 24 | print (df.show(5)) 25 | # imprime o schema do dataframe 26 | print ("\nImprime o schema do dataframe lido da raw:") 27 | print (df.printSchema()) 28 | return df 29 | 30 | def read_delta(bucket, path): 31 | df = spark.read.format("delta")\ 32 | .load(f"{bucket}/{path}") 33 | return df 34 | 35 | def write_processed(bucket, path, col_partition, data_format, mode): 36 | print ("\nEscrevendo os dados lidos da raw para delta na processing zone...") 37 | try: 38 | df.write.format(data_format)\ 39 | .partitionBy(col_partition)\ 40 | .mode(mode)\ 41 | .save(f"{bucket}/{path}") 42 | print (f"Dados escritos na processed com sucesso!") 43 | return 0 44 | except Exception as err: 45 | print (f"Falha para escrever dados na processed: {err}") 46 | return 1 47 | 48 | def write_curated(bucket, path, dataframe, data_format, mode): 49 | # converte os dados processados para parquet e escreve na curated zone 50 | print ("\nEscrevendo os dados na curated zone...") 51 | try: 52 | dataframe.write.format(data_format)\ 53 | .mode(mode)\ 54 | .save(f"{bucket}/{path}") 55 | print (f"Dados escritos na curated com sucesso!") 56 | return 0 57 | except Exception as err: 58 | print (f"Falha para escrever dados na processed: {err}") 59 | return 1 60 | 61 | def write_redshift(url_jdbc, table_name, dataframe): 62 | try: 63 | dataframe.write.format("jdbc")\ 64 | .options(url=url_jdbc, 65 | driver='com.amazon.redshift.jdbc42.Driver', 66 | user='awsuser', 67 | password='U%SlC7*Y807n', 68 | dbtable=table_name 69 | )\ 70 | .mode('overwrite')\ 71 | .save() 72 | print (f"Dados escritos no Redshift com sucesso!") 73 | return 0 74 | except Exception as err: 75 | print (f"Falha para escrever dados no Redshift: {err}") 76 | return 1 77 | 78 | 79 | def analytics_tables(bucket, dataframe, table_name, flag_write_redshift, url_jdbc): 80 | # cria uma view para trabalhar com sql 81 | dataframe.createOrReplaceTempView(table_name) 82 | # processa os dados conforme regra de negócio 83 | df_query1 = df.groupBy("name") \ 84 | .agg(sum("circulating_supply").alias("circulating_supply")) \ 85 | .sort(desc("circulating_supply")) \ 86 | .limit(10) 87 | df_query2 = df.select(col('name'),col('symbol'),col('price'))\ 88 | .sort(desc("price"))\ 89 | .limit(10) 90 | # imprime o resultado do dataframe criado 91 | print ("\n Top 10 Cryptomoedas com maior fornecimento de circulação no mercado\n") 92 | print (df_query1.show()) 93 | print ("\n Top 10 Cryptomoedas com preços mais altos de 2022\n") 94 | print (df_query2.show()) 95 | write_curated(f"{bucket}","coins_circulating_supply",df_query1,"delta","overwrite") 96 | write_curated(f"{bucket}","top10_prices_2022",df_query2,"delta","overwrite") 97 | 98 | if flag_write_redshift == True: 99 | write_redshift(url_jdbc, "coins_circulating_supply", df_query1) 100 | write_redshift(url_jdbc,"top10_prices_2022",df_query2) 101 | 102 | 103 | # Ler dados da raw 104 | df = read_csv('s3a://raw-stack-bootcampde','public/tb_coins/') 105 | 106 | # Cria uma coluna de ano para particionar os dados 107 | df = df.withColumn("year", year(df.data_added)) 108 | 109 | # Processa os dados e escreve na camada processed 110 | write_processed("s3a://processed-stack-bootcampde","tb_coins","year","delta","overwrite") 111 | 112 | # Lear dados da processed e escreve na camada curated. 113 | df = read_delta("s3a://processed-stack-bootcampde","tb_coins") 114 | 115 | flag_write_redshift = True 116 | url_jdbc = "jdbc:redshift://redshift-cluster-1.cufcxu0ztur8.us-east-1.redshift.amazonaws.com:5439/dev" 117 | analytics_tables("s3a://curated-stack-bootcampde",df,"tb_coins", flag_write_redshift, url_jdbc) 118 | 119 | # para a aplicação 120 | spark.stop() -------------------------------------------------------------------------------- /ingestao-rds/app.py: -------------------------------------------------------------------------------- 1 | from ast import Str, Try 2 | import json 3 | from sqlite3 import Date 4 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, null 5 | from requests.exceptions import ConnectionError, Timeout, TooManyRedirects 6 | from sqlalchemy.ext.declarative import declarative_base 7 | from sqlalchemy.orm import sessionmaker 8 | from model import Coins 9 | from requests import Request, Session 10 | from datetime import datetime 11 | import pandas as pd 12 | 13 | def check_if_valid_data(df: pd.DataFrame) -> bool: 14 | 15 | # Check if dataframe is empty 16 | if df.empty: 17 | print("\nDataframe empty. Finishing execution") 18 | return False 19 | 20 | # Check for nulls 21 | if df.symbol.empty: 22 | raise Exception("\nSymbol is Null or the value is empty") 23 | 24 | # Check for nulls 25 | if df.price.empty: 26 | raise Exception("\nPrice is Null or the value is empty") 27 | 28 | # Check for nulls 29 | if df.data_added.empty: 30 | raise Exception("\nData is Null or the value is empty") 31 | 32 | return True 33 | 34 | def load_data(table_name, coins_df, session_db, engine_db): 35 | 36 | # validate 37 | if check_if_valid_data(coins_df): 38 | print("\nData valid, proceed to Load stage") 39 | 40 | # load data on database 41 | try: 42 | coins_df.to_sql(table_name, engine_db, index=False, if_exists='append') 43 | print ('\nData Loaded on Database') 44 | 45 | except Exception as err: 46 | print(f"\nFail to load data on database: {err}") 47 | 48 | session_db.commit() 49 | session_db.close() 50 | print("\nClose database successfully") 51 | return session_db 52 | 53 | def get_data(session_db, engine_db, start, limit, convert, key, url): 54 | 55 | # set limit of data from api 56 | parameters = { 57 | 'start': start, 58 | 'limit': limit, 59 | 'convert': convert 60 | } 61 | 62 | headers = { 63 | 'Accepts': 'application/json', 64 | 'X-CMC_PRO_API_KEY': key, 65 | } 66 | 67 | session = Session() 68 | session.headers.update(headers) 69 | 70 | name = [] 71 | symbol = [] 72 | data_added = [] 73 | last_updated = [] 74 | price = [] 75 | volume_24h = [] 76 | circulating_supply = [] 77 | total_supply = [] 78 | max_supply = [] 79 | volume_24h = [] 80 | percent_change_1h = [] 81 | percent_change_24h = [] 82 | percent_change_7d = [] 83 | 84 | try: 85 | response = session.get(url, params=parameters) 86 | data = json.loads(response.text) 87 | 88 | print ('\n') 89 | for coin in data['data']: 90 | name.append(coin['name']) 91 | symbol.append(coin['symbol']) 92 | data_added.append(coin['date_added']) 93 | last_updated.append(coin['last_updated']) 94 | circulating_supply.append(coin['circulating_supply']) 95 | total_supply.append(coin['total_supply']) 96 | max_supply.append(coin['max_supply']) 97 | price.append(coin['quote']['USD']['price']) 98 | volume_24h.append(coin['quote']['USD']['volume_24h']) 99 | percent_change_1h.append(coin['quote']['USD']['percent_change_1h']) 100 | percent_change_24h.append(coin['quote']['USD']['percent_change_24h']) 101 | percent_change_7d.append(coin['quote']['USD']['percent_change_7d']) 102 | 103 | 104 | # Prepare a dictionary in order to turn it into a pandas dataframe below 105 | coin_dict = { 106 | "name" : name, 107 | "symbol": symbol, 108 | "data_added" : data_added, 109 | "last_updated" : last_updated, 110 | "price": price, 111 | "volume_24h": volume_24h, 112 | "circulating_supply" : circulating_supply, 113 | "total_supply": total_supply, 114 | "max_supply": max_supply, 115 | "volume_24h": volume_24h, 116 | "percent_change_1h": percent_change_1h, 117 | "percent_change_24h": percent_change_24h, 118 | "percent_change_7d": percent_change_7d 119 | 120 | } 121 | except Exception as e: 122 | print (f'Error to get data from APi: {e}') 123 | exit(1) 124 | 125 | # create dataframe to structure data 126 | coins_df = pd.DataFrame(coin_dict, columns = ["name", "symbol", "data_added", "last_updated","price","volume_24h","circulating_supply","total_supply","max_supply","percent_change_1h","percent_change_24h","percent_change_7d"]) 127 | print ("Data on Pandas Dataframe:\n") 128 | print(coins_df.head(100)) 129 | 130 | # call the function to load data on database 131 | load_data('tb_coins',coins_df, session_db, engine_db) 132 | 133 | # Declaration base 134 | Base = declarative_base() 135 | 136 | # Make the coin table 137 | get_session_db, get_engine = Coins.start() 138 | 139 | # call the get_data function and load data on database 140 | get_data(get_session_db, 141 | get_engine, 142 | '1', 143 | '5000', 144 | 'USD', 145 | '7bdc01a6-f004-4c0b-b21c-1c1d3970352f', 146 | 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest') -------------------------------------------------------------------------------- /terraform/permissions/policy_glueJobs.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "glue:*", 8 | "redshift:DescribeClusters", 9 | "redshift:DescribeClusterSubnetGroups", 10 | "iam:ListRoles", 11 | "iam:ListUsers", 12 | "iam:ListGroups", 13 | "iam:ListRolePolicies", 14 | "iam:GetRole", 15 | "iam:GetRolePolicy", 16 | "iam:ListAttachedRolePolicies", 17 | "ec2:DescribeSecurityGroups", 18 | "ec2:DescribeSubnets", 19 | "ec2:DescribeVpcs", 20 | "ec2:DescribeVpcEndpoints", 21 | "ec2:DescribeRouteTables", 22 | "ec2:DescribeVpcAttribute", 23 | "ec2:DescribeKeyPairs", 24 | "ec2:DescribeInstances", 25 | "rds:DescribeDBInstances", 26 | "rds:DescribeDBClusters", 27 | "rds:DescribeDBSubnetGroups", 28 | "s3:ListAllMyBuckets", 29 | "s3:ListBucket", 30 | "s3:GetBucketAcl", 31 | "s3:GetBucketLocation", 32 | "cloudformation:DescribeStacks", 33 | "cloudformation:GetTemplateSummary", 34 | "dynamodb:ListTables", 35 | "kms:ListAliases", 36 | "kms:DescribeKey", 37 | "cloudwatch:GetMetricData", 38 | "cloudwatch:ListDashboards" 39 | ], 40 | "Resource": [ 41 | "*" 42 | ] 43 | }, 44 | { 45 | "Effect": "Allow", 46 | "Action": [ 47 | "s3:GetObject", 48 | "s3:PutObject" 49 | ], 50 | "Resource": [ 51 | "arn:aws:s3:::*" 52 | ] 53 | }, 54 | { 55 | "Effect": "Allow", 56 | "Action": [ 57 | "tag:GetResources" 58 | ], 59 | "Resource": [ 60 | "*" 61 | ] 62 | }, 63 | { 64 | "Effect": "Allow", 65 | "Action": [ 66 | "s3:CreateBucket", 67 | "s3:PutBucketPublicAccessBlock" ], 68 | "Resource": [ 69 | "arn:aws:s3:::*" 70 | ] 71 | }, 72 | { 73 | "Effect": "Allow", 74 | "Action": [ 75 | "logs:GetLogEvents" 76 | ], 77 | "Resource": [ 78 | "arn:aws:logs:*:*:/aws-glue/*" 79 | ] 80 | }, 81 | { 82 | "Effect": "Allow", 83 | "Action": [ 84 | "cloudformation:CreateStack", 85 | "cloudformation:DeleteStack" 86 | ], 87 | "Resource": "arn:aws:cloudformation:*:*:stack/aws-glue*/*" 88 | }, 89 | { 90 | "Effect": "Allow", 91 | "Action": [ 92 | "ec2:RunInstances" 93 | ], 94 | "Resource": [ 95 | "arn:aws:ec2:*:*:instance/*", 96 | "arn:aws:ec2:*:*:key-pair/*", 97 | "arn:aws:ec2:*:*:image/*", 98 | "arn:aws:ec2:*:*:security-group/*", 99 | "arn:aws:ec2:*:*:network-interface/*", 100 | "arn:aws:ec2:*:*:subnet/*", 101 | "arn:aws:ec2:*:*:volume/*" 102 | ] 103 | }, 104 | { 105 | "Effect": "Allow", 106 | "Action": [ 107 | "ec2:TerminateInstances", 108 | "ec2:CreateTags", 109 | "ec2:DeleteTags" 110 | ], 111 | "Resource": [ 112 | "arn:aws:ec2:*:*:instance/*" 113 | ], 114 | "Condition": { 115 | "StringLike": { 116 | "ec2:ResourceTag/aws:cloudformation:stack-id": "arn:aws:cloudformation:*:*:stack/*/*" 117 | }, 118 | "StringEquals": { 119 | "ec2:ResourceTag/aws:cloudformation:logical-id": "ZeppelinInstance" 120 | } 121 | } 122 | }, 123 | { 124 | "Action": [ 125 | "iam:PassRole" 126 | ], 127 | "Effect": "Allow", 128 | "Resource": "arn:aws:iam::*:role/AWSGlueServiceRole*", 129 | "Condition": { 130 | "StringLike": { 131 | "iam:PassedToService": [ 132 | "glue.amazonaws.com" 133 | ] 134 | } 135 | } 136 | }, 137 | { 138 | "Action": [ 139 | "iam:PassRole" 140 | ], 141 | "Effect": "Allow", 142 | "Resource": "arn:aws:iam::*:role/AWSGlueServiceNotebookRole*", 143 | "Condition": { 144 | "StringLike": { 145 | "iam:PassedToService": [ 146 | "ec2.amazonaws.com" 147 | ] 148 | } 149 | } 150 | }, 151 | { 152 | "Action": [ 153 | "iam:PassRole" 154 | ], 155 | "Effect": "Allow", 156 | "Resource": [ 157 | "arn:aws:iam::*:role/service-role/AWSGlueServiceRole*" 158 | ], 159 | "Condition": { 160 | "StringLike": { 161 | "iam:PassedToService": [ 162 | "glue.amazonaws.com" 163 | ] 164 | } 165 | } 166 | } 167 | ] 168 | } --------------------------------------------------------------------------------