├── .gitignore
├── Makefile
├── README.md
├── _datasets
    ├── .gitkeep
    ├── airbnb
    │   └── .gitkeep
    └── ecommerce
    │   └── .gitkeep
├── _dockerfile
    ├── docker-metastore
    │   ├── Dockerfile
    │   ├── conf
    │   │   └── hive-site.xml
    │   └── scripts
    │   │   └── entrypoint.sh
    └── docker-presto
    │   ├── Dockerfile
    │   ├── etc
    │       ├── catalog
    │       │   ├── hive.properties
    │       │   ├── iceberg.properties
    │       │   └── tpch.properties
    │       ├── config.properties
    │       ├── jvm.config
    │       ├── log.properties
    │       └── node.properties
    │   └── scripts
    │       └── entrypoint.sh
├── _notebook
    ├── kafka-basic.ipynb
    ├── spark-jdbc-basic.ipynb
    ├── spark-metastore-local.ipynb
    ├── spark-metastore-remote.ipynb
    └── spark-streaming-data.ipynb
├── _script
    ├── docker-mysql
    │   ├── conf
    │   │   └── my.cnf
    │   └── sql
    │   │   ├── 001_create_database.sql
    │   │   └── 002_create_table.sql
    └── docker-spark
    │   ├── apps
    │       ├── main.py
    │       └── postgresql-42.2.22.jar
    │   ├── conf
    │       └── spark-defaults.conf
    │   └── data
    │       └── .gitignore
├── _slide
    └── .gitignore
├── _volume
    └── .gitignore
├── docker-compose.aws.yml
├── docker-compose.kafka.yml
├── docker-compose.metastore.yml
├── docker-compose.presto.yml
├── docker-compose.spark.yml
├── docker-compose.storage.yml
├── project-flink
    └── .gitignore
├── project-kafka
    └── .gitignore
├── project-spark
    ├── .gitignore
    ├── _scripts
    │   └── mysql-ddl
    │   │   └── table_property_stat.sql
    ├── build.gradle
    ├── gradle
    │   └── wrapper
    │   │   ├── gradle-wrapper.jar
    │   │   └── gradle-wrapper.properties
    ├── gradlew
    ├── gradlew.bat
    ├── module-core
    │   ├── build.gradle
    │   ├── gradle
    │   │   └── wrapper
    │   │   │   ├── gradle-wrapper.jar
    │   │   │   └── gradle-wrapper.properties
    │   ├── gradlew
    │   ├── gradlew.bat
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── mkt
    │   │               └── udon
    │   │                   └── core
    │   │                       ├── common
    │   │                           ├── Environment.scala
    │   │                           └── TimeUtil.scala
    │   │                       └── entity
    │   │                           ├── ProductPool.scala
    │   │                           ├── UserEvent.scala
    │   │                           └── UserProfile.scala
    ├── module-infra-spark
    │   ├── build.gradle
    │   ├── gradle
    │   │   └── wrapper
    │   │   │   ├── gradle-wrapper.jar
    │   │   │   └── gradle-wrapper.properties
    │   ├── gradlew
    │   ├── gradlew.bat
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── mkt
    │   │               └── udon
    │   │                   └── infra
    │   │                       └── spark
    │   │                           ├── SparkBase.scala
    │   │                           ├── common
    │   │                               └── Partition.scala
    │   │                           └── storage
    │   │                               ├── DynamoSink.scala
    │   │                               ├── JdbcSink.scala
    │   │                               └── ParquetSink.scala
    ├── service-batch-discovery
    │   ├── Makefile
    │   ├── VERSION
    │   ├── build.gradle
    │   ├── gradle
    │   │   └── wrapper
    │   │   │   ├── gradle-wrapper.jar
    │   │   │   └── gradle-wrapper.properties
    │   ├── gradlew
    │   ├── gradlew.bat
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── .gitignore
    │   │           ├── application.conf
    │   │           └── log4j.properties
    │   │       └── scala
    │   │           └── mkt
    │   │               └── udon
    │   │                   ├── UdonProductPoolBatch.scala
    │   │                   ├── config
    │   │                       └── UdonProductPoolBatchConfig.scala
    │   │                   └── entity
    │   │                       └── UdonProductPoolEntity.scala
    ├── service-batch-statistics
    │   ├── Makefile
    │   ├── VERSION
    │   ├── build.gradle
    │   ├── gradle
    │   │   └── wrapper
    │   │   │   ├── gradle-wrapper.jar
    │   │   │   └── gradle-wrapper.properties
    │   ├── gradlew
    │   ├── gradlew.bat
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── .gitignore
    │   │           ├── application.conf
    │   │           └── log4j.properties
    │   │       └── scala
    │   │           └── mkt
    │   │               └── udon
    │   │                   ├── UdonStatBatch.scala
    │   │                   ├── config
    │   │                       └── UdonStatBatchConfig.scala
    │   │                   └── entity
    │   │                       └── UdonStatEntity.scala
    ├── service-stream-profile
    │   ├── Makefile
    │   ├── VERSION
    │   ├── build.gradle
    │   └── src
    │   │   └── main
    │   │       ├── resources
    │   │           ├── application.conf
    │   │           └── log4j.properties
    │   │       └── scala
    │   │           └── mkt
    │   │               └── udon
    │   │                   ├── UdonProfileStream.scala
    │   │                   ├── UdonRelayStream.scala
    │   │                   ├── config
    │   │                       ├── UdonProfileStreamConfig.scala
    │   │                       └── UdonRelayStreamConfig.scala
    │   │                   └── entity
    │   │                       └── UdonProfileStateFunc.scala
    └── settings.gradle
├── project-terraform-aws
    ├── .gitignore
    ├── _aws-root-iam
    │   ├── .gitkeep
    │   ├── _local.tf
    │   ├── _output.tf
    │   ├── _provider.tf
    │   ├── _terraform.tf
    │   ├── main_iam_common.tf
    │   └── module-iam-common
    │   │   ├── _data.tf
    │   │   ├── _output.tf
    │   │   ├── _variable.tf
    │   │   ├── common.basic.iam.tf
    │   │   ├── common.ec2.iam.tf
    │   │   ├── common.ec2.profile.tf
    │   │   ├── common.emr.iam.tf
    │   │   └── common.emr.profile.tf
    ├── _aws-root-sg
    │   ├── .gitkeep
    │   ├── _data.tf
    │   ├── _local.tf
    │   ├── _output.tf
    │   ├── _provider.tf
    │   ├── _terraform.tf
    │   ├── main_sg_data_dev.tf
    │   └── module-sg-data-dev
    │   │   ├── _output.tf
    │   │   ├── _variable.tf
    │   │   ├── dev.bastion-public.sg.tf
    │   │   ├── dev.emr-master.sg.tf
    │   │   ├── dev.emr-serivce.sg.tf
    │   │   ├── dev.emr-slave.sg.tf
    │   │   └── dev.rds.sg.tf
    ├── _aws-root-vpc
    │   ├── _local.tf
    │   ├── _output.tf
    │   ├── _provider.tf
    │   ├── _terraform.tf
    │   ├── main_vpc_data_dev.tf
    │   └── module-vpc-data-dev
    │   │   ├── _output.tf
    │   │   ├── _variable.tf
    │   │   └── dev.data.vpc.tf
    ├── aws-root-machine-bastion
    │   ├── _data.ami.tf
    │   ├── _data.state.tf
    │   ├── _local.tf
    │   ├── _provider.tf
    │   ├── _template
    │   │   └── template.cloudwatch.sh
    │   ├── _terraform.tf
    │   ├── main_bastion_dev.tf
    │   └── module-bastion-data-dev
    │   │   ├── _data.bootstrap.tf
    │   │   ├── _local.tf
    │   │   ├── _variable.tf
    │   │   ├── dev.bastion-public-01.cw.tf
    │   │   └── dev.bastion-public-01.ec2.tf
    ├── aws-root-machine-eks
    │   ├── .gitkeep
    │   ├── _local.tf
    │   ├── _provider.tf
    │   └── _terraform.tf
    ├── aws-root-machine-emr-batch
    │   ├── .gitkeep
    │   ├── _data.state.tf
    │   ├── _local.tf
    │   ├── _provider.tf
    │   ├── _template
    │   │   ├── template.emr-cloudwatch-collect.sh
    │   │   ├── template.emr-instance-tag.sh
    │   │   ├── template.emr-spark-batch.json
    │   │   └── template.emr-system-config.sh
    │   ├── _terraform.tf
    │   ├── main_emr_data_dev.tf
    │   └── module-emr-data-dev
    │   │   ├── _local.tf
    │   │   ├── _variable.tf
    │   │   ├── dev.spark-batch-01.cw.tf
    │   │   └── dev.spark-batch-01.emr.tf
    ├── aws-root-machine-emr-presto
    │   └── .gitkeep
    ├── aws-root-machine-emr-stream
    │   └── .gitkeep
    └── aws-root-storage-rds
    │   ├── .gitkeep
    │   ├── _data.state.tf
    │   ├── _local.tf
    │   ├── _provider.tf
    │   ├── _terraform.tf
    │   ├── main_rds_data_dev.tf
    │   └── module-rds-data-dev
    │       ├── _variable.tf
    │       └── dev.hive-metastore.rds.tf
└── project-terraform-gcp
    └── .gitkeep


/.gitignore:
--------------------------------------------------------------------------------
 1 | ./idea
 2 | .DS_Store
 3 | _assets/
 4 | !.gitkeep
 5 | _datasets/airbnb/*.csv
 6 | _datasets/ecommerce/*.csv
 7 | 
 8 | */.ipynb_checkpoints/
 9 | 
10 | derby.log
11 | metastore_db


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | TAG = "Makefile"
  2 | 
  3 | MYSQLCLIENT = mycli
  4 | DOCKER_HOST_IP := $(shell ipconfig getifaddr en0)
  5 | 
  6 | ##
  7 | ## Jupyter
  8 | ##
  9 | 
 10 | .PHONY: jupyter
 11 | jupyter:
 12 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose"
 13 | 	@ echo "-----------------------------------------\n"
 14 | 	@ jupyter lab --ip=127.0.0.1 --port=8080
 15 | 
 16 | ##
 17 | ## Compose
 18 | ##
 19 | 
 20 | .PHONY: compose.prepare
 21 | compose.prepare:
 22 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose"
 23 | 	@ echo "-----------------------------------------\n"
 24 | 	@ echo "export DOCKER_HOST_IP=$(DOCKER_HOST_IP)"
 25 | 	@ echo "\n-----------------------------------------"
 26 | 	@ echo ""
 27 | 
 28 | .PHONY: compose.storage
 29 | compose.storage: compose.prepare
 30 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 31 | 	@ docker stop $(docker ps -a -q) || true
 32 | 	@ docker rm -f $(docker ps -a -q) || true
 33 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 34 | 	@ docker compose -f docker-compose.storage.yml rm -fsv || true
 35 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 36 | 		-f docker-compose.storage.yml \
 37 | 		up
 38 | 
 39 | .PHONY: compose.spark
 40 | compose.spark: compose.prepare
 41 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 42 | 	@ docker stop $(docker ps -a -q) || true
 43 | 	@ docker rm -f $(docker ps -a -q) || true
 44 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 45 | 	@ docker compose -f docker-compose.spark.yml rm -fsv || true
 46 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 47 | 		-f docker-compose.spark.yml \
 48 | 		up
 49 | 
 50 | .PHONY: compose.kafka
 51 | compose.kafka: compose.prepare
 52 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 53 | 	@ docker stop $(docker ps -a -q) || true
 54 | 	@ docker rm -f $(docker ps -a -q) || true
 55 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 56 | 	@ docker compose -f docker-compose.kafka.yml rm -fsv || true
 57 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 58 | 		-f docker-compose.kafka.yml \
 59 | 		up
 60 | 
 61 | .PHONY: compose.metastore
 62 | compose.metastore: compose.prepare
 63 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 64 | 	@ docker stop $(docker ps -a -q) || true
 65 | 	@ docker rm -f $(docker ps -a -q) || true
 66 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 67 | 	@ docker compose -f docker-compose.metastore.yml rm -fsv || true
 68 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 69 | 		-f docker-compose.metastore.yml \
 70 | 		up --build
 71 | 
 72 | .PHONY: compose.presto
 73 | compose.presto: compose.prepare
 74 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 75 | 	@ docker stop $(docker ps -a -q) || true
 76 | 	@ docker rm -f $(docker ps -a -q) || true
 77 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 78 | 	@ docker compose -f docker-compose.presto.yml rm -fsv || true
 79 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 80 | 		-f docker-compose.presto.yml \
 81 | 		up --build
 82 | 
 83 | .PHONY: compose.aws
 84 | compose.aws: compose.aws
 85 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
 86 | 	@ docker stop $(docker ps -a -q) || true
 87 | 	@ docker rm -f $(docker ps -a -q) || true
 88 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
 89 | 	@ docker compose -f docker-compose.aws.yml rm -fsv || true
 90 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
 91 | 		-f docker-compose.aws.yml \
 92 | 		up --build
 93 | 
 94 | .PHONY: compose.clean
 95 | compose.clean:
 96 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Starting: Cleaning docker resources"
 97 | 	@ echo "-----------------------------------------\n"
 98 | 	@ docker stop `docker ps -a -q` || true
 99 | 	@ docker rm -f `docker ps -a -q` || true
100 | 	@ docker rmi -f `docker images --quiet --filter "dangling=true"` || true
101 | 	@ docker volume rm `docker volume ls -f dangling=true -q` || true
102 | 	@ rm -rf ./docker-volumes
103 | 	@ docker network rm `docker network ls -q` || true
104 | 	@ echo ""
105 | 	@ rm -rf metastore_db
106 | 	@ echo "\n-----------------------------------------"
107 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Finished: Cleaning docker resources"
108 | 
109 | .PHONY: compose.storage-all
110 | compose.storage-all: compose.storage-all
111 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
112 | 	@ docker stop $(docker ps -a -q) || true
113 | 	@ docker rm -f $(docker ps -a -q) || true
114 | 	@ docker volume rm $(docker volume ls -f dangling=true -q) || true
115 | 	@ docker compose -f docker-compose.aws.yml rm -fsv || true
116 | 	@ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
117 | 		-f docker-compose.storage.yml \
118 | 		-f docker-compose.aws.yml \
119 | 		-f docker-compose.kafka.yml \
120 | 		up --build
121 | 
122 | ##
123 | ## Storage CLIs
124 | ##
125 | 
126 | .PHONY: mysql
127 | mysql:
128 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to mysql"
129 | 	@ $(MYSQLCLIENT) -u root -h localhost ad_stat -p root
130 | 
131 | .PHONY: redis
132 | redis:
133 | 	@ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to redis"
134 | 	@ redis-cli -a credential
135 | 
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Practical Data Pipeline (Code)
2 | 
3 | 


--------------------------------------------------------------------------------
/_datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/.gitkeep


--------------------------------------------------------------------------------
/_datasets/airbnb/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/airbnb/.gitkeep


--------------------------------------------------------------------------------
/_datasets/ecommerce/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/ecommerce/.gitkeep


--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8u242-jre
 2 | 
 3 | WORKDIR /opt
 4 | 
 5 | ENV HADOOP_VERSION=2.10.1
 6 | ENV METASTORE_VERSION=2.3.9
 7 | ENV AWS_SDK_VERSION=1.11.271
 8 | 
 9 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
10 | ENV HIVE_HOME=/opt/apache-hive-${METASTORE_VERSION}-bin
11 | ENV HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
12 | 
13 | # BIN
14 | RUN apt-get update && \
15 |     apt-get upgrade -y && \
16 |     apt-get -qqy install curl && \
17 |     curl -L https://dlcdn.apache.org/hive/hive-${METASTORE_VERSION}/apache-hive-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \
18 |     curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
19 |     apt-get install --only-upgrade openssl libssl1.1 && \
20 |     apt-get install -y libk5crypto3 libkrb5-3 libsqlite3-0
21 | 
22 | # DEPENDENCY
23 | RUN rm ${HIVE_HOME}/lib/postgresql-9.4.1208.jre7.jar
24 | RUN curl -o ${HIVE_HOME}/lib/postgresql-9.4.1212.jre7.jar -L https://jdbc.postgresql.org/download/postgresql-9.4.1212.jre7.jar
25 | RUN curl -L https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.19.tar.gz | tar zxf - && \
26 |         cp mysql-connector-java-8.0.19/mysql-connector-java-8.0.19.jar ${HIVE_HOME}/lib/ && \
27 |         rm -rf  mysql-connector-java-8.0.19
28 | 
29 | # CONFIG
30 | COPY conf/hive-site.xml ${HIVE_HOME}/conf/hive-site.xml
31 | RUN ls -alh ${HADOOP_HOME}/etc/hadoop/
32 | RUN ls -alh ${HIVE_HOME}/conf/
33 | COPY scripts/entrypoint.sh /entrypoint.sh
34 | 
35 | # UTILS
36 | ENV TINI_VERSION v0.19.0
37 | RUN apt-get -q update && apt-get -qy install netcat wget
38 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
39 | RUN chmod +x /tini
40 | 
41 | # ENV
42 | ENV TZ=Asia/Seoul
43 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
44 | 
45 | # USER
46 | RUN groupadd -r hadoop --gid=1001 && \
47 |     useradd -r -g hadoop --uid=1001 -d ${HIVE_HOME} hadoop && \
48 |     chown hadoop:hadoop -R ${HIVE_HOME}
49 | 
50 | USER hadoop
51 | WORKDIR $HIVE_HOME
52 | EXPOSE 9083
53 | 
54 | ENTRYPOINT ["/tini", "--"]
55 | CMD ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/conf/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <property>
 3 |         <name>hive.metastore.schema.verification</name>
 4 |         <value>false</value>
 5 |     </property>
 6 |     <property>
 7 |         <name>metastore.warehouse.dir</name>
 8 |         <value>s3a://spark/warehouse/</value>
 9 |     </property>
10 |     <property>
11 |         <name>javax.jdo.option.ConnectionDriverName</name>
12 |         <value>com.mysql.cj.jdbc.Driver</value>
13 |     </property>
14 | 
15 |     <property>
16 |         <name>javax.jdo.option.ConnectionURL</name>
17 |         <value>jdbc:mysql://mysql:3306/metastore_db?createDatabaseIfNotExist=true</value>
18 |     </property>
19 | 
20 |     <property>
21 |         <name>javax.jdo.option.ConnectionUserName</name>
22 |         <value>root</value>
23 |     </property>
24 | 
25 |     <property>
26 |         <name>javax.jdo.option.ConnectionPassword</name>
27 |         <value>root</value>
28 |     </property>
29 | 
30 |     <property>
31 |         <name>fs.s3a.access.key</name>
32 |         <value>accesskey</value>
33 |     </property>
34 |     <property>
35 |         <name>fs.s3a.secret.key</name>
36 |         <value>secretkey</value>
37 |     </property>
38 |     <property>
39 |         <name>fs.s3a.endpoint</name>
40 |         <value>http://minio:9000</value>
41 |     </property>
42 |     <property>
43 |         <name>fs.s3a.path.style.access</name>
44 |         <value>true</value>
45 |     </property>
46 |     <property>
47 |         <name>fs.s3a.connection.ssl.enabled</name>
48 |         <value>false</value>
49 |         <description>Enables or disables SSL connections to S3.</description>
50 |     </property>
51 |     <property>
52 |         <name>fs.s3a.impl</name>
53 |         <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
54 |         <description>The implementation class of the S3A Filesystem</description>
55 |     </property>
56 | 
57 | </configuration>


--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export HADOOP_VERSION=2.10.1
 4 | export METASTORE_VERSION=2.3.9
 5 | export AWS_SDK_VERSION=1.11.271
 6 | 
 7 | export JAVA_HOME=/usr/local/openjdk-8
 8 | export HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
 9 | 
10 | sleep 10;
11 | 
12 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType mysql || true;
13 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/hive --service metastore


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre
 2 | 
 3 | 
 4 | ARG _PRESTO_HOME=/opt/presto
 5 | ARG _PRESTO_VERSION=0.265.1
 6 | ENV PRESTO_VERSION=${_PRESTO_VERSION}
 7 | 
 8 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz
 9 | RUN mkdir -p /opt || true
10 | RUN tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt
11 | RUN rm presto-server-${PRESTO_VERSION}.tar.gz
12 | RUN ln -s /opt/presto-server-${PRESTO_VERSION} ${_PRESTO_HOME}
13 | 
14 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar
15 | RUN mv presto-cli-${PRESTO_VERSION}-executable.jar /usr/local/bin/presto
16 | RUN chmod +x /usr/local/bin/presto
17 | 
18 | # UTILS
19 | ENV TINI_VERSION v0.19.0
20 | RUN apt-get update && apt-get install -y wget python less telnet vim zsh netcat
21 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
22 | RUN chmod +x /tini
23 | 
24 | # ENV
25 | ENV TZ=Asia/Seoul
26 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
27 | 
28 | # CONFIG
29 | COPY scripts/entrypoint.sh /entrypoint.sh
30 | COPY etc/jvm.config ${_PRESTO_HOME}/etc/jvm.config
31 | 
32 | # USER
33 | RUN groupadd -r hadoop --gid=1001 && \
34 |     useradd -r -g hadoop --uid=1001 -d ${_PRESTO_HOME} hadoop && \
35 |     chown hadoop:hadoop -R ${_PRESTO_HOME}
36 | 
37 | RUN mkdir -p /var/presto && \
38 |     chown hadoop:hadoop -R /var/presto && \
39 |     chown hadoop:hadoop -R /opt/presto-server-${PRESTO_VERSION} && \
40 |     chown hadoop:hadoop -R ${_PRESTO_HOME}/etc
41 | 
42 | USER hadoop
43 | WORKDIR ${_PRESTO_HOME}
44 | EXPOSE 8080
45 | 
46 | ENTRYPOINT ["/tini", "--"]
47 | CMD ["/entrypoint.sh"]
48 | 


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/hive.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore.uri=thrift://hive-metastore:9083


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/iceberg.properties:
--------------------------------------------------------------------------------
1 | connector.name=iceberg
2 | hive.metastore.uri=thrift://hive-metastore:9083
3 | iceberg.file-format=PARQUET
4 | iceberg.compression-codec=SNAPPY
5 | 


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | query.max-memory=1GB
5 | query.max-memory-per-node=1GB
6 | query.max-total-memory-per-node=2GB
7 | discovery-server.enabled=true
8 | discovery.uri=http://localhost:8080


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/jvm.config:
--------------------------------------------------------------------------------
1 | -server
2 | -Xmx4G
3 | -XX:+UseG1GC
4 | -XX:G1HeapRegionSize=32M
5 | -XX:ReservedCodeCacheSize=150M
6 | -XX:+UseGCOverheadLimit
7 | -XX:+ExplicitGCInvokesConcurrent
8 | -XX:+HeapDumpOnOutOfMemoryError
9 | -XX:+ExitOnOutOfMemoryError


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/log.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_dockerfile/docker-presto/etc/log.properties


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=production
2 | node.id=$(NODE_ID)
3 | node.data-dir=/var/presto/data


--------------------------------------------------------------------------------
/_dockerfile/docker-presto/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | PRESTO_HOME=${PRESTO_HOME:-/opt/presto}
 2 | 
 3 | PRESTO_COORDINATOR=${PRESTO_COORDINATOR:-}
 4 | PRESTO_NODE_ID=${PRESTO_NODE_ID:-}
 5 | PRESTO_LOG_LEVEL=${PRESTO_LOG_LEVEL:-INFO}
 6 | 
 7 | PRESTO_HTTP_SERVER_PORT=${PRESTO_HTTP_SERVER_PORT:-8080}
 8 | 
 9 | PRESTO_MAX_MEMORY=${PRESTO_MAX_MEMORY:-20}
10 | PRESTO_MAX_MEMORY_PER_NODE=${PRESTO_MAX_MEMORY_PER_NODE:-1}
11 | PRESTO_MAX_TOTAL_MEMORY_PER_NODE=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE:-2}
12 | PRESTO_HEAP_HEADROOM_PER_NODE=${PRESTO_HEAP_HEADROOM_PER_NODE:-1}
13 | PRESTO_JVM_HEAP_SIZE=${PRESTO_JVM_HEAP_SIZE:-4}
14 | 
15 | create_config_node() {
16 |   (
17 |     echo "node.environment=production"
18 |     echo "node.id=${PRESTO_NODE_ID}"
19 |     echo "node.data-dir=/var/presto/data"
20 |   ) >${PRESTO_HOME}/etc/node.properties
21 | }
22 | 
23 | change_config_jvm() {
24 |   sed -i "s/-Xmx.*G/-Xmx${PRESTO_JVM_HEAP_SIZE}G/" ${PRESTO_HOME}/etc/jvm.config
25 | }
26 | 
27 | create_config_log() {
28 |   (
29 |     echo "com.facebook.presto=${PRESTO_LOG_LEVEL}"
30 |   ) >${PRESTO_HOME}/etc/log.config
31 | }
32 | 
33 | create_config_coordinator() {
34 |   (
35 |     echo "coordinator=true"
36 |     echo "node-scheduler.include-coordinator=false"
37 |     echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}"
38 |     echo "query.max-memory=${PRESTO_MAX_MEMORY}GB"
39 |     echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB"
40 |     echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB"
41 |     echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB"
42 |     echo "discovery-server.enabled=true"
43 |     echo "discovery.uri=http://localhost:${PRESTO_HTTP_SERVER_PORT}"
44 |   ) >${PRESTO_HOME}/etc/config.properties
45 | }
46 | 
47 | create_config_worker() {
48 |   (
49 |     echo "coordinator=false"
50 |     echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}"
51 |     echo "query.max-memory=${PRESTO_MAX_MEMORY}GB"
52 |     echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB"
53 |     echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB"
54 |     echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB"
55 |     echo "discovery.uri=http://${PRESTO_COORDINATOR}:${PRESTO_HTTP_SERVER_PORT}"
56 |   ) >${PRESTO_HOME}/etc/config.properties
57 | }
58 | 
59 | create_config_node
60 | create_config_log
61 | change_config_jvm
62 | if [ -z "${PRESTO_COORDINATOR}" ]
63 | then
64 |   create_config_coordinator;
65 | else
66 |   create_config_worker;
67 | fi
68 | 
69 | env
70 | 
71 | cat ${PRESTO_HOME}/etc/node.properties
72 | cat ${PRESTO_HOME}/etc/config.properties
73 | cat ${PRESTO_HOME}/etc/jvm.config
74 | 
75 | 
76 | /opt/presto/bin/launcher run
77 | 


--------------------------------------------------------------------------------
/_notebook/spark-metastore-remote.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "397cef09-bd27-4769-9f70-7ad80803cbd7",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "3.8.10 (default, Nov 14 2021, 21:32:59) \n",
 14 |       "[Clang 12.0.5 (clang-1205.0.22.9)]\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "import sys\n",
 20 |     "\n",
 21 |     "print(sys.version)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "2bae673f-f186-4e08-b7ee-23c236771a35",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "SPARK_HOME = \"/Users/kun/github/spark/spark-3.1.2-bin-hadoop-3.2.2\""
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "id": "8491d5dd-2c63-4fd0-9bd6-cdeba9b970d9",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import findspark\n",
 42 |     "\n",
 43 |     "findspark.init(SPARK_HOME)\n",
 44 |     "#findspark.add_packages([\"org.apache.hadoop:hadoop-aws:3.2.2\", \"com.amazonaws:aws-java-sdk-bundle:1.11.375\"])"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "7167ea4d-2fb3-450c-82b2-c6362b454820",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Spark Session 생성\n",
 53 |     "\n",
 54 |     "로컬모드에서 실행할 Spark Session 을 만듭니다. (`.master(\"local[*]\")`)\n",
 55 |     "- 일반적인 Spark 설정은 `$SPARK_HOME/conf/spark-defaults.conf` 내에서 세팅해 공통환경으로 사용합니다. 다만 이 예제에서는 보여주기 위해 SparkConf 를 이용해 설정합니다.\n",
 56 |     "- Hive Metastore URI 등 HMS 관련 설정은 `$SPARK_HOME/conf/hive-site.conf` 내에서 세팅해 공통 환경으로 사용합니다.\n",
 57 |     "- 이 예제에서는 Minio 를 사용하므로 Access Key, Secret Key 를 사용합니다. AWS 위에서 실행된다면 [AWS Instance Profile](https://docs.aws.amazon.com/ko_kr/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) 을 이용할 수 있으므로 키를 세팅하지 않습니다."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "id": "27587697-2e5c-4301-bc98-82389915b35c",
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stderr",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "21/11/29 15:41:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 71 |       "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
 72 |       "Setting default log level to \"WARN\".\n",
 73 |       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
 74 |       "21/11/29 15:41:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "from pyspark.sql import SparkSession\n",
 80 |     "\n",
 81 |     "\n",
 82 |     "spark = SparkSession \\\n",
 83 |     "    .builder \\\n",
 84 |     "    .master(\"local[*]\") \\\n",
 85 |     "    .appName(\"example-app\") \\\n",
 86 |     "    .config(\"spark.hadoop.fs.s3a.access.key\", \"accesskey\")\\\n",
 87 |     "    .config(\"spark.hadoop.fs.s3a.secret.key\", \"secretkey\")\\\n",
 88 |     "    .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://localhost:9000\")\\\n",
 89 |     "    .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\\\n",
 90 |     "    .config(\"spark.hadoop.fs.s3a.connection.ssl.enabled\",\"false\")\\\n",
 91 |     "    .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\\\n",
 92 |     "    .enableHiveSupport() \\\n",
 93 |     "    .getOrCreate()\n",
 94 |     "    \n",
 95 |     "spark.sparkContext.setSystemProperty(\"com.amazonaws.services.s3.enableV4\", \"true\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "52107cb8-9741-422e-b50a-d9cd830f5ab0",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "spark.sparkContext.getConf().getAll()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 7,
111 |    "id": "b5c1bd63-5298-44fc-8681-974f2b9e7d50",
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "21/11/26 01:44:27 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.\n"
119 |      ]
120 |     },
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "DataFrame[]"
125 |       ]
126 |      },
127 |      "execution_count": 7,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "spark.sql(\"\"\"\n",
134 |     "CREATE TABLE student (\n",
135 |     "    id INT, \n",
136 |     "    name STRING, \n",
137 |     "    age INT\n",
138 |     ") \n",
139 |     "STORED AS PARQUET\n",
140 |     "LOCATION 's3a://udon-data/lake/student/'\n",
141 |     "\"\"\")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 17,
147 |    "id": "0f83e0e6-8337-4922-ab2f-9e13d7b7089f",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "df = spark.read.format(\"csv\").load(\"s3a://udon-data-lake/marketing_campaign.csv\")"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "6ccea55e-6ba0-4414-8588-e1968bddd92b",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "pyspark",
166 |    "language": "python",
167 |    "name": "pyspark"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.8.10"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 5
184 | }
185 | 


--------------------------------------------------------------------------------
/_script/docker-mysql/conf/my.cnf:
--------------------------------------------------------------------------------
 1 | [client]
 2 | default-character-set = utf8mb4
 3 | 
 4 | [mysql]
 5 | default-character-set = utf8mb4
 6 | 
 7 | [mysqld]
 8 | character-set-client-handshake = FALSE
 9 | character-set-server = utf8mb4
10 | collation-server = utf8mb4_unicode_ci
11 | default-storage-engine=InnoDB
12 | default-time-zone = '+09:00'
13 | 


--------------------------------------------------------------------------------
/_script/docker-mysql/sql/001_create_database.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE pipeline;
2 | 


--------------------------------------------------------------------------------
/_script/docker-mysql/sql/002_create_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE `ListingMeta`
 2 | (
 3 |     -- primary key
 4 |     `listing_id`      BIGINT UNSIGNED NOT NULL PRIMARY KEY,
 5 |     `listing_name`    VARCHAR(240) NULL,
 6 |     `listing_desc`    TEXT NULL,
 7 |     `listing_summary` TEXT NULL,
 8 |     `listing_url`     TEXT NULL,
 9 | 
10 |     -- FK columns
11 | 
12 |     -- common
13 |     `created_at`      datetime DEFAULT CURRENT_TIMESTAMP NOT NULL
14 | 
15 | ) ENGINE = InnoDB
16 |   DEFAULT CHARSET = utf8mb4
17 |   COLLATE = utf8mb4_unicode_ci;
18 | 


--------------------------------------------------------------------------------
/_script/docker-spark/apps/main.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.functions import col,date_format
 3 | 
 4 | def init_spark():
 5 |   sql = SparkSession.builder\
 6 |     .appName("trip-app")\
 7 |     .config("spark.jars", "/opt/spark-apps/postgresql-42.2.22.jar")\
 8 |     .getOrCreate()
 9 |   sc = sql.sparkContext
10 |   return sql,sc
11 | 
12 | def main():
13 |   url = "jdbc:postgresql://storage-postgres:5432/postgres"
14 |   properties = {
15 |     "user": "postgres",
16 |     "password": "root",
17 |     "driver": "org.postgresql.Driver"
18 |   }
19 |   file = "/opt/spark-data/MTA_2014_08_01.csv"
20 |   sql,sc = init_spark()
21 | 
22 |   df = sql.read.load(file,format = "csv", inferSchema="true", sep="\t", header="true") \
23 |       .withColumn("report_hour",date_format(col("time_received"),"yyyy-MM-dd HH:00:00")) \
24 |       .withColumn("report_date",date_format(col("time_received"),"yyyy-MM-dd"))
25 |   
26 |   # Filter invalid coordinates
27 |   df.where("latitude <= 90 AND latitude >= -90 AND longitude <= 180 AND longitude >= -180") \
28 |     .where("latitude != 0.000000 OR longitude !=  0.000000 ") \
29 |     .write \
30 |     .jdbc(url=url, table="mta_reports", mode='append', properties=properties) \
31 |     .save()
32 |   
33 | if __name__ == '__main__':
34 |   main()


--------------------------------------------------------------------------------
/_script/docker-spark/apps/postgresql-42.2.22.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_script/docker-spark/apps/postgresql-42.2.22.jar


--------------------------------------------------------------------------------
/_script/docker-spark/conf/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.eventLog.dir file:/tmp/spark-events
2 | spark.eventLog.enabled true
3 | spark.history.fs.logDirectory	file:/tmp/spark-events


--------------------------------------------------------------------------------
/_script/docker-spark/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | 


--------------------------------------------------------------------------------
/_slide/.gitignore:
--------------------------------------------------------------------------------
1 | practical-aws-pipeline/
2 | practical-spark


--------------------------------------------------------------------------------
/_volume/.gitignore:
--------------------------------------------------------------------------------
1 | docker-minio/


--------------------------------------------------------------------------------
/docker-compose.aws.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |   dynamodb-local:
 4 |     image: amazon/dynamodb-local:latest
 5 |     container_name: dynamodb-local
 6 |     ports:
 7 |       - "8000:8000"
 8 | 
 9 |   dynamodb-admin:
10 |     image: aaronshaf/dynamodb-admin
11 |     ports:
12 |       - "8001:8001"
13 |     environment:
14 |       DYNAMO_ENDPOINT: "http://dynamodb-local:8000"
15 |       AWS_REGION: "ap-northeast-2"
16 |       AWS_ACCESS_KEY_ID: accesskey
17 |       AWS_SECRET_ACCESS_KEY: secretkey
18 |     depends_on:
19 |       - dynamodb-local
20 | 
21 |   minio:
22 |     image: minio/minio:latest
23 |     container_name: minio
24 |     environment:
25 |       - MINIO_ACCESS_KEY=accesskey
26 |       - MINIO_SECRET_KEY=secretkey
27 |       - MINIO_ROOT_USER=admin
28 |       - MINIO_ROOT_PASSWORD=admin12345
29 |     volumes:
30 |       - ./_volume/docker-minio:/data
31 |     ports:
32 |       - "9000:9000"
33 |       - "9001:9001"
34 |     command: server /data --console-address ":9001"
35 | 
36 |   minio-script:
37 |     image: minio/mc
38 |     container_name: minio-script
39 |     depends_on:
40 |       - minio
41 |     entrypoint: >
42 |       /bin/sh -c "
43 |       sleep 10s;
44 |       /usr/bin/mc alias set myminio http://minio:9000 admin admin12345;
45 |       /usr/bin/mc mb myminio/udon-data-lake || true;
46 |       /usr/bin/mc admin user add myminio accesskey accesskey || true;
47 |       /usr/bin/mc admin policy set myminio readwrite user=accesskey || true;
48 |       exit 0;
49 |       "
50 | 


--------------------------------------------------------------------------------
/docker-compose.kafka.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   zookeeper:
 4 |     image: confluentinc/cp-zookeeper:6.2.1
 5 |     hostname: zookeeper
 6 |     container_name: zookeeper
 7 |     ports:
 8 |       - "2181:2181"
 9 |     environment:
10 |       ZOOKEEPER_CLIENT_PORT: 2181
11 |       ZOOKEEPER_TICK_TIME: 2000
12 | 
13 |   broker:
14 |     image: confluentinc/cp-kafka:6.2.1
15 |     hostname: broker
16 |     container_name: broker
17 |     depends_on:
18 |       - zookeeper
19 |     ports:
20 |       - "29092:29092"
21 |       - "9092:9092"
22 |       - "9101:9101"
23 |     environment:
24 |       KAFKA_BROKER_ID: 1
25 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
26 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
27 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
28 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
29 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
30 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
31 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
32 |       KAFKA_JMX_PORT: 9101
33 |       KAFKA_JMX_OPTS: -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=kafka0 -Dcom.sun.management.jmxremote.rmi.port=9101
34 |       KAFKA_JMX_HOSTNAME: localhost
35 | 
36 |   schema-registry:
37 |     image: confluentinc/cp-schema-registry:6.2.1
38 |     hostname: schema-registry
39 |     container_name: schema-registry
40 |     depends_on:
41 |       - broker
42 |     ports:
43 |       - "8081:8081"
44 |     environment:
45 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
46 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092'
47 |       SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
48 | 
49 |   kafka-ui:
50 |     image: provectuslabs/kafka-ui:latest
51 |     container_name: kafka-ui
52 |     depends_on:
53 |       - broker
54 |       - zookeeper
55 |       - schema-registry
56 |     ports:
57 |       - "8080:8080"
58 |     restart: always
59 |     environment:
60 |       - KAFKA_CLUSTERS_0_NAME=local
61 |       - KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=broker:29092
62 |       - KAFKA_CLUSTERS_0_ZOOKEEPER=zookeeper:2181
63 |       - KAFKA_CLUSTERS_0_SCHEMAREGISTRY=schema-registry:8081
64 |       - KAFKA_CLUSTERS_0_JMXPORT=9101


--------------------------------------------------------------------------------
/docker-compose.metastore.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   mysql:
 4 |     image: mysql:8
 5 |     container_name: mysql
 6 |     restart: always
 7 |     ports:
 8 |       - "3306:3306"
 9 |     environment:
10 |       - MYSQL_DATABASE=metastore_db
11 |       - MYSQL_ROOT_PASSWORD=root
12 |       - LANG=C.UTF-8
13 |     volumes:
14 |       - ./_script/docker-mysql/conf/:/etc/mysql/conf.d
15 |       - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d
16 |     command: --sql_mode=''
17 |     security_opt:
18 |       - seccomp:unconfined
19 | 
20 |   minio:
21 |     image: minio/minio:latest
22 |     container_name: minio
23 |     environment:
24 |       - MINIO_ACCESS_KEY=accesskey
25 |       - MINIO_SECRET_KEY=secretkey
26 |       - MINIO_ROOT_USER=admin
27 |       - MINIO_ROOT_PASSWORD=admin12345
28 |     volumes:
29 |       - ./_volume/docker-minio:/data
30 |     ports:
31 |       - "9000:9000"
32 |       - "9001:9001"
33 |     command: server /data --console-address ":9001"
34 | 
35 |   minio-script:
36 |     image: minio/mc
37 |     container_name: minio-script
38 |     depends_on:
39 |       - minio
40 |     entrypoint: >
41 |       /bin/sh -c "
42 |       /usr/bin/mc alias set myminio http://minio:9000 admin admin12345;
43 |       /usr/bin/mc mb myminio/udon-data-lake || true;
44 |       # /usr/bin/mc admin user add myminio accesskey secretkey || true;
45 |       # /usr/bin/mc admin policy set myminio readwrite user=accesskey || true;
46 |       exit 0;
47 |       "
48 | 
49 |   hive-metastore:
50 |     container_name: hive-metastore
51 |     build:
52 |       context: _dockerfile/docker-metastore
53 |       dockerfile: Dockerfile
54 |     command:
55 |       - /bin/sh
56 |       - -c
57 |       - |
58 |         sleep 10;
59 |         /entrypoint.sh
60 |     ports:
61 |       - "9083:9083"
62 |     depends_on:
63 |       - mysql
64 |       - minio
65 | 


--------------------------------------------------------------------------------
/docker-compose.presto.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   presto-coordinator:
 4 |     container_name: presto-coordinator
 5 |     build:
 6 |       context: _dockerfile/docker-presto
 7 |       dockerfile: Dockerfile
 8 |     environment:
 9 |       - PRESTO_NODE_ID=presto-coordinator
10 |     ports:
11 |       - "8889:8080"
12 |     volumes:
13 |       - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
14 | 
15 |   presto-worker-01:
16 |     container_name: presto-worker-01
17 |     build:
18 |       context: _dockerfile/docker-presto
19 |       dockerfile: Dockerfile
20 |     environment:
21 |       - PRESTO_COORDINATOR=presto-coordinator
22 |       - PRESTO_NODE_ID=presto-worker-01
23 |     volumes:
24 |       - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
25 |     depends_on:
26 |       - presto-coordinator
27 |     command:
28 |       - /bin/sh
29 |       - -c
30 |       - |
31 |         sleep 20;
32 |         /entrypoint.sh
33 | 
34 |   presto-worker-02:
35 |     container_name: presto-worker-02
36 |     build:
37 |       context: _dockerfile/docker-presto
38 |       dockerfile: Dockerfile
39 |     environment:
40 |       - PRESTO_COORDINATOR=presto-coordinator
41 |       - PRESTO_NODE_ID=presto-worker-02
42 |     volumes:
43 |       - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
44 |     depends_on:
45 |       - presto-coordinator
46 |     command:
47 |       - /bin/sh
48 |       - -c
49 |       - |
50 |         sleep 20;
51 |         /entrypoint.sh
52 | 


--------------------------------------------------------------------------------
/docker-compose.spark.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   spark-master:
 4 |     image: bde2020/spark-master:3.1.1-hadoop3.2
 5 |     container_name: spark-master
 6 |     ports:
 7 |       - "8080:8080"
 8 |       - "7077:7077"
 9 |       - "4040:4040"
10 |     volumes:
11 |       - ./_script/docker-spark/apps:/opt/spark-apps
12 |       - ./_script/docker-spark/data:/opt/spark-data
13 |       - ./_script/docker-spark/conf:/spark/conf
14 |       - /tmp/spark-events-local:/tmp/spark-events
15 |     environment:
16 |       - INIT_DAEMON_STEP=setup_spark
17 | 
18 |   spark-worker-1:
19 |     image: bde2020/spark-worker:3.1.1-hadoop3.2
20 |     container_name: spark-worker-1
21 |     depends_on:
22 |       - spark-master
23 |     ports:
24 |       - "8081:8081"
25 |     volumes:
26 |       - ./_script/docker-spark/apps:/opt/spark-apps
27 |       - ./_script/docker-spark/data:/opt/spark-data
28 |       - ./_script/docker-spark/conf:/spark/conf
29 |       - /tmp/spark-events-local:/tmp/spark-events
30 |     environment:
31 |       - "SPARK_MASTER=spark://spark-master:7077"
32 |   spark-worker-2:
33 | 
34 |     image: bde2020/spark-worker:3.1.1-hadoop3.2
35 |     container_name: spark-worker-2
36 |     depends_on:
37 |       - spark-master
38 |     ports:
39 |       - "8082:8081"
40 |     volumes:
41 |       - ./_script/docker-spark/apps:/opt/spark-apps
42 |       - ./_script/docker-spark/data:/opt/spark-data
43 |       - ./_script/docker-spark/conf:/spark/conf
44 |       - /tmp/spark-events-local:/tmp/spark-events
45 |     environment:
46 |       - "SPARK_MASTER=spark://spark-master:7077"
47 | 
48 |   spark-history-server:
49 |     image: bde2020/spark-history-server:3.1.1-hadoop3.2
50 |     container_name: spark-history-server
51 |     depends_on:
52 |       - spark-master
53 |     ports:
54 |       - "18081:18081"
55 |     volumes:
56 |       - ./_script/docker-spark/apps:/opt/spark-apps
57 |       - ./_script/docker-spark/data:/opt/spark-data
58 |       - ./_script/docker-spark/conf:/spark/conf
59 |       - /tmp/spark-events-local:/tmp/spark-events
60 |   storage-postgres:
61 |     image: postgres:11.7-alpine
62 |     container_name: storage-postgers
63 |     depends_on:
64 |       - spark-master
65 |     ports:
66 |       - "5432:5432"
67 |     environment:
68 |       - POSTGRES_PASSWORD=root


--------------------------------------------------------------------------------
/docker-compose.storage.yml:
--------------------------------------------------------------------------------
 1 | version: '3.6'
 2 | services:
 3 |   mysql:
 4 |     image: mysql:8
 5 |     container_name: mysql
 6 |     restart: always
 7 |     ports:
 8 |       - 3306:3306
 9 |     environment:
10 |       - MYSQL_DATABASE=pipeline
11 |       - MYSQL_ROOT_PASSWORD=root
12 |       - LANG=C.UTF-8
13 |     volumes:
14 |       - ./_script/docker-mysql/conf/:/etc/mysql/conf.d
15 |       - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d
16 |     command: --sql_mode=''
17 | 
18 |   redis:
19 |     image: redis:5
20 |     container_name: redis
21 |     restart: always
22 |     command: redis-server # --requirepass credential
23 |     ports:
24 |       - 6379:6379
25 | 
26 | 


--------------------------------------------------------------------------------
/project-flink/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml
  4 | 
  5 | ### Intellij+iml ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # AWS User-specific
 17 | .idea/**/aws.xml
 18 | 
 19 | # Generated files
 20 | .idea/**/contentModel.xml
 21 | 
 22 | # Sensitive or high-churn files
 23 | .idea/**/dataSources/
 24 | .idea/**/dataSources.ids
 25 | .idea/**/dataSources.local.xml
 26 | .idea/**/sqlDataSources.xml
 27 | .idea/**/dynamic.xml
 28 | .idea/**/uiDesigner.xml
 29 | .idea/**/dbnavigator.xml
 30 | 
 31 | # Gradle
 32 | .idea/**/gradle.xml
 33 | .idea/**/libraries
 34 | 
 35 | # Gradle and Maven with auto-import
 36 | # When using Gradle or Maven with auto-import, you should exclude module files,
 37 | # since they will be recreated, and may cause churn.  Uncomment if using
 38 | # auto-import.
 39 | # .idea/artifacts
 40 | # .idea/compiler.xml
 41 | # .idea/jarRepositories.xml
 42 | # .idea/modules.xml
 43 | # .idea/*.iml
 44 | # .idea/modules
 45 | # *.iml
 46 | # *.ipr
 47 | 
 48 | # CMake
 49 | cmake-build-*/
 50 | 
 51 | # Mongo Explorer plugin
 52 | .idea/**/mongoSettings.xml
 53 | 
 54 | # File-based project format
 55 | *.iws
 56 | 
 57 | # IntelliJ
 58 | out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | # Editor-based Rest Client
 76 | .idea/httpRequests
 77 | 
 78 | # Android studio 3.1+ serialized cache file
 79 | .idea/caches/build_file_checksums.ser
 80 | 
 81 | ### Intellij+iml Patch ###
 82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 83 | 
 84 | *.iml
 85 | modules.xml
 86 | .idea/misc.xml
 87 | *.ipr
 88 | 
 89 | ### Java ###
 90 | # Compiled class file
 91 | *.class
 92 | 
 93 | # Log file
 94 | *.log
 95 | 
 96 | # BlueJ files
 97 | *.ctxt
 98 | 
 99 | # Mobile Tools for Java (J2ME)
100 | .mtj.tmp/
101 | 
102 | # Package Files #
103 | *.jar
104 | *.war
105 | *.nar
106 | *.ear
107 | *.zip
108 | *.tar.gz
109 | *.rar
110 | 
111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
112 | hs_err_pid*
113 | 
114 | ### Scala ###
115 | 
116 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
117 | 
118 | ### Gradle ###
119 | .gradle
120 | build/
121 | 
122 | # Ignore Gradle GUI config
123 | gradle-app.setting
124 | 
125 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
126 | !gradle-wrapper.jar
127 | 
128 | # Cache of project
129 | .gradletasknamecache
130 | 
131 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
132 | # gradle/wrapper/gradle-wrapper.properties
133 | 
134 | ### Gradle Patch ###
135 | **/build/
136 | 
137 | # Eclipse Gradle plugin generated files
138 | # Eclipse Core
139 | .project
140 | # JDT-specific (Eclipse Java Development Tools)
141 | .classpath
142 | 
143 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml


--------------------------------------------------------------------------------
/project-kafka/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,kotlin,java,intellij+iml,scala
  4 | 
  5 | ### Intellij+iml ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # AWS User-specific
 17 | .idea/**/aws.xml
 18 | 
 19 | # Generated files
 20 | .idea/**/contentModel.xml
 21 | 
 22 | # Sensitive or high-churn files
 23 | .idea/**/dataSources/
 24 | .idea/**/dataSources.ids
 25 | .idea/**/dataSources.local.xml
 26 | .idea/**/sqlDataSources.xml
 27 | .idea/**/dynamic.xml
 28 | .idea/**/uiDesigner.xml
 29 | .idea/**/dbnavigator.xml
 30 | 
 31 | # Gradle
 32 | .idea/**/gradle.xml
 33 | .idea/**/libraries
 34 | 
 35 | # Gradle and Maven with auto-import
 36 | # When using Gradle or Maven with auto-import, you should exclude module files,
 37 | # since they will be recreated, and may cause churn.  Uncomment if using
 38 | # auto-import.
 39 | # .idea/artifacts
 40 | # .idea/compiler.xml
 41 | # .idea/jarRepositories.xml
 42 | # .idea/modules.xml
 43 | # .idea/*.iml
 44 | # .idea/modules
 45 | # *.iml
 46 | # *.ipr
 47 | 
 48 | # CMake
 49 | cmake-build-*/
 50 | 
 51 | # Mongo Explorer plugin
 52 | .idea/**/mongoSettings.xml
 53 | 
 54 | # File-based project format
 55 | *.iws
 56 | 
 57 | # IntelliJ
 58 | out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | # Editor-based Rest Client
 76 | .idea/httpRequests
 77 | 
 78 | # Android studio 3.1+ serialized cache file
 79 | .idea/caches/build_file_checksums.ser
 80 | 
 81 | ### Intellij+iml Patch ###
 82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 83 | 
 84 | *.iml
 85 | modules.xml
 86 | .idea/misc.xml
 87 | *.ipr
 88 | 
 89 | ### Java ###
 90 | # Compiled class file
 91 | *.class
 92 | 
 93 | # Log file
 94 | *.log
 95 | 
 96 | # BlueJ files
 97 | *.ctxt
 98 | 
 99 | # Mobile Tools for Java (J2ME)
100 | .mtj.tmp/
101 | 
102 | # Package Files #
103 | *.jar
104 | *.war
105 | *.nar
106 | *.ear
107 | *.zip
108 | *.tar.gz
109 | *.rar
110 | 
111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
112 | hs_err_pid*
113 | 
114 | ### Kotlin ###
115 | # Compiled class file
116 | 
117 | # Log file
118 | 
119 | # BlueJ files
120 | 
121 | # Mobile Tools for Java (J2ME)
122 | 
123 | # Package Files #
124 | 
125 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
126 | 
127 | ### Scala ###
128 | 
129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
130 | 
131 | ### Gradle ###
132 | .gradle
133 | build/
134 | 
135 | # Ignore Gradle GUI config
136 | gradle-app.setting
137 | 
138 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
139 | !gradle-wrapper.jar
140 | 
141 | # Cache of project
142 | .gradletasknamecache
143 | 
144 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
145 | # gradle/wrapper/gradle-wrapper.properties
146 | 
147 | ### Gradle Patch ###
148 | **/build/
149 | 
150 | # Eclipse Gradle plugin generated files
151 | # Eclipse Core
152 | .project
153 | # JDT-specific (Eclipse Java Development Tools)
154 | .classpath
155 | 
156 | # End of https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala


--------------------------------------------------------------------------------
/project-spark/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml
  3 | 
  4 | _volumes/**
  5 | 
  6 | ### Intellij+iml ###
  7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  9 | 
 10 | # User-specific stuff
 11 | .idea/**/workspace.xml
 12 | .idea/**/tasks.xml
 13 | .idea/**/usage.statistics.xml
 14 | .idea/**/dictionaries
 15 | .idea/**/shelf
 16 | 
 17 | # AWS User-specific
 18 | .idea/**/aws.xml
 19 | 
 20 | # Generated files
 21 | .idea/**/contentModel.xml
 22 | 
 23 | # Sensitive or high-churn files
 24 | .idea/**/dataSources/
 25 | .idea/**/dataSources.ids
 26 | .idea/**/dataSources.local.xml
 27 | .idea/**/sqlDataSources.xml
 28 | .idea/**/dynamic.xml
 29 | .idea/**/uiDesigner.xml
 30 | .idea/**/dbnavigator.xml
 31 | 
 32 | # Gradle
 33 | .idea/**/gradle.xml
 34 | .idea/**/libraries
 35 | 
 36 | # Gradle and Maven with auto-import
 37 | # When using Gradle or Maven with auto-import, you should exclude module files,
 38 | # since they will be recreated, and may cause churn.  Uncomment if using
 39 | # auto-import.
 40 | # .idea/artifacts
 41 | # .idea/compiler.xml
 42 | # .idea/jarRepositories.xml
 43 | # .idea/modules.xml
 44 | # .idea/*.iml
 45 | # .idea/modules
 46 | # *.iml
 47 | # *.ipr
 48 | 
 49 | # CMake
 50 | cmake-build-*/
 51 | 
 52 | # Mongo Explorer plugin
 53 | .idea/**/mongoSettings.xml
 54 | 
 55 | # File-based project format
 56 | *.iws
 57 | 
 58 | # IntelliJ
 59 | out/
 60 | 
 61 | # mpeltonen/sbt-idea plugin
 62 | .idea_modules/
 63 | 
 64 | # JIRA plugin
 65 | atlassian-ide-plugin.xml
 66 | 
 67 | # Cursive Clojure plugin
 68 | .idea/replstate.xml
 69 | 
 70 | # Crashlytics plugin (for Android Studio and IntelliJ)
 71 | com_crashlytics_export_strings.xml
 72 | crashlytics.properties
 73 | crashlytics-build.properties
 74 | fabric.properties
 75 | 
 76 | # Editor-based Rest Client
 77 | .idea/httpRequests
 78 | 
 79 | # Android studio 3.1+ serialized cache file
 80 | .idea/caches/build_file_checksums.ser
 81 | 
 82 | ### Intellij+iml Patch ###
 83 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 84 | 
 85 | *.iml
 86 | modules.xml
 87 | .idea/misc.xml
 88 | *.ipr
 89 | 
 90 | ### Java ###
 91 | # Compiled class file
 92 | *.class
 93 | 
 94 | # Log file
 95 | *.log
 96 | 
 97 | # BlueJ files
 98 | *.ctxt
 99 | 
100 | # Mobile Tools for Java (J2ME)
101 | .mtj.tmp/
102 | 
103 | # Package Files #
104 | *.jar
105 | *.war
106 | *.nar
107 | *.ear
108 | *.zip
109 | *.tar.gz
110 | *.rar
111 | 
112 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
113 | hs_err_pid*
114 | 
115 | ### Scala ###
116 | 
117 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
118 | 
119 | ### Gradle ###
120 | .gradle
121 | build/
122 | 
123 | # Ignore Gradle GUI config
124 | gradle-app.setting
125 | 
126 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
127 | !gradle-wrapper.jar
128 | 
129 | # Cache of project
130 | .gradletasknamecache
131 | 
132 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
133 | # gradle/wrapper/gradle-wrapper.properties
134 | 
135 | ### Gradle Patch ###
136 | **/build/
137 | 
138 | # Eclipse Gradle plugin generated files
139 | # Eclipse Core
140 | .project
141 | # JDT-specific (Eclipse Java Development Tools)
142 | .classpath
143 | 
144 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml


--------------------------------------------------------------------------------
/project-spark/_scripts/mysql-ddl/table_property_stat.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE pipeline.property_stat
 2 | (
 3 |     property_id   BIGINT UNSIGNED NOT NULL,
 4 |     property_type VARCHAR(30)                        NOT NULL,
 5 |     lat DOUBLE(40, 10) NOT NULL,
 6 |     lng DOUBLE(40, 10) NOT NULL,
 7 | 
 8 |     count_review_all BIGINT UNSIGNED  NOT NULL,
 9 |     score_review_all DOUBLE(10, 5) NOT NULL,
10 | 
11 |     count_review BIGINT UNSIGNED  NOT NULL,
12 |     count_sales BIGINT UNSIGNED  NOT NULL,
13 |     price_sales BIGINT UNSIGNED  NOT NULL,
14 | 
15 |     created_at    DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL,
16 |     updated_at    DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL,
17 | 
18 |     part                  TIMESTAMP                               NOT NULL COMMENT '데이터 파티션',
19 | 
20 |   PRIMARY KEY (property_id, part),
21 |   INDEX idx_property_stat_combined (part, property_id)
22 | 
23 | ) ENGINE = InnoDB
24 |   DEFAULT CHARSET = utf8mb4
25 |   COLLATE = utf8mb4_unicode_ci;
26 | 
27 | 


--------------------------------------------------------------------------------
/project-spark/build.gradle:
--------------------------------------------------------------------------------
  1 | buildscript {
  2 |     ext {
  3 |         gradleShadowVersion = '6.1.0'
  4 |         gradleTestLoggerVersion = '2.1.0'
  5 |         gradleScalaTestVersion = '0.30'
  6 |         gradleVersioningPluginVersion = '2.8.2'
  7 |         gradleAvroPluginVersion = '1.2.0'
  8 |     }
  9 | 
 10 |     repositories {
 11 |         mavenCentral()
 12 |         jcenter()
 13 | 
 14 |         maven { url "https://plugins.gradle.org/m2/" }
 15 |         maven { url 'https://repo.spring.io/plugins-release' }
 16 |         maven {
 17 |             name "typesafe-maven-release"
 18 |             url "https://repo.typesafe.com/typesafe/maven-releases"
 19 |         }
 20 |         maven {
 21 |             name "Spark Packages Repo"
 22 |             url "https://dl.bintray.com/spark-packages/maven"
 23 |         }
 24 |         maven {
 25 |             name "Confluent"
 26 |             url "https://packages.confluent.io/maven/"
 27 |         }
 28 |         maven {
 29 |             name "jitpack"
 30 |             url 'https://jitpack.io'
 31 |         }
 32 |         ivy {
 33 |             name "typesafe-ivy-release"
 34 |             url "https://repo.typesafe.com/typesafe/ivy-releases"
 35 |             layout "ivy"
 36 |         }
 37 |     }
 38 | 
 39 |     dependencies {
 40 |         classpath "com.github.jengelman.gradle.plugins:shadow:${gradleShadowVersion}"
 41 |         classpath "gradle.plugin.net.nemerosa:versioning:${gradleVersioningPluginVersion}"
 42 |         classpath "com.github.davidmc24.gradle.plugin:gradle-avro-plugin:${gradleAvroPluginVersion}"
 43 | 
 44 |         // classpath "gradle.plugin.com.github.maiflai:gradle-scalatest:${gradleScalaTestVersion}"
 45 |         // classpath "com.adarshr:gradle-test-logger-plugin:${gradleTestLoggerVersion}"
 46 |     }
 47 | }
 48 | 
 49 | allprojects {
 50 |     apply plugin: 'idea'
 51 |     apply plugin: 'java'
 52 |     apply plugin: 'java-library'
 53 |     apply plugin: 'scala'
 54 | 
 55 |     // apply plugin: 'com.adarshr.test-logger'
 56 |     // apply plugin: "com.github.maiflai.scalatest"
 57 | 
 58 |     repositories {
 59 |         mavenCentral()
 60 |         maven { url "https://jcenter.bintray.com" }
 61 |         maven {
 62 |             name "Confluent"
 63 |             url "https://packages.confluent.io/maven/"
 64 |         }
 65 |     }
 66 | 
 67 |     ext {
 68 |         // Scala
 69 |         scalaVersionRevision = "12"
 70 | 
 71 |         // Spark
 72 |         scalaSparkVersion = "2.12"
 73 |         sparkVersion = "3.2.0"
 74 |         confluentVersion = "5.3.4"
 75 | 
 76 |         // Flink
 77 |         kafkaClientVersion = "2.6.2"
 78 | 
 79 |         // MySQL
 80 |         mysqlDriverVersion = "8.0.27"
 81 | 
 82 |         // AWS
 83 |         awsSdkVersion = "1.11.901"
 84 |         awsHadoopVersion = "3.3.1"
 85 | 
 86 |         // Utility
 87 |         typesafeConfigVersion = "1.3.3"
 88 |         shapelessVersion = "2.3.3"
 89 |         pureconfigVersion = "0.17.0"
 90 |         json4sVersion = '3.6.5'
 91 |         avroVersion = '1.10.2'
 92 |         semverVresion = '2.2.0'
 93 |         scalaHttpVersion = "2.0.0-RC6"
 94 | 
 95 |         // Logging
 96 |         slf4jVersion = "1.7.30"
 97 |         log4jVersion = "2.16.0"
 98 | 
 99 |         // Test
100 |         scalaTestVersion = "3.2.5"
101 |         junit5Version = "5.5.2"
102 |     }
103 | 
104 |     dependencies {
105 |         implementation("org.apache.commons:commons-lang3:3.12.0")
106 | 
107 |         implementation("com.typesafe:config:${typesafeConfigVersion}")
108 |         implementation("com.github.pureconfig:pureconfig_${scalaSparkVersion}:${pureconfigVersion}")
109 |         implementation("com.vdurmont:semver4j:${semverVresion}")
110 | 
111 |         // test
112 |         testImplementation("org.scalatest:scalatest_${scalaSparkVersion}:${scalaTestVersion}")
113 |         testImplementation "org.junit.platform:junit-platform-launcher:1.7.1"
114 |         testRuntimeOnly "org.junit.platform:junit-platform-engine:1.7.1"
115 |         testImplementation("org.junit.jupiter:junit-jupiter-api:${junit5Version}")
116 |         testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:${junit5Version}")
117 |         testRuntimeOnly "co.helmethair:scalatest-junit-runner:0.1.8"
118 |     }
119 | }
120 | 
121 | subprojects {
122 |     targetCompatibility = 1.8
123 |     sourceCompatibility = 1.8
124 |     [compileJava, compileTestJava]*.options.collect {
125 |         options -> options.encoding = 'UTF-8'
126 |     }
127 | 
128 |     task wrapper(type: Wrapper) {
129 |         gradleVersion = '6.8.1'
130 |     }
131 | 
132 |     tasks.withType(ScalaCompile) {
133 |         configure(scalaCompileOptions.forkOptions) {
134 |             memoryMaximumSize = '2g'
135 |             jvmArgs = ['-XX:MaxMetaspaceSize=512m']
136 |         }
137 |     }
138 | 
139 |     compileScala {
140 |         targetCompatibility = "1.8"
141 |         sourceCompatibility = "1.8"
142 |         scalaCompileOptions.additionalParameters = [""] // -opt:l:method
143 |     }
144 |     compileTestScala {
145 |         scalaCompileOptions.additionalParameters = ["-Yrangepos"]
146 |     }
147 | 
148 |     test {
149 |         useJUnitPlatform {
150 |             includeEngines 'scalatest'
151 |             testLogging {
152 |                 events("passed", "skipped", "failed")
153 |             }
154 |         }
155 | 
156 |         filter {
157 |             includeTestsMatching "*Spec"
158 |         }
159 |     }
160 | 
161 |     configurations {
162 |         localCompile {
163 |             transitive = true
164 |         }
165 |     }
166 | 
167 | }
168 | 


--------------------------------------------------------------------------------
/project-spark/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/project-spark/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/project-spark/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | 
 86 | # Determine the Java command to use to start the JVM.
 87 | if [ -n "$JAVA_HOME" ] ; then
 88 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 89 |         # IBM's JDK on AIX uses strange locations for the executables
 90 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 91 |     else
 92 |         JAVACMD="$JAVA_HOME/bin/java"
 93 |     fi
 94 |     if [ ! -x "$JAVACMD" ] ; then
 95 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 96 | 
 97 | Please set the JAVA_HOME variable in your environment to match the
 98 | location of your Java installation."
 99 |     fi
100 | else
101 |     JAVACMD="java"
102 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 | 
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 | 
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 |     MAX_FD_LIMIT=`ulimit -H -n`
111 |     if [ $? -eq 0 ] ; then
112 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 |             MAX_FD="$MAX_FD_LIMIT"
114 |         fi
115 |         ulimit -n $MAX_FD
116 |         if [ $? -ne 0 ] ; then
117 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
118 |         fi
119 |     else
120 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 |     fi
122 | fi
123 | 
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 | 
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 | 
134 |     JAVACMD=`cygpath --unix "$JAVACMD"`
135 | 
136 |     # We build the pattern for arguments to be converted via cygpath
137 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 |     SEP=""
139 |     for dir in $ROOTDIRSRAW ; do
140 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
141 |         SEP="|"
142 |     done
143 |     OURCYGPATTERN="(^($ROOTDIRS))"
144 |     # Add a user-defined pattern to the cygpath arguments
145 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 |     fi
148 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 |     i=0
150 |     for arg in "$@" ; do
151 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
153 | 
154 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
155 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 |         else
157 |             eval `echo args$i`="\"$arg\""
158 |         fi
159 |         i=`expr $i + 1`
160 |     done
161 |     case $i in
162 |         0) set -- ;;
163 |         1) set -- "$args0" ;;
164 |         2) set -- "$args0" "$args1" ;;
165 |         3) set -- "$args0" "$args1" "$args2" ;;
166 |         4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 |         5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 |         6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 |         7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 |         8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 |         9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 |     esac
173 | fi
174 | 
175 | # Escape application args
176 | save () {
177 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 |     echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 | 
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 | 
185 | exec "$JAVACMD" "$@"
186 | 


--------------------------------------------------------------------------------
/project-spark/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/project-spark/module-core/build.gradle:
--------------------------------------------------------------------------------
 1 | dependencies {
 2 |     // custom
 3 |     // https://mvnrepository.com/artifact/org.apache.flink/flink-avro-confluent-registry
 4 |     // https://mvnrepository.com/artifact/org.apache.avro/avro
 5 |     api("org.apache.avro:avro:${avroVersion}")
 6 | 
 7 |     api("org.json4s:json4s-jackson_${scalaSparkVersion}:${json4sVersion}")
 8 |     api("org.json4s:json4s-ext_${scalaSparkVersion}:${json4sVersion}")
 9 | 
10 |     // logging
11 |     api("org.apache.logging.log4j:log4j-api:${log4jVersion}")
12 |     api("org.apache.logging.log4j:log4j-core:${log4jVersion}")
13 |     api("org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}")
14 |     api("org.slf4j:slf4j-log4j12:${slf4jVersion}")
15 | }
16 | 


--------------------------------------------------------------------------------
/project-spark/module-core/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-core/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/project-spark/module-core/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/project-spark/module-core/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/common/Environment.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.core.common
 2 | 
 3 | import pureconfig.generic.ProductHint
 4 | import pureconfig.{CamelCase, ConfigFieldMapping, ConfigReader, ConfigSource}
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | object Environment {
 9 |   /** deployment */
10 |   private val LOCAL = "LOCAL"
11 |   private val DEVELOPMENT = "DEV"
12 |   private val STAGING = "STAGE"
13 |   private val PRODUCTION = "PROD"
14 | 
15 |   /** testing */
16 |   private val UNIT = "UNIT"
17 |   private val INTEGRATION = "INTEGRATION"
18 | 
19 |   private val mode = {
20 |     var env: String = LOCAL
21 | 
22 |     val extractedEnv = System.getenv("PIPELINE_MODE")
23 |     if (extractedEnv != null) {
24 |       env = extractedEnv.toLowerCase()
25 |     }
26 | 
27 |     env
28 |   }
29 | 
30 |   def isLocalMode(): Boolean = {
31 |     mode == LOCAL
32 |   }
33 | 
34 |   /**
35 |    * pureconfig 내에서 camel-case 사용을 위한 implicit 변수 생성
36 |    * - https://pureconfig.github.io/docs/overriding-behavior-for-case-classes.html#field-mappings
37 |    */
38 |   def buildConfigHint[T](): ProductHint[T] = {
39 |     return ProductHint[T](ConfigFieldMapping(CamelCase, CamelCase))
40 |   }
41 | 
42 |   /**
43 |    * 모드에 따라 다른 설정값 로딩하기 위한 함수
44 |    */
45 |   def getConfigOrThrow[T: ClassTag : ConfigReader]()(implicit productHint: ProductHint[T]): T = {
46 |     val config = ConfigSource.default.at(mode).loadOrThrow[T]
47 |     config
48 |   }
49 | 
50 |   def getConfigOrThrowForApp[T: ClassTag : ConfigReader](app: String)(implicit productHint: ProductHint[T]): T = {
51 |     val config = ConfigSource.default.at(mode).at(app).loadOrThrow[T]
52 |     config
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/common/TimeUtil.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.core.common
 2 | 
 3 | import java.time.format.DateTimeFormatter
 4 | import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset}
 5 | 
 6 | object TimeUtil {
 7 | 
 8 |   /**
 9 |    * @param partition 'yyyyMMdd' formatted String
10 |    */
11 |   def convertPartitionToDateString(partition: String): String = {
12 |     val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
13 |     val formatterOutput = DateTimeFormatter.ofPattern("yyyy-MM-dd")
14 |     val parsed = LocalDate.parse(partition, formatterInput)
15 | 
16 |     return parsed.format(formatterOutput)
17 |   }
18 | 
19 |   /**
20 |    * @param partition 'yyyyMMdd' formatted String
21 |    */
22 |   def convertPartitionToDateSlashString(partition: String): String = {
23 |     val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
24 |     val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd")
25 |     val parsed = LocalDate.parse(partition, formatterInput)
26 | 
27 |     return parsed.format(formatterOutput)
28 |   }
29 | 
30 |   /**
31 |    * @param partition 'yyyyMMdd' formatted String
32 |    */
33 |   def convertPartitionToSqlTimestamp(partition: String): java.sql.Timestamp = {
34 |     val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
35 |     val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd")
36 |     val parsed = LocalDate.parse(partition, formatterInput).atStartOfDay()
37 | 
38 |     return java.sql.Timestamp.valueOf(parsed)
39 |   }
40 | 
41 |   /**
42 |    * @param raw Assume the passed parameter has UTC timezone
43 |    */
44 |   def convertStringToEpochMillis(raw: String): Long = {
45 |     val formatterInput = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
46 |     val parsed = LocalDateTime.parse(raw.substring(0, 19), formatterInput)
47 | 
48 |     return parsed.atZone(ZoneOffset.UTC).toInstant.toEpochMilli
49 |   }
50 | 
51 |   def getExpireEpochSeconds(expireDays: Int): Long = {
52 |     val updatedAt = Instant.now().toEpochMilli
53 |     val expireTtl = (updatedAt + (expireDays * 86400 * 1000)) / 1000
54 |     return expireTtl
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/ProductPool.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.entity
2 | 
3 | case class ProductPoolElement(id: String, rank: Long)
4 | 
5 | case class ProductPool(specifier: String, elements: List[ProductPoolElement], elementCount: Long)
6 | 


--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserEvent.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.core.entity
 2 | 
 3 | import mkt.udon.core.common.TimeUtil
 4 | import org.json4s.{DefaultFormats, Formats}
 5 | import org.json4s.jackson.Serialization
 6 | 
 7 | case class UserEvent(eventTime: Long, eventType: String, userId: String, productId: String, price: Double) {
 8 |   def convertToUserEventView(): UserEventView = {
 9 |     UserEventView(eventTime, productId)
10 |   }
11 | 
12 |   def convertToUserEventOrder(): UserEventOrder = {
13 |     UserEventOrder(eventTime, productId, price)
14 |   }
15 | }
16 | 
17 | case class UserEventRaw(event_time: String, event_type: String, product_id: String, price: Double, user_id: String) {
18 |   def convert(): UserEvent = {
19 |     val eventTime = TimeUtil.convertStringToEpochMillis(event_time)
20 |     UserEvent(eventTime = eventTime, eventType = event_type, userId = user_id, productId = product_id, price = price)
21 |   }
22 | }
23 | 
24 | object UserEvent {
25 |   def convertFromRaw(raw: String): UserEvent = {
26 |     implicit val default: Formats = DefaultFormats.preservingEmptyValues
27 |     val parsed = Serialization.read[UserEventRaw](raw)
28 |     parsed.convert()
29 |   }
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserProfile.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.core.entity
 2 | 
 3 | import mkt.udon.core.entity.UserProfile.{EVENT_ORDER, EVENT_VIEW}
 4 | 
 5 | /**
 6 |  * User Profile 에 저장될 View Event 입니다.
 7 |  */
 8 | case class UserEventView(eventTime: Long, productId: String)
 9 | /**
10 |  * User Profile 에 저장될 Order Event 입니다.
11 |  */
12 | case class UserEventOrder(eventTime: Long, productId: String, price: Double)
13 | 
14 | /**
15 |  * Dynamo 등의 Storage 에 저장될 수 있는 User Profile 입니다.
16 |  *
17 |  * totalOrderPrice 와 같이 사용자에 대한 전체 이벤트에 집계를 수행할수도 있습니다.
18 |  * eventOrder 등의 경우에는 List 타입이고 무한히 늘어날 수 없으므로 최근 N 개만 저장합니다.
19 |  *
20 |  * @param specifier  사용자 ID
21 |  * @param eventView  최근 상품 방문 이벤트 목록
22 |  * @param eventOrder 최근 상품 주문 이벤트 목록
23 |  */
24 | case class UserProfile(specifier: String,
25 | 
26 |                        var eventView: List[UserEventView] = List(),
27 |                        var eventOrder: List[UserEventOrder] = List()) {
28 | 
29 |   def update(userEvent: UserEvent,
30 |              maxCountView: Int, maxCountOrder: Int): UserProfile = {
31 | 
32 |     if (userEvent.eventType == EVENT_VIEW) handleView(userEvent.convertToUserEventView(), maxCountView)
33 |     else if (userEvent.eventType == EVENT_ORDER) handleOrder(userEvent.convertToUserEventOrder(), maxCountOrder)
34 | 
35 |     return this
36 |   }
37 | 
38 |   def handleView(eventRecent: UserEventView, maxCount: Int) = {
39 |     val merged = (eventView :+ eventRecent)
40 |     val sorted = merged.sortBy(x => -x.eventTime).take(maxCount)
41 | 
42 |     eventView = sorted
43 |   }
44 | 
45 |   def handleOrder(eventRecent: UserEventOrder, maxCount: Int) = {
46 |     val merged = (eventOrder :+ eventRecent)
47 |     val sorted = merged.sortBy(x => -x.eventTime).take(maxCount)
48 | 
49 |     eventOrder = sorted
50 |   }
51 | 
52 | }
53 | 
54 | object UserProfile {
55 |   val EVENT_VIEW = "view"
56 |   val EVENT_ORDER = "order"
57 | 
58 |   def buildEmpty(userId: String): UserProfile = {
59 |     UserProfile(specifier = userId, eventView = List(), eventOrder = List())
60 |   }
61 | }


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/build.gradle:
--------------------------------------------------------------------------------
 1 | dependencies {
 2 | 
 3 |     // shared
 4 |     implementation project(path: ':module-core')
 5 | 
 6 |     // TODO: 클러스터모드에서 동작시에는 기본 라이브러리는 Jar 에 포함될 필요가 없습니다.
 7 |     // spark:
 8 |     // - use provided dependencies when building in CI.
 9 |     // - use compile dependencies for local testing
10 |     // if (System.env.PIPELINE_BRANCH) {
11 |     //   ...
12 |     // }
13 | 
14 |     api("org.apache.spark:spark-core_${scalaSparkVersion}:${sparkVersion}") {
15 |     }
16 |     api("org.apache.spark:spark-sql_${scalaSparkVersion}:${sparkVersion}")
17 |     api("org.apache.spark:spark-hive_${scalaSparkVersion}:${sparkVersion}")
18 |     api("org.apache.spark:spark-mllib_${scalaSparkVersion}:${sparkVersion}")
19 |     api("org.apache.spark:spark-streaming_${scalaSparkVersion}:${sparkVersion}")
20 |     api("org.apache.spark:spark-avro_${scalaSparkVersion}:${sparkVersion}")
21 |     api("org.apache.spark:spark-sql-kafka-0-10_${scalaSparkVersion}:${sparkVersion}")
22 | 
23 |     api("org.apache.hadoop:hadoop-aws:${awsHadoopVersion}")
24 |     api("org.apache.spark:spark-hadoop-cloud_${scalaSparkVersion}:${sparkVersion}")
25 |     api("com.amazonaws:aws-java-sdk:${awsSdkVersion}")
26 | 
27 |     api("mysql:mysql-connector-java:${mysqlDriverVersion}")
28 | 
29 |     api("org.apache.avro:avro:$avroVersion")
30 |     api("org.apache.kafka:kafka-clients:${kafkaClientVersion}")
31 |     api("za.co.absa:abris_${scalaSparkVersion}:4.2.0")
32 |     api("io.confluent:kafka-avro-serializer:$confluentVersion") {
33 |         exclude group: "org.apache.kafka", module: "kafka-clients"
34 |     }
35 | }


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/SparkBase.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.infra.spark
 2 | 
 3 | import mkt.udon.core.common.Environment
 4 | import org.apache.log4j.LogManager
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | trait SparkBase {
 8 | 
 9 |   val logger = LogManager.getRootLogger
10 |   var session: SparkSession = null
11 | 
12 |   def driver(session: SparkSession): Unit
13 | 
14 |   def buildSession(): SparkSession = {
15 |     var sessionBuilder = SparkSession.builder().enableHiveSupport()
16 | 
17 |     if (Environment.isLocalMode()) {
18 |       sessionBuilder = sessionBuilder.master("local[*]")
19 |       sessionBuilder = sessionBuilder.config("spark.sql.crossJoin.enabled", true)
20 | 
21 |     }
22 | 
23 |     session = sessionBuilder.getOrCreate()
24 |     setupHadoopEnvironment(session)
25 | 
26 |     session
27 |   }
28 | 
29 |   /**
30 |    * 실제 Production 환경에서는
31 |    * - 설정은 Cluster 의 spark-defaults.conf 환경을 따릅니다.
32 |    * - AWS Key 는 Machine 의 IAM Role 을 이용합니다.
33 |    *
34 |    * 아래 코드에서는 로컬 테스팅을 위해 해당 설정들을 직접 세팅합니다.
35 |    */
36 |   def setupHadoopEnvironment(session: SparkSession): Unit = {
37 |     if (!Environment.isLocalMode()) return
38 | 
39 |     val hadoopConf = session.sparkContext.hadoopConfiguration
40 | 
41 |     hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
42 |     hadoopConf.set("fs.s3.canned.acl", "BucketOwnerFullControl")
43 |     // hadoopConf.set("fs.s3a.access.key", accessKey)
44 |     // hadoopConf.set("fs.s3a.secret.key", secretKey)
45 |   }
46 | 
47 |   def main(args: Array[String]): Unit = {
48 |     session = buildSession()
49 | 
50 |     try {
51 |       driver(session)
52 |     } catch {
53 |       case t: Throwable =>
54 |         logger.error("Application failed due to", t)
55 |         session.stop()
56 |     }
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/common/Partition.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark.common
2 | 
3 | object Partition {
4 |   val PARTITION_KEY = "part"
5 | }
6 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/DynamoSink.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.infra.spark.storage
 2 | 
 3 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder
 4 | import com.amazonaws.services.dynamodbv2.document.{DynamoDB, Item, Table}
 5 | import mkt.udon.core.common.TimeUtil
 6 | import org.apache.spark.sql.Dataset
 7 | import org.json4s.jackson.JsonMethods.parse
 8 | import org.json4s.jackson.Serialization.write
 9 | import org.json4s.{DefaultFormats, Extraction, FieldSerializer, Formats, JLong, JObject}
10 | 
11 | import java.time.Instant
12 | 
13 | object DynamoSink {
14 | 
15 |   def writePartition[T](dynamoTable: String,
16 |                         dynamoRegion: String,
17 |                         expireDays: Int,
18 |                         dsTarget: Dataset[T],
19 |                         expireFieldName: String = "expireTtl",
20 |                         updateFieldName: String = "updatedAt"
21 |                        )(implicit m: Manifest[T]): Unit = {
22 | 
23 |     dsTarget.foreachPartition((iter: Iterator[T]) => {
24 |       val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build();
25 |       val dynamoDB = new DynamoDB(dynamoClient)
26 |       val client = dynamoDB.getTable(dynamoTable)
27 | 
28 |       while (iter.hasNext) {
29 |         val cur = iter.next()
30 |         implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[T]()
31 | 
32 |         val updatedAt = Instant.now().toEpochMilli
33 |         val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays)
34 | 
35 |         val json = Extraction.decompose(cur)
36 |           .merge(JObject(updateFieldName -> JLong(updatedAt)))
37 |           .merge(JObject(expireFieldName -> JLong(expireTtl)))
38 |           .snakizeKeys
39 | 
40 |         val stringified = write(json)
41 |         val request = Item.fromJSON(stringified)
42 | 
43 |         client.putItem(request)
44 |       }
45 |     })
46 |   }
47 | 
48 |   def putItem[A](dynamoClient: Table,
49 |                  item: A,
50 |                  expireDays: Int,
51 |                  expireFieldName: String = "expireTtl",
52 |                  updateFieldName: String = "updatedAt")(implicit m: Manifest[A]): Unit = {
53 | 
54 |     // FieldSerializer 는 `private` 필드 사용시 패키지 명 까지 필드 이름에 포함되므로 사용에 유의
55 |     // Scala Enum 값 변환을 위해서는 EnumNameSerializer 가 필요하나 저장용 Case Class 에서 일반적으로 String 으로 사용
56 |     implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[A]()
57 | 
58 |     val updatedAt = Instant.now().toEpochMilli
59 |     val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays)
60 | 
61 |     val json = Extraction.decompose(item)
62 |       .merge(JObject("updatedAt" -> JLong(updatedAt)))
63 |       .merge(JObject("expireTtl" -> JLong(expireTtl)))
64 |       .camelizeKeys
65 | 
66 |     val stringified = write(json)
67 |     val request = Item.fromJSON(stringified)
68 | 
69 |     dynamoClient.putItem(request)
70 |   }
71 | 
72 |   def getItem[A](dynamoClient: Table,
73 |                  keyName: String, keyValue: String)(implicit m: Manifest[A]): Option[A] = {
74 | 
75 |     val responseItem = dynamoClient.getItem(keyName, keyValue)
76 | 
77 |     if (responseItem == null) None
78 |     else {
79 |       implicit val format = DefaultFormats.preservingEmptyValues
80 |       val raw = responseItem.toJSON
81 |       val parsed = parse(raw).camelizeKeys
82 |       val converted = parsed.extract[A]
83 |       Some(converted)
84 |     }
85 |   }
86 | 
87 |   def buildClient(dynamoTable: String, dynamoRegion: String): Table = {
88 |     val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build();
89 |     val dynamoDB = new DynamoDB(dynamoClient)
90 |     val client = dynamoDB.getTable(dynamoTable)
91 |     return client
92 |   }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/JdbcSink.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.infra.spark.storage
 2 | 
 3 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
 4 | 
 5 | import java.sql.{Connection, DriverManager}
 6 | 
 7 | object JdbcSink {
 8 | 
 9 |   val DRIVER = "com.mysql.cj.jdbc.Driver"
10 | 
11 |   def write(session: SparkSession, dfTarget: Dataset[Row],
12 |             jdbcUrl: String, jdbcTable: String,
13 |             jdbcUsername: String, jdbcPassword: String,
14 |            ): Unit = {
15 | 
16 |     dfTarget
17 |       .write
18 |       .mode("append")
19 |       .format("jdbc")
20 |       .option("driver", DRIVER)
21 |       .option("url", jdbcUrl)
22 |       .option("user", jdbcUsername)
23 |       .option("password", jdbcPassword)
24 |       .option("dbtable", jdbcTable)
25 |       .option("truncate", "false")
26 |       .save()
27 |   }
28 | 
29 |   def delete(jdbcUrl: String, jdbcTable: String,
30 |              jdbcUsername: String, jdbcPassword: String,
31 |              partitionColName: String, partitionColValue: java.sql.Timestamp): Unit = {
32 | 
33 |     var connection: Connection = null
34 | 
35 |     try {
36 |       Class.forName(DRIVER)
37 |       connection = DriverManager.getConnection(jdbcUrl, jdbcUsername, jdbcPassword)
38 | 
39 |       // remove rows which are already existing and having the same partition value
40 |       val query = s"DELETE FROM ${jdbcTable} WHERE `${partitionColName}` = ?"
41 |       val preparedStatement = connection.prepareStatement(query)
42 |       preparedStatement.setTimestamp(1, partitionColValue)
43 |       preparedStatement.execute()
44 | 
45 |     } catch {
46 |       case e: Exception =>
47 |         throw e;
48 | 
49 |     } finally {
50 |       if (connection != null) connection.close()
51 |     }
52 | 
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/ParquetSink.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.infra.spark.storage
 2 | 
 3 | import mkt.udon.core.common.TimeUtil
 4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
 5 | 
 6 | object ParquetSink {
 7 | 
 8 | 
 9 |   def write(session: SparkSession,
10 |             dfTarget: DataFrame,
11 |             parquetLocation: String,
12 |             parquetSaveMode: SaveMode): Unit = {
13 | 
14 |     dfTarget
15 |       .write
16 |       .mode(parquetSaveMode)
17 |       .options(Map(
18 |         ("parquet.enable.dictionary", "true"),
19 |         ("parquet.block.size", s"${32 * 1024 * 1024}"),
20 |         ("parquet.page.size", s"${2 * 1024 * 1024}"),
21 |         ("parquet.dictionary.page.size", s"${8 * 1024 * 1024}")
22 |       ))
23 |       .parquet(parquetLocation)
24 |   }
25 | 
26 |   /** *
27 |    * Partition Value 로 부터 저장할 Parquet Location 을 빌드합니다.
28 |    *
29 |    * @param s3Prefix
30 |    * @param partitionValue yyyyMMdd 를 가정
31 |    */
32 |   def buildLocation(prefix: String, partition: String): String = {
33 |     val partitionPath = TimeUtil.convertPartitionToDateSlashString(partition)
34 |     return s"${prefix}/${partitionPath}"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/Makefile:
--------------------------------------------------------------------------------
 1 | TAG = "Makefile"
 2 | 
 3 | VERSION	= $(shell cat ./VERSION)
 4 | MODULE = service-batch-discovery
 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
 8 | 
 9 | .PHONY: test
10 | test:
11 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | 	@ echo ""
13 | 
14 | 	@ ../gradlew :$(MODULE):test
15 | 
16 | .PHONY: build
17 | build:
18 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | 	@ echo ""
20 | 
21 | 	@ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 | 
23 | .PHONY: deploy
24 | deploy:
25 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | 	@ echo ""
27 | 
28 | 	@ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/build.gradle:
--------------------------------------------------------------------------------
 1 | def versionValue = file("VERSION").text.trim()
 2 | project.version = versionValue
 3 | 
 4 | apply plugin: 'application'
 5 | apply plugin: 'com.github.johnrengelman.shadow'
 6 | 
 7 | dependencies {
 8 |     // shared
 9 |     implementation project(path: ':module-core')
10 |     implementation project(path: ':module-infra-spark')
11 | 
12 |     // custom
13 |     // TODO
14 | }
15 | 
16 | mainClassName = 'test'
17 | run.classpath = sourceSets.main.runtimeClasspath
18 | 
19 | jar {
20 |     manifest {
21 |         attributes(
22 |                 "Implementation-Title": project.name,
23 |                 "Implementation-Version": project.version,
24 |                 "Build-Jdk": System.getProperty('java.version'),
25 |         )
26 |     }
27 | 
28 | }
29 | 
30 | shadowJar {
31 |     zip64 = true
32 |     exclude 'META-INF/**'
33 |     baseName = project.name
34 |     mergeServiceFiles()
35 | }
36 | 
37 | assemble.dependsOn(shadowJar)


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | LOCAL {
 2 |   dynamoTable = "service-dev-product-pool"
 3 |   dynamoTable = ${?DYNAMO_TABLE}
 4 |   dynamoRegion = "ap-northeast-2"
 5 |   dynamoRegion = ${?DYNAMO_REGION}
 6 |   dynamoPartitionCount = 3
 7 |   dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT}
 8 | 
 9 |   parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_product_pool"
10 |   parquetPrefix = ${?PARQUET_PREFIX}
11 |   parquetWriteMode = "Overwrite"
12 |   parquetWriteMode = ${?PARQUET_WRITE_MODE}
13 |   parquetPartitionCount = 2
14 |   parquetPartitionCount = ${?PARQUET_PARTITION_COUNT}
15 | 
16 |   partitionSnapshot = "20200201"
17 |   partitionSnapshot = ${?PARTITION_SNAPSHOT}
18 |   partitionMetricStart = "20200201"
19 |   partitionMetricStart = ${?PARTITION_METRIC_START}
20 |   partitionMetricEnd = "20200201"
21 |   partitionMetricEnd = ${?PARTITION_METRIC_END}
22 | 
23 |   maxElementCount = 20
24 |   maxElementCount = ${?MAX_ELEMENT_COUNT}
25 |   expireDays = 10
26 |   expireDays = ${?EXPIRE_DAYS}
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS}  LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 | 
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 | 
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 | 
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 | 
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/UdonProductPoolBatch.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon
 2 | 
 3 | import mkt.udon.config.UdonProductPoolBatchConfig
 4 | import mkt.udon.core.common.Environment
 5 | import mkt.udon.entity.UdonProductPoolEntity
 6 | import mkt.udon.infra.spark.SparkBase
 7 | import mkt.udon.infra.spark.storage.{DynamoSink, ParquetSink}
 8 | import org.apache.log4j.LogManager
 9 | import org.apache.spark.sql.functions.lit
10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
11 | import pureconfig.generic.auto._
12 | 
13 | object UdonProductPoolBatch extends SparkBase {
14 |   override val logger = LogManager.getLogger(this.getClass.getName)
15 | 
16 |   override def driver(session: SparkSession): Unit = {
17 | 
18 |     /**
19 |      * 환경변수 추출 및 설정
20 |      */
21 |     implicit val configHint = Environment.buildConfigHint[UdonProductPoolBatchConfig]()
22 |     val config = Environment.getConfigOrThrow[UdonProductPoolBatchConfig]
23 | 
24 |     /**
25 |      * 데이터 추출 및 가공
26 |      */
27 |     val partitionSnapshot = config.partitionSnapshot
28 |     val partitionMetricStart = config.partitionMetricStart
29 |     val partitionMetricEnd = config.partitionMetricEnd
30 |     val dfUserEvent = readUserEvent(session = session,
31 |       partitionMetricStart = partitionMetricStart, partitionMetricEnd = partitionMetricEnd)
32 | 
33 |     val dsResult = UdonProductPoolEntity.convert(
34 |       session,
35 |       dfUserEvent = dfUserEvent,
36 |       maxElementCount = config.maxElementCount)
37 | 
38 |     /**
39 |      * 데이터 저장: Parquet
40 |      *
41 |      * `part` 를 파티션 컬럼으로 지정해 추가합니다.
42 |      * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만,
43 |      * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다.
44 |      */
45 |     val dfPersistedParquet = dsResult.withColumn("part", lit(partitionSnapshot))
46 |       .repartition(config.parquetPartitionCount)
47 |     val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partitionSnapshot)
48 |     ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode))
49 | 
50 |     /**
51 |      * 데이터 저장: Dynamo
52 |      */
53 |     DynamoSink.writePartition(config.dynamoTable, config.dynamoRegion, config.expireDays, dsResult)
54 |   }
55 | 
56 |   def readUserEvent(session: SparkSession,
57 |                     partitionMetricStart: String, partitionMetricEnd: String): DataFrame = {
58 | 
59 |     if (Environment.isLocalMode()) {
60 |       val resourcePath = getClass.getClassLoader.getResource("ecommerce.csv").getPath
61 | 
62 |       val df = session.read.format("csv")
63 |         .option("inferSchema", "true")
64 |         .option("header", "true")
65 |         .load(resourcePath)
66 | 
67 |       return df
68 |     }
69 | 
70 |     return session.sql(
71 |       s"""
72 |          |SELECT *
73 |          |FROM airbnb_db.user_client_event
74 |          |WHERE part BETWEEN ${partitionMetricStart} AND ${partitionMetricEnd}
75 |          |""".stripMargin)
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/config/UdonProductPoolBatchConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 | 
3 | case class UdonProductPoolBatchConfig(dynamoTable: String, dynamoRegion: String, dynamoPartitionCount: String,
4 |                                       parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int,
5 |                                       partitionSnapshot: String,
6 |                                       partitionMetricStart: String,
7 |                                       partitionMetricEnd: String,
8 |                                       maxElementCount: Int, expireDays: Int)
9 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/entity/UdonProductPoolEntity.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.entity
 2 | 
 3 | import mkt.udon.core.entity.{ProductPool, ProductPoolElement}
 4 | import org.apache.spark.sql.expressions.Window
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 7 | 
 8 | object UdonProductPoolEntity {
 9 | 
10 |   val EVENT_VIEW = "view"
11 |   val EVENT_CART = "cart"
12 |   val EVENT_ORDER = "purchase"
13 | 
14 |   def convert(session: SparkSession, dfUserEvent: DataFrame,
15 |               maxElementCount: Int): Dataset[ProductPool] = {
16 | 
17 |     import session.implicits._
18 | 
19 |     val dfFiltered = dfUserEvent.selectExpr("product_id", "user_id", "user_session")
20 |       .where(col("event_type").isInCollection(List(EVENT_VIEW)))
21 | 
22 |     /**
23 |      * 상품과 상품들을 연관짓기 위해 사용자 Session 을 사용합니다. 이 방법의 기본적인 가정은
24 |      * - 사용자가 의도를 가진 채로 상품을 탐색하는 하나의 Session 동안에는 '연관된 상품' 을 보았을 거라 가정하고
25 |      * - 하나의 세션 내에서 같이 본 상품은 사용자 관점에서 유의미 하게 비슷할거라는 가설을 가지고 있습니다.
26 |      *
27 |      * 서비스에 나가는 추천들은 실제로는 더 복잡한 모델을 이용하고 정제되어 있는 많은 Feature 를 이용하지만,
28 |      * 여기에서는 가장 기본적인 데이터 가공을 통해 상품 Pool 을 구성하기 위해 위에서 언급한 방법을 이용합니다.
29 |      *
30 |      * 이 방법을 응용하면 Search Together, View Together, Cart Together, Order Together 와 같은 상품 Pool 을 만들 수 있습니다.
31 |      * 혹은 각각의 Unique Session ID 혹은 Unique User ID, 단순 Count 등을 Feature 로 삼아 통계적으로 각 Feature 의 비율을 조합해 내보낼 수도 있습니다.
32 |      *
33 |      * 사용자의 행위 기반 외에도 도메인이 숙박이라면 상품 메타 정보 (거리, 가격) 등의 메트릭 유사도를 추가할 수 있습니다.
34 |      */
35 |     val dfJoined = dfFiltered.alias("L")
36 |       .join(
37 |         dfFiltered.alias("R"),
38 |         col("L.user_session") === col("R.user_session") &&
39 |           col("L.product_id") =!= col("R.product_id"),
40 |         "inner"
41 |       )
42 |       .selectExpr(
43 |         "L.product_id as product_id",
44 |         "R.product_id as product_id_other",
45 |         "L.user_session"
46 |       )
47 | 
48 |     // 순위 생성 및 maxElementCount 를 이용해 필터링
49 |     val windowRank = Window.partitionBy(col("product_id")).orderBy(col("count_session_uniq").desc)
50 |     val dfGrouped = dfJoined
51 |       .groupBy("product_id", "product_id_other")
52 |       .agg(countDistinct("user_session").as("count_session_uniq"))
53 |       .withColumn("rank", row_number().over(windowRank))
54 |       .where(col("rank") <= lit(maxElementCount))
55 | 
56 |     // 배열로 만들기 위해 UDF 를 통해 Case Class 로 변경
57 |     // 주의사항: Spark 의 'collect_list' 는 순서를 보존하지 않으므로 Rank 값 없이 리스트화 하면 상품의 순서가 보존되지 않을 수 있습니다.
58 |     val udfElementize = udf((id: String, rank: Long) =>
59 |       ProductPoolElement(id = id, rank = rank))
60 |     val dfConverted = dfGrouped
61 |       .withColumn("element", udfElementize(col("product_id_other"), col("rank")))
62 |       .groupBy("product_id")
63 |       .agg(collect_list("element").as("elements"), count("*").as("element_count"))
64 | 
65 | 
66 |     return dfConverted.selectExpr("product_id as specifier", "elements", "element_count as elementCount").as[ProductPool]
67 |   }
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/Makefile:
--------------------------------------------------------------------------------
 1 | TAG = "Makefile"
 2 | 
 3 | VERSION	= $(shell cat ./VERSION)
 4 | MODULE = service-batch-statistics
 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
 8 | 
 9 | .PHONY: test
10 | test:
11 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | 	@ echo ""
13 | 
14 | 	@ ../gradlew :$(MODULE):test
15 | 
16 | .PHONY: build
17 | build:
18 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | 	@ echo ""
20 | 
21 | 	@ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 | 
23 | .PHONY: deploy
24 | deploy:
25 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | 	@ echo ""
27 | 
28 | 	@ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/build.gradle:
--------------------------------------------------------------------------------
 1 | def versionValue = file("VERSION").text.trim()
 2 | project.version = versionValue
 3 | 
 4 | apply plugin: 'application'
 5 | apply plugin: 'com.github.johnrengelman.shadow'
 6 | 
 7 | dependencies {
 8 |     // shared
 9 |     implementation project(path: ':module-core')
10 |     implementation project(path: ':module-infra-spark')
11 | }
12 | 
13 | mainClassName = 'test'
14 | run.classpath = sourceSets.main.runtimeClasspath
15 | 
16 | jar {
17 |     manifest {
18 |         attributes(
19 |                 "Implementation-Title": project.name,
20 |                 "Implementation-Version": project.version,
21 |                 "Build-Jdk": System.getProperty('java.version'),
22 |         )
23 |     }
24 | 
25 | }
26 | 
27 | shadowJar {
28 |     zip64 = true
29 |     exclude 'META-INF/**'
30 |     baseName = project.name
31 |     // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고,
32 |     // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi
33 |     // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do
34 |     mergeServiceFiles()
35 | }
36 | 
37 | assemble.dependsOn(shadowJar)


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | LOCAL {
 2 |   jdbcHost = "localhost"
 3 |   jdbcHost = ${?JDBC_HOST}
 4 |   jdbcPort = 3306
 5 |   jdbcPort = ${?JDBC_PORT}
 6 |   jdbcUsername = "root"
 7 |   jdbcUsername = ${?JDBC_USERNAME}
 8 |   jdbcPassword = "root"
 9 |   jdbcPassword = ${?JDBC_PASSWORD}
10 |   jdbcSchema = "pipeline"
11 |   jdbcSchema = ${?JDBC_SCHEMA}
12 |   jdbcTable = "property_stat"
13 |   jdbcTable = ${?JDBC_TABLE}
14 |   jdbcPartitionCount = 2
15 |   jdbcPartitionCount = ${?JDBC_PARTITION_COUNT}
16 | 
17 |   parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_stat"
18 |   parquetPrefix = ${?PARQUET_PREFIX}
19 |   parquetWriteMode = "Overwrite"
20 |   parquetWriteMode = ${?PARQUET_WRITE_MODE}
21 |   parquetPartitionCount = 5
22 |   parquetPartitionCount = ${?PARQUET_PARTITION_COUNT}
23 | 
24 |   partitionSnapshot = "20191129"
25 |   partitionSnapshot = ${?PARTITION_SNAPSHOT}
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS}  LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 | 
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 | 
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 | 
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 | 
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/UdonStatBatch.scala:
--------------------------------------------------------------------------------
  1 | package mkt.udon
  2 | 
  3 | import mkt.udon.config.UdonStatBatchConfig
  4 | import mkt.udon.core.common.{Environment, TimeUtil}
  5 | import mkt.udon.entity.UdonStatEntity
  6 | import mkt.udon.infra.spark.SparkBase
  7 | import mkt.udon.infra.spark.storage.{JdbcSink, ParquetSink}
  8 | import org.apache.log4j.LogManager
  9 | import org.apache.spark.sql.functions._
 10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
 11 | import pureconfig.generic.auto._
 12 | 
 13 | object UdonStatBatch extends SparkBase {
 14 |   override val logger = LogManager.getLogger(this.getClass.getName)
 15 | 
 16 |   override def driver(session: SparkSession): Unit = {
 17 |     /**
 18 |      * 환경변수 추출 및 설정
 19 |      */
 20 |     implicit val configHint = Environment.buildConfigHint[UdonStatBatchConfig]()
 21 |     val config = Environment.getConfigOrThrow[UdonStatBatchConfig]
 22 | 
 23 |     /**
 24 |      * 데이터 추출 및 가공
 25 |      */
 26 |     val partition = config.partitionSnapshot
 27 |     val dfPropertyMeta = readPropertyMeta(partition, session)
 28 |     val dfPropertySales = readPropertySales(partition, session)
 29 |     val dfPropertyReview = readPropertyReview(partition, session)
 30 | 
 31 |     var dfResult = UdonStatEntity.convert(session, partition,
 32 |       dfPropertyMeta = dfPropertyMeta,
 33 |       dfPropertySales = dfPropertySales,
 34 |       dfPropertyReview = dfPropertyReview)
 35 | 
 36 |     // 사이즈가 작을 경우 추가적인 연산을 위해 캐싱할 수 있습니다.
 37 |     dfResult = dfResult.cache()
 38 | 
 39 |     /**
 40 |      * 데이터 저장: Parquet
 41 |      *
 42 |      * `part` 를 파티션 컬럼으로 지정해 추가합니다.
 43 |      * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만,
 44 |      * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다.
 45 |      */
 46 |     val dfPersistedParquet = dfResult.withColumn("part", lit(partition))
 47 |       .repartition(config.parquetPartitionCount)
 48 |     val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partition)
 49 |     ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode))
 50 | 
 51 |     /**
 52 |      * 데이터 저장: JDBC
 53 |      *
 54 |      * `part` 를 파티션 컬럼으로 지정해 추가합니다. Hive 테이블과 달라질 수 있기 때문에 별도 가공을 수행합니다.
 55 |      */
 56 |     val connectionUrl = s"jdbc:mysql://${config.jdbcHost}:${config.jdbcPort}/${config.jdbcSchema}"
 57 |     val partitionColumns = List(col("property_id"))
 58 | 
 59 |     val jdbcPartitionValue = TimeUtil.convertPartitionToSqlTimestamp(partition)
 60 |     val dfPersistedJdbc = dfResult.withColumn("part", lit(jdbcPartitionValue))
 61 |       .repartition(config.jdbcPartitionCount, partitionColumns: _*)
 62 | 
 63 |     JdbcSink.delete(jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable,
 64 |       jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword,
 65 |       partitionColName = "part", partitionColValue = jdbcPartitionValue
 66 |     )
 67 | 
 68 |     JdbcSink.write(session, dfPersistedJdbc,
 69 |       jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable,
 70 |       jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword)
 71 |   }
 72 | 
 73 |   def readPropertyMeta(partition: String, session: SparkSession): DataFrame = {
 74 | 
 75 |     if (Environment.isLocalMode()) {
 76 |       val resourcePath = getClass.getClassLoader.getResource("airbnb_listings.csv").getPath
 77 | 
 78 |       val df = session.read.format("csv")
 79 |         .option("inferSchema", "true")
 80 |         .option("header", "true")
 81 |         .option("quote", "\"")
 82 |         .option("escape", "\"")
 83 |         .option("sep", ",")
 84 |         .option("multiline", "true")
 85 |         .load(resourcePath)
 86 | 
 87 |       return df
 88 |     }
 89 | 
 90 |     return session.sql(
 91 |       s"""
 92 |          |SELECT *
 93 |          |FROM airbnb_db.property_meta
 94 |          |WHERE part = ${partition}
 95 |          |""".stripMargin)
 96 |   }
 97 | 
 98 |   def readPropertySales(partition: String, session: SparkSession): DataFrame = {
 99 | 
100 |     if (Environment.isLocalMode()) {
101 |       val resourcePath = getClass.getClassLoader.getResource("airbnb_calendar.csv").getPath
102 | 
103 |       val df = session.read.format("csv")
104 |         .option("inferSchema", "true")
105 |         .option("header", "true")
106 |         .option("quote", "\"")
107 |         .option("escape", "\"")
108 |         .option("sep", ",")
109 |         .option("multiline", "true")
110 |         .load(resourcePath)
111 | 
112 |       return df
113 |     }
114 | 
115 |     return session.sql(
116 |       s"""
117 |          |SELECT *
118 |          |FROM airbnb_db.property_sales
119 |          |WHERE part = ${partition}
120 |          |""".stripMargin)
121 |   }
122 | 
123 |   def readPropertyReview(partition: String, session: SparkSession): DataFrame = {
124 | 
125 |     if (Environment.isLocalMode()) {
126 |       val resourcePath = getClass.getClassLoader.getResource("airbnb_reviews.csv").getPath
127 | 
128 |       val df = session.read.format("csv")
129 |         .option("inferSchema", "true")
130 |         .option("header", "true")
131 |         .option("quote", "\"")
132 |         .option("escape", "\"")
133 |         .option("sep", ",")
134 |         .option("multiline", "true")
135 |         .load(resourcePath)
136 | 
137 |       return df
138 |     }
139 | 
140 |     return session.sql(
141 |       s"""
142 |          |SELECT *
143 |          |FROM airbnb_db.property_review
144 |          |WHERE part = ${partition}
145 |          |""".stripMargin)
146 |   }
147 | 
148 |   /**
149 |    * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다.
150 |    * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고
151 |    * - Hive Metastore URI 를 설정해야 합니다.
152 |    */
153 |   def createTable(config: UdonStatBatchConfig, session: SparkSession): Unit = {
154 |     if (Environment.isLocalMode()) return
155 | 
156 |     // TODO: execute create table DDL
157 |   }
158 | 
159 |   /**
160 |    * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다.
161 |    * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고
162 |    * - Hive Metastore URI 를 설정해야 합니다.
163 |    */
164 |   def createPartition(config: UdonStatBatchConfig, session: SparkSession): Unit = {
165 |     if (Environment.isLocalMode()) return
166 | 
167 |     // TODO: execute create partition DDL
168 |   }
169 | }
170 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/config/UdonStatBatchConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 | 
3 | case class UdonStatBatchConfig(jdbcHost: String, jdbcPort: Int,
4 |                                jdbcUsername: String, jdbcPassword: String,
5 |                                jdbcSchema: String, jdbcTable: String,
6 |                                jdbcPartitionCount: Int,
7 |                                parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int,
8 |                                partitionSnapshot: String)
9 | 


--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/entity/UdonStatEntity.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.entity
 2 | 
 3 | import mkt.udon.core.common.TimeUtil
 4 | import org.apache.spark.sql._
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | object UdonStatEntity {
 9 | 
10 |   def convert(session: SparkSession, partition: String,
11 |               dfPropertyMeta: DataFrame,
12 |               dfPropertySales: DataFrame,
13 |               dfPropertyReview: DataFrame): DataFrame = {
14 | 
15 |     val partitionDate = TimeUtil.convertPartitionToDateString(partition)
16 | 
17 |     /**
18 |      * 상품 메타
19 |      */
20 |     val dfMeta = dfPropertyMeta
21 |       .selectExpr("CAST(id AS BIGINT) as property_id", "property_type", "latitude", "longitude")
22 | 
23 |     /**
24 |      * 상품 메트릭 누적 (리뷰)
25 |      */
26 |     val dfMetricReviewTotal = dfPropertyMeta
27 |       .selectExpr("CAST(id AS BIGINT) as property_id", "number_of_reviews as count_review_all", "review_scores_rating as score_review_all")
28 | 
29 |     /**
30 |      * 상품 메트릭 델타 (리뷰)
31 |      */
32 |     val dfMetricReviewDelta = dfPropertyReview
33 |       .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date")
34 |       .where(col("date") === lit(partitionDate).cast(DateType))
35 |       .groupBy("property_id")
36 |       .agg(count("*").as("count_review"))
37 | 
38 |     /**
39 |      * 상품 메트릭 델타 (판매)
40 |      */
41 |     val dfMetricSalesDelta = dfPropertySales
42 |       .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date", "price as price_raw")
43 |       .where(col("date") === lit(partitionDate).cast(DateType))
44 |       .where(col("available") === lit("f"))
45 |       .withColumn("price", regexp_extract(col("price_raw"), "[0-9]+.[0-9]+", 0).cast(DoubleType))
46 |       .drop("price_raw")
47 |       .groupBy("property_id")
48 |       .agg(
49 |         count("*").as("count_sales"),
50 |         sum("price").as("price_sales")
51 |       )
52 | 
53 |     /**
54 |      * 결과 데이터 프레임 내 2 가지 성격의 데이터가 섞여 있습니다.
55 |      * - 누적 데이터 (전체 기간 내 최신 값)
56 |      * - 일별 데이터 (해당 일에 대한 변동 값)
57 |      *
58 |      * 이 데이터를 하나의 결과 테이블로 만드는게 맞을지 / 아니면 Spark Application 과 테이블을 분리하는게 맞을지 논의해 봅시다.
59 |      */
60 |     val dfJoined = dfMeta.alias("PROPERTY_META")
61 |       .join(dfMetricReviewTotal.alias("METRIC_REVIEW_TOTAL"),
62 |         col("PROPERTY_META.property_id") === col("METRIC_REVIEW_TOTAL.property_id"), "left")
63 |       .join(dfMetricReviewDelta.alias("METRIC_REVIEW_DELTA"),
64 |         col("PROPERTY_META.property_id") === col("METRIC_REVIEW_DELTA.property_id"), "left")
65 |       .join(dfMetricSalesDelta.alias("METRIC_SALES_DELTA"),
66 |         col("PROPERTY_META.property_id") === col("METRIC_SALES_DELTA.property_id"), "left")
67 |       .selectExpr(
68 |         "PROPERTY_META.property_id as property_id",
69 |         "PROPERTY_META.property_type as property_type",
70 |         "PROPERTY_META.latitude as lat",
71 |         "PROPERTY_META.longitude as lng",
72 | 
73 |         "coalesce(METRIC_REVIEW_TOTAL.count_review_all, 0) as count_review_all",
74 |         "coalesce(METRIC_REVIEW_TOTAL.score_review_all, 0.0) as score_review_all",
75 | 
76 |         "coalesce(METRIC_REVIEW_DELTA.count_review, 0) as count_review",
77 | 
78 |         "coalesce(METRIC_SALES_DELTA.count_sales, 0) as count_sales",
79 |         "CAST(coalesce(METRIC_SALES_DELTA.price_sales, 0) AS BIGINT) as price_sales"
80 |       )
81 | 
82 |     return dfJoined
83 |   }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/Makefile:
--------------------------------------------------------------------------------
 1 | TAG = "Makefile"
 2 | 
 3 | VERSION	= $(shell cat ./VERSION)
 4 | MODULE = service-stream-profile
 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
 8 | 
 9 | .PHONY: test
10 | test:
11 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | 	@ echo ""
13 | 
14 | 	@ ../gradlew :$(MODULE):test
15 | 
16 | .PHONY: build
17 | build:
18 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | 	@ echo ""
20 | 
21 | 	@ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 | 
23 | .PHONY: deploy
24 | deploy:
25 | 	@ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | 	@ echo ""
27 | 
28 | 	@ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/build.gradle:
--------------------------------------------------------------------------------
 1 | def versionValue = file("VERSION").text.trim()
 2 | project.version = versionValue
 3 | 
 4 | apply plugin: 'application'
 5 | apply plugin: 'com.github.johnrengelman.shadow'
 6 | 
 7 | dependencies {
 8 |     // shared
 9 |     implementation project(path: ':module-core')
10 |     implementation project(path: ':module-infra-spark')
11 | }
12 | 
13 | mainClassName = 'test'
14 | run.classpath = sourceSets.main.runtimeClasspath
15 | 
16 | jar {
17 |     manifest {
18 |         attributes(
19 |                 "Implementation-Title": project.name,
20 |                 "Implementation-Version": project.version,
21 |                 "Build-Jdk": System.getProperty('java.version'),
22 |         )
23 |     }
24 | 
25 | }
26 | 
27 | shadowJar {
28 |     zip64 = true
29 |     exclude 'META-INF/**'
30 |     baseName = project.name
31 |     // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고,
32 |     // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi
33 |     // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do
34 |     mergeServiceFiles()
35 | }
36 | 
37 | assemble.dependsOn(shadowJar)


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | LOCAL {
 2 |   UserProfileStream {
 3 |     checkpointLocation = "/tmp/spark-user-profile"
 4 |     dynamoTable = "service-dev-user-profile"
 5 |     dynamoTable = ${?DYNAMO_TABLE}
 6 |     dynamoRegion = "ap-northeast-2"
 7 |     dynamoRegion = ${?DYNAMO_REGION}
 8 |     dynamoExpireDays = 15
 9 |     dynamoExpireDays = ${?DYNAMO_EXPIRE_DAYS}
10 |     dynamoPartitionCount = 3
11 |     dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT}
12 |     kafkaBroker = "localhost:9092"
13 |     kafkaBroker = ${?KAFKA_BROKER}
14 |     kafkaTopic = "user-event"
15 |     kafkaTopic = ${?KAFKA_TOPIC}
16 |     kafkaConsumerGroup= "user-profile"
17 |     kafkaConsumerGroup = ${?KAFKA_CONSUMER_GROUP}
18 |     kafkaOffsetStarting= "latest"
19 |     kafkaOffsetStarting = ${?KAFKA_OFFSET_STARTING}
20 |     maxCountView = 10
21 |     maxCountView = ${?MAX_COUNT_VIEW}
22 |     maxCountOrder = 10
23 |     maxCountOrder = ${?MAX_COUNT_ORDER}
24 |   }
25 | 
26 |   UserRelayStream {
27 |     checkpointLocation = "/tmp/spark-user-relay"
28 |     sourceKafkaBroker = "localhost:9092"
29 |     sourceKafkaBroker = ${?SOURCE_KAFKA_BROKER}
30 |     sourceKafkaTopic = "user-event"
31 |     sourceKafkaTopic = ${?SOURCE_KAFKA_TOPIC}
32 |     sourceKafkaConsumerGroup= "user-event-relay"
33 |     sourceKafkaConsumerGroup = ${?SOURCE_KAFKA_CONSUMER_GROUP}
34 |     sourceKafkaOffsetStarting= "latest"
35 |     sourceKafkaOffsetStarting = ${?SOURCE_KAFKA_OFFSET_STARTING}
36 | 
37 |     sinkKafkaBroker = "localhost:9092"
38 |     sinkKafkaBroker = ${?SINK_KAFKA_BROKER}
39 |     sinkKafkaTopic = "user-event-relay"
40 |     sinkKafkaTopic = ${?SINK_KAFKA_TOPIC}
41 |   }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS}  LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 | 
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 | 
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 | 
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 | 
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonProfileStream.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon
 2 | 
 3 | import mkt.udon.config.UdonProfileStreamConfig
 4 | import mkt.udon.core.common.Environment
 5 | import mkt.udon.core.entity.UserEvent
 6 | import mkt.udon.entity.UdonProfileStateFunc
 7 | import mkt.udon.infra.spark.SparkBase
 8 | import org.apache.log4j.LogManager
 9 | import org.apache.spark.sql.functions.col
10 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
11 | import org.apache.spark.sql.{Dataset, SparkSession}
12 | import pureconfig.generic.auto._
13 | 
14 | object UdonProfileStream extends SparkBase {
15 |   override val logger = LogManager.getLogger(this.getClass.getName)
16 | 
17 |   val APP = "UserProfileStream"
18 | 
19 |   override def driver(session: SparkSession): Unit = {
20 |     import session.implicits._
21 | 
22 |     /**
23 |      * 환경변수 추출 및 설정
24 |      */
25 |     implicit val configHint = Environment.buildConfigHint[UdonProfileStreamConfig]()
26 |     val config = Environment.getConfigOrThrowForApp[UdonProfileStreamConfig](APP)
27 | 
28 | 
29 |     /**
30 |      * 데이터 추출 및 가공
31 |      */
32 |     val dfRaw = session.readStream
33 |       .format("kafka")
34 |       .option("kafka.bootstrap.servers", config.kafkaBroker)
35 |       .option("subscribe", config.kafkaTopic)
36 |       .option("groupIdPrefix", config.kafkaConsumerGroup)
37 |       .option("startingOffsets", config.kafkaOffsetStarting)
38 |       .load()
39 | 
40 |     // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다.
41 |     val dfConverted = dfRaw
42 |       .selectExpr("CAST(value AS STRING)").as[String]
43 |       .map(UserEvent.convertFromRaw)
44 | 
45 |     /**
46 |      * 데이터 적재
47 |      */
48 |     val dfWritten = dfConverted.writeStream
49 |       .queryName(APP)
50 |       .trigger(Trigger.ProcessingTime("1 seconds"))
51 |       .outputMode(OutputMode.Append())
52 |       .foreachBatch((dsUserEvent: Dataset[UserEvent], batchId: Long) => {
53 |         // 사용자 기준으로 repartition 해 하나의 파티션 내에서 해당 사용자 이벤트를 모드 처리할 수 있도록 합니다
54 |         val repartitioned = dsUserEvent.repartition(config.dynamoPartitionCount, col("userId"))
55 | 
56 |         // 파티션 처리 함수를 호출합니다.
57 |         repartitioned.foreachPartition((iter: Iterator[UserEvent]) => {
58 |           UdonProfileStateFunc.handlePartition(config, iter)
59 |         })
60 |       })
61 |       .option("checkpointLocation", config.checkpointLocation)
62 |       .start()
63 | 
64 |     dfWritten.awaitTermination()
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonRelayStream.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon
 2 | 
 3 | import mkt.udon.config.UdonRelayStreamConfig
 4 | import mkt.udon.core.common.Environment
 5 | import mkt.udon.core.entity.UserEvent
 6 | import mkt.udon.infra.spark.SparkBase
 7 | import org.apache.log4j.LogManager
 8 | import org.apache.spark.sql.SparkSession
 9 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
10 | import pureconfig.generic.auto._
11 | 
12 | object UdonRelayStream extends SparkBase {
13 |   override val logger = LogManager.getLogger(this.getClass.getName)
14 | 
15 |   val APP = "UserRelayStream"
16 | 
17 |   override def driver(session: SparkSession): Unit = {
18 |     import session.implicits._
19 | 
20 |     /**
21 |      * 환경변수 추출 및 설정
22 |      */
23 |     implicit val configHint = Environment.buildConfigHint[UdonRelayStreamConfig]()
24 |     val config = Environment.getConfigOrThrowForApp[UdonRelayStreamConfig](APP)
25 | 
26 |     /**
27 |      * 데이터 추출 및 가공
28 |      */
29 |     val dfRaw = session.readStream
30 |       .format("kafka")
31 |       .option("kafka.bootstrap.servers", config.sourceKafkaBroker)
32 |       .option("subscribe", config.sourceKafkaTopic)
33 |       .option("groupIdPrefix", config.sourceKafkaConsumerGroup)
34 |       .option("startingOffsets", config.sourceKafkaOffsetStarting)
35 |       .load()
36 | 
37 |     // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다.
38 |     val dfConverted = dfRaw
39 |       .selectExpr("CAST(value AS STRING)").as[String]
40 |       .map(UserEvent.convertFromRaw)
41 | 
42 |     /**
43 |      * 데이터 적재
44 |      */
45 | 
46 |     // UserEvent.userId 를 Kafka Partition Key 로 사용합니다.
47 |     val dfJson = dfConverted.selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value")
48 | 
49 |     val dfWritten = dfJson.writeStream
50 |       .queryName(APP)
51 |       .outputMode(OutputMode.Append())
52 |       .trigger(Trigger.Continuous("1 seconds"))
53 |       .format("kafka")
54 |       .option("kafka.bootstrap.servers", config.sinkKafkaBroker)
55 |       .option("topic", config.sinkKafkaTopic)
56 |       .option("checkpointLocation", config.checkpointLocation)
57 |       .start()
58 | 
59 |     dfWritten.awaitTermination()
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonProfileStreamConfig.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.config
 2 | 
 3 | case class UdonProfileStreamConfig(checkpointLocation: String,
 4 |                                    dynamoTable: String,
 5 |                                    dynamoRegion: String,
 6 |                                    dynamoExpireDays: Int,
 7 |                                    dynamoPartitionCount: Int,
 8 |                                    kafkaBroker: String,
 9 |                                    kafkaTopic: String,
10 |                                    kafkaConsumerGroup: String,
11 |                                    kafkaOffsetStarting: String,
12 |                                    maxCountView: Int,
13 |                                    maxCountOrder: Int
14 |                                   )
15 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonRelayStreamConfig.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.config
 2 | 
 3 | case class UdonRelayStreamConfig(checkpointLocation: String,
 4 |                                  sourceKafkaBroker: String,
 5 |                                  sourceKafkaTopic: String,
 6 |                                  sourceKafkaConsumerGroup: String,
 7 |                                  sourceKafkaOffsetStarting: String,
 8 |                                  sinkKafkaBroker: String,
 9 |                                  sinkKafkaTopic: String)
10 | 


--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/entity/UdonProfileStateFunc.scala:
--------------------------------------------------------------------------------
 1 | package mkt.udon.entity
 2 | 
 3 | import mkt.udon.config.UdonProfileStreamConfig
 4 | import mkt.udon.core.entity.{UserEvent, UserProfile}
 5 | import mkt.udon.infra.spark.storage.DynamoSink
 6 | 
 7 | object UdonProfileStateFunc {
 8 | 
 9 |   def handlePartition(config: UdonProfileStreamConfig, iter: Iterator[UserEvent]): Unit = {
10 |     // Dynamo Client 생성 (@ThreadSafe)
11 |     val dynamoClient = DynamoSink.buildClient(dynamoTable = config.dynamoTable, dynamoRegion = config.dynamoRegion)
12 | 
13 |     // 사용자 마다 그룹화 해 사용자별로 이벤트 시간순 정렬을 할 수 있도록 합니다.
14 |     val groupedByUser = iter.toList.groupBy(u => u.userId)
15 |     groupedByUser.foreach(kv => {
16 |       val userId = kv._1
17 |       val userEvents = kv._2.sortBy(x => -x.eventTime) // 시간순 내림차순 정렬
18 | 
19 |       // 사용자 Profile 을 Dynamo 에서 가져오고 없을 경우 만듭니다
20 |       val existing = DynamoSink.getItem[UserProfile](dynamoClient, keyName = "specifier", userId)
21 |         .getOrElse(UserProfile.buildEmpty(userId))
22 | 
23 |       /**
24 |        * 추가적으로 더 해볼 수 있는 최적화는, 사용자 이벤트 숫자를 미리 필터링 하는 것입니다.
25 |        * 사용자 이벤트 100개 -> config.maxCount 에 의해 미리 필터링해 existing.update 호출 수를 제한할 수 있습니다.
26 |        * 다만 사용자 이벤트에 따른 분기가 미리 일어나는 등 관련 로직을 작성해야 합니다
27 |        */
28 |       userEvents.foreach(event => {
29 |         existing.update(userEvent = event, maxCountView = config.maxCountView, maxCountOrder = config.maxCountOrder)
30 |       })
31 | 
32 |       /**
33 |        * Stream 이나 Batch 가 여러개일 경우 Dynamo 테이블이 많아지면 API 입장에서 Dynamo Call 을 여러번해야 해 문제가 될 수 있습니다.
34 |        * 이 때, 같은 성격의 데이터라면 Dynamo Table 을 공유하고 컬럼을 다르게 적재할 수 있습니다.
35 |        *
36 |        * 예를 들어, User Profile Table 내에는
37 |        * - Kafka 에서 당겨오는 User Event 를 바탕으로 적재하는 Stream User Profile 컬럼과
38 |        * - 배치 기반으로 Segment 를 만들어 사용자의 Segment List 를 적재하는 Batch 용 User Profile 컬럼을 만들 수 있습니다.
39 |        * - 이 때, Dynamo 1 개의 Row 사이즈에는 제한이 있으므로 너무 많은 컬럼으로 인해 데이터 사이즈가 넘치지 않도록 주의해야 합니다.
40 |        *
41 |        * 만약 다른 컬럼이 다른 스트림이나 배치에서 업데이트 된다면 Put 대신에 Dynamo Update (Upsert) 를 이용할 수 있습니다.
42 |        * - https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_UpdateItem.html
43 |        */
44 |       DynamoSink.putItem(dynamoClient, existing, config.dynamoExpireDays)
45 |     })
46 | 
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/project-spark/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'project-spark'
2 | 
3 | include ':module-core',
4 |         ':module-infra-spark',
5 |         ':service-stream-profile',
6 |         ':service-batch-discovery',
7 |         ':service-batch-statistics'
8 | 
9 | 


--------------------------------------------------------------------------------
/project-terraform-aws/.gitignore:
--------------------------------------------------------------------------------
1 | *.hcl
2 | .terraform/
3 | *.lock.info
4 | *.tfstate
5 | *.tfstate.backup
6 | __tf_state/
7 | .idea/
8 | 
9 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/_aws-root-iam/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   environment_common = "common"
3 |   environment_development = "development"
4 |   environment_production = "production"
5 | 
6 |   region_seoul = "ap-northeast-2"
7 | 
8 |   team_data = "data"
9 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_output.tf:
--------------------------------------------------------------------------------
 1 | output "profile_id_bastion" {
 2 |   value = module.module-iam-common.profile_id_bastion
 3 | }
 4 | 
 5 | output "profile_arn_emr_instance" {
 6 |   value = module.module-iam-common.profile_arn_emr_instance
 7 | }
 8 | 
 9 | output "role_arn_emr_cluster" {
10 |   value = module.module-iam-common.role_arn_emr_cluster
11 | }
12 | 
13 | output "role_arn_emr_asg" {
14 |   value = module.module-iam-common.role_arn_emr_asg
15 | }
16 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 | 
15 |   backend "local" {
16 |     path = "../__tf_state/_aws-root-iam/terraform.tfstate"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/main_iam_common.tf:
--------------------------------------------------------------------------------
1 | module "module-iam-common" {
2 |   source = "./module-iam-common"
3 | 
4 |   environment = local.environment_common
5 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_data.tf:
--------------------------------------------------------------------------------
 1 | data "aws_iam_policy" "managed_dynamo_full" {
 2 |   arn = "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess"
 3 | }
 4 | 
 5 | data "aws_iam_policy" "managed_kinesis_stream_full" {
 6 |   arn = "arn:aws:iam::aws:policy/AmazonKinesisFullAccess"
 7 | }
 8 | 
 9 | data "aws_iam_policy" "managed_data_scientist" {
10 |   arn = "arn:aws:iam::aws:policy/job-function/DataScientist"
11 | }
12 | 
13 | data "aws_iam_policy" "managed_s3_full" {
14 |   arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
15 | }
16 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_output.tf:
--------------------------------------------------------------------------------
 1 | output "profile_id_bastion" {
 2 |   value = aws_iam_instance_profile.bastion.id
 3 | }
 4 | 
 5 | output "profile_arn_emr_instance" {
 6 |   value = aws_iam_instance_profile.emr_instance.arn
 7 | }
 8 | 
 9 | output "role_arn_emr_cluster" {
10 |   value = aws_iam_role.emr_cluster.arn
11 | }
12 | 
13 | output "role_arn_emr_asg" {
14 |   value = aws_iam_role.emr_asg.arn
15 | }
16 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/common.basic.iam.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   instance_purpose_basic = "ec2-basic"
 3 | }
 4 | 
 5 | #
 6 | # Role, Instance Profile
 7 | #
 8 | 
 9 | resource "aws_iam_role" "basic" {
10 |   name = "${lower(var.environment)}-${local.instance_purpose_basic}"
11 | 
12 |   assume_role_policy = <<EOF
13 | {
14 |   "Version": "2012-10-17",
15 |   "Statement": [
16 |     {
17 |       "Effect": "Allow",
18 |       "Principal": {
19 |         "Service": "ec2.amazonaws.com"
20 |       },
21 |       "Action": "sts:AssumeRole"
22 |     }
23 |   ]
24 | }
25 | EOF
26 | }
27 | 
28 | resource "aws_iam_instance_profile" "basic" {
29 |   name = "${lower(var.environment)}-${local.instance_purpose_basic}"
30 |   role = aws_iam_role.basic.name
31 | }
32 | 
33 | #
34 | # Policy Attachments
35 | #
36 | 
37 | resource "aws_iam_role_policy_attachment" "basic_cloudwatch" {
38 |   role       = aws_iam_role.basic.name
39 |   policy_arn = aws_iam_policy.ec2_cloudwatch_monitoring.arn
40 | }
41 | 
42 | resource "aws_iam_role_policy_attachment" "basic_operation_volume" {
43 |   role       = aws_iam_role.basic.name
44 |   policy_arn = aws_iam_policy.ec2_operation_volume.arn
45 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/common.ec2.iam.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_iam_policy" "ec2_cloudwatch_monitoring" {
  2 |   name = "${lower(var.environment)}-ec2-monitoring"
  3 |   path = "/"
  4 | 
  5 |   policy = <<EOF
  6 | {
  7 |   "Version": "2012-10-17",
  8 |   "Statement": [
  9 |     {
 10 |       "Action": [
 11 |         "cloudwatch:PutMetricData",
 12 |         "cloudwatch:GetMetricStatistics",
 13 |         "cloudwatch:ListMetrics",
 14 |         "ec2:DescribeTags",
 15 |         "logs:DescribeLogGroups",
 16 |         "logs:CreateLogGroup",
 17 |         "logs:CreateLogStream",
 18 |         "logs:PutLogEvents",
 19 |         "logs:DescribeLogStreams",
 20 |         "logs:DescribeLogGroups",
 21 |         "logs:DescribeLogStreams",
 22 |         "cloudwatch:ListMetrics"
 23 |       ],
 24 |       "Effect": "Allow",
 25 |       "Resource": "*"
 26 |     }
 27 |   ]
 28 | }
 29 | EOF
 30 | }
 31 | 
 32 | resource "aws_iam_policy" "ec2_session_manager" {
 33 |   name = "${lower(var.environment)}-ec2-session-manager"
 34 |   path = "/"
 35 | 
 36 |   policy = <<EOF
 37 | {
 38 |     "Version": "2012-10-17",
 39 |     "Statement": [
 40 |         {
 41 |             "Effect": "Allow",
 42 |             "Action": [
 43 |                 "ssmmessages:CreateControlChannel",
 44 |                 "ssmmessages:CreateDataChannel",
 45 |                 "ssmmessages:OpenControlChannel",
 46 |                 "ssmmessages:OpenDataChannel",
 47 |                 "ssm:UpdateInstanceInformation"
 48 |             ],
 49 |             "Resource": "*"
 50 |         },
 51 |         {
 52 |             "Effect": "Allow",
 53 |             "Action": [
 54 |                 "s3:PutObject"
 55 |             ],
 56 |             "Resource": "arn:aws:s3:::ya-di-audit/aws-service-ssm"
 57 |         },
 58 |         {
 59 |             "Effect": "Allow",
 60 |             "Action": [
 61 |                 "s3:GetEncryptionConfiguration"
 62 |             ],
 63 |             "Resource": "*"
 64 |         },
 65 |         {
 66 |             "Effect": "Allow",
 67 |             "Action": "kms:GenerateDataKey",
 68 |             "Resource": "*"
 69 |         }
 70 |     ]
 71 | }
 72 | EOF
 73 | }
 74 | 
 75 | resource "aws_iam_policy" "ec2_operation_tags" {
 76 |   name = "${lower(var.environment)}-ec2-operation-tags"
 77 |   path = "/"
 78 | 
 79 |   policy = <<EOF
 80 | {
 81 |     "Version": "2012-10-17",
 82 |     "Statement": [
 83 |         {
 84 |             "Effect": "Allow",
 85 |             "Action": [
 86 |                 "ec2:DescribeInstances",
 87 |                 "ec2:CreateTags",
 88 |                 "ec2:DeleteTags"
 89 |             ],
 90 |             "Resource": [
 91 |                 "*"
 92 |             ]
 93 |         }
 94 |     ]
 95 | }
 96 | EOF
 97 | }
 98 | 
 99 | resource "aws_iam_policy" "ec2_operation_private_ip" {
100 |   name = "${lower(var.environment)}-ec2-operation-private-ip"
101 |   path = "/"
102 | 
103 |   policy = <<EOF
104 | {
105 |     "Version": "2012-10-17",
106 |     "Statement": [
107 |         {
108 |             "Effect": "Allow",
109 |             "Action": [
110 |                 "ec2:AssignPrivateIpAddresses"
111 |             ],
112 |             "Resource": [
113 |                 "*"
114 |             ]
115 |         }
116 |     ]
117 | }
118 | EOF
119 | }
120 | 
121 | resource "aws_iam_policy" "ec2_operation_volume" {
122 |   name = "${lower(var.environment)}-ec2-operation-volume"
123 |   path = "/"
124 | 
125 |   policy = <<EOF
126 | {
127 |     "Version": "2012-10-17",
128 |     "Statement": [
129 |         {
130 |             "Effect": "Allow",
131 |             "Action": [
132 |                 "ec2:DescribeVolumes"
133 |             ],
134 |             "Resource": [
135 |                 "*"
136 |             ]
137 |         }
138 |     ]
139 | }
140 | EOF
141 | }
142 | 
143 | resource "aws_iam_policy" "ec2_access_s3_full" {
144 |   name = "${lower(var.environment)}-ec2-access-s3-full"
145 |   path = "/"
146 | 
147 |   policy = <<EOF
148 | {
149 |     "Version": "2012-10-17",
150 |     "Statement": [
151 |         {
152 |             "Effect": "Allow",
153 |             "Action": [
154 |                 "s3:*"
155 |             ],
156 |             "Resource": [
157 |                 "*"
158 |             ]
159 |         }
160 |     ]
161 | }
162 | EOF
163 | }
164 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/common.ec2.profile.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   instance_purpose_bastion = "ec2-bastion"
 3 | }
 4 | 
 5 | #
 6 | # Role, Instance Profile
 7 | #
 8 | 
 9 | resource "aws_iam_role" "bastion" {
10 |   name = "${lower(var.environment)}-${local.instance_purpose_bastion}"
11 | 
12 |   assume_role_policy = <<EOF
13 | {
14 |   "Version": "2012-10-17",
15 |   "Statement": [
16 |     {
17 |       "Effect": "Allow",
18 |       "Principal": {
19 |         "Service": "ec2.amazonaws.com"
20 |       },
21 |       "Action": "sts:AssumeRole"
22 |     }
23 |   ]
24 | }
25 | EOF
26 | }
27 | 
28 | resource "aws_iam_instance_profile" "bastion" {
29 |   name = "${lower(var.environment)}-${local.instance_purpose_bastion}"
30 |   role = aws_iam_role.bastion.name
31 | }
32 | 
33 | #
34 | # Policy Attachments
35 | #
36 | 
37 | resource "aws_iam_role_policy_attachment" "bastion_cloudwatch" {
38 |   role       = aws_iam_role.bastion.name
39 |   policy_arn = aws_iam_policy.ec2_cloudwatch_monitoring.arn
40 | }
41 | 
42 | resource "aws_iam_role_policy_attachment" "bastion_operation_volume" {
43 |   role       = aws_iam_role.bastion.name
44 |   policy_arn = aws_iam_policy.ec2_operation_volume.arn
45 | }
46 | 
47 | resource "aws_iam_role_policy_attachment" "bastion_access_s3_full" {
48 |   role       = aws_iam_role.bastion.name
49 |   policy_arn = aws_iam_policy.ec2_access_s3_full.arn
50 | }
51 | 
52 | resource "aws_iam_role_policy_attachment" "bastion_session_manager" {
53 |   role       = aws_iam_role.bastion.name
54 |   policy_arn = aws_iam_policy.ec2_session_manager.arn
55 | }
56 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/common.emr.profile.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   instance_purpose_emr_cluster         = "emr-cluster"
  3 |   instance_purpose_emr_instance        = "emr-instance"
  4 |   instance_purpose_emr_asg             = "emr-asg"
  5 | }
  6 | 
  7 | #
  8 | # Role, Instance Profile
  9 | # - https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles-defaultroles.html
 10 | #
 11 | 
 12 | // data "aws_iam_role" "iam_role_emr_cluster_default" {
 13 | //   name = "EMR_DefaultRole"
 14 | // }
 15 | //
 16 | // data "aws_iam_role" "iam_role_emr_instance_default" {
 17 | //   name = "EMR_EC2_DefaultRole"
 18 | // }
 19 | //
 20 | // data "aws_iam_role" "iam_role_emr_asg_default" {
 21 | //   name = "EMR_AutoScaling_DefaultRole"
 22 | // }
 23 | 
 24 | resource "aws_iam_role" "emr_cluster" {
 25 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_cluster}"
 26 | 
 27 |   assume_role_policy = <<EOF
 28 | {
 29 |   "Version": "2008-10-17",
 30 |   "Statement": [
 31 |     {
 32 |       "Sid": "",
 33 |       "Effect": "Allow",
 34 |       "Principal": {
 35 |         "Service": "elasticmapreduce.amazonaws.com"
 36 |       },
 37 |       "Action": "sts:AssumeRole"
 38 |     }
 39 |   ]
 40 | }
 41 | EOF
 42 | }
 43 | 
 44 | resource "aws_iam_role" "emr_instance" {
 45 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_instance}"
 46 | 
 47 |   assume_role_policy = <<EOF
 48 | {
 49 |   "Version": "2008-10-17",
 50 |   "Statement": [
 51 |     {
 52 |       "Sid": "",
 53 |       "Effect": "Allow",
 54 |       "Principal": {
 55 |         "Service": "ec2.amazonaws.com"
 56 |       },
 57 |       "Action": "sts:AssumeRole"
 58 |     }
 59 |   ]
 60 | }
 61 | EOF
 62 | }
 63 | 
 64 | resource "aws_iam_role" "emr_asg" {
 65 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_asg}"
 66 | 
 67 |   assume_role_policy = <<EOF
 68 | {
 69 |   "Version": "2008-10-17",
 70 |   "Statement": [
 71 |     {
 72 |       "Sid": "",
 73 |       "Effect": "Allow",
 74 |       "Principal": {
 75 |         "Service": [
 76 |           "elasticmapreduce.amazonaws.com",
 77 |           "application-autoscaling.amazonaws.com"
 78 |         ]
 79 |       },
 80 |       "Action": "sts:AssumeRole"
 81 |     }
 82 |   ]
 83 | }
 84 | EOF
 85 | }
 86 | 
 87 | resource "aws_iam_instance_profile" "emr_cluster" {
 88 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_cluster}"
 89 |   role = aws_iam_role.emr_cluster.name
 90 | }
 91 | 
 92 | resource "aws_iam_instance_profile" "emr_instance" {
 93 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_instance}"
 94 |   role = aws_iam_role.emr_instance.name
 95 | }
 96 | 
 97 | resource "aws_iam_instance_profile" "emr_asg" {
 98 |   name = "${lower(var.environment)}-${local.instance_purpose_emr_asg}"
 99 |   role = aws_iam_role.emr_asg.name
100 | }
101 | 
102 | #
103 | # Policy Attachments: CLUSTER
104 | #
105 | 
106 | resource "aws_iam_role_policy_attachment" "emr_cluster_basic" {
107 |   role       = aws_iam_role.emr_cluster.name
108 |   policy_arn = aws_iam_policy.emr_cluster.arn
109 | }
110 | #
111 | # Policy Attachments: INSTANCE (EC2)
112 | #
113 | 
114 | resource "aws_iam_role_policy_attachment" "emr_instance_basic" {
115 |   role       = aws_iam_role.emr_instance.name
116 |   policy_arn = aws_iam_policy.emr_instance.arn
117 | }
118 | 
119 | resource "aws_iam_role_policy_attachment" "emr_instance_cloudwatch" {
120 |   role       = aws_iam_role.emr_instance.name
121 |   policy_arn = aws_iam_policy.ec2_cloudwatch_monitoring.arn
122 | }
123 | 
124 | resource "aws_iam_role_policy_attachment" "emr_instance_operation_tags" {
125 |   role       = aws_iam_role.emr_instance.name
126 |   policy_arn = aws_iam_policy.ec2_operation_tags.arn
127 | }
128 | 
129 | resource "aws_iam_role_policy_attachment" "emr_instance_operation_private_ip" {
130 |   role       = aws_iam_role.emr_instance.name
131 |   policy_arn = aws_iam_policy.ec2_operation_private_ip.arn
132 | }
133 | 
134 | resource "aws_iam_role_policy_attachment" "emr_instance_dynamo_full" {
135 |   role       = aws_iam_role.emr_instance.name
136 |   policy_arn = data.aws_iam_policy.managed_dynamo_full.arn
137 | }
138 | 
139 | resource "aws_iam_role_policy_attachment" "emr_instance_kinesis_stream_full" {
140 |   role       = aws_iam_role.emr_instance.name
141 |   policy_arn = data.aws_iam_policy.managed_kinesis_stream_full.arn
142 | }
143 | 
144 | resource "aws_iam_role_policy_attachment" "emr_instance_session_manager" {
145 |   role       = aws_iam_role.emr_instance.name
146 |   policy_arn = aws_iam_policy.ec2_session_manager.arn
147 | }
148 | 
149 | #
150 | # Policy Attachments: ASG
151 | #
152 | 
153 | resource "aws_iam_role_policy_attachment" "emr_asg_basic" {
154 |   role       = aws_iam_role.emr_asg.name
155 |   policy_arn = aws_iam_policy.emr_asg.arn
156 | }
157 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/_aws-root-sg/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/_data.tf:
--------------------------------------------------------------------------------
1 | data "terraform_remote_state" "root_vpc" {
2 |   backend = "local"
3 | 
4 |   config = {
5 |     path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/_local.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   environment_common = "common"
  3 |   environment_development = "development"
  4 |   environment_production = "production"
  5 | 
  6 |   region_seoul = "ap-northeast-2"
  7 | 
  8 |   team_data = "data"
  9 | }
 10 | 
 11 | locals {
 12 |   network_range_ssh_whitelist = "0.0.0.0/0"
 13 | }
 14 | 
 15 | locals {
 16 |   // https://docs.aws.amazon.com/ko_kr/emr/latest/ManagementGuide/emr-web-interfaces.html
 17 |   // @formatter:off
 18 |   emr_web_ports_master = [
 19 |     {
 20 |       port = 8088
 21 |       name = "YARN Resource Manager"
 22 |     },
 23 |     {
 24 |       port = 8042
 25 |       name = "YARN Node Manager"
 26 |     },
 27 |     {
 28 |       port = 50470
 29 |       name = "HDFS Name Node less 6.x"
 30 |     },
 31 |     {
 32 |       port = 50070
 33 |       name = "HDFS Name Node"
 34 |     },
 35 |     {
 36 |       port = 9871
 37 |       name = "HDFS Name Node 6.x"
 38 |     },
 39 |     {
 40 |       port = 50475
 41 |       name = "HDFS Data Node less 6.x"
 42 |     },
 43 |     {
 44 |       port = 50075
 45 |       name = "HDFS Data Node"
 46 |     },
 47 |     {
 48 |       port = 9865
 49 |       name = "HDFS Data Node 6.x"
 50 |     },
 51 | 
 52 |     {
 53 |       port = 18080
 54 |       name = "Spark UI"
 55 |     },
 56 |     {
 57 |       port = 20888
 58 |       name = "Spark Streaming UI"
 59 |     },
 60 |     {
 61 |       port = 19888
 62 |       name = "Spark Log"
 63 |     },
 64 | 
 65 |     {
 66 |       port = 80
 67 |       name = "Ganglia"
 68 |     },
 69 |     {
 70 |       port = 10000
 71 |       name = "Hive Server"
 72 |     },
 73 |     {
 74 |       port = 9083
 75 |       name = "Hive Metastore"
 76 |     },
 77 |     {
 78 |       port = 8889
 79 |       name = "Presto UI"
 80 |     },
 81 | 
 82 |     {
 83 |       port = 8890
 84 |       name = "Zeppelin"
 85 |     },
 86 |     {
 87 |       port = 8888
 88 |       name = "Hue"
 89 |     },
 90 |     {
 91 |       port = 16010
 92 |       name = "HBase UI"
 93 |     },
 94 |     {
 95 |       port = 9443
 96 |       name = "Jupyter UI"
 97 |     },
 98 |     {
 99 |       port = 41001
100 |       name = "Test"
101 |     },
102 |   ]
103 | 
104 |   emr_web_ports_slave = [
105 |     {
106 |       port = 8042
107 |       name = "YARN NM"
108 |     },
109 |     {
110 |       port = 50075
111 |       name = "HDFS DN"
112 |     },
113 |   ]
114 |   // @formatter:on
115 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/_output.tf:
--------------------------------------------------------------------------------
 1 | output "sg_id_bastion_public_data_dev" {
 2 |   value = module.module-sg-data-dev.sg_id_bastion_public_data_dev
 3 | }
 4 | output "sg_id_rds_hive_metastore_data_dev" {
 5 |   value = module.module-sg-data-dev.sg_id_rds_hive_metastore_data_dev
 6 | }
 7 | 
 8 | output "sg_id_emr_master_managed_data_dev" {
 9 |   value = module.module-sg-data-dev.sg_id_emr_master_managed_data_dev
10 | }
11 | 
12 | output "sg_id_emr_master_additional_data_dev" {
13 |   value = module.module-sg-data-dev.sg_id_emr_master_additional_data_dev
14 | }
15 | 
16 | output "sg_id_emr_slave_managed_data_dev" {
17 |   value = module.module-sg-data-dev.sg_id_emr_slave_managed_data_dev
18 | }
19 | 
20 | output "sg_id_emr_slave_additional_data_dev" {
21 |   value = module.module-sg-data-dev.sg_id_emr_slave_additional_data_dev
22 | }
23 | 
24 | output "sg_id_emr_service_managed_data_dev" {
25 |   value = module.module-sg-data-dev.sg_id_emr_service_managed_data_dev
26 | }
27 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 | 
15 |   backend "local" {
16 |     path = "../__tf_state/_aws-root-sg/terraform.tfstate"
17 |   }
18 | 
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/main_sg_data_dev.tf:
--------------------------------------------------------------------------------
 1 | module "module-sg-data-dev" {
 2 |   source = "./module-sg-data-dev"
 3 | 
 4 |   environment = local.environment_development
 5 |   team = local.team_data
 6 | 
 7 |   vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev
 8 |   network_range_ssh_whitelist = local.network_range_ssh_whitelist
 9 | 
10 |   emr_web_ports_master = local.emr_web_ports_master
11 |   emr_web_ports_slave = local.emr_web_ports_slave
12 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/module-sg-data-dev/_output.tf:
--------------------------------------------------------------------------------
 1 | output "sg_id_bastion_public_data_dev" {
 2 |   value = aws_security_group.bastion_public.id
 3 | }
 4 | 
 5 | output "sg_id_rds_hive_metastore_data_dev" {
 6 |   value = aws_security_group.rds_hive_metastore.id
 7 | }
 8 | 
 9 | output "sg_id_emr_master_managed_data_dev" {
10 |   value = aws_security_group.emr_master_managed.id
11 | }
12 | 
13 | output "sg_id_emr_master_additional_data_dev" {
14 |   value = aws_security_group.emr_master_additional.id
15 | }
16 | 
17 | output "sg_id_emr_slave_managed_data_dev" {
18 |   value = aws_security_group.emr_slave_managed.id
19 | }
20 | 
21 | output "sg_id_emr_slave_additional_data_dev" {
22 |   value = aws_security_group.emr_slave_additional.id
23 | }
24 | 
25 | output "sg_id_emr_service_managed_data_dev" {
26 |   value = aws_security_group.emr_service_managed.id
27 | }
28 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/module-sg-data-dev/_variable.tf:
--------------------------------------------------------------------------------
 1 | variable "environment" {}
 2 | variable "team" {}
 3 | 
 4 | variable "vpc_id" {}
 5 | variable "network_range_ssh_whitelist" {}
 6 | 
 7 | 
 8 | variable "emr_web_ports_master" {}
 9 | variable "emr_web_ports_slave" {}
10 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/module-sg-data-dev/dev.bastion-public.sg.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   purpose_bastion_public = "bastion-public"
 3 | }
 4 | 
 5 | resource "aws_security_group" "bastion_public" {
 6 |   name = "${local.purpose_bastion_public}-${lower(var.environment)}"
 7 | 
 8 |   tags = {
 9 |     Terraform   = "true"
10 |     Environment = var.environment
11 |     Team        = var.team
12 | 
13 |     Name = "${local.purpose_bastion_public}-${lower(var.environment)}"
14 |   }
15 | 
16 |   vpc_id = var.vpc_id
17 | }
18 | 
19 | resource "aws_security_group_rule" "bastion_allow_to_all" {
20 |   type      = "egress"
21 |   from_port = 0
22 |   to_port   = 0
23 |   protocol  = "-1"
24 | 
25 |   cidr_blocks      = ["0.0.0.0/0"]
26 |   ipv6_cidr_blocks = ["::/0"]
27 | 
28 |   security_group_id = aws_security_group.bastion_public.id
29 | }
30 | 
31 | resource "aws_security_group_rule" "bastion_allow_ssh_from_whitelist" {
32 |   type      = "ingress"
33 |   from_port = 22
34 |   to_port   = 22
35 |   protocol  = "tcp"
36 | 
37 |   // 일반적으로는 회사 네트워크나 VPN 대역등을 넣습니다.
38 |   cidr_blocks = [
39 |     var.network_range_ssh_whitelist,
40 |   ]
41 | 
42 |   security_group_id = aws_security_group.bastion_public.id
43 | 
44 |   description = "SSH Whitelisted"
45 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/module-sg-data-dev/dev.emr-serivce.sg.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * MANAGED
 3 |  * - https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html#emr-sg-elasticmapreduce-master-private
 4 |  */
 5 | resource "aws_security_group" "emr_service_managed" {
 6 |   name = "emr-service-managed-${lower(var.environment)}"
 7 | 
 8 |   tags = {
 9 |     Terraform   = "true"
10 |     Environment = var.environment_short
11 |     Team        = var.team
12 | 
13 |     Name = "emr-service-managed-${lower(var.environment)}"
14 |   }
15 | 
16 |   vpc_id = var.vpc_id
17 | }
18 | 
19 | resource "aws_security_group_rule" "emr_master_service_allow_to_all" {
20 |   type      = "egress"
21 |   from_port = 0
22 |   to_port   = 0
23 |   protocol  = "-1"
24 | 
25 |   cidr_blocks = [
26 |     "0.0.0.0/0",
27 |   ]
28 | 
29 |   security_group_id = aws_security_group.emr_service_managed.id
30 | }
31 | 
32 | resource "aws_security_group_rule" "emr_service_allow_8443_from_master_9443" {
33 |   type                     = "ingress"
34 |   from_port                = 9443
35 |   to_port                  = 9443
36 |   protocol                 = "tcp"
37 |   source_security_group_id = aws_security_group.emr_master_managed.id
38 | 
39 |   security_group_id = aws_security_group.emr_service_managed.id
40 | 
41 |   description = "EMR Service"
42 | }
43 | 
44 | resource "aws_security_group_rule" "emr_service_allow_8443_from_master_8443" {
45 |   type                     = "ingress"
46 |   from_port                = 8443
47 |   to_port                  = 8443
48 |   protocol                 = "tcp"
49 |   source_security_group_id = aws_security_group.emr_master_managed.id
50 | 
51 |   security_group_id = aws_security_group.emr_service_managed.id
52 | 
53 |   description = "EMR Service"
54 | }
55 | 
56 | resource "aws_security_group_rule" "emr_service_allow_8443_from_slave_8443" {
57 |   type                     = "ingress"
58 |   from_port                = 8443
59 |   to_port                  = 8443
60 |   protocol                 = "tcp"
61 |   source_security_group_id = aws_security_group.emr_slave_managed.id
62 | 
63 |   security_group_id = aws_security_group.emr_service_managed.id
64 | 
65 |   description = "EMR Service"
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-sg/module-sg-data-dev/dev.rds.sg.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   purpose_rds_hive_metastore = "rds-hive-metastore"
 3 | }
 4 | 
 5 | resource "aws_security_group" "rds_hive_metastore" {
 6 |   name = "${local.purpose_rds_hive_metastore}-${lower(var.environment)}"
 7 | 
 8 |   tags = {
 9 |     Terraform   = "true"
10 |     Environment = var.environment
11 |     Team        = var.team
12 | 
13 |     Name = "${local.purpose_rds_hive_metastore}-${lower(var.environment)}"
14 |   }
15 | 
16 |   vpc_id = var.vpc_id
17 | }
18 | 
19 | resource "aws_security_group_rule" "rds_allow_mysql_from_bastion" {
20 |   type      = "ingress"
21 |   from_port = 3306
22 |   to_port   = 3306
23 |   protocol  = "tcp"
24 | 
25 |   source_security_group_id = aws_security_group.bastion_public.id
26 |   security_group_id = aws_security_group.rds_hive_metastore.id
27 | 
28 |   description = "Bastion"
29 | }
30 | 
31 | resource "aws_security_group_rule" "rds_allow_mysql_from_emr" {
32 |   type      = "ingress"
33 |   from_port = 3306
34 |   to_port   = 3306
35 |   protocol  = "tcp"
36 | 
37 |   source_security_group_id = aws_security_group.emr_master_managed.id
38 |   security_group_id = aws_security_group.rds_hive_metastore.id
39 | 
40 |   description = "emr"
41 | }
42 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   environment_development = "development"
3 |   environment_production = "production"
4 | 
5 |   region_seoul = "ap-northeast-2"
6 | 
7 |   team_data = "data"
8 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/_output.tf:
--------------------------------------------------------------------------------
 1 | output "vpc_id_data_dev" {
 2 |   value = module.module-vpc-data-dev.vpc_id
 3 | }
 4 | 
 5 | output "subnet_list_public_data_dev" {
 6 |   value = module.module-vpc-data-dev.subnet_list_public
 7 | }
 8 | 
 9 | output "subnet_list_private_data_dev" {
10 |   value = module.module-vpc-data-dev.subnet_list_private
11 | }
12 | 
13 | output "subnet_list_database_data_dev" {
14 |   value = module.module-vpc-data-dev.subnet_list_database
15 | }
16 | 
17 | output "subnet_id_public_az_a_data_dev" {
18 |   value = module.module-vpc-data-dev.subnet_list_public[0]
19 | }
20 | 
21 | output "subnet_id_public_az_b_data_dev" {
22 |   value = module.module-vpc-data-dev.subnet_list_public[1]
23 | }
24 | 
25 | output "subnet_id_public_az_c_data_dev" {
26 |   value = module.module-vpc-data-dev.subnet_list_public[2]
27 | }
28 | 
29 | output "subnet_id_private_az_a_data_dev" {
30 |   value = module.module-vpc-data-dev.subnet_list_private[0]
31 | }
32 | 
33 | output "subnet_id_private_az_b_data_dev" {
34 |   value = module.module-vpc-data-dev.subnet_list_private[1]
35 | }
36 | 
37 | output "subnet_id_private_az_c_data_dev" {
38 |   value = module.module-vpc-data-dev.subnet_list_private[2]
39 | }
40 | 
41 | output "subnet_id_database_az_a_data_dev" {
42 |   value = module.module-vpc-data-dev.subnet_list_database[0]
43 | }
44 | 
45 | output "subnet_id_database_az_b_data_dev" {
46 |   value = module.module-vpc-data-dev.subnet_list_database[1]
47 | }
48 | 
49 | output "subnet_id_database_az_c_data_dev" {
50 |   value = module.module-vpc-data-dev.subnet_list_database[2]
51 | }
52 | 
53 | output "subnet_name_database_data_dev" {
54 |   value = module.module-vpc-data-dev.subnet_name_database
55 | }
56 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 | 
15 |   backend "local" {
16 |     path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/main_vpc_data_dev.tf:
--------------------------------------------------------------------------------
1 | module "module-vpc-data-dev" {
2 |   source = "./module-vpc-data-dev"
3 | 
4 |   region = local.region_seoul
5 |   environment = local.environment_development
6 |   team = local.team_data
7 | }


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/module-vpc-data-dev/_output.tf:
--------------------------------------------------------------------------------
 1 | output "vpc_id" {
 2 |   value = module.vpc-data-development.vpc_id
 3 | }
 4 | 
 5 | output "subnet_list_public" {
 6 |   value = module.vpc-data-development.public_subnets
 7 | }
 8 | 
 9 | output "subnet_list_private" {
10 |   value = module.vpc-data-development.private_subnets
11 | }
12 | 
13 | output "subnet_list_database" {
14 |   value = module.vpc-data-development.database_subnets
15 | }
16 | 
17 | output "subnet_name_database" {
18 |   value = module.vpc-data-development.database_subnet_group_name
19 | }
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/module-vpc-data-dev/_variable.tf:
--------------------------------------------------------------------------------
1 | 
2 | variable "environment" {}
3 | variable "region" {}
4 | variable "team" {}


--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-vpc/module-vpc-data-dev/dev.data.vpc.tf:
--------------------------------------------------------------------------------
 1 | module "vpc-data-development" {
 2 |   source = "terraform-aws-modules/vpc/aws"
 3 |   version = "3.11.0"
 4 | 
 5 |   name = "vpc-data-${var.environment}"
 6 |   cidr = "10.10.0.0/16"
 7 | 
 8 |   azs             = ["${var.region}a", "${var.region}b", "${var.region}c"]
 9 |   private_subnets = ["10.10.80.0/20", "10.10.96.0/20", "10.10.112.0/20"]
10 | 
11 |   /**
12 |    *  실습 편의를 위해 Database Subnet 도 같은 VPC 내 추가합니다.
13 |    */
14 |   database_subnets = ["10.10.0.0/20", "10.10.32.0/20", "10.10.64.0/20"]
15 |   create_database_subnet_group = true
16 | 
17 |   /**
18 |    *  실습 편의를 위해 Public Subnet 도 같은 VPC 내 추가합니다.
19 |    */
20 |   public_subnets  = ["10.10.208.0/20", "10.10.224.0/20", "10.10.240.0/20"]
21 | 
22 | 
23 |   enable_ipv6 = true
24 | 
25 |   enable_dns_hostnames = true
26 |   enable_dns_support   = true
27 |   enable_dhcp_options = false
28 | 
29 |   enable_nat_gateway = true
30 |   single_nat_gateway = false
31 |   one_nat_gateway_per_az = false
32 | 
33 |   public_subnet_tags = {
34 |     Public = "true"
35 |   }
36 | 
37 |   tags = {
38 |     Owner       = "team-${var.team}"
39 |     Environment = var.environment
40 |   }
41 | 
42 |   vpc_tags = {
43 |     Name = "vpc-${var.team}-${var.environment}"
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_data.ami.tf:
--------------------------------------------------------------------------------
 1 | data "aws_ami" "ubuntu" {
 2 |   most_recent = true
 3 |   owners      = ["099720109477"] # Canonical
 4 | 
 5 |   filter {
 6 |     name   = "name"
 7 |     values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"]
 8 |   }
 9 | 
10 |   filter {
11 |     name   = "virtualization-type"
12 |     values = ["hvm"]
13 |   }
14 | }
15 | 
16 | // https://aws.amazon.com/amazon-linux-2/release-notes/
17 | data "aws_ami" "amazon_linux_2" {
18 |   most_recent = true
19 |   owners      = ["amazon"]
20 | 
21 |   filter {
22 |     name   = "owner-alias"
23 |     values = ["amazon"]
24 |   }
25 | 
26 |   filter {
27 |     name   = "name"
28 |     values = ["amzn2-ami-hvm-*-x86_64-gp2"]
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_data.state.tf:
--------------------------------------------------------------------------------
 1 | data "terraform_remote_state" "root_iam" {
 2 |   backend = "local"
 3 | 
 4 |   config = {
 5 |     path = "../__tf_state/_aws-root-iam/terraform.tfstate"
 6 |   }
 7 | }
 8 | 
 9 | data "terraform_remote_state" "root_vpc" {
10 |   backend = "local"
11 | 
12 |   config = {
13 |     path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
14 |   }
15 | }
16 | 
17 | data "terraform_remote_state" "root_sg" {
18 |   backend = "local"
19 | 
20 |   config = {
21 |     path = "../__tf_state/_aws-root-sg/terraform.tfstate"
22 |   }
23 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_local.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   environment_common = "common"
 3 |   environment_development = "development"
 4 |   environment_production = "production"
 5 | 
 6 |   region_seoul = "ap-northeast-2"
 7 | 
 8 |   team_data = "data"
 9 | 
10 |   keypair_infra = "infra-admin"
11 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_template/template.cloudwatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd /root
 3 | 
 4 | ${installer} update -y
 5 | 
 6 | # https://forums.aws.amazon.com/thread.jspa?threadID=149117
 7 | ${installer} install -y perl-core
 8 | ${installer} install -y perl-Sys-Syslog
 9 | ${installer} install -y perl-CGI
10 | ${installer} install -y perl-Switch perl-DateTime perl-LWP-Protocol-https perl-Digest-SHA.x86_64 curl zip unzip
11 | 
12 | curl https://aws-cloudwatch.s3.amazonaws.com/downloads/CloudWatchMonitoringScripts-${agent_version}.zip -O
13 | unzip CloudWatchMonitoringScripts-${agent_version}.zip
14 | chown ${user}:${user} ./aws-scripts-mon
15 | mv aws-scripts-mon /home/${user}/
16 | echo "*/1 * * * * /home/${user}/aws-scripts-mon/mon-put-instance-data.pl --mem-util --mem-avail --disk-path=/  -disk-space-util --disk-space-avail --memory-units=megabytes --disk-space-units=gigabytes --from-cron" >> /var/spool/cron/${user}
17 | chown ${user}:${user} /var/spool/cron/${user}
18 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 | 
15 |   backend "local" {
16 |     path = "../__tf_state/_aws-root-machine-bastion/terraform.tfstate"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/main_bastion_dev.tf:
--------------------------------------------------------------------------------
 1 | module "module-bastion-data-dev" {
 2 |   source = "./module-bastion-data-dev"
 3 | 
 4 |   environment = local.environment_development
 5 |   team = local.team_data
 6 | 
 7 |   bastion_ami = data.aws_ami.amazon_linux_2.id
 8 |   bastion_profile = data.terraform_remote_state.root_iam.outputs.profile_id_bastion
 9 |   bastion_keypair = local.keypair_infra
10 | 
11 |   bastion_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_bastion_public_data_dev
12 | 
13 |   bastion_subnet_id = data.terraform_remote_state.root_vpc.outputs.subnet_id_public_az_a_data_dev
14 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/_data.bootstrap.tf:
--------------------------------------------------------------------------------
 1 | data "template_file" "bastion_template_cloudwatch" {
 2 |   template = file("${path.root}/_template/template.cloudwatch.sh")
 3 | 
 4 |   vars = {
 5 |     user                       = "ec2-user"
 6 |     installer                  = "yum"
 7 |     agent_version              = "1.2.2"
 8 |   }
 9 | }
10 | 
11 | 
12 | data "template_cloudinit_config" "bastion_user_data" {
13 |   gzip          = false
14 |   base64_encode = true
15 | 
16 |   # install patches for Amazon Linux
17 |   part {
18 |     content_type = "text/x-shellscript"
19 | 
20 |     content = <<EOF
21 | #!/bin/bash
22 | yum update -y
23 | EOF
24 |   }
25 | 
26 |   # https://docs.aws.amazon.com/corretto/latest/corretto-8-ug/amazon-linux-install.html
27 |   # install correto8
28 |   part {
29 |     content_type = "text/x-shellscript"
30 | 
31 |     content = <<EOF
32 | #!/bin/bash
33 | amazon-linux-extras enable corretto8
34 | yum install -y java-1.8.0-amazon-corretto-devel
35 | EOF
36 |   }
37 | 
38 |   # install agent for cloudwatch custom metric
39 |   part {
40 |     content_type = "text/x-shellscript"
41 |     content      = data.template_file.bastion_template_cloudwatch.rendered
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   purpose_bastion_public = "bastion-public"
3 | 
4 |   ebs_root_device_link = "/dev/nvme0n1p1"
5 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}
2 | variable "team" {}
3 | 
4 | variable "bastion_ami" {}
5 | variable "bastion_profile" {}
6 | variable "bastion_keypair" {}
7 | 
8 | variable "bastion_subnet_id" {}
9 | variable "bastion_sg_id" {}


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/dev.bastion-public-01.cw.tf:
--------------------------------------------------------------------------------
  1 | locals {
  2 |   bastion_instances = [
  3 |     {
  4 |       instanceId = aws_instance.bastion_public_01.id
  5 |       name       = local.purpose_bastion_public
  6 |       index      = "01"
  7 |       rootDevice = local.ebs_root_device_link
  8 |     },
  9 |   ]
 10 | }
 11 | 
 12 | resource "aws_cloudwatch_metric_alarm" "bastion_High-CPUUtilization" {
 13 |   count = length(local.bastion_instances)
 14 | 
 15 |   alarm_name          = "${lookup(local.bastion_instances[count.index], "name")}-${lookup(local.bastion_instances[count.index], "index")}/${var.environment}-High_CPUUtil"
 16 |   comparison_operator = "GreaterThanOrEqualToThreshold"
 17 | 
 18 |   period              = "600"
 19 |   evaluation_periods  = "1"
 20 |   datapoints_to_alarm = 1
 21 | 
 22 |   # second
 23 |   statistic         = "Average"
 24 |   threshold         = "80"
 25 |   alarm_description = ""
 26 | 
 27 |   metric_name = "CPUUtilization"
 28 |   namespace   = "AWS/EC2"
 29 | 
 30 |   dimensions = {
 31 |     InstanceId = lookup(local.bastion_instances[count.index], "instanceId")
 32 |   }
 33 | 
 34 |   actions_enabled           = false
 35 |   insufficient_data_actions = []
 36 |   ok_actions                = []
 37 | 
 38 |   alarm_actions = [
 39 |     // ${var.sns_topic_arn_cloudwatch_alarm},
 40 |   ]
 41 | }
 42 | 
 43 | resource "aws_cloudwatch_metric_alarm" "bastion_Has-SystemCheckFailure" {
 44 |   count = length(local.bastion_instances)
 45 | 
 46 |   alarm_name          = "${lookup(local.bastion_instances[count.index], "name")}-${lookup(local.bastion_instances[count.index], "index")}/${var.environment}-Has_SysCheckFailure"
 47 |   comparison_operator = "GreaterThanOrEqualToThreshold"
 48 | 
 49 |   period              = "300"
 50 |   evaluation_periods  = "1"
 51 |   datapoints_to_alarm = 1
 52 | 
 53 |   # second
 54 |   statistic         = "Sum"
 55 |   threshold         = "1"
 56 |   alarm_description = ""
 57 | 
 58 |   metric_name = "StatusCheckFailed"
 59 |   namespace   = "AWS/EC2"
 60 | 
 61 |   dimensions = {
 62 |     InstanceId = lookup(local.bastion_instances[count.index], "instanceId")
 63 |   }
 64 | 
 65 |   actions_enabled           = false
 66 |   insufficient_data_actions = []
 67 |   ok_actions                = []
 68 | 
 69 |   alarm_actions = [
 70 |     // ${var.sns_topic_arn_cloudwatch_alarm},
 71 |   ]
 72 | }
 73 | 
 74 | # EC2 Custom Metric (Disk, Memory)
 75 | 
 76 | resource "aws_cloudwatch_metric_alarm" "bastion_High-RootDiskUtil" {
 77 |   count = length(local.bastion_instances)
 78 | 
 79 |   alarm_name          = "${lookup(local.bastion_instances[count.index], "name")}-${lookup(local.bastion_instances[count.index], "index")}/${var.environment}-High_RootDiskUtil"
 80 |   comparison_operator = "GreaterThanOrEqualToThreshold"
 81 | 
 82 |   period              = "300"
 83 |   evaluation_periods  = "1"
 84 |   datapoints_to_alarm = 1
 85 | 
 86 |   # second
 87 |   statistic         = "Maximum"
 88 |   threshold         = "80"
 89 |   alarm_description = ""
 90 | 
 91 |   metric_name = "DiskSpaceUtilization"
 92 |   namespace   = "System/Linux"
 93 | 
 94 |   dimensions = {
 95 |     InstanceId = lookup(local.bastion_instances[count.index], "instanceId")
 96 |     MountPath  = "/"
 97 |     Filesystem = lookup(local.bastion_instances[count.index], "rootDevice")
 98 |   }
 99 | 
100 |   actions_enabled = false
101 | 
102 |   insufficient_data_actions = [
103 |     // ${var.sns_topic_arn_cloudwatch_alarm},
104 |   ]
105 | 
106 |   ok_actions = []
107 | 
108 |   alarm_actions = [
109 |     // ${var.sns_topic_arn_cloudwatch_alarm},
110 |   ]
111 | }
112 | 
113 | resource "aws_cloudwatch_metric_alarm" "bastion_High-MemUtil" {
114 |   count = length(local.bastion_instances)
115 | 
116 |   alarm_name          = "${lookup(local.bastion_instances[count.index], "name")}-${lookup(local.bastion_instances[count.index], "index")}/${var.environment}-High_MemUtil"
117 |   comparison_operator = "GreaterThanOrEqualToThreshold"
118 | 
119 |   period              = "300"
120 |   evaluation_periods  = "1"
121 |   datapoints_to_alarm = 1
122 | 
123 |   # second
124 |   statistic         = "Maximum"
125 |   threshold         = "80"
126 |   alarm_description = ""
127 | 
128 |   metric_name = "MemoryUtilization"
129 |   namespace   = "System/Linux"
130 | 
131 |   dimensions = {
132 |     InstanceId = lookup(local.bastion_instances[count.index], "instanceId")
133 |   }
134 | 
135 |   actions_enabled = false
136 | 
137 |   insufficient_data_actions = [
138 |     // ${var.sns_topic_arn_cloudwatch_alarm},
139 |   ]
140 | 
141 |   ok_actions = []
142 | 
143 |   alarm_actions = [
144 |     // ${var.sns_topic_arn_cloudwatch_alarm},
145 |   ]
146 | }
147 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/dev.bastion-public-01.ec2.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_instance" "bastion_public_01" {
 2 |   ami = var.bastion_ami
 3 | 
 4 |   lifecycle {
 5 |     create_before_destroy = false
 6 | 
 7 |     ignore_changes = [
 8 |       ami,
 9 |       user_data,
10 |     ]
11 |   }
12 | 
13 |   instance_type = "t3.small"
14 |   subnet_id     = var.bastion_subnet_id
15 | 
16 |   vpc_security_group_ids = [var.bastion_sg_id]
17 | 
18 |   associate_public_ip_address = true
19 |   monitoring = true
20 | 
21 |   key_name             = var.bastion_keypair
22 |   iam_instance_profile = var.bastion_profile
23 | 
24 |   ebs_optimized = true
25 |   root_block_device {
26 |     volume_type           = "gp3"
27 |     volume_size           = "100"
28 |     delete_on_termination = false
29 |   }
30 | 
31 |   user_data = data.template_cloudinit_config.bastion_user_data.rendered
32 | 
33 |   tags = {
34 |     Terraform   = "true"
35 |     Environment = var.environment
36 |     Team        = var.team
37 | 
38 |     Name = "${local.purpose_bastion_public}-01-${var.environment}"
39 |   }
40 | 
41 |   volume_tags = {
42 |     Terraform   = "true"
43 |     Environment = var.environment
44 |     Team        = var.team
45 | 
46 |     Name = "${local.purpose_bastion_public}-01-${var.environment}"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-eks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-eks/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-eks/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   environment_common = "common"
3 |   environment_development = "development"
4 |   environment_production = "production"
5 | 
6 |   region_seoul = "ap-northeast-2"
7 | 
8 |   team_data = "data"
9 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-eks/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-eks/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 | 
15 |   backend "local" {
16 |     path = "../__tf_state/_aws-root-machine-eks/terraform.tfstate"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-batch/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_data.state.tf:
--------------------------------------------------------------------------------
 1 | data "terraform_remote_state" "root_iam" {
 2 |   backend = "local"
 3 | 
 4 |   config = {
 5 |     path = "../__tf_state/_aws-root-iam/terraform.tfstate"
 6 |   }
 7 | }
 8 | 
 9 | data "terraform_remote_state" "root_vpc" {
10 |   backend = "local"
11 | 
12 |   config = {
13 |     path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
14 |   }
15 | }
16 | 
17 | data "terraform_remote_state" "root_sg" {
18 |   backend = "local"
19 | 
20 |   config = {
21 |     path = "../__tf_state/_aws-root-sg/terraform.tfstate"
22 |   }
23 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_local.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   environment_common = "common"
 3 |   environment_development = "development"
 4 |   environment_production = "production"
 5 | 
 6 |   region_seoul = "ap-northeast-2"
 7 | 
 8 |   team_data = "data"
 9 | 
10 |   keypair_infra = "infra-admin"
11 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-cloudwatch-collect.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $HOME
 4 | 
 5 | # https://forums.aws.amazon.com/thread.jspa?threadID=149117
 6 | sudo yum install -y perl-core
 7 | sudo yum install -y perl-Sys-Syslog
 8 | sudo yum install -y perl-CGI
 9 | sudo yum install -y perl-Switch perl-DateTime perl-LWP-Protocol-https perl-Digest-SHA.x86_64 curl zip unzip
10 | 
11 | curl https://aws-cloudwatch.s3.amazonaws.com/downloads/CloudWatchMonitoringScripts-1.2.2.zip --silent -O
12 | unzip CloudWatchMonitoringScripts-1.2.2.zip
13 | 
14 | sudo chown -R hadoop:hadoop ./aws-scripts-mon
15 | 
16 | sudo sh -c "echo '*/1 * * * * /home/hadoop/aws-scripts-mon/mon-put-instance-data.pl --mem-util --mem-avail --disk-path=/ --disk-path=/mnt --disk-space-util --disk-space-avail --memory-units=megabytes --disk-space-units=gigabytes --from-cron' >> /var/spool/cron/hadoop"
17 | sudo chown hadoop:hadoop /var/spool/cron/hadoop
18 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-instance-tag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sleep 15s;
 4 | 
 5 | ls -al /mnt/var/lib/info/
 6 | 
 7 | echo -e ""
 8 | 
 9 | export IS_MASTER=$(cat /mnt/var/lib/info/instance.json | jq -r ".isMaster")
10 | export INSTANCE_GROUP_ID=$(cat /mnt/var/lib/info/instance.json | jq -r ".instanceGroupId")
11 | export CLUSTER_ID=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".jobFlowId")
12 | export INSTANCE_ID=$(wget -q -O - http://169.254.169.254/latest/meta-data/instance-id)
13 | export INSTANCE_GROUP_TYPE=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".instanceGroups | .[] | select( .instanceGroupId == \"${INSTANCE_GROUP_ID}\") | .instanceRole" | tr a-z A-Z)
14 | 
15 | echo -e "IS_MASTER: ${IS_MASTER}"
16 | echo -e "INSTANCE_GROUP_ID: ${INSTANCE_GROUP_ID}"
17 | echo -e "CLUSTER_ID: ${CLUSTER_ID}"
18 | echo -e "INSTANCE_ID: ${INSTANCE_ID}"
19 | echo -e "INSTANCE_GROUP_TYPE: ${INSTANCE_GROUP_TYPE}"
20 | 
21 | export CURRENT_TAG_NAME=$(aws ec2 --region ap-northeast-2 describe-tags --filters Name=resource-id,Values=${INSTANCE_ID} | jq -r ".Tags | .[] | select( .Key == \"Name\") | .Value")
22 | export NEW_TAG_NAME="${CURRENT_TAG_NAME}-${INSTANCE_GROUP_TYPE}"
23 | 
24 | echo -e "CURRENT_TAG_NAME: ${CURRENT_TAG_NAME}"
25 | echo -e "NEW_TAG_NAME: ${NEW_TAG_NAME}"
26 | 
27 | echo -e "aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME}"
28 | 
29 | aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME}
30 | 
31 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-spark-batch.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "classification": "httpfs-env",
  4 |     "properties": {
  5 |     },
  6 |     "configurations": [
  7 |       {
  8 |         "classification": "export",
  9 |         "properties": {
 10 |           "TZ": "Asia/Seoul"
 11 |         },
 12 |         "configurations": [
 13 |         ]
 14 |       }
 15 |     ]
 16 |   },
 17 |   {
 18 |     "classification": "hadoop-kms-env",
 19 |     "properties": {
 20 |     },
 21 |     "configurations": [
 22 |       {
 23 |         "classification": "export",
 24 |         "properties": {
 25 |           "TZ": "Asia/Seoul"
 26 |         },
 27 |         "configurations": [
 28 |         ]
 29 |       }
 30 |     ]
 31 |   },
 32 |   {
 33 |     "classification": "livy-env",
 34 |     "properties": {
 35 |     },
 36 |     "configurations": [
 37 |       {
 38 |         "classification": "export",
 39 |         "properties": {
 40 |           "TZ": "Asia/Seoul"
 41 |         },
 42 |         "configurations": [
 43 |         ]
 44 |       }
 45 |     ]
 46 |   },
 47 |   {
 48 |     "classification": "zeppelin-env",
 49 |     "properties": {
 50 |     },
 51 |     "configurations": [
 52 |       {
 53 |         "classification": "export",
 54 |         "properties": {
 55 |           "TZ": "Asia/Seoul"
 56 |         },
 57 |         "configurations": [
 58 |         ]
 59 |       }
 60 |     ]
 61 |   },
 62 |   {
 63 |     "classification": "sqoop-env",
 64 |     "properties": {
 65 |     },
 66 |     "configurations": [
 67 |       {
 68 |         "classification": "export",
 69 |         "properties": {
 70 |           "TZ": "Asia/Seoul"
 71 |         },
 72 |         "configurations": [
 73 |         ]
 74 |       }
 75 |     ]
 76 |   },
 77 |   {
 78 |     "classification": "oozie-env",
 79 |     "properties": {
 80 |     },
 81 |     "configurations": [
 82 |       {
 83 |         "classification": "export",
 84 |         "properties": {
 85 |           "TZ": "Asia/Seoul"
 86 |         },
 87 |         "configurations": [
 88 |         ]
 89 |       }
 90 |     ]
 91 |   },
 92 |   {
 93 |     "classification": "presto-env",
 94 |     "properties": {
 95 |     },
 96 |     "configurations": [
 97 |       {
 98 |         "classification": "export",
 99 |         "properties": {
100 |           "TZ": "Asia/Seoul"
101 |         },
102 |         "configurations": [
103 |         ]
104 |       }
105 |     ]
106 |   },
107 |   {
108 |     "classification": "hcatalog-env",
109 |     "properties": {
110 |     },
111 |     "configurations": [
112 |       {
113 |         "classification": "export",
114 |         "properties": {
115 |           "TZ": "Asia/Seoul"
116 |         },
117 |         "configurations": [
118 |         ]
119 |       }
120 |     ]
121 |   },
122 |   {
123 |     "classification": "hcatalog-webhcat-env",
124 |     "properties": {
125 |     },
126 |     "configurations": [
127 |       {
128 |         "classification": "export",
129 |         "properties": {
130 |           "TZ": "Asia/Seoul"
131 |         },
132 |         "configurations": [
133 |         ]
134 |       }
135 |     ]
136 |   },
137 |   {
138 |     "classification": "hive-env",
139 |     "properties": {
140 |     },
141 |     "configurations": [
142 |       {
143 |         "classification": "export",
144 |         "properties": {
145 |           "TZ": "Asia/Seoul"
146 |         },
147 |         "configurations": [
148 |         ]
149 |       }
150 |     ]
151 |   },
152 |   {
153 |     "classification": "mapred-env",
154 |     "properties": {
155 |     },
156 |     "configurations": [
157 |       {
158 |         "classification": "export",
159 |         "properties": {
160 |           "TZ": "Asia/Seoul"
161 |         },
162 |         "configurations": [
163 |         ]
164 |       }
165 |     ]
166 |   },
167 |   {
168 |     "classification": "hadoop-env",
169 |     "properties": {
170 |     },
171 |     "configurations": [
172 |       {
173 |         "classification": "export",
174 |         "properties": {
175 |           "TZ": "Asia/Seoul"
176 |         },
177 |         "configurations": [
178 |         ]
179 |       }
180 |     ]
181 |   },
182 |   {
183 |     "classification": "hbase-env",
184 |     "properties": {
185 |     },
186 |     "configurations": [
187 |       {
188 |         "classification": "export",
189 |         "properties": {
190 |           "TZ": "Asia/Seoul"
191 |         },
192 |         "configurations": [
193 |         ]
194 |       }
195 |     ]
196 |   },
197 |   {
198 |     "classification": "spark-env",
199 |     "properties": {
200 |     },
201 |     "configurations": [
202 |       {
203 |         "classification": "export",
204 |         "properties": {
205 |           "TZ": "Asia/Seoul"
206 |         },
207 |         "configurations": [
208 |         ]
209 |       }
210 |     ]
211 |   },
212 |   {
213 |     "Classification": "hive-site",
214 |     "Properties": {
215 |       "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true",
216 |       "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
217 |       "javax.jdo.option.ConnectionUserName": "root",
218 |       "javax.jdo.option.ConnectionPassword": "admin1234"
219 |     }
220 |   },
221 |   {
222 |     "Classification": "spark-hive-site",
223 |     "Properties": {
224 |       "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true",
225 |       "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
226 |       "javax.jdo.option.ConnectionUserName": "root",
227 |       "javax.jdo.option.ConnectionPassword": "admin1234"
228 |     }
229 |   },
230 |   {
231 |     "Classification": "capacity-scheduler",
232 |     "Properties": {
233 |       "yarn.scheduler.capacity.resource-calculator": "org.apache.hadoop.yarn.util.resource.DominantResourceCalculator",
234 |       "yarn.scheduler.capacity.maximum-am-resource-percent": "0.8"
235 |     },
236 |     "configurations": [
237 |     ]
238 |   },
239 |   {
240 |     "Classification": "yarn-site",
241 |     "Properties": {
242 |       "yarn.scheduler.minimum-allocation-vcores": "1",
243 |       "yarn.scheduler.maximum-allocation-vcores": "8",
244 |       "yarn.node-labels.enabled": "true",
245 |       "yarn.node-labels.am.default-node-label-expression": "CORE"
246 |     },
247 |     "configurations": [
248 |     ]
249 |   },
250 |   {
251 |     "classification": "yarn-env",
252 |     "properties": {
253 |     },
254 |     "configurations": [
255 |       {
256 |         "classification": "export",
257 |         "properties": {
258 |           "TZ": "Asia/Seoul"
259 |         },
260 |         "configurations": [
261 |         ]
262 |       }
263 |     ]
264 |   }
265 | ]


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-system-config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo yum -y update
 4 | sudo yum -y upgrade
 5 | 
 6 | sudo timedatectl set-timezone Asia/Seoul
 7 | 
 8 | sudo yum -y groupinstall development
 9 | sudo yum -y install curl wget jq htop
10 | 
11 | sudo sh -c 'echo "fs.inotify.max_user_instances = 8192" > /etc/sysctl.d/98-inotifyfix.conf'
12 | sudo sh -c 'echo "fs.inotify.max_user_watches = 524288" >> /etc/sysctl.d/98-inotifyfix.conf'
13 | sudo sysctl --system
14 | 
15 | sudo sh -c 'echo "* soft nofile 65536" > /etc/security/limits.d/50-custom.conf'
16 | sudo sh -c 'echo "* hard nofile 65536" >> /etc/security/limits.d/50-custom.conf'
17 | sudo sh -c 'echo "* soft nproc 200000" >> /etc/security/limits.d/50-custom.conf'
18 | sudo sh -c 'echo "* hard nproc 200000" >> /etc/security/limits.d/50-custom.conf'
19 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 |   backend "local" {
15 |     path = "../__tf_state/_aws-root-machine-emr/terraform.tfstate"
16 |   }
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/main_emr_data_dev.tf:
--------------------------------------------------------------------------------
 1 | module "module-emr-data-dev" {
 2 |   source = "./module-emr-data-dev"
 3 | 
 4 |   environment = local.environment_development
 5 |   team = local.team_data
 6 | 
 7 |   vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev
 8 |   emr_subnet = data.terraform_remote_state.root_vpc.outputs.subnet_id_private_az_c_data_dev /** AZ-c */
 9 | 
10 |   emr_keypair = local.keypair_infra
11 |   emr_profile_arn_instance = data.terraform_remote_state.root_iam.outputs.profile_arn_emr_instance
12 |   emr_role_arn_cluster = data.terraform_remote_state.root_iam.outputs.role_arn_emr_cluster
13 |   emr_role_arn_asg  = data.terraform_remote_state.root_iam.outputs.role_arn_emr_asg
14 | 
15 |   emr_master_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_managed_data_dev
16 |   emr_master_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_additional_data_dev
17 |   emr_slave_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_managed_data_dev
18 |   emr_slave_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_additional_data_dev
19 |   emr_service_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_service_managed_data_dev
20 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_local.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   emr_cluster_spark_batch = "spark-batch"
 3 | 
 4 |   emr_release_5_34_0 = "emr-5.34.0"
 5 |   emr_release_6_5_0  = "emr-6.5.0"
 6 | }
 7 | 
 8 | locals {
 9 |   spot_default_factor = 0.8
10 | 
11 |   spot_on_demand_price_r5xlarge = 0.304
12 |   spot_bid_price_r5xlarge       = format("%.2f", tonumber(local.spot_on_demand_price_r5xlarge) * tonumber(local.spot_default_factor))
13 | 
14 |   spot_on_demand_price_m5xlarge = 0.236
15 |   spot_bid_price_m5_xlarge        = format("%.2f", tonumber(local.spot_on_demand_price_m5xlarge) * tonumber(local.spot_default_factor))
16 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_variable.tf:
--------------------------------------------------------------------------------
 1 | variable "environment" {}
 2 | variable "team" {}
 3 | 
 4 | 
 5 | variable "emr_keypair" {}
 6 | 
 7 | variable "emr_profile_arn_instance" {}
 8 | variable "emr_role_arn_cluster" {}
 9 | variable "emr_role_arn_asg" {}
10 | 
11 | variable "vpc_id" {}
12 | variable "emr_subnet" {}
13 | 
14 | variable "emr_master_managed_sg_id" {}
15 | variable "emr_master_additional_sg_id" {}
16 | variable "emr_slave_managed_sg_id" {}
17 | variable "emr_slave_additional_sg_id" {}
18 | variable "emr_service_managed_sg_id" {}
19 | 
20 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/dev.spark-batch-01.cw.tf:
--------------------------------------------------------------------------------
  1 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-CPUUtilization" {
  2 | #  alarm_name          = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_CPUUtil"
  3 | #  comparison_operator = "GreaterThanOrEqualToThreshold"
  4 | #
  5 | #  period              = "300"
  6 | #  evaluation_periods  = "2"
  7 | #  datapoints_to_alarm = 2
  8 | #
  9 | #  # second
 10 | #  statistic         = "Average"
 11 | #  threshold         = "80"
 12 | #  alarm_description = ""
 13 | #
 14 | #  metric_name = "CPUUtilization"
 15 | #  namespace   = "AWS/EC2"
 16 | #
 17 | #  dimensions = {
 18 | #    InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
 19 | #  }
 20 | #
 21 | #  actions_enabled           = true
 22 | #  insufficient_data_actions = []
 23 | #  ok_actions                = []
 24 | #
 25 | #  alarm_actions = [
 26 | #    var.sns_topic_arn_cloudwatch_alarm,
 27 | #  ]
 28 | #}
 29 | #
 30 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-MemUtil" {
 31 | #  alarm_name          = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_MemUtil"
 32 | #  comparison_operator = "GreaterThanOrEqualToThreshold"
 33 | #
 34 | #  period              = "300"
 35 | #  evaluation_periods  = "2"
 36 | #  datapoints_to_alarm = 2
 37 | #
 38 | #  # second
 39 | #  statistic         = "Maximum"
 40 | #  threshold         = "80"
 41 | #  alarm_description = ""
 42 | #
 43 | #  metric_name = "MemoryUtilization"
 44 | #  namespace   = "System/Linux"
 45 | #
 46 | #  dimensions = {
 47 | #    InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
 48 | #  }
 49 | #
 50 | #  actions_enabled = true
 51 | #
 52 | #  insufficient_data_actions = [
 53 | #    var.sns_topic_arn_cloudwatch_alarm,
 54 | #  ]
 55 | #
 56 | #  ok_actions = []
 57 | #
 58 | #  alarm_actions = [
 59 | #    var.sns_topic_arn_cloudwatch_alarm,
 60 | #  ]
 61 | #}
 62 | #
 63 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_Has-SystemCheckFailure" {
 64 | #  alarm_name          = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-Has_SysCheckFailure"
 65 | #  comparison_operator = "GreaterThanOrEqualToThreshold"
 66 | #
 67 | #  period              = "300"
 68 | #  evaluation_periods  = "1"
 69 | #  datapoints_to_alarm = 1
 70 | #
 71 | #  # second
 72 | #  statistic         = "Sum"
 73 | #  threshold         = "1"
 74 | #  alarm_description = ""
 75 | #
 76 | #  metric_name = "StatusCheckFailed"
 77 | #  namespace   = "AWS/EC2"
 78 | #
 79 | #  dimensions = {
 80 | #    InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
 81 | #  }
 82 | #
 83 | #  actions_enabled           = true
 84 | #  insufficient_data_actions = []
 85 | #  ok_actions                = []
 86 | #
 87 | #  alarm_actions = [
 88 | #    var.sns_topic_arn_cloudwatch_alarm,
 89 | #  ]
 90 | #}
 91 | #
 92 | ## EC2 Custom Metric (Disk, Memory)
 93 | #
 94 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-RootDiskUtil" {
 95 | #  alarm_name          = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_RootDiskUtil"
 96 | #  comparison_operator = "GreaterThanOrEqualToThreshold"
 97 | #
 98 | #  period              = "300"
 99 | #  evaluation_periods  = "2"
100 | #  datapoints_to_alarm = 2
101 | #
102 | #  # second
103 | #  statistic         = "Maximum"
104 | #  threshold         = "80"
105 | #  alarm_description = ""
106 | #
107 | #  metric_name = "DiskSpaceUtilization"
108 | #  namespace   = "System/Linux"
109 | #
110 | #  dimensions = {
111 | #    InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
112 | #    MountPath  = local.emr_cw_root_disk_mount_path
113 | #    Filesystem = local.emr_cw_root_disk_mount_fs
114 | #  }
115 | #
116 | #  actions_enabled = true
117 | #
118 | #  insufficient_data_actions = [
119 | #    var.sns_topic_arn_cloudwatch_alarm,
120 | #  ]
121 | #
122 | #  ok_actions = []
123 | #
124 | #  alarm_actions = [
125 | #    var.sns_topic_arn_cloudwatch_alarm,
126 | #  ]
127 | #}
128 | #
129 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-DataDiskUtil" {
130 | #  alarm_name          = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_DataDiskUtil"
131 | #  comparison_operator = "GreaterThanOrEqualToThreshold"
132 | #
133 | #  period              = "300"
134 | #  evaluation_periods  = "2"
135 | #  datapoints_to_alarm = 2
136 | #
137 | #  # second
138 | #  statistic         = "Maximum"
139 | #  threshold         = "80"
140 | #  alarm_description = ""
141 | #
142 | #  metric_name = "DiskSpaceUtilization"
143 | #  namespace   = "System/Linux"
144 | #
145 | #  dimensions = {
146 | #    InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
147 | #    MountPath  = local.emr_cw_data_disk_mount_path
148 | #    Filesystem = local.emr_cw_data_disk_mount_fs
149 | #  }
150 | #
151 | #  actions_enabled = true
152 | #
153 | #  insufficient_data_actions = [
154 | #    var.sns_topic_arn_cloudwatch_alarm,
155 | #  ]
156 | #
157 | #  ok_actions = []
158 | #
159 | #  alarm_actions = [
160 | #    var.sns_topic_arn_cloudwatch_alarm,
161 | #  ]
162 | #}
163 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-presto/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-presto/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-stream/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-stream/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-storage-rds/.gitkeep


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_data.state.tf:
--------------------------------------------------------------------------------
 1 | data "terraform_remote_state" "root_iam" {
 2 |   backend = "local"
 3 | 
 4 |   config = {
 5 |     path = "../__tf_state/_aws-root-iam/terraform.tfstate"
 6 |   }
 7 | }
 8 | 
 9 | data "terraform_remote_state" "root_vpc" {
10 |   backend = "local"
11 | 
12 |   config = {
13 |     path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
14 |   }
15 | }
16 | 
17 | data "terraform_remote_state" "root_sg" {
18 |   backend = "local"
19 | 
20 |   config = {
21 |     path = "../__tf_state/_aws-root-sg/terraform.tfstate"
22 |   }
23 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 |   environment_common = "common"
3 |   environment_development = "development"
4 |   environment_production = "production"
5 | 
6 |   region_seoul = "ap-northeast-2"
7 | 
8 |   team_data = "data"
9 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region = local.region_seoul
3 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_terraform.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.1.3"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = "~> 3.71.0"
 8 |     }
 9 |   }
10 | 
11 |   /**
12 |    * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 |    */
14 |   backend "local" {
15 |     path = "../__tf_state/_aws-root-storage-rds/terraform.tfstate"
16 |   }
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/main_rds_data_dev.tf:
--------------------------------------------------------------------------------
 1 | module "module-rds-data-dev" {
 2 |   source = "./module-rds-data-dev"
 3 | 
 4 |   environment = local.environment_development
 5 |   team = local.team_data
 6 | 
 7 |   vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev
 8 |   rds_hive_metastore_subnet_list = data.terraform_remote_state.root_vpc.outputs.subnet_list_database_data_dev
 9 |   rds_hive_metastore_subnet_group = data.terraform_remote_state.root_vpc.outputs.subnet_name_database_data_dev
10 |   rds_hive_metastore_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_rds_hive_metastore_data_dev
11 | }


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}
2 | variable "team" {}
3 | 
4 | variable "vpc_id" {}
5 | variable "rds_hive_metastore_sg_id" {}
6 | variable "rds_hive_metastore_subnet_group" {}
7 | variable "rds_hive_metastore_subnet_list" {}
8 | 
9 | 


--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/dev.hive-metastore.rds.tf:
--------------------------------------------------------------------------------
 1 | module "rds-hive-metastore-data-development" {
 2 |   source  = "terraform-aws-modules/rds-aurora/aws"
 3 |   version = "6.1.4"
 4 | 
 5 |   name           = "hive-metastore-${var.environment}"
 6 |   engine         = "aurora-mysql"
 7 |   engine_version = "5.7.12"
 8 |   instance_class = "db.t3.medium"
 9 |   instances = {
10 |     01 = {}
11 |     02 = {}
12 |   }
13 | 
14 |   storage_encrypted   = true
15 |   apply_immediately   = true
16 |   skip_final_snapshot = true
17 |   create_monitoring_role = false
18 | 
19 |   vpc_id  = var.vpc_id
20 |   db_subnet_group_name = var.rds_hive_metastore_subnet_group
21 |   vpc_security_group_ids = [var.rds_hive_metastore_sg_id]
22 |   create_db_subnet_group = false
23 |   create_security_group  = false
24 | 
25 |   # 이후 실습에서의 편의를 위해 고정된 값을 사용합니다.
26 |   # Password 를 Terraform 에서 지정시 State 에 저장되므로 주의해야합니다.
27 |   master_password                     = "admin1234"
28 |   # master_password                     = random_password.hive-metastore.result
29 |   create_random_password              = false
30 | 
31 |   db_parameter_group_name         = aws_db_parameter_group.hive-metastore.name
32 |   db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.hive-metastore.name
33 | 
34 |   enabled_cloudwatch_logs_exports = []
35 | 
36 |   tags = {
37 |     Environment = var.environment
38 |     Team = var.team
39 |   }
40 | }
41 | 
42 | resource "random_password" "hive-metastore" {
43 |   length = 10
44 | }
45 | 
46 | resource "aws_db_parameter_group" "hive-metastore" {
47 |   name        = "hive-metastore-aurora-db-57-parameter-group"
48 |   family      = "aurora-mysql5.7"
49 |   description = "hive-metastore-aurora-db-57-parameter-group"
50 |   tags        = {
51 |     Environment = var.environment
52 |     Team = var.team
53 |   }
54 | }
55 | 
56 | resource "aws_rds_cluster_parameter_group" "hive-metastore" {
57 |   name        = "hive-metastore-aurora-57-cluster-parameter-group"
58 |   family      = "aurora-mysql5.7"
59 |   description = "hive-metastore-aurora-57-cluster-parameter-group"
60 |   tags        = {
61 |     Environment = var.environment
62 |     Team = var.team
63 |   }
64 | }


--------------------------------------------------------------------------------
/project-terraform-gcp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-gcp/.gitkeep


--------------------------------------------------------------------------------