├── .gitignore
├── Makefile
├── README.md
├── _datasets
├── .gitkeep
├── airbnb
│ └── .gitkeep
└── ecommerce
│ └── .gitkeep
├── _dockerfile
├── docker-metastore
│ ├── Dockerfile
│ ├── conf
│ │ └── hive-site.xml
│ └── scripts
│ │ └── entrypoint.sh
└── docker-presto
│ ├── Dockerfile
│ ├── etc
│ ├── catalog
│ │ ├── hive.properties
│ │ ├── iceberg.properties
│ │ └── tpch.properties
│ ├── config.properties
│ ├── jvm.config
│ ├── log.properties
│ └── node.properties
│ └── scripts
│ └── entrypoint.sh
├── _notebook
├── kafka-basic.ipynb
├── spark-jdbc-basic.ipynb
├── spark-metastore-local.ipynb
├── spark-metastore-remote.ipynb
└── spark-streaming-data.ipynb
├── _script
├── docker-mysql
│ ├── conf
│ │ └── my.cnf
│ └── sql
│ │ ├── 001_create_database.sql
│ │ └── 002_create_table.sql
└── docker-spark
│ ├── apps
│ ├── main.py
│ └── postgresql-42.2.22.jar
│ ├── conf
│ └── spark-defaults.conf
│ └── data
│ └── .gitignore
├── _slide
└── .gitignore
├── _volume
└── .gitignore
├── docker-compose.aws.yml
├── docker-compose.kafka.yml
├── docker-compose.metastore.yml
├── docker-compose.presto.yml
├── docker-compose.spark.yml
├── docker-compose.storage.yml
├── project-flink
└── .gitignore
├── project-kafka
└── .gitignore
├── project-spark
├── .gitignore
├── _scripts
│ └── mysql-ddl
│ │ └── table_property_stat.sql
├── build.gradle
├── gradle
│ └── wrapper
│ │ ├── gradle-wrapper.jar
│ │ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── module-core
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── mkt
│ │ └── udon
│ │ └── core
│ │ ├── common
│ │ ├── Environment.scala
│ │ └── TimeUtil.scala
│ │ └── entity
│ │ ├── ProductPool.scala
│ │ ├── UserEvent.scala
│ │ └── UserProfile.scala
├── module-infra-spark
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── mkt
│ │ └── udon
│ │ └── infra
│ │ └── spark
│ │ ├── SparkBase.scala
│ │ ├── common
│ │ └── Partition.scala
│ │ └── storage
│ │ ├── DynamoSink.scala
│ │ ├── JdbcSink.scala
│ │ └── ParquetSink.scala
├── service-batch-discovery
│ ├── Makefile
│ ├── VERSION
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ └── src
│ │ └── main
│ │ ├── resources
│ │ ├── .gitignore
│ │ ├── application.conf
│ │ └── log4j.properties
│ │ └── scala
│ │ └── mkt
│ │ └── udon
│ │ ├── UdonProductPoolBatch.scala
│ │ ├── config
│ │ └── UdonProductPoolBatchConfig.scala
│ │ └── entity
│ │ └── UdonProductPoolEntity.scala
├── service-batch-statistics
│ ├── Makefile
│ ├── VERSION
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ └── src
│ │ └── main
│ │ ├── resources
│ │ ├── .gitignore
│ │ ├── application.conf
│ │ └── log4j.properties
│ │ └── scala
│ │ └── mkt
│ │ └── udon
│ │ ├── UdonStatBatch.scala
│ │ ├── config
│ │ └── UdonStatBatchConfig.scala
│ │ └── entity
│ │ └── UdonStatEntity.scala
├── service-stream-profile
│ ├── Makefile
│ ├── VERSION
│ ├── build.gradle
│ └── src
│ │ └── main
│ │ ├── resources
│ │ ├── application.conf
│ │ └── log4j.properties
│ │ └── scala
│ │ └── mkt
│ │ └── udon
│ │ ├── UdonProfileStream.scala
│ │ ├── UdonRelayStream.scala
│ │ ├── config
│ │ ├── UdonProfileStreamConfig.scala
│ │ └── UdonRelayStreamConfig.scala
│ │ └── entity
│ │ └── UdonProfileStateFunc.scala
└── settings.gradle
├── project-terraform-aws
├── .gitignore
├── _aws-root-iam
│ ├── .gitkeep
│ ├── _local.tf
│ ├── _output.tf
│ ├── _provider.tf
│ ├── _terraform.tf
│ ├── main_iam_common.tf
│ └── module-iam-common
│ │ ├── _data.tf
│ │ ├── _output.tf
│ │ ├── _variable.tf
│ │ ├── common.basic.iam.tf
│ │ ├── common.ec2.iam.tf
│ │ ├── common.ec2.profile.tf
│ │ ├── common.emr.iam.tf
│ │ └── common.emr.profile.tf
├── _aws-root-sg
│ ├── .gitkeep
│ ├── _data.tf
│ ├── _local.tf
│ ├── _output.tf
│ ├── _provider.tf
│ ├── _terraform.tf
│ ├── main_sg_data_dev.tf
│ └── module-sg-data-dev
│ │ ├── _output.tf
│ │ ├── _variable.tf
│ │ ├── dev.bastion-public.sg.tf
│ │ ├── dev.emr-master.sg.tf
│ │ ├── dev.emr-serivce.sg.tf
│ │ ├── dev.emr-slave.sg.tf
│ │ └── dev.rds.sg.tf
├── _aws-root-vpc
│ ├── _local.tf
│ ├── _output.tf
│ ├── _provider.tf
│ ├── _terraform.tf
│ ├── main_vpc_data_dev.tf
│ └── module-vpc-data-dev
│ │ ├── _output.tf
│ │ ├── _variable.tf
│ │ └── dev.data.vpc.tf
├── aws-root-machine-bastion
│ ├── _data.ami.tf
│ ├── _data.state.tf
│ ├── _local.tf
│ ├── _provider.tf
│ ├── _template
│ │ └── template.cloudwatch.sh
│ ├── _terraform.tf
│ ├── main_bastion_dev.tf
│ └── module-bastion-data-dev
│ │ ├── _data.bootstrap.tf
│ │ ├── _local.tf
│ │ ├── _variable.tf
│ │ ├── dev.bastion-public-01.cw.tf
│ │ └── dev.bastion-public-01.ec2.tf
├── aws-root-machine-eks
│ ├── .gitkeep
│ ├── _local.tf
│ ├── _provider.tf
│ └── _terraform.tf
├── aws-root-machine-emr-batch
│ ├── .gitkeep
│ ├── _data.state.tf
│ ├── _local.tf
│ ├── _provider.tf
│ ├── _template
│ │ ├── template.emr-cloudwatch-collect.sh
│ │ ├── template.emr-instance-tag.sh
│ │ ├── template.emr-spark-batch.json
│ │ └── template.emr-system-config.sh
│ ├── _terraform.tf
│ ├── main_emr_data_dev.tf
│ └── module-emr-data-dev
│ │ ├── _local.tf
│ │ ├── _variable.tf
│ │ ├── dev.spark-batch-01.cw.tf
│ │ └── dev.spark-batch-01.emr.tf
├── aws-root-machine-emr-presto
│ └── .gitkeep
├── aws-root-machine-emr-stream
│ └── .gitkeep
└── aws-root-storage-rds
│ ├── .gitkeep
│ ├── _data.state.tf
│ ├── _local.tf
│ ├── _provider.tf
│ ├── _terraform.tf
│ ├── main_rds_data_dev.tf
│ └── module-rds-data-dev
│ ├── _variable.tf
│ └── dev.hive-metastore.rds.tf
└── project-terraform-gcp
└── .gitkeep
/.gitignore:
--------------------------------------------------------------------------------
1 | ./idea
2 | .DS_Store
3 | _assets/
4 | !.gitkeep
5 | _datasets/airbnb/*.csv
6 | _datasets/ecommerce/*.csv
7 |
8 | */.ipynb_checkpoints/
9 |
10 | derby.log
11 | metastore_db
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | TAG = "Makefile"
2 |
3 | MYSQLCLIENT = mycli
4 | DOCKER_HOST_IP := $(shell ipconfig getifaddr en0)
5 |
6 | ##
7 | ## Jupyter
8 | ##
9 |
10 | .PHONY: jupyter
11 | jupyter:
12 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose"
13 | @ echo "-----------------------------------------\n"
14 | @ jupyter lab --ip=127.0.0.1 --port=8080
15 |
16 | ##
17 | ## Compose
18 | ##
19 |
20 | .PHONY: compose.prepare
21 | compose.prepare:
22 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose"
23 | @ echo "-----------------------------------------\n"
24 | @ echo "export DOCKER_HOST_IP=$(DOCKER_HOST_IP)"
25 | @ echo "\n-----------------------------------------"
26 | @ echo ""
27 |
28 | .PHONY: compose.storage
29 | compose.storage: compose.prepare
30 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
31 | @ docker stop $(docker ps -a -q) || true
32 | @ docker rm -f $(docker ps -a -q) || true
33 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
34 | @ docker compose -f docker-compose.storage.yml rm -fsv || true
35 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
36 | -f docker-compose.storage.yml \
37 | up
38 |
39 | .PHONY: compose.spark
40 | compose.spark: compose.prepare
41 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
42 | @ docker stop $(docker ps -a -q) || true
43 | @ docker rm -f $(docker ps -a -q) || true
44 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
45 | @ docker compose -f docker-compose.spark.yml rm -fsv || true
46 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
47 | -f docker-compose.spark.yml \
48 | up
49 |
50 | .PHONY: compose.kafka
51 | compose.kafka: compose.prepare
52 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
53 | @ docker stop $(docker ps -a -q) || true
54 | @ docker rm -f $(docker ps -a -q) || true
55 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
56 | @ docker compose -f docker-compose.kafka.yml rm -fsv || true
57 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
58 | -f docker-compose.kafka.yml \
59 | up
60 |
61 | .PHONY: compose.metastore
62 | compose.metastore: compose.prepare
63 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
64 | @ docker stop $(docker ps -a -q) || true
65 | @ docker rm -f $(docker ps -a -q) || true
66 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
67 | @ docker compose -f docker-compose.metastore.yml rm -fsv || true
68 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
69 | -f docker-compose.metastore.yml \
70 | up --build
71 |
72 | .PHONY: compose.presto
73 | compose.presto: compose.prepare
74 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
75 | @ docker stop $(docker ps -a -q) || true
76 | @ docker rm -f $(docker ps -a -q) || true
77 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
78 | @ docker compose -f docker-compose.presto.yml rm -fsv || true
79 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
80 | -f docker-compose.presto.yml \
81 | up --build
82 |
83 | .PHONY: compose.aws
84 | compose.aws: compose.aws
85 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
86 | @ docker stop $(docker ps -a -q) || true
87 | @ docker rm -f $(docker ps -a -q) || true
88 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
89 | @ docker compose -f docker-compose.aws.yml rm -fsv || true
90 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
91 | -f docker-compose.aws.yml \
92 | up --build
93 |
94 | .PHONY: compose.clean
95 | compose.clean:
96 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Starting: Cleaning docker resources"
97 | @ echo "-----------------------------------------\n"
98 | @ docker stop `docker ps -a -q` || true
99 | @ docker rm -f `docker ps -a -q` || true
100 | @ docker rmi -f `docker images --quiet --filter "dangling=true"` || true
101 | @ docker volume rm `docker volume ls -f dangling=true -q` || true
102 | @ rm -rf ./docker-volumes
103 | @ docker network rm `docker network ls -q` || true
104 | @ echo ""
105 | @ rm -rf metastore_db
106 | @ echo "\n-----------------------------------------"
107 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Finished: Cleaning docker resources"
108 |
109 | .PHONY: compose.storage-all
110 | compose.storage-all: compose.storage-all
111 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose"
112 | @ docker stop $(docker ps -a -q) || true
113 | @ docker rm -f $(docker ps -a -q) || true
114 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true
115 | @ docker compose -f docker-compose.aws.yml rm -fsv || true
116 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \
117 | -f docker-compose.storage.yml \
118 | -f docker-compose.aws.yml \
119 | -f docker-compose.kafka.yml \
120 | up --build
121 |
122 | ##
123 | ## Storage CLIs
124 | ##
125 |
126 | .PHONY: mysql
127 | mysql:
128 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to mysql"
129 | @ $(MYSQLCLIENT) -u root -h localhost ad_stat -p root
130 |
131 | .PHONY: redis
132 | redis:
133 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to redis"
134 | @ redis-cli -a credential
135 |
136 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Practical Data Pipeline (Code)
2 |
3 |
--------------------------------------------------------------------------------
/_datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/.gitkeep
--------------------------------------------------------------------------------
/_datasets/airbnb/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/airbnb/.gitkeep
--------------------------------------------------------------------------------
/_datasets/ecommerce/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/ecommerce/.gitkeep
--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM openjdk:8u242-jre
2 |
3 | WORKDIR /opt
4 |
5 | ENV HADOOP_VERSION=2.10.1
6 | ENV METASTORE_VERSION=2.3.9
7 | ENV AWS_SDK_VERSION=1.11.271
8 |
9 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION}
10 | ENV HIVE_HOME=/opt/apache-hive-${METASTORE_VERSION}-bin
11 | ENV HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
12 |
13 | # BIN
14 | RUN apt-get update && \
15 | apt-get upgrade -y && \
16 | apt-get -qqy install curl && \
17 | curl -L https://dlcdn.apache.org/hive/hive-${METASTORE_VERSION}/apache-hive-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \
18 | curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \
19 | apt-get install --only-upgrade openssl libssl1.1 && \
20 | apt-get install -y libk5crypto3 libkrb5-3 libsqlite3-0
21 |
22 | # DEPENDENCY
23 | RUN rm ${HIVE_HOME}/lib/postgresql-9.4.1208.jre7.jar
24 | RUN curl -o ${HIVE_HOME}/lib/postgresql-9.4.1212.jre7.jar -L https://jdbc.postgresql.org/download/postgresql-9.4.1212.jre7.jar
25 | RUN curl -L https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.19.tar.gz | tar zxf - && \
26 | cp mysql-connector-java-8.0.19/mysql-connector-java-8.0.19.jar ${HIVE_HOME}/lib/ && \
27 | rm -rf mysql-connector-java-8.0.19
28 |
29 | # CONFIG
30 | COPY conf/hive-site.xml ${HIVE_HOME}/conf/hive-site.xml
31 | RUN ls -alh ${HADOOP_HOME}/etc/hadoop/
32 | RUN ls -alh ${HIVE_HOME}/conf/
33 | COPY scripts/entrypoint.sh /entrypoint.sh
34 |
35 | # UTILS
36 | ENV TINI_VERSION v0.19.0
37 | RUN apt-get -q update && apt-get -qy install netcat wget
38 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
39 | RUN chmod +x /tini
40 |
41 | # ENV
42 | ENV TZ=Asia/Seoul
43 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
44 |
45 | # USER
46 | RUN groupadd -r hadoop --gid=1001 && \
47 | useradd -r -g hadoop --uid=1001 -d ${HIVE_HOME} hadoop && \
48 | chown hadoop:hadoop -R ${HIVE_HOME}
49 |
50 | USER hadoop
51 | WORKDIR $HIVE_HOME
52 | EXPOSE 9083
53 |
54 | ENTRYPOINT ["/tini", "--"]
55 | CMD ["/entrypoint.sh"]
--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/conf/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | hive.metastore.schema.verification
4 | false
5 |
6 |
7 | metastore.warehouse.dir
8 | s3a://spark/warehouse/
9 |
10 |
11 | javax.jdo.option.ConnectionDriverName
12 | com.mysql.cj.jdbc.Driver
13 |
14 |
15 |
16 | javax.jdo.option.ConnectionURL
17 | jdbc:mysql://mysql:3306/metastore_db?createDatabaseIfNotExist=true
18 |
19 |
20 |
21 | javax.jdo.option.ConnectionUserName
22 | root
23 |
24 |
25 |
26 | javax.jdo.option.ConnectionPassword
27 | root
28 |
29 |
30 |
31 | fs.s3a.access.key
32 | accesskey
33 |
34 |
35 | fs.s3a.secret.key
36 | secretkey
37 |
38 |
39 | fs.s3a.endpoint
40 | http://minio:9000
41 |
42 |
43 | fs.s3a.path.style.access
44 | true
45 |
46 |
47 | fs.s3a.connection.ssl.enabled
48 | false
49 | Enables or disables SSL connections to S3.
50 |
51 |
52 | fs.s3a.impl
53 | org.apache.hadoop.fs.s3a.S3AFileSystem
54 | The implementation class of the S3A Filesystem
55 |
56 |
57 |
--------------------------------------------------------------------------------
/_dockerfile/docker-metastore/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | export HADOOP_VERSION=2.10.1
4 | export METASTORE_VERSION=2.3.9
5 | export AWS_SDK_VERSION=1.11.271
6 |
7 | export JAVA_HOME=/usr/local/openjdk-8
8 | export HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar
9 |
10 | sleep 10;
11 |
12 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType mysql || true;
13 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/hive --service metastore
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM openjdk:8-jre
2 |
3 |
4 | ARG _PRESTO_HOME=/opt/presto
5 | ARG _PRESTO_VERSION=0.265.1
6 | ENV PRESTO_VERSION=${_PRESTO_VERSION}
7 |
8 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz
9 | RUN mkdir -p /opt || true
10 | RUN tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt
11 | RUN rm presto-server-${PRESTO_VERSION}.tar.gz
12 | RUN ln -s /opt/presto-server-${PRESTO_VERSION} ${_PRESTO_HOME}
13 |
14 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar
15 | RUN mv presto-cli-${PRESTO_VERSION}-executable.jar /usr/local/bin/presto
16 | RUN chmod +x /usr/local/bin/presto
17 |
18 | # UTILS
19 | ENV TINI_VERSION v0.19.0
20 | RUN apt-get update && apt-get install -y wget python less telnet vim zsh netcat
21 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
22 | RUN chmod +x /tini
23 |
24 | # ENV
25 | ENV TZ=Asia/Seoul
26 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
27 |
28 | # CONFIG
29 | COPY scripts/entrypoint.sh /entrypoint.sh
30 | COPY etc/jvm.config ${_PRESTO_HOME}/etc/jvm.config
31 |
32 | # USER
33 | RUN groupadd -r hadoop --gid=1001 && \
34 | useradd -r -g hadoop --uid=1001 -d ${_PRESTO_HOME} hadoop && \
35 | chown hadoop:hadoop -R ${_PRESTO_HOME}
36 |
37 | RUN mkdir -p /var/presto && \
38 | chown hadoop:hadoop -R /var/presto && \
39 | chown hadoop:hadoop -R /opt/presto-server-${PRESTO_VERSION} && \
40 | chown hadoop:hadoop -R ${_PRESTO_HOME}/etc
41 |
42 | USER hadoop
43 | WORKDIR ${_PRESTO_HOME}
44 | EXPOSE 8080
45 |
46 | ENTRYPOINT ["/tini", "--"]
47 | CMD ["/entrypoint.sh"]
48 |
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/hive.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore.uri=thrift://hive-metastore:9083
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/iceberg.properties:
--------------------------------------------------------------------------------
1 | connector.name=iceberg
2 | hive.metastore.uri=thrift://hive-metastore:9083
3 | iceberg.file-format=PARQUET
4 | iceberg.compression-codec=SNAPPY
5 |
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/catalog/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | query.max-memory=1GB
5 | query.max-memory-per-node=1GB
6 | query.max-total-memory-per-node=2GB
7 | discovery-server.enabled=true
8 | discovery.uri=http://localhost:8080
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/jvm.config:
--------------------------------------------------------------------------------
1 | -server
2 | -Xmx4G
3 | -XX:+UseG1GC
4 | -XX:G1HeapRegionSize=32M
5 | -XX:ReservedCodeCacheSize=150M
6 | -XX:+UseGCOverheadLimit
7 | -XX:+ExplicitGCInvokesConcurrent
8 | -XX:+HeapDumpOnOutOfMemoryError
9 | -XX:+ExitOnOutOfMemoryError
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/log.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_dockerfile/docker-presto/etc/log.properties
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=production
2 | node.id=$(NODE_ID)
3 | node.data-dir=/var/presto/data
--------------------------------------------------------------------------------
/_dockerfile/docker-presto/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
1 | PRESTO_HOME=${PRESTO_HOME:-/opt/presto}
2 |
3 | PRESTO_COORDINATOR=${PRESTO_COORDINATOR:-}
4 | PRESTO_NODE_ID=${PRESTO_NODE_ID:-}
5 | PRESTO_LOG_LEVEL=${PRESTO_LOG_LEVEL:-INFO}
6 |
7 | PRESTO_HTTP_SERVER_PORT=${PRESTO_HTTP_SERVER_PORT:-8080}
8 |
9 | PRESTO_MAX_MEMORY=${PRESTO_MAX_MEMORY:-20}
10 | PRESTO_MAX_MEMORY_PER_NODE=${PRESTO_MAX_MEMORY_PER_NODE:-1}
11 | PRESTO_MAX_TOTAL_MEMORY_PER_NODE=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE:-2}
12 | PRESTO_HEAP_HEADROOM_PER_NODE=${PRESTO_HEAP_HEADROOM_PER_NODE:-1}
13 | PRESTO_JVM_HEAP_SIZE=${PRESTO_JVM_HEAP_SIZE:-4}
14 |
15 | create_config_node() {
16 | (
17 | echo "node.environment=production"
18 | echo "node.id=${PRESTO_NODE_ID}"
19 | echo "node.data-dir=/var/presto/data"
20 | ) >${PRESTO_HOME}/etc/node.properties
21 | }
22 |
23 | change_config_jvm() {
24 | sed -i "s/-Xmx.*G/-Xmx${PRESTO_JVM_HEAP_SIZE}G/" ${PRESTO_HOME}/etc/jvm.config
25 | }
26 |
27 | create_config_log() {
28 | (
29 | echo "com.facebook.presto=${PRESTO_LOG_LEVEL}"
30 | ) >${PRESTO_HOME}/etc/log.config
31 | }
32 |
33 | create_config_coordinator() {
34 | (
35 | echo "coordinator=true"
36 | echo "node-scheduler.include-coordinator=false"
37 | echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}"
38 | echo "query.max-memory=${PRESTO_MAX_MEMORY}GB"
39 | echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB"
40 | echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB"
41 | echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB"
42 | echo "discovery-server.enabled=true"
43 | echo "discovery.uri=http://localhost:${PRESTO_HTTP_SERVER_PORT}"
44 | ) >${PRESTO_HOME}/etc/config.properties
45 | }
46 |
47 | create_config_worker() {
48 | (
49 | echo "coordinator=false"
50 | echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}"
51 | echo "query.max-memory=${PRESTO_MAX_MEMORY}GB"
52 | echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB"
53 | echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB"
54 | echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB"
55 | echo "discovery.uri=http://${PRESTO_COORDINATOR}:${PRESTO_HTTP_SERVER_PORT}"
56 | ) >${PRESTO_HOME}/etc/config.properties
57 | }
58 |
59 | create_config_node
60 | create_config_log
61 | change_config_jvm
62 | if [ -z "${PRESTO_COORDINATOR}" ]
63 | then
64 | create_config_coordinator;
65 | else
66 | create_config_worker;
67 | fi
68 |
69 | env
70 |
71 | cat ${PRESTO_HOME}/etc/node.properties
72 | cat ${PRESTO_HOME}/etc/config.properties
73 | cat ${PRESTO_HOME}/etc/jvm.config
74 |
75 |
76 | /opt/presto/bin/launcher run
77 |
--------------------------------------------------------------------------------
/_notebook/spark-metastore-remote.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "397cef09-bd27-4769-9f70-7ad80803cbd7",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "3.8.10 (default, Nov 14 2021, 21:32:59) \n",
14 | "[Clang 12.0.5 (clang-1205.0.22.9)]\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import sys\n",
20 | "\n",
21 | "print(sys.version)"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "id": "2bae673f-f186-4e08-b7ee-23c236771a35",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "SPARK_HOME = \"/Users/kun/github/spark/spark-3.1.2-bin-hadoop-3.2.2\""
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "id": "8491d5dd-2c63-4fd0-9bd6-cdeba9b970d9",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import findspark\n",
42 | "\n",
43 | "findspark.init(SPARK_HOME)\n",
44 | "#findspark.add_packages([\"org.apache.hadoop:hadoop-aws:3.2.2\", \"com.amazonaws:aws-java-sdk-bundle:1.11.375\"])"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "7167ea4d-2fb3-450c-82b2-c6362b454820",
50 | "metadata": {},
51 | "source": [
52 | "### Spark Session 생성\n",
53 | "\n",
54 | "로컬모드에서 실행할 Spark Session 을 만듭니다. (`.master(\"local[*]\")`)\n",
55 | "- 일반적인 Spark 설정은 `$SPARK_HOME/conf/spark-defaults.conf` 내에서 세팅해 공통환경으로 사용합니다. 다만 이 예제에서는 보여주기 위해 SparkConf 를 이용해 설정합니다.\n",
56 | "- Hive Metastore URI 등 HMS 관련 설정은 `$SPARK_HOME/conf/hive-site.conf` 내에서 세팅해 공통 환경으로 사용합니다.\n",
57 | "- 이 예제에서는 Minio 를 사용하므로 Access Key, Secret Key 를 사용합니다. AWS 위에서 실행된다면 [AWS Instance Profile](https://docs.aws.amazon.com/ko_kr/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) 을 이용할 수 있으므로 키를 세팅하지 않습니다."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "id": "27587697-2e5c-4301-bc98-82389915b35c",
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stderr",
68 | "output_type": "stream",
69 | "text": [
70 | "21/11/29 15:41:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
71 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
72 | "Setting default log level to \"WARN\".\n",
73 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
74 | "21/11/29 15:41:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "from pyspark.sql import SparkSession\n",
80 | "\n",
81 | "\n",
82 | "spark = SparkSession \\\n",
83 | " .builder \\\n",
84 | " .master(\"local[*]\") \\\n",
85 | " .appName(\"example-app\") \\\n",
86 | " .config(\"spark.hadoop.fs.s3a.access.key\", \"accesskey\")\\\n",
87 | " .config(\"spark.hadoop.fs.s3a.secret.key\", \"secretkey\")\\\n",
88 | " .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://localhost:9000\")\\\n",
89 | " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\\\n",
90 | " .config(\"spark.hadoop.fs.s3a.connection.ssl.enabled\",\"false\")\\\n",
91 | " .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\\\n",
92 | " .enableHiveSupport() \\\n",
93 | " .getOrCreate()\n",
94 | " \n",
95 | "spark.sparkContext.setSystemProperty(\"com.amazonaws.services.s3.enableV4\", \"true\")"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "52107cb8-9741-422e-b50a-d9cd830f5ab0",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "spark.sparkContext.getConf().getAll()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "id": "b5c1bd63-5298-44fc-8681-974f2b9e7d50",
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stderr",
116 | "output_type": "stream",
117 | "text": [
118 | "21/11/26 01:44:27 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.\n"
119 | ]
120 | },
121 | {
122 | "data": {
123 | "text/plain": [
124 | "DataFrame[]"
125 | ]
126 | },
127 | "execution_count": 7,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "spark.sql(\"\"\"\n",
134 | "CREATE TABLE student (\n",
135 | " id INT, \n",
136 | " name STRING, \n",
137 | " age INT\n",
138 | ") \n",
139 | "STORED AS PARQUET\n",
140 | "LOCATION 's3a://udon-data/lake/student/'\n",
141 | "\"\"\")"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 17,
147 | "id": "0f83e0e6-8337-4922-ab2f-9e13d7b7089f",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "df = spark.read.format(\"csv\").load(\"s3a://udon-data-lake/marketing_campaign.csv\")"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "id": "6ccea55e-6ba0-4414-8588-e1968bddd92b",
158 | "metadata": {},
159 | "outputs": [],
160 | "source": []
161 | }
162 | ],
163 | "metadata": {
164 | "kernelspec": {
165 | "display_name": "pyspark",
166 | "language": "python",
167 | "name": "pyspark"
168 | },
169 | "language_info": {
170 | "codemirror_mode": {
171 | "name": "ipython",
172 | "version": 3
173 | },
174 | "file_extension": ".py",
175 | "mimetype": "text/x-python",
176 | "name": "python",
177 | "nbconvert_exporter": "python",
178 | "pygments_lexer": "ipython3",
179 | "version": "3.8.10"
180 | }
181 | },
182 | "nbformat": 4,
183 | "nbformat_minor": 5
184 | }
185 |
--------------------------------------------------------------------------------
/_script/docker-mysql/conf/my.cnf:
--------------------------------------------------------------------------------
1 | [client]
2 | default-character-set = utf8mb4
3 |
4 | [mysql]
5 | default-character-set = utf8mb4
6 |
7 | [mysqld]
8 | character-set-client-handshake = FALSE
9 | character-set-server = utf8mb4
10 | collation-server = utf8mb4_unicode_ci
11 | default-storage-engine=InnoDB
12 | default-time-zone = '+09:00'
13 |
--------------------------------------------------------------------------------
/_script/docker-mysql/sql/001_create_database.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE pipeline;
2 |
--------------------------------------------------------------------------------
/_script/docker-mysql/sql/002_create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE `ListingMeta`
2 | (
3 | -- primary key
4 | `listing_id` BIGINT UNSIGNED NOT NULL PRIMARY KEY,
5 | `listing_name` VARCHAR(240) NULL,
6 | `listing_desc` TEXT NULL,
7 | `listing_summary` TEXT NULL,
8 | `listing_url` TEXT NULL,
9 |
10 | -- FK columns
11 |
12 | -- common
13 | `created_at` datetime DEFAULT CURRENT_TIMESTAMP NOT NULL
14 |
15 | ) ENGINE = InnoDB
16 | DEFAULT CHARSET = utf8mb4
17 | COLLATE = utf8mb4_unicode_ci;
18 |
--------------------------------------------------------------------------------
/_script/docker-spark/apps/main.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import col,date_format
3 |
4 | def init_spark():
5 | sql = SparkSession.builder\
6 | .appName("trip-app")\
7 | .config("spark.jars", "/opt/spark-apps/postgresql-42.2.22.jar")\
8 | .getOrCreate()
9 | sc = sql.sparkContext
10 | return sql,sc
11 |
12 | def main():
13 | url = "jdbc:postgresql://storage-postgres:5432/postgres"
14 | properties = {
15 | "user": "postgres",
16 | "password": "root",
17 | "driver": "org.postgresql.Driver"
18 | }
19 | file = "/opt/spark-data/MTA_2014_08_01.csv"
20 | sql,sc = init_spark()
21 |
22 | df = sql.read.load(file,format = "csv", inferSchema="true", sep="\t", header="true") \
23 | .withColumn("report_hour",date_format(col("time_received"),"yyyy-MM-dd HH:00:00")) \
24 | .withColumn("report_date",date_format(col("time_received"),"yyyy-MM-dd"))
25 |
26 | # Filter invalid coordinates
27 | df.where("latitude <= 90 AND latitude >= -90 AND longitude <= 180 AND longitude >= -180") \
28 | .where("latitude != 0.000000 OR longitude != 0.000000 ") \
29 | .write \
30 | .jdbc(url=url, table="mta_reports", mode='append', properties=properties) \
31 | .save()
32 |
33 | if __name__ == '__main__':
34 | main()
--------------------------------------------------------------------------------
/_script/docker-spark/apps/postgresql-42.2.22.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_script/docker-spark/apps/postgresql-42.2.22.jar
--------------------------------------------------------------------------------
/_script/docker-spark/conf/spark-defaults.conf:
--------------------------------------------------------------------------------
1 | spark.eventLog.dir file:/tmp/spark-events
2 | spark.eventLog.enabled true
3 | spark.history.fs.logDirectory file:/tmp/spark-events
--------------------------------------------------------------------------------
/_script/docker-spark/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 |
--------------------------------------------------------------------------------
/_slide/.gitignore:
--------------------------------------------------------------------------------
1 | practical-aws-pipeline/
2 | practical-spark
--------------------------------------------------------------------------------
/_volume/.gitignore:
--------------------------------------------------------------------------------
1 | docker-minio/
--------------------------------------------------------------------------------
/docker-compose.aws.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | services:
3 | dynamodb-local:
4 | image: amazon/dynamodb-local:latest
5 | container_name: dynamodb-local
6 | ports:
7 | - "8000:8000"
8 |
9 | dynamodb-admin:
10 | image: aaronshaf/dynamodb-admin
11 | ports:
12 | - "8001:8001"
13 | environment:
14 | DYNAMO_ENDPOINT: "http://dynamodb-local:8000"
15 | AWS_REGION: "ap-northeast-2"
16 | AWS_ACCESS_KEY_ID: accesskey
17 | AWS_SECRET_ACCESS_KEY: secretkey
18 | depends_on:
19 | - dynamodb-local
20 |
21 | minio:
22 | image: minio/minio:latest
23 | container_name: minio
24 | environment:
25 | - MINIO_ACCESS_KEY=accesskey
26 | - MINIO_SECRET_KEY=secretkey
27 | - MINIO_ROOT_USER=admin
28 | - MINIO_ROOT_PASSWORD=admin12345
29 | volumes:
30 | - ./_volume/docker-minio:/data
31 | ports:
32 | - "9000:9000"
33 | - "9001:9001"
34 | command: server /data --console-address ":9001"
35 |
36 | minio-script:
37 | image: minio/mc
38 | container_name: minio-script
39 | depends_on:
40 | - minio
41 | entrypoint: >
42 | /bin/sh -c "
43 | sleep 10s;
44 | /usr/bin/mc alias set myminio http://minio:9000 admin admin12345;
45 | /usr/bin/mc mb myminio/udon-data-lake || true;
46 | /usr/bin/mc admin user add myminio accesskey accesskey || true;
47 | /usr/bin/mc admin policy set myminio readwrite user=accesskey || true;
48 | exit 0;
49 | "
50 |
--------------------------------------------------------------------------------
/docker-compose.kafka.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | zookeeper:
4 | image: confluentinc/cp-zookeeper:6.2.1
5 | hostname: zookeeper
6 | container_name: zookeeper
7 | ports:
8 | - "2181:2181"
9 | environment:
10 | ZOOKEEPER_CLIENT_PORT: 2181
11 | ZOOKEEPER_TICK_TIME: 2000
12 |
13 | broker:
14 | image: confluentinc/cp-kafka:6.2.1
15 | hostname: broker
16 | container_name: broker
17 | depends_on:
18 | - zookeeper
19 | ports:
20 | - "29092:29092"
21 | - "9092:9092"
22 | - "9101:9101"
23 | environment:
24 | KAFKA_BROKER_ID: 1
25 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
26 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
29 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
30 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
32 | KAFKA_JMX_PORT: 9101
33 | KAFKA_JMX_OPTS: -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=kafka0 -Dcom.sun.management.jmxremote.rmi.port=9101
34 | KAFKA_JMX_HOSTNAME: localhost
35 |
36 | schema-registry:
37 | image: confluentinc/cp-schema-registry:6.2.1
38 | hostname: schema-registry
39 | container_name: schema-registry
40 | depends_on:
41 | - broker
42 | ports:
43 | - "8081:8081"
44 | environment:
45 | SCHEMA_REGISTRY_HOST_NAME: schema-registry
46 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092'
47 | SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
48 |
49 | kafka-ui:
50 | image: provectuslabs/kafka-ui:latest
51 | container_name: kafka-ui
52 | depends_on:
53 | - broker
54 | - zookeeper
55 | - schema-registry
56 | ports:
57 | - "8080:8080"
58 | restart: always
59 | environment:
60 | - KAFKA_CLUSTERS_0_NAME=local
61 | - KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=broker:29092
62 | - KAFKA_CLUSTERS_0_ZOOKEEPER=zookeeper:2181
63 | - KAFKA_CLUSTERS_0_SCHEMAREGISTRY=schema-registry:8081
64 | - KAFKA_CLUSTERS_0_JMXPORT=9101
--------------------------------------------------------------------------------
/docker-compose.metastore.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | mysql:
4 | image: mysql:8
5 | container_name: mysql
6 | restart: always
7 | ports:
8 | - "3306:3306"
9 | environment:
10 | - MYSQL_DATABASE=metastore_db
11 | - MYSQL_ROOT_PASSWORD=root
12 | - LANG=C.UTF-8
13 | volumes:
14 | - ./_script/docker-mysql/conf/:/etc/mysql/conf.d
15 | - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d
16 | command: --sql_mode=''
17 | security_opt:
18 | - seccomp:unconfined
19 |
20 | minio:
21 | image: minio/minio:latest
22 | container_name: minio
23 | environment:
24 | - MINIO_ACCESS_KEY=accesskey
25 | - MINIO_SECRET_KEY=secretkey
26 | - MINIO_ROOT_USER=admin
27 | - MINIO_ROOT_PASSWORD=admin12345
28 | volumes:
29 | - ./_volume/docker-minio:/data
30 | ports:
31 | - "9000:9000"
32 | - "9001:9001"
33 | command: server /data --console-address ":9001"
34 |
35 | minio-script:
36 | image: minio/mc
37 | container_name: minio-script
38 | depends_on:
39 | - minio
40 | entrypoint: >
41 | /bin/sh -c "
42 | /usr/bin/mc alias set myminio http://minio:9000 admin admin12345;
43 | /usr/bin/mc mb myminio/udon-data-lake || true;
44 | # /usr/bin/mc admin user add myminio accesskey secretkey || true;
45 | # /usr/bin/mc admin policy set myminio readwrite user=accesskey || true;
46 | exit 0;
47 | "
48 |
49 | hive-metastore:
50 | container_name: hive-metastore
51 | build:
52 | context: _dockerfile/docker-metastore
53 | dockerfile: Dockerfile
54 | command:
55 | - /bin/sh
56 | - -c
57 | - |
58 | sleep 10;
59 | /entrypoint.sh
60 | ports:
61 | - "9083:9083"
62 | depends_on:
63 | - mysql
64 | - minio
65 |
--------------------------------------------------------------------------------
/docker-compose.presto.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | presto-coordinator:
4 | container_name: presto-coordinator
5 | build:
6 | context: _dockerfile/docker-presto
7 | dockerfile: Dockerfile
8 | environment:
9 | - PRESTO_NODE_ID=presto-coordinator
10 | ports:
11 | - "8889:8080"
12 | volumes:
13 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
14 |
15 | presto-worker-01:
16 | container_name: presto-worker-01
17 | build:
18 | context: _dockerfile/docker-presto
19 | dockerfile: Dockerfile
20 | environment:
21 | - PRESTO_COORDINATOR=presto-coordinator
22 | - PRESTO_NODE_ID=presto-worker-01
23 | volumes:
24 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
25 | depends_on:
26 | - presto-coordinator
27 | command:
28 | - /bin/sh
29 | - -c
30 | - |
31 | sleep 20;
32 | /entrypoint.sh
33 |
34 | presto-worker-02:
35 | container_name: presto-worker-02
36 | build:
37 | context: _dockerfile/docker-presto
38 | dockerfile: Dockerfile
39 | environment:
40 | - PRESTO_COORDINATOR=presto-coordinator
41 | - PRESTO_NODE_ID=presto-worker-02
42 | volumes:
43 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog
44 | depends_on:
45 | - presto-coordinator
46 | command:
47 | - /bin/sh
48 | - -c
49 | - |
50 | sleep 20;
51 | /entrypoint.sh
52 |
--------------------------------------------------------------------------------
/docker-compose.spark.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | spark-master:
4 | image: bde2020/spark-master:3.1.1-hadoop3.2
5 | container_name: spark-master
6 | ports:
7 | - "8080:8080"
8 | - "7077:7077"
9 | - "4040:4040"
10 | volumes:
11 | - ./_script/docker-spark/apps:/opt/spark-apps
12 | - ./_script/docker-spark/data:/opt/spark-data
13 | - ./_script/docker-spark/conf:/spark/conf
14 | - /tmp/spark-events-local:/tmp/spark-events
15 | environment:
16 | - INIT_DAEMON_STEP=setup_spark
17 |
18 | spark-worker-1:
19 | image: bde2020/spark-worker:3.1.1-hadoop3.2
20 | container_name: spark-worker-1
21 | depends_on:
22 | - spark-master
23 | ports:
24 | - "8081:8081"
25 | volumes:
26 | - ./_script/docker-spark/apps:/opt/spark-apps
27 | - ./_script/docker-spark/data:/opt/spark-data
28 | - ./_script/docker-spark/conf:/spark/conf
29 | - /tmp/spark-events-local:/tmp/spark-events
30 | environment:
31 | - "SPARK_MASTER=spark://spark-master:7077"
32 | spark-worker-2:
33 |
34 | image: bde2020/spark-worker:3.1.1-hadoop3.2
35 | container_name: spark-worker-2
36 | depends_on:
37 | - spark-master
38 | ports:
39 | - "8082:8081"
40 | volumes:
41 | - ./_script/docker-spark/apps:/opt/spark-apps
42 | - ./_script/docker-spark/data:/opt/spark-data
43 | - ./_script/docker-spark/conf:/spark/conf
44 | - /tmp/spark-events-local:/tmp/spark-events
45 | environment:
46 | - "SPARK_MASTER=spark://spark-master:7077"
47 |
48 | spark-history-server:
49 | image: bde2020/spark-history-server:3.1.1-hadoop3.2
50 | container_name: spark-history-server
51 | depends_on:
52 | - spark-master
53 | ports:
54 | - "18081:18081"
55 | volumes:
56 | - ./_script/docker-spark/apps:/opt/spark-apps
57 | - ./_script/docker-spark/data:/opt/spark-data
58 | - ./_script/docker-spark/conf:/spark/conf
59 | - /tmp/spark-events-local:/tmp/spark-events
60 | storage-postgres:
61 | image: postgres:11.7-alpine
62 | container_name: storage-postgers
63 | depends_on:
64 | - spark-master
65 | ports:
66 | - "5432:5432"
67 | environment:
68 | - POSTGRES_PASSWORD=root
--------------------------------------------------------------------------------
/docker-compose.storage.yml:
--------------------------------------------------------------------------------
1 | version: '3.6'
2 | services:
3 | mysql:
4 | image: mysql:8
5 | container_name: mysql
6 | restart: always
7 | ports:
8 | - 3306:3306
9 | environment:
10 | - MYSQL_DATABASE=pipeline
11 | - MYSQL_ROOT_PASSWORD=root
12 | - LANG=C.UTF-8
13 | volumes:
14 | - ./_script/docker-mysql/conf/:/etc/mysql/conf.d
15 | - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d
16 | command: --sql_mode=''
17 |
18 | redis:
19 | image: redis:5
20 | container_name: redis
21 | restart: always
22 | command: redis-server # --requirepass credential
23 | ports:
24 | - 6379:6379
25 |
26 |
--------------------------------------------------------------------------------
/project-flink/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml
4 |
5 | ### Intellij+iml ###
6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8 |
9 | # User-specific stuff
10 | .idea/**/workspace.xml
11 | .idea/**/tasks.xml
12 | .idea/**/usage.statistics.xml
13 | .idea/**/dictionaries
14 | .idea/**/shelf
15 |
16 | # AWS User-specific
17 | .idea/**/aws.xml
18 |
19 | # Generated files
20 | .idea/**/contentModel.xml
21 |
22 | # Sensitive or high-churn files
23 | .idea/**/dataSources/
24 | .idea/**/dataSources.ids
25 | .idea/**/dataSources.local.xml
26 | .idea/**/sqlDataSources.xml
27 | .idea/**/dynamic.xml
28 | .idea/**/uiDesigner.xml
29 | .idea/**/dbnavigator.xml
30 |
31 | # Gradle
32 | .idea/**/gradle.xml
33 | .idea/**/libraries
34 |
35 | # Gradle and Maven with auto-import
36 | # When using Gradle or Maven with auto-import, you should exclude module files,
37 | # since they will be recreated, and may cause churn. Uncomment if using
38 | # auto-import.
39 | # .idea/artifacts
40 | # .idea/compiler.xml
41 | # .idea/jarRepositories.xml
42 | # .idea/modules.xml
43 | # .idea/*.iml
44 | # .idea/modules
45 | # *.iml
46 | # *.ipr
47 |
48 | # CMake
49 | cmake-build-*/
50 |
51 | # Mongo Explorer plugin
52 | .idea/**/mongoSettings.xml
53 |
54 | # File-based project format
55 | *.iws
56 |
57 | # IntelliJ
58 | out/
59 |
60 | # mpeltonen/sbt-idea plugin
61 | .idea_modules/
62 |
63 | # JIRA plugin
64 | atlassian-ide-plugin.xml
65 |
66 | # Cursive Clojure plugin
67 | .idea/replstate.xml
68 |
69 | # Crashlytics plugin (for Android Studio and IntelliJ)
70 | com_crashlytics_export_strings.xml
71 | crashlytics.properties
72 | crashlytics-build.properties
73 | fabric.properties
74 |
75 | # Editor-based Rest Client
76 | .idea/httpRequests
77 |
78 | # Android studio 3.1+ serialized cache file
79 | .idea/caches/build_file_checksums.ser
80 |
81 | ### Intellij+iml Patch ###
82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
83 |
84 | *.iml
85 | modules.xml
86 | .idea/misc.xml
87 | *.ipr
88 |
89 | ### Java ###
90 | # Compiled class file
91 | *.class
92 |
93 | # Log file
94 | *.log
95 |
96 | # BlueJ files
97 | *.ctxt
98 |
99 | # Mobile Tools for Java (J2ME)
100 | .mtj.tmp/
101 |
102 | # Package Files #
103 | *.jar
104 | *.war
105 | *.nar
106 | *.ear
107 | *.zip
108 | *.tar.gz
109 | *.rar
110 |
111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
112 | hs_err_pid*
113 |
114 | ### Scala ###
115 |
116 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
117 |
118 | ### Gradle ###
119 | .gradle
120 | build/
121 |
122 | # Ignore Gradle GUI config
123 | gradle-app.setting
124 |
125 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
126 | !gradle-wrapper.jar
127 |
128 | # Cache of project
129 | .gradletasknamecache
130 |
131 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
132 | # gradle/wrapper/gradle-wrapper.properties
133 |
134 | ### Gradle Patch ###
135 | **/build/
136 |
137 | # Eclipse Gradle plugin generated files
138 | # Eclipse Core
139 | .project
140 | # JDT-specific (Eclipse Java Development Tools)
141 | .classpath
142 |
143 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
--------------------------------------------------------------------------------
/project-kafka/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,kotlin,java,intellij+iml,scala
4 |
5 | ### Intellij+iml ###
6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8 |
9 | # User-specific stuff
10 | .idea/**/workspace.xml
11 | .idea/**/tasks.xml
12 | .idea/**/usage.statistics.xml
13 | .idea/**/dictionaries
14 | .idea/**/shelf
15 |
16 | # AWS User-specific
17 | .idea/**/aws.xml
18 |
19 | # Generated files
20 | .idea/**/contentModel.xml
21 |
22 | # Sensitive or high-churn files
23 | .idea/**/dataSources/
24 | .idea/**/dataSources.ids
25 | .idea/**/dataSources.local.xml
26 | .idea/**/sqlDataSources.xml
27 | .idea/**/dynamic.xml
28 | .idea/**/uiDesigner.xml
29 | .idea/**/dbnavigator.xml
30 |
31 | # Gradle
32 | .idea/**/gradle.xml
33 | .idea/**/libraries
34 |
35 | # Gradle and Maven with auto-import
36 | # When using Gradle or Maven with auto-import, you should exclude module files,
37 | # since they will be recreated, and may cause churn. Uncomment if using
38 | # auto-import.
39 | # .idea/artifacts
40 | # .idea/compiler.xml
41 | # .idea/jarRepositories.xml
42 | # .idea/modules.xml
43 | # .idea/*.iml
44 | # .idea/modules
45 | # *.iml
46 | # *.ipr
47 |
48 | # CMake
49 | cmake-build-*/
50 |
51 | # Mongo Explorer plugin
52 | .idea/**/mongoSettings.xml
53 |
54 | # File-based project format
55 | *.iws
56 |
57 | # IntelliJ
58 | out/
59 |
60 | # mpeltonen/sbt-idea plugin
61 | .idea_modules/
62 |
63 | # JIRA plugin
64 | atlassian-ide-plugin.xml
65 |
66 | # Cursive Clojure plugin
67 | .idea/replstate.xml
68 |
69 | # Crashlytics plugin (for Android Studio and IntelliJ)
70 | com_crashlytics_export_strings.xml
71 | crashlytics.properties
72 | crashlytics-build.properties
73 | fabric.properties
74 |
75 | # Editor-based Rest Client
76 | .idea/httpRequests
77 |
78 | # Android studio 3.1+ serialized cache file
79 | .idea/caches/build_file_checksums.ser
80 |
81 | ### Intellij+iml Patch ###
82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
83 |
84 | *.iml
85 | modules.xml
86 | .idea/misc.xml
87 | *.ipr
88 |
89 | ### Java ###
90 | # Compiled class file
91 | *.class
92 |
93 | # Log file
94 | *.log
95 |
96 | # BlueJ files
97 | *.ctxt
98 |
99 | # Mobile Tools for Java (J2ME)
100 | .mtj.tmp/
101 |
102 | # Package Files #
103 | *.jar
104 | *.war
105 | *.nar
106 | *.ear
107 | *.zip
108 | *.tar.gz
109 | *.rar
110 |
111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
112 | hs_err_pid*
113 |
114 | ### Kotlin ###
115 | # Compiled class file
116 |
117 | # Log file
118 |
119 | # BlueJ files
120 |
121 | # Mobile Tools for Java (J2ME)
122 |
123 | # Package Files #
124 |
125 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
126 |
127 | ### Scala ###
128 |
129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
130 |
131 | ### Gradle ###
132 | .gradle
133 | build/
134 |
135 | # Ignore Gradle GUI config
136 | gradle-app.setting
137 |
138 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
139 | !gradle-wrapper.jar
140 |
141 | # Cache of project
142 | .gradletasknamecache
143 |
144 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
145 | # gradle/wrapper/gradle-wrapper.properties
146 |
147 | ### Gradle Patch ###
148 | **/build/
149 |
150 | # Eclipse Gradle plugin generated files
151 | # Eclipse Core
152 | .project
153 | # JDT-specific (Eclipse Java Development Tools)
154 | .classpath
155 |
156 | # End of https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala
--------------------------------------------------------------------------------
/project-spark/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml
3 |
4 | _volumes/**
5 |
6 | ### Intellij+iml ###
7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
9 |
10 | # User-specific stuff
11 | .idea/**/workspace.xml
12 | .idea/**/tasks.xml
13 | .idea/**/usage.statistics.xml
14 | .idea/**/dictionaries
15 | .idea/**/shelf
16 |
17 | # AWS User-specific
18 | .idea/**/aws.xml
19 |
20 | # Generated files
21 | .idea/**/contentModel.xml
22 |
23 | # Sensitive or high-churn files
24 | .idea/**/dataSources/
25 | .idea/**/dataSources.ids
26 | .idea/**/dataSources.local.xml
27 | .idea/**/sqlDataSources.xml
28 | .idea/**/dynamic.xml
29 | .idea/**/uiDesigner.xml
30 | .idea/**/dbnavigator.xml
31 |
32 | # Gradle
33 | .idea/**/gradle.xml
34 | .idea/**/libraries
35 |
36 | # Gradle and Maven with auto-import
37 | # When using Gradle or Maven with auto-import, you should exclude module files,
38 | # since they will be recreated, and may cause churn. Uncomment if using
39 | # auto-import.
40 | # .idea/artifacts
41 | # .idea/compiler.xml
42 | # .idea/jarRepositories.xml
43 | # .idea/modules.xml
44 | # .idea/*.iml
45 | # .idea/modules
46 | # *.iml
47 | # *.ipr
48 |
49 | # CMake
50 | cmake-build-*/
51 |
52 | # Mongo Explorer plugin
53 | .idea/**/mongoSettings.xml
54 |
55 | # File-based project format
56 | *.iws
57 |
58 | # IntelliJ
59 | out/
60 |
61 | # mpeltonen/sbt-idea plugin
62 | .idea_modules/
63 |
64 | # JIRA plugin
65 | atlassian-ide-plugin.xml
66 |
67 | # Cursive Clojure plugin
68 | .idea/replstate.xml
69 |
70 | # Crashlytics plugin (for Android Studio and IntelliJ)
71 | com_crashlytics_export_strings.xml
72 | crashlytics.properties
73 | crashlytics-build.properties
74 | fabric.properties
75 |
76 | # Editor-based Rest Client
77 | .idea/httpRequests
78 |
79 | # Android studio 3.1+ serialized cache file
80 | .idea/caches/build_file_checksums.ser
81 |
82 | ### Intellij+iml Patch ###
83 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
84 |
85 | *.iml
86 | modules.xml
87 | .idea/misc.xml
88 | *.ipr
89 |
90 | ### Java ###
91 | # Compiled class file
92 | *.class
93 |
94 | # Log file
95 | *.log
96 |
97 | # BlueJ files
98 | *.ctxt
99 |
100 | # Mobile Tools for Java (J2ME)
101 | .mtj.tmp/
102 |
103 | # Package Files #
104 | *.jar
105 | *.war
106 | *.nar
107 | *.ear
108 | *.zip
109 | *.tar.gz
110 | *.rar
111 |
112 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
113 | hs_err_pid*
114 |
115 | ### Scala ###
116 |
117 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
118 |
119 | ### Gradle ###
120 | .gradle
121 | build/
122 |
123 | # Ignore Gradle GUI config
124 | gradle-app.setting
125 |
126 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
127 | !gradle-wrapper.jar
128 |
129 | # Cache of project
130 | .gradletasknamecache
131 |
132 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
133 | # gradle/wrapper/gradle-wrapper.properties
134 |
135 | ### Gradle Patch ###
136 | **/build/
137 |
138 | # Eclipse Gradle plugin generated files
139 | # Eclipse Core
140 | .project
141 | # JDT-specific (Eclipse Java Development Tools)
142 | .classpath
143 |
144 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml
--------------------------------------------------------------------------------
/project-spark/_scripts/mysql-ddl/table_property_stat.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE pipeline.property_stat
2 | (
3 | property_id BIGINT UNSIGNED NOT NULL,
4 | property_type VARCHAR(30) NOT NULL,
5 | lat DOUBLE(40, 10) NOT NULL,
6 | lng DOUBLE(40, 10) NOT NULL,
7 |
8 | count_review_all BIGINT UNSIGNED NOT NULL,
9 | score_review_all DOUBLE(10, 5) NOT NULL,
10 |
11 | count_review BIGINT UNSIGNED NOT NULL,
12 | count_sales BIGINT UNSIGNED NOT NULL,
13 | price_sales BIGINT UNSIGNED NOT NULL,
14 |
15 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL,
16 | updated_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL,
17 |
18 | part TIMESTAMP NOT NULL COMMENT '데이터 파티션',
19 |
20 | PRIMARY KEY (property_id, part),
21 | INDEX idx_property_stat_combined (part, property_id)
22 |
23 | ) ENGINE = InnoDB
24 | DEFAULT CHARSET = utf8mb4
25 | COLLATE = utf8mb4_unicode_ci;
26 |
27 |
--------------------------------------------------------------------------------
/project-spark/build.gradle:
--------------------------------------------------------------------------------
1 | buildscript {
2 | ext {
3 | gradleShadowVersion = '6.1.0'
4 | gradleTestLoggerVersion = '2.1.0'
5 | gradleScalaTestVersion = '0.30'
6 | gradleVersioningPluginVersion = '2.8.2'
7 | gradleAvroPluginVersion = '1.2.0'
8 | }
9 |
10 | repositories {
11 | mavenCentral()
12 | jcenter()
13 |
14 | maven { url "https://plugins.gradle.org/m2/" }
15 | maven { url 'https://repo.spring.io/plugins-release' }
16 | maven {
17 | name "typesafe-maven-release"
18 | url "https://repo.typesafe.com/typesafe/maven-releases"
19 | }
20 | maven {
21 | name "Spark Packages Repo"
22 | url "https://dl.bintray.com/spark-packages/maven"
23 | }
24 | maven {
25 | name "Confluent"
26 | url "https://packages.confluent.io/maven/"
27 | }
28 | maven {
29 | name "jitpack"
30 | url 'https://jitpack.io'
31 | }
32 | ivy {
33 | name "typesafe-ivy-release"
34 | url "https://repo.typesafe.com/typesafe/ivy-releases"
35 | layout "ivy"
36 | }
37 | }
38 |
39 | dependencies {
40 | classpath "com.github.jengelman.gradle.plugins:shadow:${gradleShadowVersion}"
41 | classpath "gradle.plugin.net.nemerosa:versioning:${gradleVersioningPluginVersion}"
42 | classpath "com.github.davidmc24.gradle.plugin:gradle-avro-plugin:${gradleAvroPluginVersion}"
43 |
44 | // classpath "gradle.plugin.com.github.maiflai:gradle-scalatest:${gradleScalaTestVersion}"
45 | // classpath "com.adarshr:gradle-test-logger-plugin:${gradleTestLoggerVersion}"
46 | }
47 | }
48 |
49 | allprojects {
50 | apply plugin: 'idea'
51 | apply plugin: 'java'
52 | apply plugin: 'java-library'
53 | apply plugin: 'scala'
54 |
55 | // apply plugin: 'com.adarshr.test-logger'
56 | // apply plugin: "com.github.maiflai.scalatest"
57 |
58 | repositories {
59 | mavenCentral()
60 | maven { url "https://jcenter.bintray.com" }
61 | maven {
62 | name "Confluent"
63 | url "https://packages.confluent.io/maven/"
64 | }
65 | }
66 |
67 | ext {
68 | // Scala
69 | scalaVersionRevision = "12"
70 |
71 | // Spark
72 | scalaSparkVersion = "2.12"
73 | sparkVersion = "3.2.0"
74 | confluentVersion = "5.3.4"
75 |
76 | // Flink
77 | kafkaClientVersion = "2.6.2"
78 |
79 | // MySQL
80 | mysqlDriverVersion = "8.0.27"
81 |
82 | // AWS
83 | awsSdkVersion = "1.11.901"
84 | awsHadoopVersion = "3.3.1"
85 |
86 | // Utility
87 | typesafeConfigVersion = "1.3.3"
88 | shapelessVersion = "2.3.3"
89 | pureconfigVersion = "0.17.0"
90 | json4sVersion = '3.6.5'
91 | avroVersion = '1.10.2'
92 | semverVresion = '2.2.0'
93 | scalaHttpVersion = "2.0.0-RC6"
94 |
95 | // Logging
96 | slf4jVersion = "1.7.30"
97 | log4jVersion = "2.16.0"
98 |
99 | // Test
100 | scalaTestVersion = "3.2.5"
101 | junit5Version = "5.5.2"
102 | }
103 |
104 | dependencies {
105 | implementation("org.apache.commons:commons-lang3:3.12.0")
106 |
107 | implementation("com.typesafe:config:${typesafeConfigVersion}")
108 | implementation("com.github.pureconfig:pureconfig_${scalaSparkVersion}:${pureconfigVersion}")
109 | implementation("com.vdurmont:semver4j:${semverVresion}")
110 |
111 | // test
112 | testImplementation("org.scalatest:scalatest_${scalaSparkVersion}:${scalaTestVersion}")
113 | testImplementation "org.junit.platform:junit-platform-launcher:1.7.1"
114 | testRuntimeOnly "org.junit.platform:junit-platform-engine:1.7.1"
115 | testImplementation("org.junit.jupiter:junit-jupiter-api:${junit5Version}")
116 | testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:${junit5Version}")
117 | testRuntimeOnly "co.helmethair:scalatest-junit-runner:0.1.8"
118 | }
119 | }
120 |
121 | subprojects {
122 | targetCompatibility = 1.8
123 | sourceCompatibility = 1.8
124 | [compileJava, compileTestJava]*.options.collect {
125 | options -> options.encoding = 'UTF-8'
126 | }
127 |
128 | task wrapper(type: Wrapper) {
129 | gradleVersion = '6.8.1'
130 | }
131 |
132 | tasks.withType(ScalaCompile) {
133 | configure(scalaCompileOptions.forkOptions) {
134 | memoryMaximumSize = '2g'
135 | jvmArgs = ['-XX:MaxMetaspaceSize=512m']
136 | }
137 | }
138 |
139 | compileScala {
140 | targetCompatibility = "1.8"
141 | sourceCompatibility = "1.8"
142 | scalaCompileOptions.additionalParameters = [""] // -opt:l:method
143 | }
144 | compileTestScala {
145 | scalaCompileOptions.additionalParameters = ["-Yrangepos"]
146 | }
147 |
148 | test {
149 | useJUnitPlatform {
150 | includeEngines 'scalatest'
151 | testLogging {
152 | events("passed", "skipped", "failed")
153 | }
154 | }
155 |
156 | filter {
157 | includeTestsMatching "*Spec"
158 | }
159 | }
160 |
161 | configurations {
162 | localCompile {
163 | transitive = true
164 | }
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/project-spark/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/project-spark/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/project-spark/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | #
4 | # Copyright 2015 the original author or authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | ##
21 | ## Gradle start up script for UN*X
22 | ##
23 | ##############################################################################
24 |
25 | # Attempt to set APP_HOME
26 | # Resolve links: $0 may be a link
27 | PRG="$0"
28 | # Need this for relative symlinks.
29 | while [ -h "$PRG" ] ; do
30 | ls=`ls -ld "$PRG"`
31 | link=`expr "$ls" : '.*-> \(.*\)$'`
32 | if expr "$link" : '/.*' > /dev/null; then
33 | PRG="$link"
34 | else
35 | PRG=`dirname "$PRG"`"/$link"
36 | fi
37 | done
38 | SAVED="`pwd`"
39 | cd "`dirname \"$PRG\"`/" >/dev/null
40 | APP_HOME="`pwd -P`"
41 | cd "$SAVED" >/dev/null
42 |
43 | APP_NAME="Gradle"
44 | APP_BASE_NAME=`basename "$0"`
45 |
46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
48 |
49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
50 | MAX_FD="maximum"
51 |
52 | warn () {
53 | echo "$*"
54 | }
55 |
56 | die () {
57 | echo
58 | echo "$*"
59 | echo
60 | exit 1
61 | }
62 |
63 | # OS specific support (must be 'true' or 'false').
64 | cygwin=false
65 | msys=false
66 | darwin=false
67 | nonstop=false
68 | case "`uname`" in
69 | CYGWIN* )
70 | cygwin=true
71 | ;;
72 | Darwin* )
73 | darwin=true
74 | ;;
75 | MINGW* )
76 | msys=true
77 | ;;
78 | NONSTOP* )
79 | nonstop=true
80 | ;;
81 | esac
82 |
83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
84 |
85 |
86 | # Determine the Java command to use to start the JVM.
87 | if [ -n "$JAVA_HOME" ] ; then
88 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
89 | # IBM's JDK on AIX uses strange locations for the executables
90 | JAVACMD="$JAVA_HOME/jre/sh/java"
91 | else
92 | JAVACMD="$JAVA_HOME/bin/java"
93 | fi
94 | if [ ! -x "$JAVACMD" ] ; then
95 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
96 |
97 | Please set the JAVA_HOME variable in your environment to match the
98 | location of your Java installation."
99 | fi
100 | else
101 | JAVACMD="java"
102 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 |
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 |
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 | MAX_FD_LIMIT=`ulimit -H -n`
111 | if [ $? -eq 0 ] ; then
112 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 | MAX_FD="$MAX_FD_LIMIT"
114 | fi
115 | ulimit -n $MAX_FD
116 | if [ $? -ne 0 ] ; then
117 | warn "Could not set maximum file descriptor limit: $MAX_FD"
118 | fi
119 | else
120 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 | fi
122 | fi
123 |
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 |
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 |
134 | JAVACMD=`cygpath --unix "$JAVACMD"`
135 |
136 | # We build the pattern for arguments to be converted via cygpath
137 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 | SEP=""
139 | for dir in $ROOTDIRSRAW ; do
140 | ROOTDIRS="$ROOTDIRS$SEP$dir"
141 | SEP="|"
142 | done
143 | OURCYGPATTERN="(^($ROOTDIRS))"
144 | # Add a user-defined pattern to the cygpath arguments
145 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 | fi
148 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 | i=0
150 | for arg in "$@" ; do
151 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
153 |
154 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
155 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 | else
157 | eval `echo args$i`="\"$arg\""
158 | fi
159 | i=`expr $i + 1`
160 | done
161 | case $i in
162 | 0) set -- ;;
163 | 1) set -- "$args0" ;;
164 | 2) set -- "$args0" "$args1" ;;
165 | 3) set -- "$args0" "$args1" "$args2" ;;
166 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 | esac
173 | fi
174 |
175 | # Escape application args
176 | save () {
177 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 | echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 |
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 |
185 | exec "$JAVACMD" "$@"
186 |
--------------------------------------------------------------------------------
/project-spark/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/project-spark/module-core/build.gradle:
--------------------------------------------------------------------------------
1 | dependencies {
2 | // custom
3 | // https://mvnrepository.com/artifact/org.apache.flink/flink-avro-confluent-registry
4 | // https://mvnrepository.com/artifact/org.apache.avro/avro
5 | api("org.apache.avro:avro:${avroVersion}")
6 |
7 | api("org.json4s:json4s-jackson_${scalaSparkVersion}:${json4sVersion}")
8 | api("org.json4s:json4s-ext_${scalaSparkVersion}:${json4sVersion}")
9 |
10 | // logging
11 | api("org.apache.logging.log4j:log4j-api:${log4jVersion}")
12 | api("org.apache.logging.log4j:log4j-core:${log4jVersion}")
13 | api("org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}")
14 | api("org.slf4j:slf4j-log4j12:${slf4jVersion}")
15 | }
16 |
--------------------------------------------------------------------------------
/project-spark/module-core/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-core/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/project-spark/module-core/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/project-spark/module-core/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/common/Environment.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.common
2 |
3 | import pureconfig.generic.ProductHint
4 | import pureconfig.{CamelCase, ConfigFieldMapping, ConfigReader, ConfigSource}
5 |
6 | import scala.reflect.ClassTag
7 |
8 | object Environment {
9 | /** deployment */
10 | private val LOCAL = "LOCAL"
11 | private val DEVELOPMENT = "DEV"
12 | private val STAGING = "STAGE"
13 | private val PRODUCTION = "PROD"
14 |
15 | /** testing */
16 | private val UNIT = "UNIT"
17 | private val INTEGRATION = "INTEGRATION"
18 |
19 | private val mode = {
20 | var env: String = LOCAL
21 |
22 | val extractedEnv = System.getenv("PIPELINE_MODE")
23 | if (extractedEnv != null) {
24 | env = extractedEnv.toLowerCase()
25 | }
26 |
27 | env
28 | }
29 |
30 | def isLocalMode(): Boolean = {
31 | mode == LOCAL
32 | }
33 |
34 | /**
35 | * pureconfig 내에서 camel-case 사용을 위한 implicit 변수 생성
36 | * - https://pureconfig.github.io/docs/overriding-behavior-for-case-classes.html#field-mappings
37 | */
38 | def buildConfigHint[T](): ProductHint[T] = {
39 | return ProductHint[T](ConfigFieldMapping(CamelCase, CamelCase))
40 | }
41 |
42 | /**
43 | * 모드에 따라 다른 설정값 로딩하기 위한 함수
44 | */
45 | def getConfigOrThrow[T: ClassTag : ConfigReader]()(implicit productHint: ProductHint[T]): T = {
46 | val config = ConfigSource.default.at(mode).loadOrThrow[T]
47 | config
48 | }
49 |
50 | def getConfigOrThrowForApp[T: ClassTag : ConfigReader](app: String)(implicit productHint: ProductHint[T]): T = {
51 | val config = ConfigSource.default.at(mode).at(app).loadOrThrow[T]
52 | config
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/common/TimeUtil.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.common
2 |
3 | import java.time.format.DateTimeFormatter
4 | import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset}
5 |
6 | object TimeUtil {
7 |
8 | /**
9 | * @param partition 'yyyyMMdd' formatted String
10 | */
11 | def convertPartitionToDateString(partition: String): String = {
12 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
13 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy-MM-dd")
14 | val parsed = LocalDate.parse(partition, formatterInput)
15 |
16 | return parsed.format(formatterOutput)
17 | }
18 |
19 | /**
20 | * @param partition 'yyyyMMdd' formatted String
21 | */
22 | def convertPartitionToDateSlashString(partition: String): String = {
23 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
24 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd")
25 | val parsed = LocalDate.parse(partition, formatterInput)
26 |
27 | return parsed.format(formatterOutput)
28 | }
29 |
30 | /**
31 | * @param partition 'yyyyMMdd' formatted String
32 | */
33 | def convertPartitionToSqlTimestamp(partition: String): java.sql.Timestamp = {
34 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd")
35 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd")
36 | val parsed = LocalDate.parse(partition, formatterInput).atStartOfDay()
37 |
38 | return java.sql.Timestamp.valueOf(parsed)
39 | }
40 |
41 | /**
42 | * @param raw Assume the passed parameter has UTC timezone
43 | */
44 | def convertStringToEpochMillis(raw: String): Long = {
45 | val formatterInput = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
46 | val parsed = LocalDateTime.parse(raw.substring(0, 19), formatterInput)
47 |
48 | return parsed.atZone(ZoneOffset.UTC).toInstant.toEpochMilli
49 | }
50 |
51 | def getExpireEpochSeconds(expireDays: Int): Long = {
52 | val updatedAt = Instant.now().toEpochMilli
53 | val expireTtl = (updatedAt + (expireDays * 86400 * 1000)) / 1000
54 | return expireTtl
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/ProductPool.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.entity
2 |
3 | case class ProductPoolElement(id: String, rank: Long)
4 |
5 | case class ProductPool(specifier: String, elements: List[ProductPoolElement], elementCount: Long)
6 |
--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserEvent.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.entity
2 |
3 | import mkt.udon.core.common.TimeUtil
4 | import org.json4s.{DefaultFormats, Formats}
5 | import org.json4s.jackson.Serialization
6 |
7 | case class UserEvent(eventTime: Long, eventType: String, userId: String, productId: String, price: Double) {
8 | def convertToUserEventView(): UserEventView = {
9 | UserEventView(eventTime, productId)
10 | }
11 |
12 | def convertToUserEventOrder(): UserEventOrder = {
13 | UserEventOrder(eventTime, productId, price)
14 | }
15 | }
16 |
17 | case class UserEventRaw(event_time: String, event_type: String, product_id: String, price: Double, user_id: String) {
18 | def convert(): UserEvent = {
19 | val eventTime = TimeUtil.convertStringToEpochMillis(event_time)
20 | UserEvent(eventTime = eventTime, eventType = event_type, userId = user_id, productId = product_id, price = price)
21 | }
22 | }
23 |
24 | object UserEvent {
25 | def convertFromRaw(raw: String): UserEvent = {
26 | implicit val default: Formats = DefaultFormats.preservingEmptyValues
27 | val parsed = Serialization.read[UserEventRaw](raw)
28 | parsed.convert()
29 | }
30 | }
31 |
32 |
--------------------------------------------------------------------------------
/project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserProfile.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.core.entity
2 |
3 | import mkt.udon.core.entity.UserProfile.{EVENT_ORDER, EVENT_VIEW}
4 |
5 | /**
6 | * User Profile 에 저장될 View Event 입니다.
7 | */
8 | case class UserEventView(eventTime: Long, productId: String)
9 | /**
10 | * User Profile 에 저장될 Order Event 입니다.
11 | */
12 | case class UserEventOrder(eventTime: Long, productId: String, price: Double)
13 |
14 | /**
15 | * Dynamo 등의 Storage 에 저장될 수 있는 User Profile 입니다.
16 | *
17 | * totalOrderPrice 와 같이 사용자에 대한 전체 이벤트에 집계를 수행할수도 있습니다.
18 | * eventOrder 등의 경우에는 List 타입이고 무한히 늘어날 수 없으므로 최근 N 개만 저장합니다.
19 | *
20 | * @param specifier 사용자 ID
21 | * @param eventView 최근 상품 방문 이벤트 목록
22 | * @param eventOrder 최근 상품 주문 이벤트 목록
23 | */
24 | case class UserProfile(specifier: String,
25 |
26 | var eventView: List[UserEventView] = List(),
27 | var eventOrder: List[UserEventOrder] = List()) {
28 |
29 | def update(userEvent: UserEvent,
30 | maxCountView: Int, maxCountOrder: Int): UserProfile = {
31 |
32 | if (userEvent.eventType == EVENT_VIEW) handleView(userEvent.convertToUserEventView(), maxCountView)
33 | else if (userEvent.eventType == EVENT_ORDER) handleOrder(userEvent.convertToUserEventOrder(), maxCountOrder)
34 |
35 | return this
36 | }
37 |
38 | def handleView(eventRecent: UserEventView, maxCount: Int) = {
39 | val merged = (eventView :+ eventRecent)
40 | val sorted = merged.sortBy(x => -x.eventTime).take(maxCount)
41 |
42 | eventView = sorted
43 | }
44 |
45 | def handleOrder(eventRecent: UserEventOrder, maxCount: Int) = {
46 | val merged = (eventOrder :+ eventRecent)
47 | val sorted = merged.sortBy(x => -x.eventTime).take(maxCount)
48 |
49 | eventOrder = sorted
50 | }
51 |
52 | }
53 |
54 | object UserProfile {
55 | val EVENT_VIEW = "view"
56 | val EVENT_ORDER = "order"
57 |
58 | def buildEmpty(userId: String): UserProfile = {
59 | UserProfile(specifier = userId, eventView = List(), eventOrder = List())
60 | }
61 | }
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/build.gradle:
--------------------------------------------------------------------------------
1 | dependencies {
2 |
3 | // shared
4 | implementation project(path: ':module-core')
5 |
6 | // TODO: 클러스터모드에서 동작시에는 기본 라이브러리는 Jar 에 포함될 필요가 없습니다.
7 | // spark:
8 | // - use provided dependencies when building in CI.
9 | // - use compile dependencies for local testing
10 | // if (System.env.PIPELINE_BRANCH) {
11 | // ...
12 | // }
13 |
14 | api("org.apache.spark:spark-core_${scalaSparkVersion}:${sparkVersion}") {
15 | }
16 | api("org.apache.spark:spark-sql_${scalaSparkVersion}:${sparkVersion}")
17 | api("org.apache.spark:spark-hive_${scalaSparkVersion}:${sparkVersion}")
18 | api("org.apache.spark:spark-mllib_${scalaSparkVersion}:${sparkVersion}")
19 | api("org.apache.spark:spark-streaming_${scalaSparkVersion}:${sparkVersion}")
20 | api("org.apache.spark:spark-avro_${scalaSparkVersion}:${sparkVersion}")
21 | api("org.apache.spark:spark-sql-kafka-0-10_${scalaSparkVersion}:${sparkVersion}")
22 |
23 | api("org.apache.hadoop:hadoop-aws:${awsHadoopVersion}")
24 | api("org.apache.spark:spark-hadoop-cloud_${scalaSparkVersion}:${sparkVersion}")
25 | api("com.amazonaws:aws-java-sdk:${awsSdkVersion}")
26 |
27 | api("mysql:mysql-connector-java:${mysqlDriverVersion}")
28 |
29 | api("org.apache.avro:avro:$avroVersion")
30 | api("org.apache.kafka:kafka-clients:${kafkaClientVersion}")
31 | api("za.co.absa:abris_${scalaSparkVersion}:4.2.0")
32 | api("io.confluent:kafka-avro-serializer:$confluentVersion") {
33 | exclude group: "org.apache.kafka", module: "kafka-clients"
34 | }
35 | }
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/SparkBase.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark
2 |
3 | import mkt.udon.core.common.Environment
4 | import org.apache.log4j.LogManager
5 | import org.apache.spark.sql.SparkSession
6 |
7 | trait SparkBase {
8 |
9 | val logger = LogManager.getRootLogger
10 | var session: SparkSession = null
11 |
12 | def driver(session: SparkSession): Unit
13 |
14 | def buildSession(): SparkSession = {
15 | var sessionBuilder = SparkSession.builder().enableHiveSupport()
16 |
17 | if (Environment.isLocalMode()) {
18 | sessionBuilder = sessionBuilder.master("local[*]")
19 | sessionBuilder = sessionBuilder.config("spark.sql.crossJoin.enabled", true)
20 |
21 | }
22 |
23 | session = sessionBuilder.getOrCreate()
24 | setupHadoopEnvironment(session)
25 |
26 | session
27 | }
28 |
29 | /**
30 | * 실제 Production 환경에서는
31 | * - 설정은 Cluster 의 spark-defaults.conf 환경을 따릅니다.
32 | * - AWS Key 는 Machine 의 IAM Role 을 이용합니다.
33 | *
34 | * 아래 코드에서는 로컬 테스팅을 위해 해당 설정들을 직접 세팅합니다.
35 | */
36 | def setupHadoopEnvironment(session: SparkSession): Unit = {
37 | if (!Environment.isLocalMode()) return
38 |
39 | val hadoopConf = session.sparkContext.hadoopConfiguration
40 |
41 | hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
42 | hadoopConf.set("fs.s3.canned.acl", "BucketOwnerFullControl")
43 | // hadoopConf.set("fs.s3a.access.key", accessKey)
44 | // hadoopConf.set("fs.s3a.secret.key", secretKey)
45 | }
46 |
47 | def main(args: Array[String]): Unit = {
48 | session = buildSession()
49 |
50 | try {
51 | driver(session)
52 | } catch {
53 | case t: Throwable =>
54 | logger.error("Application failed due to", t)
55 | session.stop()
56 | }
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/common/Partition.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark.common
2 |
3 | object Partition {
4 | val PARTITION_KEY = "part"
5 | }
6 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/DynamoSink.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark.storage
2 |
3 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder
4 | import com.amazonaws.services.dynamodbv2.document.{DynamoDB, Item, Table}
5 | import mkt.udon.core.common.TimeUtil
6 | import org.apache.spark.sql.Dataset
7 | import org.json4s.jackson.JsonMethods.parse
8 | import org.json4s.jackson.Serialization.write
9 | import org.json4s.{DefaultFormats, Extraction, FieldSerializer, Formats, JLong, JObject}
10 |
11 | import java.time.Instant
12 |
13 | object DynamoSink {
14 |
15 | def writePartition[T](dynamoTable: String,
16 | dynamoRegion: String,
17 | expireDays: Int,
18 | dsTarget: Dataset[T],
19 | expireFieldName: String = "expireTtl",
20 | updateFieldName: String = "updatedAt"
21 | )(implicit m: Manifest[T]): Unit = {
22 |
23 | dsTarget.foreachPartition((iter: Iterator[T]) => {
24 | val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build();
25 | val dynamoDB = new DynamoDB(dynamoClient)
26 | val client = dynamoDB.getTable(dynamoTable)
27 |
28 | while (iter.hasNext) {
29 | val cur = iter.next()
30 | implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[T]()
31 |
32 | val updatedAt = Instant.now().toEpochMilli
33 | val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays)
34 |
35 | val json = Extraction.decompose(cur)
36 | .merge(JObject(updateFieldName -> JLong(updatedAt)))
37 | .merge(JObject(expireFieldName -> JLong(expireTtl)))
38 | .snakizeKeys
39 |
40 | val stringified = write(json)
41 | val request = Item.fromJSON(stringified)
42 |
43 | client.putItem(request)
44 | }
45 | })
46 | }
47 |
48 | def putItem[A](dynamoClient: Table,
49 | item: A,
50 | expireDays: Int,
51 | expireFieldName: String = "expireTtl",
52 | updateFieldName: String = "updatedAt")(implicit m: Manifest[A]): Unit = {
53 |
54 | // FieldSerializer 는 `private` 필드 사용시 패키지 명 까지 필드 이름에 포함되므로 사용에 유의
55 | // Scala Enum 값 변환을 위해서는 EnumNameSerializer 가 필요하나 저장용 Case Class 에서 일반적으로 String 으로 사용
56 | implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[A]()
57 |
58 | val updatedAt = Instant.now().toEpochMilli
59 | val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays)
60 |
61 | val json = Extraction.decompose(item)
62 | .merge(JObject("updatedAt" -> JLong(updatedAt)))
63 | .merge(JObject("expireTtl" -> JLong(expireTtl)))
64 | .camelizeKeys
65 |
66 | val stringified = write(json)
67 | val request = Item.fromJSON(stringified)
68 |
69 | dynamoClient.putItem(request)
70 | }
71 |
72 | def getItem[A](dynamoClient: Table,
73 | keyName: String, keyValue: String)(implicit m: Manifest[A]): Option[A] = {
74 |
75 | val responseItem = dynamoClient.getItem(keyName, keyValue)
76 |
77 | if (responseItem == null) None
78 | else {
79 | implicit val format = DefaultFormats.preservingEmptyValues
80 | val raw = responseItem.toJSON
81 | val parsed = parse(raw).camelizeKeys
82 | val converted = parsed.extract[A]
83 | Some(converted)
84 | }
85 | }
86 |
87 | def buildClient(dynamoTable: String, dynamoRegion: String): Table = {
88 | val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build();
89 | val dynamoDB = new DynamoDB(dynamoClient)
90 | val client = dynamoDB.getTable(dynamoTable)
91 | return client
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/JdbcSink.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark.storage
2 |
3 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
4 |
5 | import java.sql.{Connection, DriverManager}
6 |
7 | object JdbcSink {
8 |
9 | val DRIVER = "com.mysql.cj.jdbc.Driver"
10 |
11 | def write(session: SparkSession, dfTarget: Dataset[Row],
12 | jdbcUrl: String, jdbcTable: String,
13 | jdbcUsername: String, jdbcPassword: String,
14 | ): Unit = {
15 |
16 | dfTarget
17 | .write
18 | .mode("append")
19 | .format("jdbc")
20 | .option("driver", DRIVER)
21 | .option("url", jdbcUrl)
22 | .option("user", jdbcUsername)
23 | .option("password", jdbcPassword)
24 | .option("dbtable", jdbcTable)
25 | .option("truncate", "false")
26 | .save()
27 | }
28 |
29 | def delete(jdbcUrl: String, jdbcTable: String,
30 | jdbcUsername: String, jdbcPassword: String,
31 | partitionColName: String, partitionColValue: java.sql.Timestamp): Unit = {
32 |
33 | var connection: Connection = null
34 |
35 | try {
36 | Class.forName(DRIVER)
37 | connection = DriverManager.getConnection(jdbcUrl, jdbcUsername, jdbcPassword)
38 |
39 | // remove rows which are already existing and having the same partition value
40 | val query = s"DELETE FROM ${jdbcTable} WHERE `${partitionColName}` = ?"
41 | val preparedStatement = connection.prepareStatement(query)
42 | preparedStatement.setTimestamp(1, partitionColValue)
43 | preparedStatement.execute()
44 |
45 | } catch {
46 | case e: Exception =>
47 | throw e;
48 |
49 | } finally {
50 | if (connection != null) connection.close()
51 | }
52 |
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/ParquetSink.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.infra.spark.storage
2 |
3 | import mkt.udon.core.common.TimeUtil
4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
5 |
6 | object ParquetSink {
7 |
8 |
9 | def write(session: SparkSession,
10 | dfTarget: DataFrame,
11 | parquetLocation: String,
12 | parquetSaveMode: SaveMode): Unit = {
13 |
14 | dfTarget
15 | .write
16 | .mode(parquetSaveMode)
17 | .options(Map(
18 | ("parquet.enable.dictionary", "true"),
19 | ("parquet.block.size", s"${32 * 1024 * 1024}"),
20 | ("parquet.page.size", s"${2 * 1024 * 1024}"),
21 | ("parquet.dictionary.page.size", s"${8 * 1024 * 1024}")
22 | ))
23 | .parquet(parquetLocation)
24 | }
25 |
26 | /** *
27 | * Partition Value 로 부터 저장할 Parquet Location 을 빌드합니다.
28 | *
29 | * @param s3Prefix
30 | * @param partitionValue yyyyMMdd 를 가정
31 | */
32 | def buildLocation(prefix: String, partition: String): String = {
33 | val partitionPath = TimeUtil.convertPartitionToDateSlashString(partition)
34 | return s"${prefix}/${partitionPath}"
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/Makefile:
--------------------------------------------------------------------------------
1 | TAG = "Makefile"
2 |
3 | VERSION = $(shell cat ./VERSION)
4 | MODULE = service-batch-discovery
5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
8 |
9 | .PHONY: test
10 | test:
11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | @ echo ""
13 |
14 | @ ../gradlew :$(MODULE):test
15 |
16 | .PHONY: build
17 | build:
18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | @ echo ""
20 |
21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 |
23 | .PHONY: deploy
24 | deploy:
25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | @ echo ""
27 |
28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/build.gradle:
--------------------------------------------------------------------------------
1 | def versionValue = file("VERSION").text.trim()
2 | project.version = versionValue
3 |
4 | apply plugin: 'application'
5 | apply plugin: 'com.github.johnrengelman.shadow'
6 |
7 | dependencies {
8 | // shared
9 | implementation project(path: ':module-core')
10 | implementation project(path: ':module-infra-spark')
11 |
12 | // custom
13 | // TODO
14 | }
15 |
16 | mainClassName = 'test'
17 | run.classpath = sourceSets.main.runtimeClasspath
18 |
19 | jar {
20 | manifest {
21 | attributes(
22 | "Implementation-Title": project.name,
23 | "Implementation-Version": project.version,
24 | "Build-Jdk": System.getProperty('java.version'),
25 | )
26 | }
27 |
28 | }
29 |
30 | shadowJar {
31 | zip64 = true
32 | exclude 'META-INF/**'
33 | baseName = project.name
34 | mergeServiceFiles()
35 | }
36 |
37 | assemble.dependsOn(shadowJar)
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | LOCAL {
2 | dynamoTable = "service-dev-product-pool"
3 | dynamoTable = ${?DYNAMO_TABLE}
4 | dynamoRegion = "ap-northeast-2"
5 | dynamoRegion = ${?DYNAMO_REGION}
6 | dynamoPartitionCount = 3
7 | dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT}
8 |
9 | parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_product_pool"
10 | parquetPrefix = ${?PARQUET_PREFIX}
11 | parquetWriteMode = "Overwrite"
12 | parquetWriteMode = ${?PARQUET_WRITE_MODE}
13 | parquetPartitionCount = 2
14 | parquetPartitionCount = ${?PARQUET_PARTITION_COUNT}
15 |
16 | partitionSnapshot = "20200201"
17 | partitionSnapshot = ${?PARTITION_SNAPSHOT}
18 | partitionMetricStart = "20200201"
19 | partitionMetricStart = ${?PARTITION_METRIC_START}
20 | partitionMetricEnd = "20200201"
21 | partitionMetricEnd = ${?PARTITION_METRIC_END}
22 |
23 | maxElementCount = 20
24 | maxElementCount = ${?MAX_ELEMENT_COUNT}
25 | expireDays = 10
26 | expireDays = ${?EXPIRE_DAYS}
27 | }
28 |
29 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 |
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 |
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 |
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 |
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/UdonProductPoolBatch.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon
2 |
3 | import mkt.udon.config.UdonProductPoolBatchConfig
4 | import mkt.udon.core.common.Environment
5 | import mkt.udon.entity.UdonProductPoolEntity
6 | import mkt.udon.infra.spark.SparkBase
7 | import mkt.udon.infra.spark.storage.{DynamoSink, ParquetSink}
8 | import org.apache.log4j.LogManager
9 | import org.apache.spark.sql.functions.lit
10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
11 | import pureconfig.generic.auto._
12 |
13 | object UdonProductPoolBatch extends SparkBase {
14 | override val logger = LogManager.getLogger(this.getClass.getName)
15 |
16 | override def driver(session: SparkSession): Unit = {
17 |
18 | /**
19 | * 환경변수 추출 및 설정
20 | */
21 | implicit val configHint = Environment.buildConfigHint[UdonProductPoolBatchConfig]()
22 | val config = Environment.getConfigOrThrow[UdonProductPoolBatchConfig]
23 |
24 | /**
25 | * 데이터 추출 및 가공
26 | */
27 | val partitionSnapshot = config.partitionSnapshot
28 | val partitionMetricStart = config.partitionMetricStart
29 | val partitionMetricEnd = config.partitionMetricEnd
30 | val dfUserEvent = readUserEvent(session = session,
31 | partitionMetricStart = partitionMetricStart, partitionMetricEnd = partitionMetricEnd)
32 |
33 | val dsResult = UdonProductPoolEntity.convert(
34 | session,
35 | dfUserEvent = dfUserEvent,
36 | maxElementCount = config.maxElementCount)
37 |
38 | /**
39 | * 데이터 저장: Parquet
40 | *
41 | * `part` 를 파티션 컬럼으로 지정해 추가합니다.
42 | * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만,
43 | * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다.
44 | */
45 | val dfPersistedParquet = dsResult.withColumn("part", lit(partitionSnapshot))
46 | .repartition(config.parquetPartitionCount)
47 | val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partitionSnapshot)
48 | ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode))
49 |
50 | /**
51 | * 데이터 저장: Dynamo
52 | */
53 | DynamoSink.writePartition(config.dynamoTable, config.dynamoRegion, config.expireDays, dsResult)
54 | }
55 |
56 | def readUserEvent(session: SparkSession,
57 | partitionMetricStart: String, partitionMetricEnd: String): DataFrame = {
58 |
59 | if (Environment.isLocalMode()) {
60 | val resourcePath = getClass.getClassLoader.getResource("ecommerce.csv").getPath
61 |
62 | val df = session.read.format("csv")
63 | .option("inferSchema", "true")
64 | .option("header", "true")
65 | .load(resourcePath)
66 |
67 | return df
68 | }
69 |
70 | return session.sql(
71 | s"""
72 | |SELECT *
73 | |FROM airbnb_db.user_client_event
74 | |WHERE part BETWEEN ${partitionMetricStart} AND ${partitionMetricEnd}
75 | |""".stripMargin)
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/config/UdonProductPoolBatchConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 |
3 | case class UdonProductPoolBatchConfig(dynamoTable: String, dynamoRegion: String, dynamoPartitionCount: String,
4 | parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int,
5 | partitionSnapshot: String,
6 | partitionMetricStart: String,
7 | partitionMetricEnd: String,
8 | maxElementCount: Int, expireDays: Int)
9 |
--------------------------------------------------------------------------------
/project-spark/service-batch-discovery/src/main/scala/mkt/udon/entity/UdonProductPoolEntity.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.entity
2 |
3 | import mkt.udon.core.entity.{ProductPool, ProductPoolElement}
4 | import org.apache.spark.sql.expressions.Window
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
7 |
8 | object UdonProductPoolEntity {
9 |
10 | val EVENT_VIEW = "view"
11 | val EVENT_CART = "cart"
12 | val EVENT_ORDER = "purchase"
13 |
14 | def convert(session: SparkSession, dfUserEvent: DataFrame,
15 | maxElementCount: Int): Dataset[ProductPool] = {
16 |
17 | import session.implicits._
18 |
19 | val dfFiltered = dfUserEvent.selectExpr("product_id", "user_id", "user_session")
20 | .where(col("event_type").isInCollection(List(EVENT_VIEW)))
21 |
22 | /**
23 | * 상품과 상품들을 연관짓기 위해 사용자 Session 을 사용합니다. 이 방법의 기본적인 가정은
24 | * - 사용자가 의도를 가진 채로 상품을 탐색하는 하나의 Session 동안에는 '연관된 상품' 을 보았을 거라 가정하고
25 | * - 하나의 세션 내에서 같이 본 상품은 사용자 관점에서 유의미 하게 비슷할거라는 가설을 가지고 있습니다.
26 | *
27 | * 서비스에 나가는 추천들은 실제로는 더 복잡한 모델을 이용하고 정제되어 있는 많은 Feature 를 이용하지만,
28 | * 여기에서는 가장 기본적인 데이터 가공을 통해 상품 Pool 을 구성하기 위해 위에서 언급한 방법을 이용합니다.
29 | *
30 | * 이 방법을 응용하면 Search Together, View Together, Cart Together, Order Together 와 같은 상품 Pool 을 만들 수 있습니다.
31 | * 혹은 각각의 Unique Session ID 혹은 Unique User ID, 단순 Count 등을 Feature 로 삼아 통계적으로 각 Feature 의 비율을 조합해 내보낼 수도 있습니다.
32 | *
33 | * 사용자의 행위 기반 외에도 도메인이 숙박이라면 상품 메타 정보 (거리, 가격) 등의 메트릭 유사도를 추가할 수 있습니다.
34 | */
35 | val dfJoined = dfFiltered.alias("L")
36 | .join(
37 | dfFiltered.alias("R"),
38 | col("L.user_session") === col("R.user_session") &&
39 | col("L.product_id") =!= col("R.product_id"),
40 | "inner"
41 | )
42 | .selectExpr(
43 | "L.product_id as product_id",
44 | "R.product_id as product_id_other",
45 | "L.user_session"
46 | )
47 |
48 | // 순위 생성 및 maxElementCount 를 이용해 필터링
49 | val windowRank = Window.partitionBy(col("product_id")).orderBy(col("count_session_uniq").desc)
50 | val dfGrouped = dfJoined
51 | .groupBy("product_id", "product_id_other")
52 | .agg(countDistinct("user_session").as("count_session_uniq"))
53 | .withColumn("rank", row_number().over(windowRank))
54 | .where(col("rank") <= lit(maxElementCount))
55 |
56 | // 배열로 만들기 위해 UDF 를 통해 Case Class 로 변경
57 | // 주의사항: Spark 의 'collect_list' 는 순서를 보존하지 않으므로 Rank 값 없이 리스트화 하면 상품의 순서가 보존되지 않을 수 있습니다.
58 | val udfElementize = udf((id: String, rank: Long) =>
59 | ProductPoolElement(id = id, rank = rank))
60 | val dfConverted = dfGrouped
61 | .withColumn("element", udfElementize(col("product_id_other"), col("rank")))
62 | .groupBy("product_id")
63 | .agg(collect_list("element").as("elements"), count("*").as("element_count"))
64 |
65 |
66 | return dfConverted.selectExpr("product_id as specifier", "elements", "element_count as elementCount").as[ProductPool]
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/Makefile:
--------------------------------------------------------------------------------
1 | TAG = "Makefile"
2 |
3 | VERSION = $(shell cat ./VERSION)
4 | MODULE = service-batch-statistics
5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
8 |
9 | .PHONY: test
10 | test:
11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | @ echo ""
13 |
14 | @ ../gradlew :$(MODULE):test
15 |
16 | .PHONY: build
17 | build:
18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | @ echo ""
20 |
21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 |
23 | .PHONY: deploy
24 | deploy:
25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | @ echo ""
27 |
28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/build.gradle:
--------------------------------------------------------------------------------
1 | def versionValue = file("VERSION").text.trim()
2 | project.version = versionValue
3 |
4 | apply plugin: 'application'
5 | apply plugin: 'com.github.johnrengelman.shadow'
6 |
7 | dependencies {
8 | // shared
9 | implementation project(path: ':module-core')
10 | implementation project(path: ':module-infra-spark')
11 | }
12 |
13 | mainClassName = 'test'
14 | run.classpath = sourceSets.main.runtimeClasspath
15 |
16 | jar {
17 | manifest {
18 | attributes(
19 | "Implementation-Title": project.name,
20 | "Implementation-Version": project.version,
21 | "Build-Jdk": System.getProperty('java.version'),
22 | )
23 | }
24 |
25 | }
26 |
27 | shadowJar {
28 | zip64 = true
29 | exclude 'META-INF/**'
30 | baseName = project.name
31 | // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고,
32 | // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi
33 | // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do
34 | mergeServiceFiles()
35 | }
36 |
37 | assemble.dependsOn(shadowJar)
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | LOCAL {
2 | jdbcHost = "localhost"
3 | jdbcHost = ${?JDBC_HOST}
4 | jdbcPort = 3306
5 | jdbcPort = ${?JDBC_PORT}
6 | jdbcUsername = "root"
7 | jdbcUsername = ${?JDBC_USERNAME}
8 | jdbcPassword = "root"
9 | jdbcPassword = ${?JDBC_PASSWORD}
10 | jdbcSchema = "pipeline"
11 | jdbcSchema = ${?JDBC_SCHEMA}
12 | jdbcTable = "property_stat"
13 | jdbcTable = ${?JDBC_TABLE}
14 | jdbcPartitionCount = 2
15 | jdbcPartitionCount = ${?JDBC_PARTITION_COUNT}
16 |
17 | parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_stat"
18 | parquetPrefix = ${?PARQUET_PREFIX}
19 | parquetWriteMode = "Overwrite"
20 | parquetWriteMode = ${?PARQUET_WRITE_MODE}
21 | parquetPartitionCount = 5
22 | parquetPartitionCount = ${?PARQUET_PARTITION_COUNT}
23 |
24 | partitionSnapshot = "20191129"
25 | partitionSnapshot = ${?PARTITION_SNAPSHOT}
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 |
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 |
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 |
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 |
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/UdonStatBatch.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon
2 |
3 | import mkt.udon.config.UdonStatBatchConfig
4 | import mkt.udon.core.common.{Environment, TimeUtil}
5 | import mkt.udon.entity.UdonStatEntity
6 | import mkt.udon.infra.spark.SparkBase
7 | import mkt.udon.infra.spark.storage.{JdbcSink, ParquetSink}
8 | import org.apache.log4j.LogManager
9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
11 | import pureconfig.generic.auto._
12 |
13 | object UdonStatBatch extends SparkBase {
14 | override val logger = LogManager.getLogger(this.getClass.getName)
15 |
16 | override def driver(session: SparkSession): Unit = {
17 | /**
18 | * 환경변수 추출 및 설정
19 | */
20 | implicit val configHint = Environment.buildConfigHint[UdonStatBatchConfig]()
21 | val config = Environment.getConfigOrThrow[UdonStatBatchConfig]
22 |
23 | /**
24 | * 데이터 추출 및 가공
25 | */
26 | val partition = config.partitionSnapshot
27 | val dfPropertyMeta = readPropertyMeta(partition, session)
28 | val dfPropertySales = readPropertySales(partition, session)
29 | val dfPropertyReview = readPropertyReview(partition, session)
30 |
31 | var dfResult = UdonStatEntity.convert(session, partition,
32 | dfPropertyMeta = dfPropertyMeta,
33 | dfPropertySales = dfPropertySales,
34 | dfPropertyReview = dfPropertyReview)
35 |
36 | // 사이즈가 작을 경우 추가적인 연산을 위해 캐싱할 수 있습니다.
37 | dfResult = dfResult.cache()
38 |
39 | /**
40 | * 데이터 저장: Parquet
41 | *
42 | * `part` 를 파티션 컬럼으로 지정해 추가합니다.
43 | * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만,
44 | * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다.
45 | */
46 | val dfPersistedParquet = dfResult.withColumn("part", lit(partition))
47 | .repartition(config.parquetPartitionCount)
48 | val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partition)
49 | ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode))
50 |
51 | /**
52 | * 데이터 저장: JDBC
53 | *
54 | * `part` 를 파티션 컬럼으로 지정해 추가합니다. Hive 테이블과 달라질 수 있기 때문에 별도 가공을 수행합니다.
55 | */
56 | val connectionUrl = s"jdbc:mysql://${config.jdbcHost}:${config.jdbcPort}/${config.jdbcSchema}"
57 | val partitionColumns = List(col("property_id"))
58 |
59 | val jdbcPartitionValue = TimeUtil.convertPartitionToSqlTimestamp(partition)
60 | val dfPersistedJdbc = dfResult.withColumn("part", lit(jdbcPartitionValue))
61 | .repartition(config.jdbcPartitionCount, partitionColumns: _*)
62 |
63 | JdbcSink.delete(jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable,
64 | jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword,
65 | partitionColName = "part", partitionColValue = jdbcPartitionValue
66 | )
67 |
68 | JdbcSink.write(session, dfPersistedJdbc,
69 | jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable,
70 | jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword)
71 | }
72 |
73 | def readPropertyMeta(partition: String, session: SparkSession): DataFrame = {
74 |
75 | if (Environment.isLocalMode()) {
76 | val resourcePath = getClass.getClassLoader.getResource("airbnb_listings.csv").getPath
77 |
78 | val df = session.read.format("csv")
79 | .option("inferSchema", "true")
80 | .option("header", "true")
81 | .option("quote", "\"")
82 | .option("escape", "\"")
83 | .option("sep", ",")
84 | .option("multiline", "true")
85 | .load(resourcePath)
86 |
87 | return df
88 | }
89 |
90 | return session.sql(
91 | s"""
92 | |SELECT *
93 | |FROM airbnb_db.property_meta
94 | |WHERE part = ${partition}
95 | |""".stripMargin)
96 | }
97 |
98 | def readPropertySales(partition: String, session: SparkSession): DataFrame = {
99 |
100 | if (Environment.isLocalMode()) {
101 | val resourcePath = getClass.getClassLoader.getResource("airbnb_calendar.csv").getPath
102 |
103 | val df = session.read.format("csv")
104 | .option("inferSchema", "true")
105 | .option("header", "true")
106 | .option("quote", "\"")
107 | .option("escape", "\"")
108 | .option("sep", ",")
109 | .option("multiline", "true")
110 | .load(resourcePath)
111 |
112 | return df
113 | }
114 |
115 | return session.sql(
116 | s"""
117 | |SELECT *
118 | |FROM airbnb_db.property_sales
119 | |WHERE part = ${partition}
120 | |""".stripMargin)
121 | }
122 |
123 | def readPropertyReview(partition: String, session: SparkSession): DataFrame = {
124 |
125 | if (Environment.isLocalMode()) {
126 | val resourcePath = getClass.getClassLoader.getResource("airbnb_reviews.csv").getPath
127 |
128 | val df = session.read.format("csv")
129 | .option("inferSchema", "true")
130 | .option("header", "true")
131 | .option("quote", "\"")
132 | .option("escape", "\"")
133 | .option("sep", ",")
134 | .option("multiline", "true")
135 | .load(resourcePath)
136 |
137 | return df
138 | }
139 |
140 | return session.sql(
141 | s"""
142 | |SELECT *
143 | |FROM airbnb_db.property_review
144 | |WHERE part = ${partition}
145 | |""".stripMargin)
146 | }
147 |
148 | /**
149 | * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다.
150 | * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고
151 | * - Hive Metastore URI 를 설정해야 합니다.
152 | */
153 | def createTable(config: UdonStatBatchConfig, session: SparkSession): Unit = {
154 | if (Environment.isLocalMode()) return
155 |
156 | // TODO: execute create table DDL
157 | }
158 |
159 | /**
160 | * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다.
161 | * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고
162 | * - Hive Metastore URI 를 설정해야 합니다.
163 | */
164 | def createPartition(config: UdonStatBatchConfig, session: SparkSession): Unit = {
165 | if (Environment.isLocalMode()) return
166 |
167 | // TODO: execute create partition DDL
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/config/UdonStatBatchConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 |
3 | case class UdonStatBatchConfig(jdbcHost: String, jdbcPort: Int,
4 | jdbcUsername: String, jdbcPassword: String,
5 | jdbcSchema: String, jdbcTable: String,
6 | jdbcPartitionCount: Int,
7 | parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int,
8 | partitionSnapshot: String)
9 |
--------------------------------------------------------------------------------
/project-spark/service-batch-statistics/src/main/scala/mkt/udon/entity/UdonStatEntity.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.entity
2 |
3 | import mkt.udon.core.common.TimeUtil
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 |
8 | object UdonStatEntity {
9 |
10 | def convert(session: SparkSession, partition: String,
11 | dfPropertyMeta: DataFrame,
12 | dfPropertySales: DataFrame,
13 | dfPropertyReview: DataFrame): DataFrame = {
14 |
15 | val partitionDate = TimeUtil.convertPartitionToDateString(partition)
16 |
17 | /**
18 | * 상품 메타
19 | */
20 | val dfMeta = dfPropertyMeta
21 | .selectExpr("CAST(id AS BIGINT) as property_id", "property_type", "latitude", "longitude")
22 |
23 | /**
24 | * 상품 메트릭 누적 (리뷰)
25 | */
26 | val dfMetricReviewTotal = dfPropertyMeta
27 | .selectExpr("CAST(id AS BIGINT) as property_id", "number_of_reviews as count_review_all", "review_scores_rating as score_review_all")
28 |
29 | /**
30 | * 상품 메트릭 델타 (리뷰)
31 | */
32 | val dfMetricReviewDelta = dfPropertyReview
33 | .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date")
34 | .where(col("date") === lit(partitionDate).cast(DateType))
35 | .groupBy("property_id")
36 | .agg(count("*").as("count_review"))
37 |
38 | /**
39 | * 상품 메트릭 델타 (판매)
40 | */
41 | val dfMetricSalesDelta = dfPropertySales
42 | .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date", "price as price_raw")
43 | .where(col("date") === lit(partitionDate).cast(DateType))
44 | .where(col("available") === lit("f"))
45 | .withColumn("price", regexp_extract(col("price_raw"), "[0-9]+.[0-9]+", 0).cast(DoubleType))
46 | .drop("price_raw")
47 | .groupBy("property_id")
48 | .agg(
49 | count("*").as("count_sales"),
50 | sum("price").as("price_sales")
51 | )
52 |
53 | /**
54 | * 결과 데이터 프레임 내 2 가지 성격의 데이터가 섞여 있습니다.
55 | * - 누적 데이터 (전체 기간 내 최신 값)
56 | * - 일별 데이터 (해당 일에 대한 변동 값)
57 | *
58 | * 이 데이터를 하나의 결과 테이블로 만드는게 맞을지 / 아니면 Spark Application 과 테이블을 분리하는게 맞을지 논의해 봅시다.
59 | */
60 | val dfJoined = dfMeta.alias("PROPERTY_META")
61 | .join(dfMetricReviewTotal.alias("METRIC_REVIEW_TOTAL"),
62 | col("PROPERTY_META.property_id") === col("METRIC_REVIEW_TOTAL.property_id"), "left")
63 | .join(dfMetricReviewDelta.alias("METRIC_REVIEW_DELTA"),
64 | col("PROPERTY_META.property_id") === col("METRIC_REVIEW_DELTA.property_id"), "left")
65 | .join(dfMetricSalesDelta.alias("METRIC_SALES_DELTA"),
66 | col("PROPERTY_META.property_id") === col("METRIC_SALES_DELTA.property_id"), "left")
67 | .selectExpr(
68 | "PROPERTY_META.property_id as property_id",
69 | "PROPERTY_META.property_type as property_type",
70 | "PROPERTY_META.latitude as lat",
71 | "PROPERTY_META.longitude as lng",
72 |
73 | "coalesce(METRIC_REVIEW_TOTAL.count_review_all, 0) as count_review_all",
74 | "coalesce(METRIC_REVIEW_TOTAL.score_review_all, 0.0) as score_review_all",
75 |
76 | "coalesce(METRIC_REVIEW_DELTA.count_review, 0) as count_review",
77 |
78 | "coalesce(METRIC_SALES_DELTA.count_sales, 0) as count_sales",
79 | "CAST(coalesce(METRIC_SALES_DELTA.price_sales, 0) AS BIGINT) as price_sales"
80 | )
81 |
82 | return dfJoined
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/Makefile:
--------------------------------------------------------------------------------
1 | TAG = "Makefile"
2 |
3 | VERSION = $(shell cat ./VERSION)
4 | MODULE = service-stream-profile
5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact
6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar
7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar
8 |
9 | .PHONY: test
10 | test:
11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
12 | @ echo ""
13 |
14 | @ ../gradlew :$(MODULE):test
15 |
16 | .PHONY: build
17 | build:
18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)"
19 | @ echo ""
20 |
21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar
22 |
23 | .PHONY: deploy
24 | deploy:
25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)"
26 | @ echo ""
27 |
28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT)
29 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1-SNAPSHOT
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/build.gradle:
--------------------------------------------------------------------------------
1 | def versionValue = file("VERSION").text.trim()
2 | project.version = versionValue
3 |
4 | apply plugin: 'application'
5 | apply plugin: 'com.github.johnrengelman.shadow'
6 |
7 | dependencies {
8 | // shared
9 | implementation project(path: ':module-core')
10 | implementation project(path: ':module-infra-spark')
11 | }
12 |
13 | mainClassName = 'test'
14 | run.classpath = sourceSets.main.runtimeClasspath
15 |
16 | jar {
17 | manifest {
18 | attributes(
19 | "Implementation-Title": project.name,
20 | "Implementation-Version": project.version,
21 | "Build-Jdk": System.getProperty('java.version'),
22 | )
23 | }
24 |
25 | }
26 |
27 | shadowJar {
28 | zip64 = true
29 | exclude 'META-INF/**'
30 | baseName = project.name
31 | // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고,
32 | // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi
33 | // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do
34 | mergeServiceFiles()
35 | }
36 |
37 | assemble.dependsOn(shadowJar)
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | LOCAL {
2 | UserProfileStream {
3 | checkpointLocation = "/tmp/spark-user-profile"
4 | dynamoTable = "service-dev-user-profile"
5 | dynamoTable = ${?DYNAMO_TABLE}
6 | dynamoRegion = "ap-northeast-2"
7 | dynamoRegion = ${?DYNAMO_REGION}
8 | dynamoExpireDays = 15
9 | dynamoExpireDays = ${?DYNAMO_EXPIRE_DAYS}
10 | dynamoPartitionCount = 3
11 | dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT}
12 | kafkaBroker = "localhost:9092"
13 | kafkaBroker = ${?KAFKA_BROKER}
14 | kafkaTopic = "user-event"
15 | kafkaTopic = ${?KAFKA_TOPIC}
16 | kafkaConsumerGroup= "user-profile"
17 | kafkaConsumerGroup = ${?KAFKA_CONSUMER_GROUP}
18 | kafkaOffsetStarting= "latest"
19 | kafkaOffsetStarting = ${?KAFKA_OFFSET_STARTING}
20 | maxCountView = 10
21 | maxCountView = ${?MAX_COUNT_VIEW}
22 | maxCountOrder = 10
23 | maxCountOrder = ${?MAX_COUNT_ORDER}
24 | }
25 |
26 | UserRelayStream {
27 | checkpointLocation = "/tmp/spark-user-relay"
28 | sourceKafkaBroker = "localhost:9092"
29 | sourceKafkaBroker = ${?SOURCE_KAFKA_BROKER}
30 | sourceKafkaTopic = "user-event"
31 | sourceKafkaTopic = ${?SOURCE_KAFKA_TOPIC}
32 | sourceKafkaConsumerGroup= "user-event-relay"
33 | sourceKafkaConsumerGroup = ${?SOURCE_KAFKA_CONSUMER_GROUP}
34 | sourceKafkaOffsetStarting= "latest"
35 | sourceKafkaOffsetStarting = ${?SOURCE_KAFKA_OFFSET_STARTING}
36 |
37 | sinkKafkaBroker = "localhost:9092"
38 | sinkKafkaBroker = ${?SINK_KAFKA_BROKER}
39 | sinkKafkaTopic = "user-event-relay"
40 | sinkKafkaTopic = ${?SINK_KAFKA_TOPIC}
41 | }
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=INFO, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n
24 |
25 | # Set the default spark-shell/spark-sql log level to WARN. When running the
26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
27 | # the root logger's log level, so that the user can have different defaults
28 | # for the shell and regular Spark apps.
29 | log4j.logger.org.apache.spark.repl.Main=WARN
30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN
31 |
32 | # Settings to quiet third party logs that are too verbose
33 | log4j.logger.org.sparkproject.jetty=WARN
34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
37 | log4j.logger.org.apache.parquet=ERROR
38 | log4j.logger.parquet=ERROR
39 |
40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
43 |
44 | # For deploying Spark ThriftServer
45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805
46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message
48 | log4j.appender.console.filter.1.AcceptOnMatch=false
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonProfileStream.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon
2 |
3 | import mkt.udon.config.UdonProfileStreamConfig
4 | import mkt.udon.core.common.Environment
5 | import mkt.udon.core.entity.UserEvent
6 | import mkt.udon.entity.UdonProfileStateFunc
7 | import mkt.udon.infra.spark.SparkBase
8 | import org.apache.log4j.LogManager
9 | import org.apache.spark.sql.functions.col
10 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
11 | import org.apache.spark.sql.{Dataset, SparkSession}
12 | import pureconfig.generic.auto._
13 |
14 | object UdonProfileStream extends SparkBase {
15 | override val logger = LogManager.getLogger(this.getClass.getName)
16 |
17 | val APP = "UserProfileStream"
18 |
19 | override def driver(session: SparkSession): Unit = {
20 | import session.implicits._
21 |
22 | /**
23 | * 환경변수 추출 및 설정
24 | */
25 | implicit val configHint = Environment.buildConfigHint[UdonProfileStreamConfig]()
26 | val config = Environment.getConfigOrThrowForApp[UdonProfileStreamConfig](APP)
27 |
28 |
29 | /**
30 | * 데이터 추출 및 가공
31 | */
32 | val dfRaw = session.readStream
33 | .format("kafka")
34 | .option("kafka.bootstrap.servers", config.kafkaBroker)
35 | .option("subscribe", config.kafkaTopic)
36 | .option("groupIdPrefix", config.kafkaConsumerGroup)
37 | .option("startingOffsets", config.kafkaOffsetStarting)
38 | .load()
39 |
40 | // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다.
41 | val dfConverted = dfRaw
42 | .selectExpr("CAST(value AS STRING)").as[String]
43 | .map(UserEvent.convertFromRaw)
44 |
45 | /**
46 | * 데이터 적재
47 | */
48 | val dfWritten = dfConverted.writeStream
49 | .queryName(APP)
50 | .trigger(Trigger.ProcessingTime("1 seconds"))
51 | .outputMode(OutputMode.Append())
52 | .foreachBatch((dsUserEvent: Dataset[UserEvent], batchId: Long) => {
53 | // 사용자 기준으로 repartition 해 하나의 파티션 내에서 해당 사용자 이벤트를 모드 처리할 수 있도록 합니다
54 | val repartitioned = dsUserEvent.repartition(config.dynamoPartitionCount, col("userId"))
55 |
56 | // 파티션 처리 함수를 호출합니다.
57 | repartitioned.foreachPartition((iter: Iterator[UserEvent]) => {
58 | UdonProfileStateFunc.handlePartition(config, iter)
59 | })
60 | })
61 | .option("checkpointLocation", config.checkpointLocation)
62 | .start()
63 |
64 | dfWritten.awaitTermination()
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonRelayStream.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon
2 |
3 | import mkt.udon.config.UdonRelayStreamConfig
4 | import mkt.udon.core.common.Environment
5 | import mkt.udon.core.entity.UserEvent
6 | import mkt.udon.infra.spark.SparkBase
7 | import org.apache.log4j.LogManager
8 | import org.apache.spark.sql.SparkSession
9 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
10 | import pureconfig.generic.auto._
11 |
12 | object UdonRelayStream extends SparkBase {
13 | override val logger = LogManager.getLogger(this.getClass.getName)
14 |
15 | val APP = "UserRelayStream"
16 |
17 | override def driver(session: SparkSession): Unit = {
18 | import session.implicits._
19 |
20 | /**
21 | * 환경변수 추출 및 설정
22 | */
23 | implicit val configHint = Environment.buildConfigHint[UdonRelayStreamConfig]()
24 | val config = Environment.getConfigOrThrowForApp[UdonRelayStreamConfig](APP)
25 |
26 | /**
27 | * 데이터 추출 및 가공
28 | */
29 | val dfRaw = session.readStream
30 | .format("kafka")
31 | .option("kafka.bootstrap.servers", config.sourceKafkaBroker)
32 | .option("subscribe", config.sourceKafkaTopic)
33 | .option("groupIdPrefix", config.sourceKafkaConsumerGroup)
34 | .option("startingOffsets", config.sourceKafkaOffsetStarting)
35 | .load()
36 |
37 | // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다.
38 | val dfConverted = dfRaw
39 | .selectExpr("CAST(value AS STRING)").as[String]
40 | .map(UserEvent.convertFromRaw)
41 |
42 | /**
43 | * 데이터 적재
44 | */
45 |
46 | // UserEvent.userId 를 Kafka Partition Key 로 사용합니다.
47 | val dfJson = dfConverted.selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value")
48 |
49 | val dfWritten = dfJson.writeStream
50 | .queryName(APP)
51 | .outputMode(OutputMode.Append())
52 | .trigger(Trigger.Continuous("1 seconds"))
53 | .format("kafka")
54 | .option("kafka.bootstrap.servers", config.sinkKafkaBroker)
55 | .option("topic", config.sinkKafkaTopic)
56 | .option("checkpointLocation", config.checkpointLocation)
57 | .start()
58 |
59 | dfWritten.awaitTermination()
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonProfileStreamConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 |
3 | case class UdonProfileStreamConfig(checkpointLocation: String,
4 | dynamoTable: String,
5 | dynamoRegion: String,
6 | dynamoExpireDays: Int,
7 | dynamoPartitionCount: Int,
8 | kafkaBroker: String,
9 | kafkaTopic: String,
10 | kafkaConsumerGroup: String,
11 | kafkaOffsetStarting: String,
12 | maxCountView: Int,
13 | maxCountOrder: Int
14 | )
15 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonRelayStreamConfig.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.config
2 |
3 | case class UdonRelayStreamConfig(checkpointLocation: String,
4 | sourceKafkaBroker: String,
5 | sourceKafkaTopic: String,
6 | sourceKafkaConsumerGroup: String,
7 | sourceKafkaOffsetStarting: String,
8 | sinkKafkaBroker: String,
9 | sinkKafkaTopic: String)
10 |
--------------------------------------------------------------------------------
/project-spark/service-stream-profile/src/main/scala/mkt/udon/entity/UdonProfileStateFunc.scala:
--------------------------------------------------------------------------------
1 | package mkt.udon.entity
2 |
3 | import mkt.udon.config.UdonProfileStreamConfig
4 | import mkt.udon.core.entity.{UserEvent, UserProfile}
5 | import mkt.udon.infra.spark.storage.DynamoSink
6 |
7 | object UdonProfileStateFunc {
8 |
9 | def handlePartition(config: UdonProfileStreamConfig, iter: Iterator[UserEvent]): Unit = {
10 | // Dynamo Client 생성 (@ThreadSafe)
11 | val dynamoClient = DynamoSink.buildClient(dynamoTable = config.dynamoTable, dynamoRegion = config.dynamoRegion)
12 |
13 | // 사용자 마다 그룹화 해 사용자별로 이벤트 시간순 정렬을 할 수 있도록 합니다.
14 | val groupedByUser = iter.toList.groupBy(u => u.userId)
15 | groupedByUser.foreach(kv => {
16 | val userId = kv._1
17 | val userEvents = kv._2.sortBy(x => -x.eventTime) // 시간순 내림차순 정렬
18 |
19 | // 사용자 Profile 을 Dynamo 에서 가져오고 없을 경우 만듭니다
20 | val existing = DynamoSink.getItem[UserProfile](dynamoClient, keyName = "specifier", userId)
21 | .getOrElse(UserProfile.buildEmpty(userId))
22 |
23 | /**
24 | * 추가적으로 더 해볼 수 있는 최적화는, 사용자 이벤트 숫자를 미리 필터링 하는 것입니다.
25 | * 사용자 이벤트 100개 -> config.maxCount 에 의해 미리 필터링해 existing.update 호출 수를 제한할 수 있습니다.
26 | * 다만 사용자 이벤트에 따른 분기가 미리 일어나는 등 관련 로직을 작성해야 합니다
27 | */
28 | userEvents.foreach(event => {
29 | existing.update(userEvent = event, maxCountView = config.maxCountView, maxCountOrder = config.maxCountOrder)
30 | })
31 |
32 | /**
33 | * Stream 이나 Batch 가 여러개일 경우 Dynamo 테이블이 많아지면 API 입장에서 Dynamo Call 을 여러번해야 해 문제가 될 수 있습니다.
34 | * 이 때, 같은 성격의 데이터라면 Dynamo Table 을 공유하고 컬럼을 다르게 적재할 수 있습니다.
35 | *
36 | * 예를 들어, User Profile Table 내에는
37 | * - Kafka 에서 당겨오는 User Event 를 바탕으로 적재하는 Stream User Profile 컬럼과
38 | * - 배치 기반으로 Segment 를 만들어 사용자의 Segment List 를 적재하는 Batch 용 User Profile 컬럼을 만들 수 있습니다.
39 | * - 이 때, Dynamo 1 개의 Row 사이즈에는 제한이 있으므로 너무 많은 컬럼으로 인해 데이터 사이즈가 넘치지 않도록 주의해야 합니다.
40 | *
41 | * 만약 다른 컬럼이 다른 스트림이나 배치에서 업데이트 된다면 Put 대신에 Dynamo Update (Upsert) 를 이용할 수 있습니다.
42 | * - https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_UpdateItem.html
43 | */
44 | DynamoSink.putItem(dynamoClient, existing, config.dynamoExpireDays)
45 | })
46 |
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/project-spark/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'project-spark'
2 |
3 | include ':module-core',
4 | ':module-infra-spark',
5 | ':service-stream-profile',
6 | ':service-batch-discovery',
7 | ':service-batch-statistics'
8 |
9 |
--------------------------------------------------------------------------------
/project-terraform-aws/.gitignore:
--------------------------------------------------------------------------------
1 | *.hcl
2 | .terraform/
3 | *.lock.info
4 | *.tfstate
5 | *.tfstate.backup
6 | __tf_state/
7 | .idea/
8 |
9 |
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/_aws-root-iam/.gitkeep
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | environment_common = "common"
3 | environment_development = "development"
4 | environment_production = "production"
5 |
6 | region_seoul = "ap-northeast-2"
7 |
8 | team_data = "data"
9 | }
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_output.tf:
--------------------------------------------------------------------------------
1 | output "profile_id_bastion" {
2 | value = module.module-iam-common.profile_id_bastion
3 | }
4 |
5 | output "profile_arn_emr_instance" {
6 | value = module.module-iam-common.profile_arn_emr_instance
7 | }
8 |
9 | output "role_arn_emr_cluster" {
10 | value = module.module-iam-common.role_arn_emr_cluster
11 | }
12 |
13 | output "role_arn_emr_asg" {
14 | value = module.module-iam-common.role_arn_emr_asg
15 | }
16 |
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 | region = local.region_seoul
3 | }
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/_terraform.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.1.3"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = "~> 3.71.0"
8 | }
9 | }
10 |
11 | /**
12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 | */
14 |
15 | backend "local" {
16 | path = "../__tf_state/_aws-root-iam/terraform.tfstate"
17 | }
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/main_iam_common.tf:
--------------------------------------------------------------------------------
1 | module "module-iam-common" {
2 | source = "./module-iam-common"
3 |
4 | environment = local.environment_common
5 | }
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_data.tf:
--------------------------------------------------------------------------------
1 | data "aws_iam_policy" "managed_dynamo_full" {
2 | arn = "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess"
3 | }
4 |
5 | data "aws_iam_policy" "managed_kinesis_stream_full" {
6 | arn = "arn:aws:iam::aws:policy/AmazonKinesisFullAccess"
7 | }
8 |
9 | data "aws_iam_policy" "managed_data_scientist" {
10 | arn = "arn:aws:iam::aws:policy/job-function/DataScientist"
11 | }
12 |
13 | data "aws_iam_policy" "managed_s3_full" {
14 | arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
15 | }
16 |
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_output.tf:
--------------------------------------------------------------------------------
1 | output "profile_id_bastion" {
2 | value = aws_iam_instance_profile.bastion.id
3 | }
4 |
5 | output "profile_arn_emr_instance" {
6 | value = aws_iam_instance_profile.emr_instance.arn
7 | }
8 |
9 | output "role_arn_emr_cluster" {
10 | value = aws_iam_role.emr_cluster.arn
11 | }
12 |
13 | output "role_arn_emr_asg" {
14 | value = aws_iam_role.emr_asg.arn
15 | }
16 |
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}
--------------------------------------------------------------------------------
/project-terraform-aws/_aws-root-iam/module-iam-common/common.basic.iam.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | instance_purpose_basic = "ec2-basic"
3 | }
4 |
5 | #
6 | # Role, Instance Profile
7 | #
8 |
9 | resource "aws_iam_role" "basic" {
10 | name = "${lower(var.environment)}-${local.instance_purpose_basic}"
11 |
12 | assume_role_policy = <> /var/spool/cron/${user}
17 | chown ${user}:${user} /var/spool/cron/${user}
18 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/_terraform.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.1.3"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = "~> 3.71.0"
8 | }
9 | }
10 |
11 | /**
12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 | */
14 |
15 | backend "local" {
16 | path = "../__tf_state/_aws-root-machine-bastion/terraform.tfstate"
17 | }
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/main_bastion_dev.tf:
--------------------------------------------------------------------------------
1 | module "module-bastion-data-dev" {
2 | source = "./module-bastion-data-dev"
3 |
4 | environment = local.environment_development
5 | team = local.team_data
6 |
7 | bastion_ami = data.aws_ami.amazon_linux_2.id
8 | bastion_profile = data.terraform_remote_state.root_iam.outputs.profile_id_bastion
9 | bastion_keypair = local.keypair_infra
10 |
11 | bastion_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_bastion_public_data_dev
12 |
13 | bastion_subnet_id = data.terraform_remote_state.root_vpc.outputs.subnet_id_public_az_a_data_dev
14 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/_data.bootstrap.tf:
--------------------------------------------------------------------------------
1 | data "template_file" "bastion_template_cloudwatch" {
2 | template = file("${path.root}/_template/template.cloudwatch.sh")
3 |
4 | vars = {
5 | user = "ec2-user"
6 | installer = "yum"
7 | agent_version = "1.2.2"
8 | }
9 | }
10 |
11 |
12 | data "template_cloudinit_config" "bastion_user_data" {
13 | gzip = false
14 | base64_encode = true
15 |
16 | # install patches for Amazon Linux
17 | part {
18 | content_type = "text/x-shellscript"
19 |
20 | content = <> /var/spool/cron/hadoop"
17 | sudo chown hadoop:hadoop /var/spool/cron/hadoop
18 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-instance-tag.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sleep 15s;
4 |
5 | ls -al /mnt/var/lib/info/
6 |
7 | echo -e ""
8 |
9 | export IS_MASTER=$(cat /mnt/var/lib/info/instance.json | jq -r ".isMaster")
10 | export INSTANCE_GROUP_ID=$(cat /mnt/var/lib/info/instance.json | jq -r ".instanceGroupId")
11 | export CLUSTER_ID=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".jobFlowId")
12 | export INSTANCE_ID=$(wget -q -O - http://169.254.169.254/latest/meta-data/instance-id)
13 | export INSTANCE_GROUP_TYPE=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".instanceGroups | .[] | select( .instanceGroupId == \"${INSTANCE_GROUP_ID}\") | .instanceRole" | tr a-z A-Z)
14 |
15 | echo -e "IS_MASTER: ${IS_MASTER}"
16 | echo -e "INSTANCE_GROUP_ID: ${INSTANCE_GROUP_ID}"
17 | echo -e "CLUSTER_ID: ${CLUSTER_ID}"
18 | echo -e "INSTANCE_ID: ${INSTANCE_ID}"
19 | echo -e "INSTANCE_GROUP_TYPE: ${INSTANCE_GROUP_TYPE}"
20 |
21 | export CURRENT_TAG_NAME=$(aws ec2 --region ap-northeast-2 describe-tags --filters Name=resource-id,Values=${INSTANCE_ID} | jq -r ".Tags | .[] | select( .Key == \"Name\") | .Value")
22 | export NEW_TAG_NAME="${CURRENT_TAG_NAME}-${INSTANCE_GROUP_TYPE}"
23 |
24 | echo -e "CURRENT_TAG_NAME: ${CURRENT_TAG_NAME}"
25 | echo -e "NEW_TAG_NAME: ${NEW_TAG_NAME}"
26 |
27 | echo -e "aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME}"
28 |
29 | aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME}
30 |
31 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-spark-batch.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "classification": "httpfs-env",
4 | "properties": {
5 | },
6 | "configurations": [
7 | {
8 | "classification": "export",
9 | "properties": {
10 | "TZ": "Asia/Seoul"
11 | },
12 | "configurations": [
13 | ]
14 | }
15 | ]
16 | },
17 | {
18 | "classification": "hadoop-kms-env",
19 | "properties": {
20 | },
21 | "configurations": [
22 | {
23 | "classification": "export",
24 | "properties": {
25 | "TZ": "Asia/Seoul"
26 | },
27 | "configurations": [
28 | ]
29 | }
30 | ]
31 | },
32 | {
33 | "classification": "livy-env",
34 | "properties": {
35 | },
36 | "configurations": [
37 | {
38 | "classification": "export",
39 | "properties": {
40 | "TZ": "Asia/Seoul"
41 | },
42 | "configurations": [
43 | ]
44 | }
45 | ]
46 | },
47 | {
48 | "classification": "zeppelin-env",
49 | "properties": {
50 | },
51 | "configurations": [
52 | {
53 | "classification": "export",
54 | "properties": {
55 | "TZ": "Asia/Seoul"
56 | },
57 | "configurations": [
58 | ]
59 | }
60 | ]
61 | },
62 | {
63 | "classification": "sqoop-env",
64 | "properties": {
65 | },
66 | "configurations": [
67 | {
68 | "classification": "export",
69 | "properties": {
70 | "TZ": "Asia/Seoul"
71 | },
72 | "configurations": [
73 | ]
74 | }
75 | ]
76 | },
77 | {
78 | "classification": "oozie-env",
79 | "properties": {
80 | },
81 | "configurations": [
82 | {
83 | "classification": "export",
84 | "properties": {
85 | "TZ": "Asia/Seoul"
86 | },
87 | "configurations": [
88 | ]
89 | }
90 | ]
91 | },
92 | {
93 | "classification": "presto-env",
94 | "properties": {
95 | },
96 | "configurations": [
97 | {
98 | "classification": "export",
99 | "properties": {
100 | "TZ": "Asia/Seoul"
101 | },
102 | "configurations": [
103 | ]
104 | }
105 | ]
106 | },
107 | {
108 | "classification": "hcatalog-env",
109 | "properties": {
110 | },
111 | "configurations": [
112 | {
113 | "classification": "export",
114 | "properties": {
115 | "TZ": "Asia/Seoul"
116 | },
117 | "configurations": [
118 | ]
119 | }
120 | ]
121 | },
122 | {
123 | "classification": "hcatalog-webhcat-env",
124 | "properties": {
125 | },
126 | "configurations": [
127 | {
128 | "classification": "export",
129 | "properties": {
130 | "TZ": "Asia/Seoul"
131 | },
132 | "configurations": [
133 | ]
134 | }
135 | ]
136 | },
137 | {
138 | "classification": "hive-env",
139 | "properties": {
140 | },
141 | "configurations": [
142 | {
143 | "classification": "export",
144 | "properties": {
145 | "TZ": "Asia/Seoul"
146 | },
147 | "configurations": [
148 | ]
149 | }
150 | ]
151 | },
152 | {
153 | "classification": "mapred-env",
154 | "properties": {
155 | },
156 | "configurations": [
157 | {
158 | "classification": "export",
159 | "properties": {
160 | "TZ": "Asia/Seoul"
161 | },
162 | "configurations": [
163 | ]
164 | }
165 | ]
166 | },
167 | {
168 | "classification": "hadoop-env",
169 | "properties": {
170 | },
171 | "configurations": [
172 | {
173 | "classification": "export",
174 | "properties": {
175 | "TZ": "Asia/Seoul"
176 | },
177 | "configurations": [
178 | ]
179 | }
180 | ]
181 | },
182 | {
183 | "classification": "hbase-env",
184 | "properties": {
185 | },
186 | "configurations": [
187 | {
188 | "classification": "export",
189 | "properties": {
190 | "TZ": "Asia/Seoul"
191 | },
192 | "configurations": [
193 | ]
194 | }
195 | ]
196 | },
197 | {
198 | "classification": "spark-env",
199 | "properties": {
200 | },
201 | "configurations": [
202 | {
203 | "classification": "export",
204 | "properties": {
205 | "TZ": "Asia/Seoul"
206 | },
207 | "configurations": [
208 | ]
209 | }
210 | ]
211 | },
212 | {
213 | "Classification": "hive-site",
214 | "Properties": {
215 | "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true",
216 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
217 | "javax.jdo.option.ConnectionUserName": "root",
218 | "javax.jdo.option.ConnectionPassword": "admin1234"
219 | }
220 | },
221 | {
222 | "Classification": "spark-hive-site",
223 | "Properties": {
224 | "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true",
225 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
226 | "javax.jdo.option.ConnectionUserName": "root",
227 | "javax.jdo.option.ConnectionPassword": "admin1234"
228 | }
229 | },
230 | {
231 | "Classification": "capacity-scheduler",
232 | "Properties": {
233 | "yarn.scheduler.capacity.resource-calculator": "org.apache.hadoop.yarn.util.resource.DominantResourceCalculator",
234 | "yarn.scheduler.capacity.maximum-am-resource-percent": "0.8"
235 | },
236 | "configurations": [
237 | ]
238 | },
239 | {
240 | "Classification": "yarn-site",
241 | "Properties": {
242 | "yarn.scheduler.minimum-allocation-vcores": "1",
243 | "yarn.scheduler.maximum-allocation-vcores": "8",
244 | "yarn.node-labels.enabled": "true",
245 | "yarn.node-labels.am.default-node-label-expression": "CORE"
246 | },
247 | "configurations": [
248 | ]
249 | },
250 | {
251 | "classification": "yarn-env",
252 | "properties": {
253 | },
254 | "configurations": [
255 | {
256 | "classification": "export",
257 | "properties": {
258 | "TZ": "Asia/Seoul"
259 | },
260 | "configurations": [
261 | ]
262 | }
263 | ]
264 | }
265 | ]
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-system-config.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo yum -y update
4 | sudo yum -y upgrade
5 |
6 | sudo timedatectl set-timezone Asia/Seoul
7 |
8 | sudo yum -y groupinstall development
9 | sudo yum -y install curl wget jq htop
10 |
11 | sudo sh -c 'echo "fs.inotify.max_user_instances = 8192" > /etc/sysctl.d/98-inotifyfix.conf'
12 | sudo sh -c 'echo "fs.inotify.max_user_watches = 524288" >> /etc/sysctl.d/98-inotifyfix.conf'
13 | sudo sysctl --system
14 |
15 | sudo sh -c 'echo "* soft nofile 65536" > /etc/security/limits.d/50-custom.conf'
16 | sudo sh -c 'echo "* hard nofile 65536" >> /etc/security/limits.d/50-custom.conf'
17 | sudo sh -c 'echo "* soft nproc 200000" >> /etc/security/limits.d/50-custom.conf'
18 | sudo sh -c 'echo "* hard nproc 200000" >> /etc/security/limits.d/50-custom.conf'
19 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/_terraform.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.1.3"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = "~> 3.71.0"
8 | }
9 | }
10 |
11 | /**
12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 | */
14 | backend "local" {
15 | path = "../__tf_state/_aws-root-machine-emr/terraform.tfstate"
16 | }
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/main_emr_data_dev.tf:
--------------------------------------------------------------------------------
1 | module "module-emr-data-dev" {
2 | source = "./module-emr-data-dev"
3 |
4 | environment = local.environment_development
5 | team = local.team_data
6 |
7 | vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev
8 | emr_subnet = data.terraform_remote_state.root_vpc.outputs.subnet_id_private_az_c_data_dev /** AZ-c */
9 |
10 | emr_keypair = local.keypair_infra
11 | emr_profile_arn_instance = data.terraform_remote_state.root_iam.outputs.profile_arn_emr_instance
12 | emr_role_arn_cluster = data.terraform_remote_state.root_iam.outputs.role_arn_emr_cluster
13 | emr_role_arn_asg = data.terraform_remote_state.root_iam.outputs.role_arn_emr_asg
14 |
15 | emr_master_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_managed_data_dev
16 | emr_master_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_additional_data_dev
17 | emr_slave_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_managed_data_dev
18 | emr_slave_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_additional_data_dev
19 | emr_service_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_service_managed_data_dev
20 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | emr_cluster_spark_batch = "spark-batch"
3 |
4 | emr_release_5_34_0 = "emr-5.34.0"
5 | emr_release_6_5_0 = "emr-6.5.0"
6 | }
7 |
8 | locals {
9 | spot_default_factor = 0.8
10 |
11 | spot_on_demand_price_r5xlarge = 0.304
12 | spot_bid_price_r5xlarge = format("%.2f", tonumber(local.spot_on_demand_price_r5xlarge) * tonumber(local.spot_default_factor))
13 |
14 | spot_on_demand_price_m5xlarge = 0.236
15 | spot_bid_price_m5_xlarge = format("%.2f", tonumber(local.spot_on_demand_price_m5xlarge) * tonumber(local.spot_default_factor))
16 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}
2 | variable "team" {}
3 |
4 |
5 | variable "emr_keypair" {}
6 |
7 | variable "emr_profile_arn_instance" {}
8 | variable "emr_role_arn_cluster" {}
9 | variable "emr_role_arn_asg" {}
10 |
11 | variable "vpc_id" {}
12 | variable "emr_subnet" {}
13 |
14 | variable "emr_master_managed_sg_id" {}
15 | variable "emr_master_additional_sg_id" {}
16 | variable "emr_slave_managed_sg_id" {}
17 | variable "emr_slave_additional_sg_id" {}
18 | variable "emr_service_managed_sg_id" {}
19 |
20 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/dev.spark-batch-01.cw.tf:
--------------------------------------------------------------------------------
1 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-CPUUtilization" {
2 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_CPUUtil"
3 | # comparison_operator = "GreaterThanOrEqualToThreshold"
4 | #
5 | # period = "300"
6 | # evaluation_periods = "2"
7 | # datapoints_to_alarm = 2
8 | #
9 | # # second
10 | # statistic = "Average"
11 | # threshold = "80"
12 | # alarm_description = ""
13 | #
14 | # metric_name = "CPUUtilization"
15 | # namespace = "AWS/EC2"
16 | #
17 | # dimensions = {
18 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
19 | # }
20 | #
21 | # actions_enabled = true
22 | # insufficient_data_actions = []
23 | # ok_actions = []
24 | #
25 | # alarm_actions = [
26 | # var.sns_topic_arn_cloudwatch_alarm,
27 | # ]
28 | #}
29 | #
30 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-MemUtil" {
31 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_MemUtil"
32 | # comparison_operator = "GreaterThanOrEqualToThreshold"
33 | #
34 | # period = "300"
35 | # evaluation_periods = "2"
36 | # datapoints_to_alarm = 2
37 | #
38 | # # second
39 | # statistic = "Maximum"
40 | # threshold = "80"
41 | # alarm_description = ""
42 | #
43 | # metric_name = "MemoryUtilization"
44 | # namespace = "System/Linux"
45 | #
46 | # dimensions = {
47 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
48 | # }
49 | #
50 | # actions_enabled = true
51 | #
52 | # insufficient_data_actions = [
53 | # var.sns_topic_arn_cloudwatch_alarm,
54 | # ]
55 | #
56 | # ok_actions = []
57 | #
58 | # alarm_actions = [
59 | # var.sns_topic_arn_cloudwatch_alarm,
60 | # ]
61 | #}
62 | #
63 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_Has-SystemCheckFailure" {
64 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-Has_SysCheckFailure"
65 | # comparison_operator = "GreaterThanOrEqualToThreshold"
66 | #
67 | # period = "300"
68 | # evaluation_periods = "1"
69 | # datapoints_to_alarm = 1
70 | #
71 | # # second
72 | # statistic = "Sum"
73 | # threshold = "1"
74 | # alarm_description = ""
75 | #
76 | # metric_name = "StatusCheckFailed"
77 | # namespace = "AWS/EC2"
78 | #
79 | # dimensions = {
80 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
81 | # }
82 | #
83 | # actions_enabled = true
84 | # insufficient_data_actions = []
85 | # ok_actions = []
86 | #
87 | # alarm_actions = [
88 | # var.sns_topic_arn_cloudwatch_alarm,
89 | # ]
90 | #}
91 | #
92 | ## EC2 Custom Metric (Disk, Memory)
93 | #
94 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-RootDiskUtil" {
95 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_RootDiskUtil"
96 | # comparison_operator = "GreaterThanOrEqualToThreshold"
97 | #
98 | # period = "300"
99 | # evaluation_periods = "2"
100 | # datapoints_to_alarm = 2
101 | #
102 | # # second
103 | # statistic = "Maximum"
104 | # threshold = "80"
105 | # alarm_description = ""
106 | #
107 | # metric_name = "DiskSpaceUtilization"
108 | # namespace = "System/Linux"
109 | #
110 | # dimensions = {
111 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
112 | # MountPath = local.emr_cw_root_disk_mount_path
113 | # Filesystem = local.emr_cw_root_disk_mount_fs
114 | # }
115 | #
116 | # actions_enabled = true
117 | #
118 | # insufficient_data_actions = [
119 | # var.sns_topic_arn_cloudwatch_alarm,
120 | # ]
121 | #
122 | # ok_actions = []
123 | #
124 | # alarm_actions = [
125 | # var.sns_topic_arn_cloudwatch_alarm,
126 | # ]
127 | #}
128 | #
129 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-DataDiskUtil" {
130 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_DataDiskUtil"
131 | # comparison_operator = "GreaterThanOrEqualToThreshold"
132 | #
133 | # period = "300"
134 | # evaluation_periods = "2"
135 | # datapoints_to_alarm = 2
136 | #
137 | # # second
138 | # statistic = "Maximum"
139 | # threshold = "80"
140 | # alarm_description = ""
141 | #
142 | # metric_name = "DiskSpaceUtilization"
143 | # namespace = "System/Linux"
144 | #
145 | # dimensions = {
146 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id
147 | # MountPath = local.emr_cw_data_disk_mount_path
148 | # Filesystem = local.emr_cw_data_disk_mount_fs
149 | # }
150 | #
151 | # actions_enabled = true
152 | #
153 | # insufficient_data_actions = [
154 | # var.sns_topic_arn_cloudwatch_alarm,
155 | # ]
156 | #
157 | # ok_actions = []
158 | #
159 | # alarm_actions = [
160 | # var.sns_topic_arn_cloudwatch_alarm,
161 | # ]
162 | #}
163 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-presto/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-presto/.gitkeep
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-machine-emr-stream/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-stream/.gitkeep
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-storage-rds/.gitkeep
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_data.state.tf:
--------------------------------------------------------------------------------
1 | data "terraform_remote_state" "root_iam" {
2 | backend = "local"
3 |
4 | config = {
5 | path = "../__tf_state/_aws-root-iam/terraform.tfstate"
6 | }
7 | }
8 |
9 | data "terraform_remote_state" "root_vpc" {
10 | backend = "local"
11 |
12 | config = {
13 | path = "../__tf_state/_aws-root-vpc/terraform.tfstate"
14 | }
15 | }
16 |
17 | data "terraform_remote_state" "root_sg" {
18 | backend = "local"
19 |
20 | config = {
21 | path = "../__tf_state/_aws-root-sg/terraform.tfstate"
22 | }
23 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_local.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | environment_common = "common"
3 | environment_development = "development"
4 | environment_production = "production"
5 |
6 | region_seoul = "ap-northeast-2"
7 |
8 | team_data = "data"
9 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_provider.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 | region = local.region_seoul
3 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/_terraform.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.1.3"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = "~> 3.71.0"
8 | }
9 | }
10 |
11 | /**
12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다
13 | */
14 | backend "local" {
15 | path = "../__tf_state/_aws-root-storage-rds/terraform.tfstate"
16 | }
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/main_rds_data_dev.tf:
--------------------------------------------------------------------------------
1 | module "module-rds-data-dev" {
2 | source = "./module-rds-data-dev"
3 |
4 | environment = local.environment_development
5 | team = local.team_data
6 |
7 | vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev
8 | rds_hive_metastore_subnet_list = data.terraform_remote_state.root_vpc.outputs.subnet_list_database_data_dev
9 | rds_hive_metastore_subnet_group = data.terraform_remote_state.root_vpc.outputs.subnet_name_database_data_dev
10 | rds_hive_metastore_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_rds_hive_metastore_data_dev
11 | }
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/_variable.tf:
--------------------------------------------------------------------------------
1 | variable "environment" {}
2 | variable "team" {}
3 |
4 | variable "vpc_id" {}
5 | variable "rds_hive_metastore_sg_id" {}
6 | variable "rds_hive_metastore_subnet_group" {}
7 | variable "rds_hive_metastore_subnet_list" {}
8 |
9 |
--------------------------------------------------------------------------------
/project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/dev.hive-metastore.rds.tf:
--------------------------------------------------------------------------------
1 | module "rds-hive-metastore-data-development" {
2 | source = "terraform-aws-modules/rds-aurora/aws"
3 | version = "6.1.4"
4 |
5 | name = "hive-metastore-${var.environment}"
6 | engine = "aurora-mysql"
7 | engine_version = "5.7.12"
8 | instance_class = "db.t3.medium"
9 | instances = {
10 | 01 = {}
11 | 02 = {}
12 | }
13 |
14 | storage_encrypted = true
15 | apply_immediately = true
16 | skip_final_snapshot = true
17 | create_monitoring_role = false
18 |
19 | vpc_id = var.vpc_id
20 | db_subnet_group_name = var.rds_hive_metastore_subnet_group
21 | vpc_security_group_ids = [var.rds_hive_metastore_sg_id]
22 | create_db_subnet_group = false
23 | create_security_group = false
24 |
25 | # 이후 실습에서의 편의를 위해 고정된 값을 사용합니다.
26 | # Password 를 Terraform 에서 지정시 State 에 저장되므로 주의해야합니다.
27 | master_password = "admin1234"
28 | # master_password = random_password.hive-metastore.result
29 | create_random_password = false
30 |
31 | db_parameter_group_name = aws_db_parameter_group.hive-metastore.name
32 | db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.hive-metastore.name
33 |
34 | enabled_cloudwatch_logs_exports = []
35 |
36 | tags = {
37 | Environment = var.environment
38 | Team = var.team
39 | }
40 | }
41 |
42 | resource "random_password" "hive-metastore" {
43 | length = 10
44 | }
45 |
46 | resource "aws_db_parameter_group" "hive-metastore" {
47 | name = "hive-metastore-aurora-db-57-parameter-group"
48 | family = "aurora-mysql5.7"
49 | description = "hive-metastore-aurora-db-57-parameter-group"
50 | tags = {
51 | Environment = var.environment
52 | Team = var.team
53 | }
54 | }
55 |
56 | resource "aws_rds_cluster_parameter_group" "hive-metastore" {
57 | name = "hive-metastore-aurora-57-cluster-parameter-group"
58 | family = "aurora-mysql5.7"
59 | description = "hive-metastore-aurora-57-cluster-parameter-group"
60 | tags = {
61 | Environment = var.environment
62 | Team = var.team
63 | }
64 | }
--------------------------------------------------------------------------------
/project-terraform-gcp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-gcp/.gitkeep
--------------------------------------------------------------------------------