├── .gitignore ├── Makefile ├── README.md ├── _datasets ├── .gitkeep ├── airbnb │ └── .gitkeep └── ecommerce │ └── .gitkeep ├── _dockerfile ├── docker-metastore │ ├── Dockerfile │ ├── conf │ │ └── hive-site.xml │ └── scripts │ │ └── entrypoint.sh └── docker-presto │ ├── Dockerfile │ ├── etc │ ├── catalog │ │ ├── hive.properties │ │ ├── iceberg.properties │ │ └── tpch.properties │ ├── config.properties │ ├── jvm.config │ ├── log.properties │ └── node.properties │ └── scripts │ └── entrypoint.sh ├── _notebook ├── kafka-basic.ipynb ├── spark-jdbc-basic.ipynb ├── spark-metastore-local.ipynb ├── spark-metastore-remote.ipynb └── spark-streaming-data.ipynb ├── _script ├── docker-mysql │ ├── conf │ │ └── my.cnf │ └── sql │ │ ├── 001_create_database.sql │ │ └── 002_create_table.sql └── docker-spark │ ├── apps │ ├── main.py │ └── postgresql-42.2.22.jar │ ├── conf │ └── spark-defaults.conf │ └── data │ └── .gitignore ├── _slide └── .gitignore ├── _volume └── .gitignore ├── docker-compose.aws.yml ├── docker-compose.kafka.yml ├── docker-compose.metastore.yml ├── docker-compose.presto.yml ├── docker-compose.spark.yml ├── docker-compose.storage.yml ├── project-flink └── .gitignore ├── project-kafka └── .gitignore ├── project-spark ├── .gitignore ├── _scripts │ └── mysql-ddl │ │ └── table_property_stat.sql ├── build.gradle ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── module-core │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ └── src │ │ └── main │ │ └── scala │ │ └── mkt │ │ └── udon │ │ └── core │ │ ├── common │ │ ├── Environment.scala │ │ └── TimeUtil.scala │ │ └── entity │ │ ├── ProductPool.scala │ │ ├── UserEvent.scala │ │ └── UserProfile.scala ├── module-infra-spark │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ └── src │ │ └── main │ │ └── scala │ │ └── mkt │ │ └── udon │ │ └── infra │ │ └── spark │ │ ├── SparkBase.scala │ │ ├── common │ │ └── Partition.scala │ │ └── storage │ │ ├── DynamoSink.scala │ │ ├── JdbcSink.scala │ │ └── ParquetSink.scala ├── service-batch-discovery │ ├── Makefile │ ├── VERSION │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ └── src │ │ └── main │ │ ├── resources │ │ ├── .gitignore │ │ ├── application.conf │ │ └── log4j.properties │ │ └── scala │ │ └── mkt │ │ └── udon │ │ ├── UdonProductPoolBatch.scala │ │ ├── config │ │ └── UdonProductPoolBatchConfig.scala │ │ └── entity │ │ └── UdonProductPoolEntity.scala ├── service-batch-statistics │ ├── Makefile │ ├── VERSION │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ └── src │ │ └── main │ │ ├── resources │ │ ├── .gitignore │ │ ├── application.conf │ │ └── log4j.properties │ │ └── scala │ │ └── mkt │ │ └── udon │ │ ├── UdonStatBatch.scala │ │ ├── config │ │ └── UdonStatBatchConfig.scala │ │ └── entity │ │ └── UdonStatEntity.scala ├── service-stream-profile │ ├── Makefile │ ├── VERSION │ ├── build.gradle │ └── src │ │ └── main │ │ ├── resources │ │ ├── application.conf │ │ └── log4j.properties │ │ └── scala │ │ └── mkt │ │ └── udon │ │ ├── UdonProfileStream.scala │ │ ├── UdonRelayStream.scala │ │ ├── config │ │ ├── UdonProfileStreamConfig.scala │ │ └── UdonRelayStreamConfig.scala │ │ └── entity │ │ └── UdonProfileStateFunc.scala └── settings.gradle ├── project-terraform-aws ├── .gitignore ├── _aws-root-iam │ ├── .gitkeep │ ├── _local.tf │ ├── _output.tf │ ├── _provider.tf │ ├── _terraform.tf │ ├── main_iam_common.tf │ └── module-iam-common │ │ ├── _data.tf │ │ ├── _output.tf │ │ ├── _variable.tf │ │ ├── common.basic.iam.tf │ │ ├── common.ec2.iam.tf │ │ ├── common.ec2.profile.tf │ │ ├── common.emr.iam.tf │ │ └── common.emr.profile.tf ├── _aws-root-sg │ ├── .gitkeep │ ├── _data.tf │ ├── _local.tf │ ├── _output.tf │ ├── _provider.tf │ ├── _terraform.tf │ ├── main_sg_data_dev.tf │ └── module-sg-data-dev │ │ ├── _output.tf │ │ ├── _variable.tf │ │ ├── dev.bastion-public.sg.tf │ │ ├── dev.emr-master.sg.tf │ │ ├── dev.emr-serivce.sg.tf │ │ ├── dev.emr-slave.sg.tf │ │ └── dev.rds.sg.tf ├── _aws-root-vpc │ ├── _local.tf │ ├── _output.tf │ ├── _provider.tf │ ├── _terraform.tf │ ├── main_vpc_data_dev.tf │ └── module-vpc-data-dev │ │ ├── _output.tf │ │ ├── _variable.tf │ │ └── dev.data.vpc.tf ├── aws-root-machine-bastion │ ├── _data.ami.tf │ ├── _data.state.tf │ ├── _local.tf │ ├── _provider.tf │ ├── _template │ │ └── template.cloudwatch.sh │ ├── _terraform.tf │ ├── main_bastion_dev.tf │ └── module-bastion-data-dev │ │ ├── _data.bootstrap.tf │ │ ├── _local.tf │ │ ├── _variable.tf │ │ ├── dev.bastion-public-01.cw.tf │ │ └── dev.bastion-public-01.ec2.tf ├── aws-root-machine-eks │ ├── .gitkeep │ ├── _local.tf │ ├── _provider.tf │ └── _terraform.tf ├── aws-root-machine-emr-batch │ ├── .gitkeep │ ├── _data.state.tf │ ├── _local.tf │ ├── _provider.tf │ ├── _template │ │ ├── template.emr-cloudwatch-collect.sh │ │ ├── template.emr-instance-tag.sh │ │ ├── template.emr-spark-batch.json │ │ └── template.emr-system-config.sh │ ├── _terraform.tf │ ├── main_emr_data_dev.tf │ └── module-emr-data-dev │ │ ├── _local.tf │ │ ├── _variable.tf │ │ ├── dev.spark-batch-01.cw.tf │ │ └── dev.spark-batch-01.emr.tf ├── aws-root-machine-emr-presto │ └── .gitkeep ├── aws-root-machine-emr-stream │ └── .gitkeep └── aws-root-storage-rds │ ├── .gitkeep │ ├── _data.state.tf │ ├── _local.tf │ ├── _provider.tf │ ├── _terraform.tf │ ├── main_rds_data_dev.tf │ └── module-rds-data-dev │ ├── _variable.tf │ └── dev.hive-metastore.rds.tf └── project-terraform-gcp └── .gitkeep /.gitignore: -------------------------------------------------------------------------------- 1 | ./idea 2 | .DS_Store 3 | _assets/ 4 | !.gitkeep 5 | _datasets/airbnb/*.csv 6 | _datasets/ecommerce/*.csv 7 | 8 | */.ipynb_checkpoints/ 9 | 10 | derby.log 11 | metastore_db -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TAG = "Makefile" 2 | 3 | MYSQLCLIENT = mycli 4 | DOCKER_HOST_IP := $(shell ipconfig getifaddr en0) 5 | 6 | ## 7 | ## Jupyter 8 | ## 9 | 10 | .PHONY: jupyter 11 | jupyter: 12 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose" 13 | @ echo "-----------------------------------------\n" 14 | @ jupyter lab --ip=127.0.0.1 --port=8080 15 | 16 | ## 17 | ## Compose 18 | ## 19 | 20 | .PHONY: compose.prepare 21 | compose.prepare: 22 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Preparing docker-compose" 23 | @ echo "-----------------------------------------\n" 24 | @ echo "export DOCKER_HOST_IP=$(DOCKER_HOST_IP)" 25 | @ echo "\n-----------------------------------------" 26 | @ echo "" 27 | 28 | .PHONY: compose.storage 29 | compose.storage: compose.prepare 30 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 31 | @ docker stop $(docker ps -a -q) || true 32 | @ docker rm -f $(docker ps -a -q) || true 33 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 34 | @ docker compose -f docker-compose.storage.yml rm -fsv || true 35 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 36 | -f docker-compose.storage.yml \ 37 | up 38 | 39 | .PHONY: compose.spark 40 | compose.spark: compose.prepare 41 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 42 | @ docker stop $(docker ps -a -q) || true 43 | @ docker rm -f $(docker ps -a -q) || true 44 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 45 | @ docker compose -f docker-compose.spark.yml rm -fsv || true 46 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 47 | -f docker-compose.spark.yml \ 48 | up 49 | 50 | .PHONY: compose.kafka 51 | compose.kafka: compose.prepare 52 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 53 | @ docker stop $(docker ps -a -q) || true 54 | @ docker rm -f $(docker ps -a -q) || true 55 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 56 | @ docker compose -f docker-compose.kafka.yml rm -fsv || true 57 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 58 | -f docker-compose.kafka.yml \ 59 | up 60 | 61 | .PHONY: compose.metastore 62 | compose.metastore: compose.prepare 63 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 64 | @ docker stop $(docker ps -a -q) || true 65 | @ docker rm -f $(docker ps -a -q) || true 66 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 67 | @ docker compose -f docker-compose.metastore.yml rm -fsv || true 68 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 69 | -f docker-compose.metastore.yml \ 70 | up --build 71 | 72 | .PHONY: compose.presto 73 | compose.presto: compose.prepare 74 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 75 | @ docker stop $(docker ps -a -q) || true 76 | @ docker rm -f $(docker ps -a -q) || true 77 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 78 | @ docker compose -f docker-compose.presto.yml rm -fsv || true 79 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 80 | -f docker-compose.presto.yml \ 81 | up --build 82 | 83 | .PHONY: compose.aws 84 | compose.aws: compose.aws 85 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 86 | @ docker stop $(docker ps -a -q) || true 87 | @ docker rm -f $(docker ps -a -q) || true 88 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 89 | @ docker compose -f docker-compose.aws.yml rm -fsv || true 90 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 91 | -f docker-compose.aws.yml \ 92 | up --build 93 | 94 | .PHONY: compose.clean 95 | compose.clean: 96 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Starting: Cleaning docker resources" 97 | @ echo "-----------------------------------------\n" 98 | @ docker stop `docker ps -a -q` || true 99 | @ docker rm -f `docker ps -a -q` || true 100 | @ docker rmi -f `docker images --quiet --filter "dangling=true"` || true 101 | @ docker volume rm `docker volume ls -f dangling=true -q` || true 102 | @ rm -rf ./docker-volumes 103 | @ docker network rm `docker network ls -q` || true 104 | @ echo "" 105 | @ rm -rf metastore_db 106 | @ echo "\n-----------------------------------------" 107 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Finished: Cleaning docker resources" 108 | 109 | .PHONY: compose.storage-all 110 | compose.storage-all: compose.storage-all 111 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Running docker-compose" 112 | @ docker stop $(docker ps -a -q) || true 113 | @ docker rm -f $(docker ps -a -q) || true 114 | @ docker volume rm $(docker volume ls -f dangling=true -q) || true 115 | @ docker compose -f docker-compose.aws.yml rm -fsv || true 116 | @ DOCKER_HOST_IP=$(DOCKER_HOST_IP) docker compose \ 117 | -f docker-compose.storage.yml \ 118 | -f docker-compose.aws.yml \ 119 | -f docker-compose.kafka.yml \ 120 | up --build 121 | 122 | ## 123 | ## Storage CLIs 124 | ## 125 | 126 | .PHONY: mysql 127 | mysql: 128 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to mysql" 129 | @ $(MYSQLCLIENT) -u root -h localhost ad_stat -p root 130 | 131 | .PHONY: redis 132 | redis: 133 | @ echo "[$(TAG)] ($(shell TZ=UTC date -u '+%H:%M:%S')) - Connecting to redis" 134 | @ redis-cli -a credential 135 | 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical Data Pipeline (Code) 2 | 3 | -------------------------------------------------------------------------------- /_datasets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/.gitkeep -------------------------------------------------------------------------------- /_datasets/airbnb/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/airbnb/.gitkeep -------------------------------------------------------------------------------- /_datasets/ecommerce/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_datasets/ecommerce/.gitkeep -------------------------------------------------------------------------------- /_dockerfile/docker-metastore/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8u242-jre 2 | 3 | WORKDIR /opt 4 | 5 | ENV HADOOP_VERSION=2.10.1 6 | ENV METASTORE_VERSION=2.3.9 7 | ENV AWS_SDK_VERSION=1.11.271 8 | 9 | ENV HADOOP_HOME=/opt/hadoop-${HADOOP_VERSION} 10 | ENV HIVE_HOME=/opt/apache-hive-${METASTORE_VERSION}-bin 11 | ENV HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar 12 | 13 | # BIN 14 | RUN apt-get update && \ 15 | apt-get upgrade -y && \ 16 | apt-get -qqy install curl && \ 17 | curl -L https://dlcdn.apache.org/hive/hive-${METASTORE_VERSION}/apache-hive-${METASTORE_VERSION}-bin.tar.gz | tar zxf - && \ 18 | curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \ 19 | apt-get install --only-upgrade openssl libssl1.1 && \ 20 | apt-get install -y libk5crypto3 libkrb5-3 libsqlite3-0 21 | 22 | # DEPENDENCY 23 | RUN rm ${HIVE_HOME}/lib/postgresql-9.4.1208.jre7.jar 24 | RUN curl -o ${HIVE_HOME}/lib/postgresql-9.4.1212.jre7.jar -L https://jdbc.postgresql.org/download/postgresql-9.4.1212.jre7.jar 25 | RUN curl -L https://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-8.0.19.tar.gz | tar zxf - && \ 26 | cp mysql-connector-java-8.0.19/mysql-connector-java-8.0.19.jar ${HIVE_HOME}/lib/ && \ 27 | rm -rf mysql-connector-java-8.0.19 28 | 29 | # CONFIG 30 | COPY conf/hive-site.xml ${HIVE_HOME}/conf/hive-site.xml 31 | RUN ls -alh ${HADOOP_HOME}/etc/hadoop/ 32 | RUN ls -alh ${HIVE_HOME}/conf/ 33 | COPY scripts/entrypoint.sh /entrypoint.sh 34 | 35 | # UTILS 36 | ENV TINI_VERSION v0.19.0 37 | RUN apt-get -q update && apt-get -qy install netcat wget 38 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini 39 | RUN chmod +x /tini 40 | 41 | # ENV 42 | ENV TZ=Asia/Seoul 43 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 44 | 45 | # USER 46 | RUN groupadd -r hadoop --gid=1001 && \ 47 | useradd -r -g hadoop --uid=1001 -d ${HIVE_HOME} hadoop && \ 48 | chown hadoop:hadoop -R ${HIVE_HOME} 49 | 50 | USER hadoop 51 | WORKDIR $HIVE_HOME 52 | EXPOSE 9083 53 | 54 | ENTRYPOINT ["/tini", "--"] 55 | CMD ["/entrypoint.sh"] -------------------------------------------------------------------------------- /_dockerfile/docker-metastore/conf/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | hive.metastore.schema.verification 4 | false 5 | 6 | 7 | metastore.warehouse.dir 8 | s3a://spark/warehouse/ 9 | 10 | 11 | javax.jdo.option.ConnectionDriverName 12 | com.mysql.cj.jdbc.Driver 13 | 14 | 15 | 16 | javax.jdo.option.ConnectionURL 17 | jdbc:mysql://mysql:3306/metastore_db?createDatabaseIfNotExist=true 18 | 19 | 20 | 21 | javax.jdo.option.ConnectionUserName 22 | root 23 | 24 | 25 | 26 | javax.jdo.option.ConnectionPassword 27 | root 28 | 29 | 30 | 31 | fs.s3a.access.key 32 | accesskey 33 | 34 | 35 | fs.s3a.secret.key 36 | secretkey 37 | 38 | 39 | fs.s3a.endpoint 40 | http://minio:9000 41 | 42 | 43 | fs.s3a.path.style.access 44 | true 45 | 46 | 47 | fs.s3a.connection.ssl.enabled 48 | false 49 | Enables or disables SSL connections to S3. 50 | 51 | 52 | fs.s3a.impl 53 | org.apache.hadoop.fs.s3a.S3AFileSystem 54 | The implementation class of the S3A Filesystem 55 | 56 | 57 | -------------------------------------------------------------------------------- /_dockerfile/docker-metastore/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export HADOOP_VERSION=2.10.1 4 | export METASTORE_VERSION=2.3.9 5 | export AWS_SDK_VERSION=1.11.271 6 | 7 | export JAVA_HOME=/usr/local/openjdk-8 8 | export HADOOP_CLASSPATH=/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar:/opt/hadoop-${HADOOP_VERSION}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar 9 | 10 | sleep 10; 11 | 12 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType mysql || true; 13 | /opt/apache-hive-${METASTORE_VERSION}-bin/bin/hive --service metastore -------------------------------------------------------------------------------- /_dockerfile/docker-presto/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre 2 | 3 | 4 | ARG _PRESTO_HOME=/opt/presto 5 | ARG _PRESTO_VERSION=0.265.1 6 | ENV PRESTO_VERSION=${_PRESTO_VERSION} 7 | 8 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz 9 | RUN mkdir -p /opt || true 10 | RUN tar -xf presto-server-${PRESTO_VERSION}.tar.gz -C /opt 11 | RUN rm presto-server-${PRESTO_VERSION}.tar.gz 12 | RUN ln -s /opt/presto-server-${PRESTO_VERSION} ${_PRESTO_HOME} 13 | 14 | RUN wget --quiet https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar 15 | RUN mv presto-cli-${PRESTO_VERSION}-executable.jar /usr/local/bin/presto 16 | RUN chmod +x /usr/local/bin/presto 17 | 18 | # UTILS 19 | ENV TINI_VERSION v0.19.0 20 | RUN apt-get update && apt-get install -y wget python less telnet vim zsh netcat 21 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini 22 | RUN chmod +x /tini 23 | 24 | # ENV 25 | ENV TZ=Asia/Seoul 26 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 27 | 28 | # CONFIG 29 | COPY scripts/entrypoint.sh /entrypoint.sh 30 | COPY etc/jvm.config ${_PRESTO_HOME}/etc/jvm.config 31 | 32 | # USER 33 | RUN groupadd -r hadoop --gid=1001 && \ 34 | useradd -r -g hadoop --uid=1001 -d ${_PRESTO_HOME} hadoop && \ 35 | chown hadoop:hadoop -R ${_PRESTO_HOME} 36 | 37 | RUN mkdir -p /var/presto && \ 38 | chown hadoop:hadoop -R /var/presto && \ 39 | chown hadoop:hadoop -R /opt/presto-server-${PRESTO_VERSION} && \ 40 | chown hadoop:hadoop -R ${_PRESTO_HOME}/etc 41 | 42 | USER hadoop 43 | WORKDIR ${_PRESTO_HOME} 44 | EXPOSE 8080 45 | 46 | ENTRYPOINT ["/tini", "--"] 47 | CMD ["/entrypoint.sh"] 48 | -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/catalog/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore.uri=thrift://hive-metastore:9083 -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/catalog/iceberg.properties: -------------------------------------------------------------------------------- 1 | connector.name=iceberg 2 | hive.metastore.uri=thrift://hive-metastore:9083 3 | iceberg.file-format=PARQUET 4 | iceberg.compression-codec=SNAPPY 5 | -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/catalog/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | query.max-memory=1GB 5 | query.max-memory-per-node=1GB 6 | query.max-total-memory-per-node=2GB 7 | discovery-server.enabled=true 8 | discovery.uri=http://localhost:8080 -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx4G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:ReservedCodeCacheSize=150M 6 | -XX:+UseGCOverheadLimit 7 | -XX:+ExplicitGCInvokesConcurrent 8 | -XX:+HeapDumpOnOutOfMemoryError 9 | -XX:+ExitOnOutOfMemoryError -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/log.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_dockerfile/docker-presto/etc/log.properties -------------------------------------------------------------------------------- /_dockerfile/docker-presto/etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=production 2 | node.id=$(NODE_ID) 3 | node.data-dir=/var/presto/data -------------------------------------------------------------------------------- /_dockerfile/docker-presto/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | PRESTO_HOME=${PRESTO_HOME:-/opt/presto} 2 | 3 | PRESTO_COORDINATOR=${PRESTO_COORDINATOR:-} 4 | PRESTO_NODE_ID=${PRESTO_NODE_ID:-} 5 | PRESTO_LOG_LEVEL=${PRESTO_LOG_LEVEL:-INFO} 6 | 7 | PRESTO_HTTP_SERVER_PORT=${PRESTO_HTTP_SERVER_PORT:-8080} 8 | 9 | PRESTO_MAX_MEMORY=${PRESTO_MAX_MEMORY:-20} 10 | PRESTO_MAX_MEMORY_PER_NODE=${PRESTO_MAX_MEMORY_PER_NODE:-1} 11 | PRESTO_MAX_TOTAL_MEMORY_PER_NODE=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE:-2} 12 | PRESTO_HEAP_HEADROOM_PER_NODE=${PRESTO_HEAP_HEADROOM_PER_NODE:-1} 13 | PRESTO_JVM_HEAP_SIZE=${PRESTO_JVM_HEAP_SIZE:-4} 14 | 15 | create_config_node() { 16 | ( 17 | echo "node.environment=production" 18 | echo "node.id=${PRESTO_NODE_ID}" 19 | echo "node.data-dir=/var/presto/data" 20 | ) >${PRESTO_HOME}/etc/node.properties 21 | } 22 | 23 | change_config_jvm() { 24 | sed -i "s/-Xmx.*G/-Xmx${PRESTO_JVM_HEAP_SIZE}G/" ${PRESTO_HOME}/etc/jvm.config 25 | } 26 | 27 | create_config_log() { 28 | ( 29 | echo "com.facebook.presto=${PRESTO_LOG_LEVEL}" 30 | ) >${PRESTO_HOME}/etc/log.config 31 | } 32 | 33 | create_config_coordinator() { 34 | ( 35 | echo "coordinator=true" 36 | echo "node-scheduler.include-coordinator=false" 37 | echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}" 38 | echo "query.max-memory=${PRESTO_MAX_MEMORY}GB" 39 | echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB" 40 | echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB" 41 | echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB" 42 | echo "discovery-server.enabled=true" 43 | echo "discovery.uri=http://localhost:${PRESTO_HTTP_SERVER_PORT}" 44 | ) >${PRESTO_HOME}/etc/config.properties 45 | } 46 | 47 | create_config_worker() { 48 | ( 49 | echo "coordinator=false" 50 | echo "http-server.http.port=${PRESTO_HTTP_SERVER_PORT}" 51 | echo "query.max-memory=${PRESTO_MAX_MEMORY}GB" 52 | echo "query.max-memory-per-node=${PRESTO_MAX_MEMORY_PER_NODE}GB" 53 | echo "query.max-total-memory-per-node=${PRESTO_MAX_TOTAL_MEMORY_PER_NODE}GB" 54 | echo "memory.heap-headroom-per-node=${PRESTO_HEAP_HEADROOM_PER_NODE}GB" 55 | echo "discovery.uri=http://${PRESTO_COORDINATOR}:${PRESTO_HTTP_SERVER_PORT}" 56 | ) >${PRESTO_HOME}/etc/config.properties 57 | } 58 | 59 | create_config_node 60 | create_config_log 61 | change_config_jvm 62 | if [ -z "${PRESTO_COORDINATOR}" ] 63 | then 64 | create_config_coordinator; 65 | else 66 | create_config_worker; 67 | fi 68 | 69 | env 70 | 71 | cat ${PRESTO_HOME}/etc/node.properties 72 | cat ${PRESTO_HOME}/etc/config.properties 73 | cat ${PRESTO_HOME}/etc/jvm.config 74 | 75 | 76 | /opt/presto/bin/launcher run 77 | -------------------------------------------------------------------------------- /_notebook/spark-metastore-remote.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "397cef09-bd27-4769-9f70-7ad80803cbd7", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "3.8.10 (default, Nov 14 2021, 21:32:59) \n", 14 | "[Clang 12.0.5 (clang-1205.0.22.9)]\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import sys\n", 20 | "\n", 21 | "print(sys.version)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "2bae673f-f186-4e08-b7ee-23c236771a35", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "SPARK_HOME = \"/Users/kun/github/spark/spark-3.1.2-bin-hadoop-3.2.2\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "id": "8491d5dd-2c63-4fd0-9bd6-cdeba9b970d9", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import findspark\n", 42 | "\n", 43 | "findspark.init(SPARK_HOME)\n", 44 | "#findspark.add_packages([\"org.apache.hadoop:hadoop-aws:3.2.2\", \"com.amazonaws:aws-java-sdk-bundle:1.11.375\"])" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "7167ea4d-2fb3-450c-82b2-c6362b454820", 50 | "metadata": {}, 51 | "source": [ 52 | "### Spark Session 생성\n", 53 | "\n", 54 | "로컬모드에서 실행할 Spark Session 을 만듭니다. (`.master(\"local[*]\")`)\n", 55 | "- 일반적인 Spark 설정은 `$SPARK_HOME/conf/spark-defaults.conf` 내에서 세팅해 공통환경으로 사용합니다. 다만 이 예제에서는 보여주기 위해 SparkConf 를 이용해 설정합니다.\n", 56 | "- Hive Metastore URI 등 HMS 관련 설정은 `$SPARK_HOME/conf/hive-site.conf` 내에서 세팅해 공통 환경으로 사용합니다.\n", 57 | "- 이 예제에서는 Minio 를 사용하므로 Access Key, Secret Key 를 사용합니다. AWS 위에서 실행된다면 [AWS Instance Profile](https://docs.aws.amazon.com/ko_kr/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) 을 이용할 수 있으므로 키를 세팅하지 않습니다." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "id": "27587697-2e5c-4301-bc98-82389915b35c", 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "21/11/29 15:41:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 71 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 72 | "Setting default log level to \"WARN\".\n", 73 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 74 | "21/11/29 15:41:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "from pyspark.sql import SparkSession\n", 80 | "\n", 81 | "\n", 82 | "spark = SparkSession \\\n", 83 | " .builder \\\n", 84 | " .master(\"local[*]\") \\\n", 85 | " .appName(\"example-app\") \\\n", 86 | " .config(\"spark.hadoop.fs.s3a.access.key\", \"accesskey\")\\\n", 87 | " .config(\"spark.hadoop.fs.s3a.secret.key\", \"secretkey\")\\\n", 88 | " .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://localhost:9000\")\\\n", 89 | " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\")\\\n", 90 | " .config(\"spark.hadoop.fs.s3a.connection.ssl.enabled\",\"false\")\\\n", 91 | " .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\\\n", 92 | " .enableHiveSupport() \\\n", 93 | " .getOrCreate()\n", 94 | " \n", 95 | "spark.sparkContext.setSystemProperty(\"com.amazonaws.services.s3.enableV4\", \"true\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "52107cb8-9741-422e-b50a-d9cd830f5ab0", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "spark.sparkContext.getConf().getAll()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "id": "b5c1bd63-5298-44fc-8681-974f2b9e7d50", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "21/11/26 01:44:27 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.\n" 119 | ] 120 | }, 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "DataFrame[]" 125 | ] 126 | }, 127 | "execution_count": 7, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "spark.sql(\"\"\"\n", 134 | "CREATE TABLE student (\n", 135 | " id INT, \n", 136 | " name STRING, \n", 137 | " age INT\n", 138 | ") \n", 139 | "STORED AS PARQUET\n", 140 | "LOCATION 's3a://udon-data/lake/student/'\n", 141 | "\"\"\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 17, 147 | "id": "0f83e0e6-8337-4922-ab2f-9e13d7b7089f", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "df = spark.read.format(\"csv\").load(\"s3a://udon-data-lake/marketing_campaign.csv\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "6ccea55e-6ba0-4414-8588-e1968bddd92b", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "pyspark", 166 | "language": "python", 167 | "name": "pyspark" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.8.10" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 5 184 | } 185 | -------------------------------------------------------------------------------- /_script/docker-mysql/conf/my.cnf: -------------------------------------------------------------------------------- 1 | [client] 2 | default-character-set = utf8mb4 3 | 4 | [mysql] 5 | default-character-set = utf8mb4 6 | 7 | [mysqld] 8 | character-set-client-handshake = FALSE 9 | character-set-server = utf8mb4 10 | collation-server = utf8mb4_unicode_ci 11 | default-storage-engine=InnoDB 12 | default-time-zone = '+09:00' 13 | -------------------------------------------------------------------------------- /_script/docker-mysql/sql/001_create_database.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE pipeline; 2 | -------------------------------------------------------------------------------- /_script/docker-mysql/sql/002_create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `ListingMeta` 2 | ( 3 | -- primary key 4 | `listing_id` BIGINT UNSIGNED NOT NULL PRIMARY KEY, 5 | `listing_name` VARCHAR(240) NULL, 6 | `listing_desc` TEXT NULL, 7 | `listing_summary` TEXT NULL, 8 | `listing_url` TEXT NULL, 9 | 10 | -- FK columns 11 | 12 | -- common 13 | `created_at` datetime DEFAULT CURRENT_TIMESTAMP NOT NULL 14 | 15 | ) ENGINE = InnoDB 16 | DEFAULT CHARSET = utf8mb4 17 | COLLATE = utf8mb4_unicode_ci; 18 | -------------------------------------------------------------------------------- /_script/docker-spark/apps/main.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import col,date_format 3 | 4 | def init_spark(): 5 | sql = SparkSession.builder\ 6 | .appName("trip-app")\ 7 | .config("spark.jars", "/opt/spark-apps/postgresql-42.2.22.jar")\ 8 | .getOrCreate() 9 | sc = sql.sparkContext 10 | return sql,sc 11 | 12 | def main(): 13 | url = "jdbc:postgresql://storage-postgres:5432/postgres" 14 | properties = { 15 | "user": "postgres", 16 | "password": "root", 17 | "driver": "org.postgresql.Driver" 18 | } 19 | file = "/opt/spark-data/MTA_2014_08_01.csv" 20 | sql,sc = init_spark() 21 | 22 | df = sql.read.load(file,format = "csv", inferSchema="true", sep="\t", header="true") \ 23 | .withColumn("report_hour",date_format(col("time_received"),"yyyy-MM-dd HH:00:00")) \ 24 | .withColumn("report_date",date_format(col("time_received"),"yyyy-MM-dd")) 25 | 26 | # Filter invalid coordinates 27 | df.where("latitude <= 90 AND latitude >= -90 AND longitude <= 180 AND longitude >= -180") \ 28 | .where("latitude != 0.000000 OR longitude != 0.000000 ") \ 29 | .write \ 30 | .jdbc(url=url, table="mta_reports", mode='append', properties=properties) \ 31 | .save() 32 | 33 | if __name__ == '__main__': 34 | main() -------------------------------------------------------------------------------- /_script/docker-spark/apps/postgresql-42.2.22.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/_script/docker-spark/apps/postgresql-42.2.22.jar -------------------------------------------------------------------------------- /_script/docker-spark/conf/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.eventLog.dir file:/tmp/spark-events 2 | spark.eventLog.enabled true 3 | spark.history.fs.logDirectory file:/tmp/spark-events -------------------------------------------------------------------------------- /_script/docker-spark/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | -------------------------------------------------------------------------------- /_slide/.gitignore: -------------------------------------------------------------------------------- 1 | practical-aws-pipeline/ 2 | practical-spark -------------------------------------------------------------------------------- /_volume/.gitignore: -------------------------------------------------------------------------------- 1 | docker-minio/ -------------------------------------------------------------------------------- /docker-compose.aws.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | dynamodb-local: 4 | image: amazon/dynamodb-local:latest 5 | container_name: dynamodb-local 6 | ports: 7 | - "8000:8000" 8 | 9 | dynamodb-admin: 10 | image: aaronshaf/dynamodb-admin 11 | ports: 12 | - "8001:8001" 13 | environment: 14 | DYNAMO_ENDPOINT: "http://dynamodb-local:8000" 15 | AWS_REGION: "ap-northeast-2" 16 | AWS_ACCESS_KEY_ID: accesskey 17 | AWS_SECRET_ACCESS_KEY: secretkey 18 | depends_on: 19 | - dynamodb-local 20 | 21 | minio: 22 | image: minio/minio:latest 23 | container_name: minio 24 | environment: 25 | - MINIO_ACCESS_KEY=accesskey 26 | - MINIO_SECRET_KEY=secretkey 27 | - MINIO_ROOT_USER=admin 28 | - MINIO_ROOT_PASSWORD=admin12345 29 | volumes: 30 | - ./_volume/docker-minio:/data 31 | ports: 32 | - "9000:9000" 33 | - "9001:9001" 34 | command: server /data --console-address ":9001" 35 | 36 | minio-script: 37 | image: minio/mc 38 | container_name: minio-script 39 | depends_on: 40 | - minio 41 | entrypoint: > 42 | /bin/sh -c " 43 | sleep 10s; 44 | /usr/bin/mc alias set myminio http://minio:9000 admin admin12345; 45 | /usr/bin/mc mb myminio/udon-data-lake || true; 46 | /usr/bin/mc admin user add myminio accesskey accesskey || true; 47 | /usr/bin/mc admin policy set myminio readwrite user=accesskey || true; 48 | exit 0; 49 | " 50 | -------------------------------------------------------------------------------- /docker-compose.kafka.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | zookeeper: 4 | image: confluentinc/cp-zookeeper:6.2.1 5 | hostname: zookeeper 6 | container_name: zookeeper 7 | ports: 8 | - "2181:2181" 9 | environment: 10 | ZOOKEEPER_CLIENT_PORT: 2181 11 | ZOOKEEPER_TICK_TIME: 2000 12 | 13 | broker: 14 | image: confluentinc/cp-kafka:6.2.1 15 | hostname: broker 16 | container_name: broker 17 | depends_on: 18 | - zookeeper 19 | ports: 20 | - "29092:29092" 21 | - "9092:9092" 22 | - "9101:9101" 23 | environment: 24 | KAFKA_BROKER_ID: 1 25 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 26 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 29 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 30 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 32 | KAFKA_JMX_PORT: 9101 33 | KAFKA_JMX_OPTS: -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=kafka0 -Dcom.sun.management.jmxremote.rmi.port=9101 34 | KAFKA_JMX_HOSTNAME: localhost 35 | 36 | schema-registry: 37 | image: confluentinc/cp-schema-registry:6.2.1 38 | hostname: schema-registry 39 | container_name: schema-registry 40 | depends_on: 41 | - broker 42 | ports: 43 | - "8081:8081" 44 | environment: 45 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 46 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092' 47 | SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081 48 | 49 | kafka-ui: 50 | image: provectuslabs/kafka-ui:latest 51 | container_name: kafka-ui 52 | depends_on: 53 | - broker 54 | - zookeeper 55 | - schema-registry 56 | ports: 57 | - "8080:8080" 58 | restart: always 59 | environment: 60 | - KAFKA_CLUSTERS_0_NAME=local 61 | - KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=broker:29092 62 | - KAFKA_CLUSTERS_0_ZOOKEEPER=zookeeper:2181 63 | - KAFKA_CLUSTERS_0_SCHEMAREGISTRY=schema-registry:8081 64 | - KAFKA_CLUSTERS_0_JMXPORT=9101 -------------------------------------------------------------------------------- /docker-compose.metastore.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | mysql: 4 | image: mysql:8 5 | container_name: mysql 6 | restart: always 7 | ports: 8 | - "3306:3306" 9 | environment: 10 | - MYSQL_DATABASE=metastore_db 11 | - MYSQL_ROOT_PASSWORD=root 12 | - LANG=C.UTF-8 13 | volumes: 14 | - ./_script/docker-mysql/conf/:/etc/mysql/conf.d 15 | - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d 16 | command: --sql_mode='' 17 | security_opt: 18 | - seccomp:unconfined 19 | 20 | minio: 21 | image: minio/minio:latest 22 | container_name: minio 23 | environment: 24 | - MINIO_ACCESS_KEY=accesskey 25 | - MINIO_SECRET_KEY=secretkey 26 | - MINIO_ROOT_USER=admin 27 | - MINIO_ROOT_PASSWORD=admin12345 28 | volumes: 29 | - ./_volume/docker-minio:/data 30 | ports: 31 | - "9000:9000" 32 | - "9001:9001" 33 | command: server /data --console-address ":9001" 34 | 35 | minio-script: 36 | image: minio/mc 37 | container_name: minio-script 38 | depends_on: 39 | - minio 40 | entrypoint: > 41 | /bin/sh -c " 42 | /usr/bin/mc alias set myminio http://minio:9000 admin admin12345; 43 | /usr/bin/mc mb myminio/udon-data-lake || true; 44 | # /usr/bin/mc admin user add myminio accesskey secretkey || true; 45 | # /usr/bin/mc admin policy set myminio readwrite user=accesskey || true; 46 | exit 0; 47 | " 48 | 49 | hive-metastore: 50 | container_name: hive-metastore 51 | build: 52 | context: _dockerfile/docker-metastore 53 | dockerfile: Dockerfile 54 | command: 55 | - /bin/sh 56 | - -c 57 | - | 58 | sleep 10; 59 | /entrypoint.sh 60 | ports: 61 | - "9083:9083" 62 | depends_on: 63 | - mysql 64 | - minio 65 | -------------------------------------------------------------------------------- /docker-compose.presto.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | presto-coordinator: 4 | container_name: presto-coordinator 5 | build: 6 | context: _dockerfile/docker-presto 7 | dockerfile: Dockerfile 8 | environment: 9 | - PRESTO_NODE_ID=presto-coordinator 10 | ports: 11 | - "8889:8080" 12 | volumes: 13 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog 14 | 15 | presto-worker-01: 16 | container_name: presto-worker-01 17 | build: 18 | context: _dockerfile/docker-presto 19 | dockerfile: Dockerfile 20 | environment: 21 | - PRESTO_COORDINATOR=presto-coordinator 22 | - PRESTO_NODE_ID=presto-worker-01 23 | volumes: 24 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog 25 | depends_on: 26 | - presto-coordinator 27 | command: 28 | - /bin/sh 29 | - -c 30 | - | 31 | sleep 20; 32 | /entrypoint.sh 33 | 34 | presto-worker-02: 35 | container_name: presto-worker-02 36 | build: 37 | context: _dockerfile/docker-presto 38 | dockerfile: Dockerfile 39 | environment: 40 | - PRESTO_COORDINATOR=presto-coordinator 41 | - PRESTO_NODE_ID=presto-worker-02 42 | volumes: 43 | - ./_dockerfile/docker-presto/etc/catalog:/opt/presto/etc/catalog 44 | depends_on: 45 | - presto-coordinator 46 | command: 47 | - /bin/sh 48 | - -c 49 | - | 50 | sleep 20; 51 | /entrypoint.sh 52 | -------------------------------------------------------------------------------- /docker-compose.spark.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | spark-master: 4 | image: bde2020/spark-master:3.1.1-hadoop3.2 5 | container_name: spark-master 6 | ports: 7 | - "8080:8080" 8 | - "7077:7077" 9 | - "4040:4040" 10 | volumes: 11 | - ./_script/docker-spark/apps:/opt/spark-apps 12 | - ./_script/docker-spark/data:/opt/spark-data 13 | - ./_script/docker-spark/conf:/spark/conf 14 | - /tmp/spark-events-local:/tmp/spark-events 15 | environment: 16 | - INIT_DAEMON_STEP=setup_spark 17 | 18 | spark-worker-1: 19 | image: bde2020/spark-worker:3.1.1-hadoop3.2 20 | container_name: spark-worker-1 21 | depends_on: 22 | - spark-master 23 | ports: 24 | - "8081:8081" 25 | volumes: 26 | - ./_script/docker-spark/apps:/opt/spark-apps 27 | - ./_script/docker-spark/data:/opt/spark-data 28 | - ./_script/docker-spark/conf:/spark/conf 29 | - /tmp/spark-events-local:/tmp/spark-events 30 | environment: 31 | - "SPARK_MASTER=spark://spark-master:7077" 32 | spark-worker-2: 33 | 34 | image: bde2020/spark-worker:3.1.1-hadoop3.2 35 | container_name: spark-worker-2 36 | depends_on: 37 | - spark-master 38 | ports: 39 | - "8082:8081" 40 | volumes: 41 | - ./_script/docker-spark/apps:/opt/spark-apps 42 | - ./_script/docker-spark/data:/opt/spark-data 43 | - ./_script/docker-spark/conf:/spark/conf 44 | - /tmp/spark-events-local:/tmp/spark-events 45 | environment: 46 | - "SPARK_MASTER=spark://spark-master:7077" 47 | 48 | spark-history-server: 49 | image: bde2020/spark-history-server:3.1.1-hadoop3.2 50 | container_name: spark-history-server 51 | depends_on: 52 | - spark-master 53 | ports: 54 | - "18081:18081" 55 | volumes: 56 | - ./_script/docker-spark/apps:/opt/spark-apps 57 | - ./_script/docker-spark/data:/opt/spark-data 58 | - ./_script/docker-spark/conf:/spark/conf 59 | - /tmp/spark-events-local:/tmp/spark-events 60 | storage-postgres: 61 | image: postgres:11.7-alpine 62 | container_name: storage-postgers 63 | depends_on: 64 | - spark-master 65 | ports: 66 | - "5432:5432" 67 | environment: 68 | - POSTGRES_PASSWORD=root -------------------------------------------------------------------------------- /docker-compose.storage.yml: -------------------------------------------------------------------------------- 1 | version: '3.6' 2 | services: 3 | mysql: 4 | image: mysql:8 5 | container_name: mysql 6 | restart: always 7 | ports: 8 | - 3306:3306 9 | environment: 10 | - MYSQL_DATABASE=pipeline 11 | - MYSQL_ROOT_PASSWORD=root 12 | - LANG=C.UTF-8 13 | volumes: 14 | - ./_script/docker-mysql/conf/:/etc/mysql/conf.d 15 | - ./_script/docker-mysql/sql/:/docker-entrypoint-initdb.d 16 | command: --sql_mode='' 17 | 18 | redis: 19 | image: redis:5 20 | container_name: redis 21 | restart: always 22 | command: redis-server # --requirepass credential 23 | ports: 24 | - 6379:6379 25 | 26 | -------------------------------------------------------------------------------- /project-flink/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml 4 | 5 | ### Intellij+iml ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # AWS User-specific 17 | .idea/**/aws.xml 18 | 19 | # Generated files 20 | .idea/**/contentModel.xml 21 | 22 | # Sensitive or high-churn files 23 | .idea/**/dataSources/ 24 | .idea/**/dataSources.ids 25 | .idea/**/dataSources.local.xml 26 | .idea/**/sqlDataSources.xml 27 | .idea/**/dynamic.xml 28 | .idea/**/uiDesigner.xml 29 | .idea/**/dbnavigator.xml 30 | 31 | # Gradle 32 | .idea/**/gradle.xml 33 | .idea/**/libraries 34 | 35 | # Gradle and Maven with auto-import 36 | # When using Gradle or Maven with auto-import, you should exclude module files, 37 | # since they will be recreated, and may cause churn. Uncomment if using 38 | # auto-import. 39 | # .idea/artifacts 40 | # .idea/compiler.xml 41 | # .idea/jarRepositories.xml 42 | # .idea/modules.xml 43 | # .idea/*.iml 44 | # .idea/modules 45 | # *.iml 46 | # *.ipr 47 | 48 | # CMake 49 | cmake-build-*/ 50 | 51 | # Mongo Explorer plugin 52 | .idea/**/mongoSettings.xml 53 | 54 | # File-based project format 55 | *.iws 56 | 57 | # IntelliJ 58 | out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | # Editor-based Rest Client 76 | .idea/httpRequests 77 | 78 | # Android studio 3.1+ serialized cache file 79 | .idea/caches/build_file_checksums.ser 80 | 81 | ### Intellij+iml Patch ### 82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 83 | 84 | *.iml 85 | modules.xml 86 | .idea/misc.xml 87 | *.ipr 88 | 89 | ### Java ### 90 | # Compiled class file 91 | *.class 92 | 93 | # Log file 94 | *.log 95 | 96 | # BlueJ files 97 | *.ctxt 98 | 99 | # Mobile Tools for Java (J2ME) 100 | .mtj.tmp/ 101 | 102 | # Package Files # 103 | *.jar 104 | *.war 105 | *.nar 106 | *.ear 107 | *.zip 108 | *.tar.gz 109 | *.rar 110 | 111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 112 | hs_err_pid* 113 | 114 | ### Scala ### 115 | 116 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 117 | 118 | ### Gradle ### 119 | .gradle 120 | build/ 121 | 122 | # Ignore Gradle GUI config 123 | gradle-app.setting 124 | 125 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 126 | !gradle-wrapper.jar 127 | 128 | # Cache of project 129 | .gradletasknamecache 130 | 131 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 132 | # gradle/wrapper/gradle-wrapper.properties 133 | 134 | ### Gradle Patch ### 135 | **/build/ 136 | 137 | # Eclipse Gradle plugin generated files 138 | # Eclipse Core 139 | .project 140 | # JDT-specific (Eclipse Java Development Tools) 141 | .classpath 142 | 143 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml -------------------------------------------------------------------------------- /project-kafka/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,kotlin,java,intellij+iml,scala 4 | 5 | ### Intellij+iml ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # AWS User-specific 17 | .idea/**/aws.xml 18 | 19 | # Generated files 20 | .idea/**/contentModel.xml 21 | 22 | # Sensitive or high-churn files 23 | .idea/**/dataSources/ 24 | .idea/**/dataSources.ids 25 | .idea/**/dataSources.local.xml 26 | .idea/**/sqlDataSources.xml 27 | .idea/**/dynamic.xml 28 | .idea/**/uiDesigner.xml 29 | .idea/**/dbnavigator.xml 30 | 31 | # Gradle 32 | .idea/**/gradle.xml 33 | .idea/**/libraries 34 | 35 | # Gradle and Maven with auto-import 36 | # When using Gradle or Maven with auto-import, you should exclude module files, 37 | # since they will be recreated, and may cause churn. Uncomment if using 38 | # auto-import. 39 | # .idea/artifacts 40 | # .idea/compiler.xml 41 | # .idea/jarRepositories.xml 42 | # .idea/modules.xml 43 | # .idea/*.iml 44 | # .idea/modules 45 | # *.iml 46 | # *.ipr 47 | 48 | # CMake 49 | cmake-build-*/ 50 | 51 | # Mongo Explorer plugin 52 | .idea/**/mongoSettings.xml 53 | 54 | # File-based project format 55 | *.iws 56 | 57 | # IntelliJ 58 | out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | # Editor-based Rest Client 76 | .idea/httpRequests 77 | 78 | # Android studio 3.1+ serialized cache file 79 | .idea/caches/build_file_checksums.ser 80 | 81 | ### Intellij+iml Patch ### 82 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 83 | 84 | *.iml 85 | modules.xml 86 | .idea/misc.xml 87 | *.ipr 88 | 89 | ### Java ### 90 | # Compiled class file 91 | *.class 92 | 93 | # Log file 94 | *.log 95 | 96 | # BlueJ files 97 | *.ctxt 98 | 99 | # Mobile Tools for Java (J2ME) 100 | .mtj.tmp/ 101 | 102 | # Package Files # 103 | *.jar 104 | *.war 105 | *.nar 106 | *.ear 107 | *.zip 108 | *.tar.gz 109 | *.rar 110 | 111 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 112 | hs_err_pid* 113 | 114 | ### Kotlin ### 115 | # Compiled class file 116 | 117 | # Log file 118 | 119 | # BlueJ files 120 | 121 | # Mobile Tools for Java (J2ME) 122 | 123 | # Package Files # 124 | 125 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 126 | 127 | ### Scala ### 128 | 129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 130 | 131 | ### Gradle ### 132 | .gradle 133 | build/ 134 | 135 | # Ignore Gradle GUI config 136 | gradle-app.setting 137 | 138 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 139 | !gradle-wrapper.jar 140 | 141 | # Cache of project 142 | .gradletasknamecache 143 | 144 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 145 | # gradle/wrapper/gradle-wrapper.properties 146 | 147 | ### Gradle Patch ### 148 | **/build/ 149 | 150 | # Eclipse Gradle plugin generated files 151 | # Eclipse Core 152 | .project 153 | # JDT-specific (Eclipse Java Development Tools) 154 | .classpath 155 | 156 | # End of https://www.toptal.com/developers/gitignore/api/gradle,kotlin,java,intellij+iml,scala -------------------------------------------------------------------------------- /project-spark/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=gradle,scala,java,intellij+iml 3 | 4 | _volumes/** 5 | 6 | ### Intellij+iml ### 7 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 8 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 9 | 10 | # User-specific stuff 11 | .idea/**/workspace.xml 12 | .idea/**/tasks.xml 13 | .idea/**/usage.statistics.xml 14 | .idea/**/dictionaries 15 | .idea/**/shelf 16 | 17 | # AWS User-specific 18 | .idea/**/aws.xml 19 | 20 | # Generated files 21 | .idea/**/contentModel.xml 22 | 23 | # Sensitive or high-churn files 24 | .idea/**/dataSources/ 25 | .idea/**/dataSources.ids 26 | .idea/**/dataSources.local.xml 27 | .idea/**/sqlDataSources.xml 28 | .idea/**/dynamic.xml 29 | .idea/**/uiDesigner.xml 30 | .idea/**/dbnavigator.xml 31 | 32 | # Gradle 33 | .idea/**/gradle.xml 34 | .idea/**/libraries 35 | 36 | # Gradle and Maven with auto-import 37 | # When using Gradle or Maven with auto-import, you should exclude module files, 38 | # since they will be recreated, and may cause churn. Uncomment if using 39 | # auto-import. 40 | # .idea/artifacts 41 | # .idea/compiler.xml 42 | # .idea/jarRepositories.xml 43 | # .idea/modules.xml 44 | # .idea/*.iml 45 | # .idea/modules 46 | # *.iml 47 | # *.ipr 48 | 49 | # CMake 50 | cmake-build-*/ 51 | 52 | # Mongo Explorer plugin 53 | .idea/**/mongoSettings.xml 54 | 55 | # File-based project format 56 | *.iws 57 | 58 | # IntelliJ 59 | out/ 60 | 61 | # mpeltonen/sbt-idea plugin 62 | .idea_modules/ 63 | 64 | # JIRA plugin 65 | atlassian-ide-plugin.xml 66 | 67 | # Cursive Clojure plugin 68 | .idea/replstate.xml 69 | 70 | # Crashlytics plugin (for Android Studio and IntelliJ) 71 | com_crashlytics_export_strings.xml 72 | crashlytics.properties 73 | crashlytics-build.properties 74 | fabric.properties 75 | 76 | # Editor-based Rest Client 77 | .idea/httpRequests 78 | 79 | # Android studio 3.1+ serialized cache file 80 | .idea/caches/build_file_checksums.ser 81 | 82 | ### Intellij+iml Patch ### 83 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 84 | 85 | *.iml 86 | modules.xml 87 | .idea/misc.xml 88 | *.ipr 89 | 90 | ### Java ### 91 | # Compiled class file 92 | *.class 93 | 94 | # Log file 95 | *.log 96 | 97 | # BlueJ files 98 | *.ctxt 99 | 100 | # Mobile Tools for Java (J2ME) 101 | .mtj.tmp/ 102 | 103 | # Package Files # 104 | *.jar 105 | *.war 106 | *.nar 107 | *.ear 108 | *.zip 109 | *.tar.gz 110 | *.rar 111 | 112 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 113 | hs_err_pid* 114 | 115 | ### Scala ### 116 | 117 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 118 | 119 | ### Gradle ### 120 | .gradle 121 | build/ 122 | 123 | # Ignore Gradle GUI config 124 | gradle-app.setting 125 | 126 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 127 | !gradle-wrapper.jar 128 | 129 | # Cache of project 130 | .gradletasknamecache 131 | 132 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 133 | # gradle/wrapper/gradle-wrapper.properties 134 | 135 | ### Gradle Patch ### 136 | **/build/ 137 | 138 | # Eclipse Gradle plugin generated files 139 | # Eclipse Core 140 | .project 141 | # JDT-specific (Eclipse Java Development Tools) 142 | .classpath 143 | 144 | # End of https://www.toptal.com/developers/gitignore/api/gradle,scala,java,intellij+iml -------------------------------------------------------------------------------- /project-spark/_scripts/mysql-ddl/table_property_stat.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE pipeline.property_stat 2 | ( 3 | property_id BIGINT UNSIGNED NOT NULL, 4 | property_type VARCHAR(30) NOT NULL, 5 | lat DOUBLE(40, 10) NOT NULL, 6 | lng DOUBLE(40, 10) NOT NULL, 7 | 8 | count_review_all BIGINT UNSIGNED NOT NULL, 9 | score_review_all DOUBLE(10, 5) NOT NULL, 10 | 11 | count_review BIGINT UNSIGNED NOT NULL, 12 | count_sales BIGINT UNSIGNED NOT NULL, 13 | price_sales BIGINT UNSIGNED NOT NULL, 14 | 15 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL, 16 | updated_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL, 17 | 18 | part TIMESTAMP NOT NULL COMMENT '데이터 파티션', 19 | 20 | PRIMARY KEY (property_id, part), 21 | INDEX idx_property_stat_combined (part, property_id) 22 | 23 | ) ENGINE = InnoDB 24 | DEFAULT CHARSET = utf8mb4 25 | COLLATE = utf8mb4_unicode_ci; 26 | 27 | -------------------------------------------------------------------------------- /project-spark/build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | ext { 3 | gradleShadowVersion = '6.1.0' 4 | gradleTestLoggerVersion = '2.1.0' 5 | gradleScalaTestVersion = '0.30' 6 | gradleVersioningPluginVersion = '2.8.2' 7 | gradleAvroPluginVersion = '1.2.0' 8 | } 9 | 10 | repositories { 11 | mavenCentral() 12 | jcenter() 13 | 14 | maven { url "https://plugins.gradle.org/m2/" } 15 | maven { url 'https://repo.spring.io/plugins-release' } 16 | maven { 17 | name "typesafe-maven-release" 18 | url "https://repo.typesafe.com/typesafe/maven-releases" 19 | } 20 | maven { 21 | name "Spark Packages Repo" 22 | url "https://dl.bintray.com/spark-packages/maven" 23 | } 24 | maven { 25 | name "Confluent" 26 | url "https://packages.confluent.io/maven/" 27 | } 28 | maven { 29 | name "jitpack" 30 | url 'https://jitpack.io' 31 | } 32 | ivy { 33 | name "typesafe-ivy-release" 34 | url "https://repo.typesafe.com/typesafe/ivy-releases" 35 | layout "ivy" 36 | } 37 | } 38 | 39 | dependencies { 40 | classpath "com.github.jengelman.gradle.plugins:shadow:${gradleShadowVersion}" 41 | classpath "gradle.plugin.net.nemerosa:versioning:${gradleVersioningPluginVersion}" 42 | classpath "com.github.davidmc24.gradle.plugin:gradle-avro-plugin:${gradleAvroPluginVersion}" 43 | 44 | // classpath "gradle.plugin.com.github.maiflai:gradle-scalatest:${gradleScalaTestVersion}" 45 | // classpath "com.adarshr:gradle-test-logger-plugin:${gradleTestLoggerVersion}" 46 | } 47 | } 48 | 49 | allprojects { 50 | apply plugin: 'idea' 51 | apply plugin: 'java' 52 | apply plugin: 'java-library' 53 | apply plugin: 'scala' 54 | 55 | // apply plugin: 'com.adarshr.test-logger' 56 | // apply plugin: "com.github.maiflai.scalatest" 57 | 58 | repositories { 59 | mavenCentral() 60 | maven { url "https://jcenter.bintray.com" } 61 | maven { 62 | name "Confluent" 63 | url "https://packages.confluent.io/maven/" 64 | } 65 | } 66 | 67 | ext { 68 | // Scala 69 | scalaVersionRevision = "12" 70 | 71 | // Spark 72 | scalaSparkVersion = "2.12" 73 | sparkVersion = "3.2.0" 74 | confluentVersion = "5.3.4" 75 | 76 | // Flink 77 | kafkaClientVersion = "2.6.2" 78 | 79 | // MySQL 80 | mysqlDriverVersion = "8.0.27" 81 | 82 | // AWS 83 | awsSdkVersion = "1.11.901" 84 | awsHadoopVersion = "3.3.1" 85 | 86 | // Utility 87 | typesafeConfigVersion = "1.3.3" 88 | shapelessVersion = "2.3.3" 89 | pureconfigVersion = "0.17.0" 90 | json4sVersion = '3.6.5' 91 | avroVersion = '1.10.2' 92 | semverVresion = '2.2.0' 93 | scalaHttpVersion = "2.0.0-RC6" 94 | 95 | // Logging 96 | slf4jVersion = "1.7.30" 97 | log4jVersion = "2.16.0" 98 | 99 | // Test 100 | scalaTestVersion = "3.2.5" 101 | junit5Version = "5.5.2" 102 | } 103 | 104 | dependencies { 105 | implementation("org.apache.commons:commons-lang3:3.12.0") 106 | 107 | implementation("com.typesafe:config:${typesafeConfigVersion}") 108 | implementation("com.github.pureconfig:pureconfig_${scalaSparkVersion}:${pureconfigVersion}") 109 | implementation("com.vdurmont:semver4j:${semverVresion}") 110 | 111 | // test 112 | testImplementation("org.scalatest:scalatest_${scalaSparkVersion}:${scalaTestVersion}") 113 | testImplementation "org.junit.platform:junit-platform-launcher:1.7.1" 114 | testRuntimeOnly "org.junit.platform:junit-platform-engine:1.7.1" 115 | testImplementation("org.junit.jupiter:junit-jupiter-api:${junit5Version}") 116 | testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:${junit5Version}") 117 | testRuntimeOnly "co.helmethair:scalatest-junit-runner:0.1.8" 118 | } 119 | } 120 | 121 | subprojects { 122 | targetCompatibility = 1.8 123 | sourceCompatibility = 1.8 124 | [compileJava, compileTestJava]*.options.collect { 125 | options -> options.encoding = 'UTF-8' 126 | } 127 | 128 | task wrapper(type: Wrapper) { 129 | gradleVersion = '6.8.1' 130 | } 131 | 132 | tasks.withType(ScalaCompile) { 133 | configure(scalaCompileOptions.forkOptions) { 134 | memoryMaximumSize = '2g' 135 | jvmArgs = ['-XX:MaxMetaspaceSize=512m'] 136 | } 137 | } 138 | 139 | compileScala { 140 | targetCompatibility = "1.8" 141 | sourceCompatibility = "1.8" 142 | scalaCompileOptions.additionalParameters = [""] // -opt:l:method 143 | } 144 | compileTestScala { 145 | scalaCompileOptions.additionalParameters = ["-Yrangepos"] 146 | } 147 | 148 | test { 149 | useJUnitPlatform { 150 | includeEngines 'scalatest' 151 | testLogging { 152 | events("passed", "skipped", "failed") 153 | } 154 | } 155 | 156 | filter { 157 | includeTestsMatching "*Spec" 158 | } 159 | } 160 | 161 | configurations { 162 | localCompile { 163 | transitive = true 164 | } 165 | } 166 | 167 | } 168 | -------------------------------------------------------------------------------- /project-spark/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /project-spark/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /project-spark/gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # 4 | # Copyright 2015 the original author or authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | ## 21 | ## Gradle start up script for UN*X 22 | ## 23 | ############################################################################## 24 | 25 | # Attempt to set APP_HOME 26 | # Resolve links: $0 may be a link 27 | PRG="$0" 28 | # Need this for relative symlinks. 29 | while [ -h "$PRG" ] ; do 30 | ls=`ls -ld "$PRG"` 31 | link=`expr "$ls" : '.*-> \(.*\)$'` 32 | if expr "$link" : '/.*' > /dev/null; then 33 | PRG="$link" 34 | else 35 | PRG=`dirname "$PRG"`"/$link" 36 | fi 37 | done 38 | SAVED="`pwd`" 39 | cd "`dirname \"$PRG\"`/" >/dev/null 40 | APP_HOME="`pwd -P`" 41 | cd "$SAVED" >/dev/null 42 | 43 | APP_NAME="Gradle" 44 | APP_BASE_NAME=`basename "$0"` 45 | 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 48 | 49 | # Use the maximum available, or set MAX_FD != -1 to use that value. 50 | MAX_FD="maximum" 51 | 52 | warn () { 53 | echo "$*" 54 | } 55 | 56 | die () { 57 | echo 58 | echo "$*" 59 | echo 60 | exit 1 61 | } 62 | 63 | # OS specific support (must be 'true' or 'false'). 64 | cygwin=false 65 | msys=false 66 | darwin=false 67 | nonstop=false 68 | case "`uname`" in 69 | CYGWIN* ) 70 | cygwin=true 71 | ;; 72 | Darwin* ) 73 | darwin=true 74 | ;; 75 | MINGW* ) 76 | msys=true 77 | ;; 78 | NONSTOP* ) 79 | nonstop=true 80 | ;; 81 | esac 82 | 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 84 | 85 | 86 | # Determine the Java command to use to start the JVM. 87 | if [ -n "$JAVA_HOME" ] ; then 88 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 89 | # IBM's JDK on AIX uses strange locations for the executables 90 | JAVACMD="$JAVA_HOME/jre/sh/java" 91 | else 92 | JAVACMD="$JAVA_HOME/bin/java" 93 | fi 94 | if [ ! -x "$JAVACMD" ] ; then 95 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 96 | 97 | Please set the JAVA_HOME variable in your environment to match the 98 | location of your Java installation." 99 | fi 100 | else 101 | JAVACMD="java" 102 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 103 | 104 | Please set the JAVA_HOME variable in your environment to match the 105 | location of your Java installation." 106 | fi 107 | 108 | # Increase the maximum file descriptors if we can. 109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 110 | MAX_FD_LIMIT=`ulimit -H -n` 111 | if [ $? -eq 0 ] ; then 112 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 113 | MAX_FD="$MAX_FD_LIMIT" 114 | fi 115 | ulimit -n $MAX_FD 116 | if [ $? -ne 0 ] ; then 117 | warn "Could not set maximum file descriptor limit: $MAX_FD" 118 | fi 119 | else 120 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 121 | fi 122 | fi 123 | 124 | # For Darwin, add options to specify how the application appears in the dock 125 | if $darwin; then 126 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 127 | fi 128 | 129 | # For Cygwin or MSYS, switch paths to Windows format before running java 130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then 131 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 132 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 133 | 134 | JAVACMD=`cygpath --unix "$JAVACMD"` 135 | 136 | # We build the pattern for arguments to be converted via cygpath 137 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 138 | SEP="" 139 | for dir in $ROOTDIRSRAW ; do 140 | ROOTDIRS="$ROOTDIRS$SEP$dir" 141 | SEP="|" 142 | done 143 | OURCYGPATTERN="(^($ROOTDIRS))" 144 | # Add a user-defined pattern to the cygpath arguments 145 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 146 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 147 | fi 148 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 149 | i=0 150 | for arg in "$@" ; do 151 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 152 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 153 | 154 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 155 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 156 | else 157 | eval `echo args$i`="\"$arg\"" 158 | fi 159 | i=`expr $i + 1` 160 | done 161 | case $i in 162 | 0) set -- ;; 163 | 1) set -- "$args0" ;; 164 | 2) set -- "$args0" "$args1" ;; 165 | 3) set -- "$args0" "$args1" "$args2" ;; 166 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;; 167 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 168 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 169 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 170 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 171 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 172 | esac 173 | fi 174 | 175 | # Escape application args 176 | save () { 177 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 178 | echo " " 179 | } 180 | APP_ARGS=`save "$@"` 181 | 182 | # Collect all arguments for the java command, following the shell quoting and substitution rules 183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 184 | 185 | exec "$JAVACMD" "$@" 186 | -------------------------------------------------------------------------------- /project-spark/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /project-spark/module-core/build.gradle: -------------------------------------------------------------------------------- 1 | dependencies { 2 | // custom 3 | // https://mvnrepository.com/artifact/org.apache.flink/flink-avro-confluent-registry 4 | // https://mvnrepository.com/artifact/org.apache.avro/avro 5 | api("org.apache.avro:avro:${avroVersion}") 6 | 7 | api("org.json4s:json4s-jackson_${scalaSparkVersion}:${json4sVersion}") 8 | api("org.json4s:json4s-ext_${scalaSparkVersion}:${json4sVersion}") 9 | 10 | // logging 11 | api("org.apache.logging.log4j:log4j-api:${log4jVersion}") 12 | api("org.apache.logging.log4j:log4j-core:${log4jVersion}") 13 | api("org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}") 14 | api("org.slf4j:slf4j-log4j12:${slf4jVersion}") 15 | } 16 | -------------------------------------------------------------------------------- /project-spark/module-core/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-core/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /project-spark/module-core/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /project-spark/module-core/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /project-spark/module-core/src/main/scala/mkt/udon/core/common/Environment.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.core.common 2 | 3 | import pureconfig.generic.ProductHint 4 | import pureconfig.{CamelCase, ConfigFieldMapping, ConfigReader, ConfigSource} 5 | 6 | import scala.reflect.ClassTag 7 | 8 | object Environment { 9 | /** deployment */ 10 | private val LOCAL = "LOCAL" 11 | private val DEVELOPMENT = "DEV" 12 | private val STAGING = "STAGE" 13 | private val PRODUCTION = "PROD" 14 | 15 | /** testing */ 16 | private val UNIT = "UNIT" 17 | private val INTEGRATION = "INTEGRATION" 18 | 19 | private val mode = { 20 | var env: String = LOCAL 21 | 22 | val extractedEnv = System.getenv("PIPELINE_MODE") 23 | if (extractedEnv != null) { 24 | env = extractedEnv.toLowerCase() 25 | } 26 | 27 | env 28 | } 29 | 30 | def isLocalMode(): Boolean = { 31 | mode == LOCAL 32 | } 33 | 34 | /** 35 | * pureconfig 내에서 camel-case 사용을 위한 implicit 변수 생성 36 | * - https://pureconfig.github.io/docs/overriding-behavior-for-case-classes.html#field-mappings 37 | */ 38 | def buildConfigHint[T](): ProductHint[T] = { 39 | return ProductHint[T](ConfigFieldMapping(CamelCase, CamelCase)) 40 | } 41 | 42 | /** 43 | * 모드에 따라 다른 설정값 로딩하기 위한 함수 44 | */ 45 | def getConfigOrThrow[T: ClassTag : ConfigReader]()(implicit productHint: ProductHint[T]): T = { 46 | val config = ConfigSource.default.at(mode).loadOrThrow[T] 47 | config 48 | } 49 | 50 | def getConfigOrThrowForApp[T: ClassTag : ConfigReader](app: String)(implicit productHint: ProductHint[T]): T = { 51 | val config = ConfigSource.default.at(mode).at(app).loadOrThrow[T] 52 | config 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /project-spark/module-core/src/main/scala/mkt/udon/core/common/TimeUtil.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.core.common 2 | 3 | import java.time.format.DateTimeFormatter 4 | import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset} 5 | 6 | object TimeUtil { 7 | 8 | /** 9 | * @param partition 'yyyyMMdd' formatted String 10 | */ 11 | def convertPartitionToDateString(partition: String): String = { 12 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd") 13 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy-MM-dd") 14 | val parsed = LocalDate.parse(partition, formatterInput) 15 | 16 | return parsed.format(formatterOutput) 17 | } 18 | 19 | /** 20 | * @param partition 'yyyyMMdd' formatted String 21 | */ 22 | def convertPartitionToDateSlashString(partition: String): String = { 23 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd") 24 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd") 25 | val parsed = LocalDate.parse(partition, formatterInput) 26 | 27 | return parsed.format(formatterOutput) 28 | } 29 | 30 | /** 31 | * @param partition 'yyyyMMdd' formatted String 32 | */ 33 | def convertPartitionToSqlTimestamp(partition: String): java.sql.Timestamp = { 34 | val formatterInput = DateTimeFormatter.ofPattern("yyyyMMdd") 35 | val formatterOutput = DateTimeFormatter.ofPattern("yyyy/MM/dd") 36 | val parsed = LocalDate.parse(partition, formatterInput).atStartOfDay() 37 | 38 | return java.sql.Timestamp.valueOf(parsed) 39 | } 40 | 41 | /** 42 | * @param raw Assume the passed parameter has UTC timezone 43 | */ 44 | def convertStringToEpochMillis(raw: String): Long = { 45 | val formatterInput = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") 46 | val parsed = LocalDateTime.parse(raw.substring(0, 19), formatterInput) 47 | 48 | return parsed.atZone(ZoneOffset.UTC).toInstant.toEpochMilli 49 | } 50 | 51 | def getExpireEpochSeconds(expireDays: Int): Long = { 52 | val updatedAt = Instant.now().toEpochMilli 53 | val expireTtl = (updatedAt + (expireDays * 86400 * 1000)) / 1000 54 | return expireTtl 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /project-spark/module-core/src/main/scala/mkt/udon/core/entity/ProductPool.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.core.entity 2 | 3 | case class ProductPoolElement(id: String, rank: Long) 4 | 5 | case class ProductPool(specifier: String, elements: List[ProductPoolElement], elementCount: Long) 6 | -------------------------------------------------------------------------------- /project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserEvent.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.core.entity 2 | 3 | import mkt.udon.core.common.TimeUtil 4 | import org.json4s.{DefaultFormats, Formats} 5 | import org.json4s.jackson.Serialization 6 | 7 | case class UserEvent(eventTime: Long, eventType: String, userId: String, productId: String, price: Double) { 8 | def convertToUserEventView(): UserEventView = { 9 | UserEventView(eventTime, productId) 10 | } 11 | 12 | def convertToUserEventOrder(): UserEventOrder = { 13 | UserEventOrder(eventTime, productId, price) 14 | } 15 | } 16 | 17 | case class UserEventRaw(event_time: String, event_type: String, product_id: String, price: Double, user_id: String) { 18 | def convert(): UserEvent = { 19 | val eventTime = TimeUtil.convertStringToEpochMillis(event_time) 20 | UserEvent(eventTime = eventTime, eventType = event_type, userId = user_id, productId = product_id, price = price) 21 | } 22 | } 23 | 24 | object UserEvent { 25 | def convertFromRaw(raw: String): UserEvent = { 26 | implicit val default: Formats = DefaultFormats.preservingEmptyValues 27 | val parsed = Serialization.read[UserEventRaw](raw) 28 | parsed.convert() 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /project-spark/module-core/src/main/scala/mkt/udon/core/entity/UserProfile.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.core.entity 2 | 3 | import mkt.udon.core.entity.UserProfile.{EVENT_ORDER, EVENT_VIEW} 4 | 5 | /** 6 | * User Profile 에 저장될 View Event 입니다. 7 | */ 8 | case class UserEventView(eventTime: Long, productId: String) 9 | /** 10 | * User Profile 에 저장될 Order Event 입니다. 11 | */ 12 | case class UserEventOrder(eventTime: Long, productId: String, price: Double) 13 | 14 | /** 15 | * Dynamo 등의 Storage 에 저장될 수 있는 User Profile 입니다. 16 | * 17 | * totalOrderPrice 와 같이 사용자에 대한 전체 이벤트에 집계를 수행할수도 있습니다. 18 | * eventOrder 등의 경우에는 List 타입이고 무한히 늘어날 수 없으므로 최근 N 개만 저장합니다. 19 | * 20 | * @param specifier 사용자 ID 21 | * @param eventView 최근 상품 방문 이벤트 목록 22 | * @param eventOrder 최근 상품 주문 이벤트 목록 23 | */ 24 | case class UserProfile(specifier: String, 25 | 26 | var eventView: List[UserEventView] = List(), 27 | var eventOrder: List[UserEventOrder] = List()) { 28 | 29 | def update(userEvent: UserEvent, 30 | maxCountView: Int, maxCountOrder: Int): UserProfile = { 31 | 32 | if (userEvent.eventType == EVENT_VIEW) handleView(userEvent.convertToUserEventView(), maxCountView) 33 | else if (userEvent.eventType == EVENT_ORDER) handleOrder(userEvent.convertToUserEventOrder(), maxCountOrder) 34 | 35 | return this 36 | } 37 | 38 | def handleView(eventRecent: UserEventView, maxCount: Int) = { 39 | val merged = (eventView :+ eventRecent) 40 | val sorted = merged.sortBy(x => -x.eventTime).take(maxCount) 41 | 42 | eventView = sorted 43 | } 44 | 45 | def handleOrder(eventRecent: UserEventOrder, maxCount: Int) = { 46 | val merged = (eventOrder :+ eventRecent) 47 | val sorted = merged.sortBy(x => -x.eventTime).take(maxCount) 48 | 49 | eventOrder = sorted 50 | } 51 | 52 | } 53 | 54 | object UserProfile { 55 | val EVENT_VIEW = "view" 56 | val EVENT_ORDER = "order" 57 | 58 | def buildEmpty(userId: String): UserProfile = { 59 | UserProfile(specifier = userId, eventView = List(), eventOrder = List()) 60 | } 61 | } -------------------------------------------------------------------------------- /project-spark/module-infra-spark/build.gradle: -------------------------------------------------------------------------------- 1 | dependencies { 2 | 3 | // shared 4 | implementation project(path: ':module-core') 5 | 6 | // TODO: 클러스터모드에서 동작시에는 기본 라이브러리는 Jar 에 포함될 필요가 없습니다. 7 | // spark: 8 | // - use provided dependencies when building in CI. 9 | // - use compile dependencies for local testing 10 | // if (System.env.PIPELINE_BRANCH) { 11 | // ... 12 | // } 13 | 14 | api("org.apache.spark:spark-core_${scalaSparkVersion}:${sparkVersion}") { 15 | } 16 | api("org.apache.spark:spark-sql_${scalaSparkVersion}:${sparkVersion}") 17 | api("org.apache.spark:spark-hive_${scalaSparkVersion}:${sparkVersion}") 18 | api("org.apache.spark:spark-mllib_${scalaSparkVersion}:${sparkVersion}") 19 | api("org.apache.spark:spark-streaming_${scalaSparkVersion}:${sparkVersion}") 20 | api("org.apache.spark:spark-avro_${scalaSparkVersion}:${sparkVersion}") 21 | api("org.apache.spark:spark-sql-kafka-0-10_${scalaSparkVersion}:${sparkVersion}") 22 | 23 | api("org.apache.hadoop:hadoop-aws:${awsHadoopVersion}") 24 | api("org.apache.spark:spark-hadoop-cloud_${scalaSparkVersion}:${sparkVersion}") 25 | api("com.amazonaws:aws-java-sdk:${awsSdkVersion}") 26 | 27 | api("mysql:mysql-connector-java:${mysqlDriverVersion}") 28 | 29 | api("org.apache.avro:avro:$avroVersion") 30 | api("org.apache.kafka:kafka-clients:${kafkaClientVersion}") 31 | api("za.co.absa:abris_${scalaSparkVersion}:4.2.0") 32 | api("io.confluent:kafka-avro-serializer:$confluentVersion") { 33 | exclude group: "org.apache.kafka", module: "kafka-clients" 34 | } 35 | } -------------------------------------------------------------------------------- /project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /project-spark/module-infra-spark/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/SparkBase.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.infra.spark 2 | 3 | import mkt.udon.core.common.Environment 4 | import org.apache.log4j.LogManager 5 | import org.apache.spark.sql.SparkSession 6 | 7 | trait SparkBase { 8 | 9 | val logger = LogManager.getRootLogger 10 | var session: SparkSession = null 11 | 12 | def driver(session: SparkSession): Unit 13 | 14 | def buildSession(): SparkSession = { 15 | var sessionBuilder = SparkSession.builder().enableHiveSupport() 16 | 17 | if (Environment.isLocalMode()) { 18 | sessionBuilder = sessionBuilder.master("local[*]") 19 | sessionBuilder = sessionBuilder.config("spark.sql.crossJoin.enabled", true) 20 | 21 | } 22 | 23 | session = sessionBuilder.getOrCreate() 24 | setupHadoopEnvironment(session) 25 | 26 | session 27 | } 28 | 29 | /** 30 | * 실제 Production 환경에서는 31 | * - 설정은 Cluster 의 spark-defaults.conf 환경을 따릅니다. 32 | * - AWS Key 는 Machine 의 IAM Role 을 이용합니다. 33 | * 34 | * 아래 코드에서는 로컬 테스팅을 위해 해당 설정들을 직접 세팅합니다. 35 | */ 36 | def setupHadoopEnvironment(session: SparkSession): Unit = { 37 | if (!Environment.isLocalMode()) return 38 | 39 | val hadoopConf = session.sparkContext.hadoopConfiguration 40 | 41 | hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 42 | hadoopConf.set("fs.s3.canned.acl", "BucketOwnerFullControl") 43 | // hadoopConf.set("fs.s3a.access.key", accessKey) 44 | // hadoopConf.set("fs.s3a.secret.key", secretKey) 45 | } 46 | 47 | def main(args: Array[String]): Unit = { 48 | session = buildSession() 49 | 50 | try { 51 | driver(session) 52 | } catch { 53 | case t: Throwable => 54 | logger.error("Application failed due to", t) 55 | session.stop() 56 | } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/common/Partition.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.infra.spark.common 2 | 3 | object Partition { 4 | val PARTITION_KEY = "part" 5 | } 6 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/DynamoSink.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.infra.spark.storage 2 | 3 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder 4 | import com.amazonaws.services.dynamodbv2.document.{DynamoDB, Item, Table} 5 | import mkt.udon.core.common.TimeUtil 6 | import org.apache.spark.sql.Dataset 7 | import org.json4s.jackson.JsonMethods.parse 8 | import org.json4s.jackson.Serialization.write 9 | import org.json4s.{DefaultFormats, Extraction, FieldSerializer, Formats, JLong, JObject} 10 | 11 | import java.time.Instant 12 | 13 | object DynamoSink { 14 | 15 | def writePartition[T](dynamoTable: String, 16 | dynamoRegion: String, 17 | expireDays: Int, 18 | dsTarget: Dataset[T], 19 | expireFieldName: String = "expireTtl", 20 | updateFieldName: String = "updatedAt" 21 | )(implicit m: Manifest[T]): Unit = { 22 | 23 | dsTarget.foreachPartition((iter: Iterator[T]) => { 24 | val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build(); 25 | val dynamoDB = new DynamoDB(dynamoClient) 26 | val client = dynamoDB.getTable(dynamoTable) 27 | 28 | while (iter.hasNext) { 29 | val cur = iter.next() 30 | implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[T]() 31 | 32 | val updatedAt = Instant.now().toEpochMilli 33 | val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays) 34 | 35 | val json = Extraction.decompose(cur) 36 | .merge(JObject(updateFieldName -> JLong(updatedAt))) 37 | .merge(JObject(expireFieldName -> JLong(expireTtl))) 38 | .snakizeKeys 39 | 40 | val stringified = write(json) 41 | val request = Item.fromJSON(stringified) 42 | 43 | client.putItem(request) 44 | } 45 | }) 46 | } 47 | 48 | def putItem[A](dynamoClient: Table, 49 | item: A, 50 | expireDays: Int, 51 | expireFieldName: String = "expireTtl", 52 | updateFieldName: String = "updatedAt")(implicit m: Manifest[A]): Unit = { 53 | 54 | // FieldSerializer 는 `private` 필드 사용시 패키지 명 까지 필드 이름에 포함되므로 사용에 유의 55 | // Scala Enum 값 변환을 위해서는 EnumNameSerializer 가 필요하나 저장용 Case Class 에서 일반적으로 String 으로 사용 56 | implicit val default: Formats = DefaultFormats.preservingEmptyValues + FieldSerializer[A]() 57 | 58 | val updatedAt = Instant.now().toEpochMilli 59 | val expireTtl = TimeUtil.getExpireEpochSeconds(expireDays) 60 | 61 | val json = Extraction.decompose(item) 62 | .merge(JObject("updatedAt" -> JLong(updatedAt))) 63 | .merge(JObject("expireTtl" -> JLong(expireTtl))) 64 | .camelizeKeys 65 | 66 | val stringified = write(json) 67 | val request = Item.fromJSON(stringified) 68 | 69 | dynamoClient.putItem(request) 70 | } 71 | 72 | def getItem[A](dynamoClient: Table, 73 | keyName: String, keyValue: String)(implicit m: Manifest[A]): Option[A] = { 74 | 75 | val responseItem = dynamoClient.getItem(keyName, keyValue) 76 | 77 | if (responseItem == null) None 78 | else { 79 | implicit val format = DefaultFormats.preservingEmptyValues 80 | val raw = responseItem.toJSON 81 | val parsed = parse(raw).camelizeKeys 82 | val converted = parsed.extract[A] 83 | Some(converted) 84 | } 85 | } 86 | 87 | def buildClient(dynamoTable: String, dynamoRegion: String): Table = { 88 | val dynamoClient = AmazonDynamoDBClientBuilder.standard().withRegion(dynamoRegion).build(); 89 | val dynamoDB = new DynamoDB(dynamoClient) 90 | val client = dynamoDB.getTable(dynamoTable) 91 | return client 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/JdbcSink.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.infra.spark.storage 2 | 3 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 4 | 5 | import java.sql.{Connection, DriverManager} 6 | 7 | object JdbcSink { 8 | 9 | val DRIVER = "com.mysql.cj.jdbc.Driver" 10 | 11 | def write(session: SparkSession, dfTarget: Dataset[Row], 12 | jdbcUrl: String, jdbcTable: String, 13 | jdbcUsername: String, jdbcPassword: String, 14 | ): Unit = { 15 | 16 | dfTarget 17 | .write 18 | .mode("append") 19 | .format("jdbc") 20 | .option("driver", DRIVER) 21 | .option("url", jdbcUrl) 22 | .option("user", jdbcUsername) 23 | .option("password", jdbcPassword) 24 | .option("dbtable", jdbcTable) 25 | .option("truncate", "false") 26 | .save() 27 | } 28 | 29 | def delete(jdbcUrl: String, jdbcTable: String, 30 | jdbcUsername: String, jdbcPassword: String, 31 | partitionColName: String, partitionColValue: java.sql.Timestamp): Unit = { 32 | 33 | var connection: Connection = null 34 | 35 | try { 36 | Class.forName(DRIVER) 37 | connection = DriverManager.getConnection(jdbcUrl, jdbcUsername, jdbcPassword) 38 | 39 | // remove rows which are already existing and having the same partition value 40 | val query = s"DELETE FROM ${jdbcTable} WHERE `${partitionColName}` = ?" 41 | val preparedStatement = connection.prepareStatement(query) 42 | preparedStatement.setTimestamp(1, partitionColValue) 43 | preparedStatement.execute() 44 | 45 | } catch { 46 | case e: Exception => 47 | throw e; 48 | 49 | } finally { 50 | if (connection != null) connection.close() 51 | } 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /project-spark/module-infra-spark/src/main/scala/mkt/udon/infra/spark/storage/ParquetSink.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.infra.spark.storage 2 | 3 | import mkt.udon.core.common.TimeUtil 4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 5 | 6 | object ParquetSink { 7 | 8 | 9 | def write(session: SparkSession, 10 | dfTarget: DataFrame, 11 | parquetLocation: String, 12 | parquetSaveMode: SaveMode): Unit = { 13 | 14 | dfTarget 15 | .write 16 | .mode(parquetSaveMode) 17 | .options(Map( 18 | ("parquet.enable.dictionary", "true"), 19 | ("parquet.block.size", s"${32 * 1024 * 1024}"), 20 | ("parquet.page.size", s"${2 * 1024 * 1024}"), 21 | ("parquet.dictionary.page.size", s"${8 * 1024 * 1024}") 22 | )) 23 | .parquet(parquetLocation) 24 | } 25 | 26 | /** * 27 | * Partition Value 로 부터 저장할 Parquet Location 을 빌드합니다. 28 | * 29 | * @param s3Prefix 30 | * @param partitionValue yyyyMMdd 를 가정 31 | */ 32 | def buildLocation(prefix: String, partition: String): String = { 33 | val partitionPath = TimeUtil.convertPartitionToDateSlashString(partition) 34 | return s"${prefix}/${partitionPath}" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/Makefile: -------------------------------------------------------------------------------- 1 | TAG = "Makefile" 2 | 3 | VERSION = $(shell cat ./VERSION) 4 | MODULE = service-batch-discovery 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar 8 | 9 | .PHONY: test 10 | test: 11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 12 | @ echo "" 13 | 14 | @ ../gradlew :$(MODULE):test 15 | 16 | .PHONY: build 17 | build: 18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 19 | @ echo "" 20 | 21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar 22 | 23 | .PHONY: deploy 24 | deploy: 25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)" 26 | @ echo "" 27 | 28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT) 29 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1-SNAPSHOT -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/build.gradle: -------------------------------------------------------------------------------- 1 | def versionValue = file("VERSION").text.trim() 2 | project.version = versionValue 3 | 4 | apply plugin: 'application' 5 | apply plugin: 'com.github.johnrengelman.shadow' 6 | 7 | dependencies { 8 | // shared 9 | implementation project(path: ':module-core') 10 | implementation project(path: ':module-infra-spark') 11 | 12 | // custom 13 | // TODO 14 | } 15 | 16 | mainClassName = 'test' 17 | run.classpath = sourceSets.main.runtimeClasspath 18 | 19 | jar { 20 | manifest { 21 | attributes( 22 | "Implementation-Title": project.name, 23 | "Implementation-Version": project.version, 24 | "Build-Jdk": System.getProperty('java.version'), 25 | ) 26 | } 27 | 28 | } 29 | 30 | shadowJar { 31 | zip64 = true 32 | exclude 'META-INF/**' 33 | baseName = project.name 34 | mergeServiceFiles() 35 | } 36 | 37 | assemble.dependsOn(shadowJar) -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/resources/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | LOCAL { 2 | dynamoTable = "service-dev-product-pool" 3 | dynamoTable = ${?DYNAMO_TABLE} 4 | dynamoRegion = "ap-northeast-2" 5 | dynamoRegion = ${?DYNAMO_REGION} 6 | dynamoPartitionCount = 3 7 | dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT} 8 | 9 | parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_product_pool" 10 | parquetPrefix = ${?PARQUET_PREFIX} 11 | parquetWriteMode = "Overwrite" 12 | parquetWriteMode = ${?PARQUET_WRITE_MODE} 13 | parquetPartitionCount = 2 14 | parquetPartitionCount = ${?PARQUET_PARTITION_COUNT} 15 | 16 | partitionSnapshot = "20200201" 17 | partitionSnapshot = ${?PARTITION_SNAPSHOT} 18 | partitionMetricStart = "20200201" 19 | partitionMetricStart = ${?PARTITION_METRIC_START} 20 | partitionMetricEnd = "20200201" 21 | partitionMetricEnd = ${?PARTITION_METRIC_END} 22 | 23 | maxElementCount = 20 24 | maxElementCount = ${?MAX_ELEMENT_COUNT} 25 | expireDays = 10 26 | expireDays = ${?EXPIRE_DAYS} 27 | } 28 | 29 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=INFO, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n 24 | 25 | # Set the default spark-shell/spark-sql log level to WARN. When running the 26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite 27 | # the root logger's log level, so that the user can have different defaults 28 | # for the shell and regular Spark apps. 29 | log4j.logger.org.apache.spark.repl.Main=WARN 30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN 31 | 32 | # Settings to quiet third party logs that are too verbose 33 | log4j.logger.org.sparkproject.jetty=WARN 34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 37 | log4j.logger.org.apache.parquet=ERROR 38 | log4j.logger.parquet=ERROR 39 | 40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 43 | 44 | # For deploying Spark ThriftServer 45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805 46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter 47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message 48 | log4j.appender.console.filter.1.AcceptOnMatch=false -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/scala/mkt/udon/UdonProductPoolBatch.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon 2 | 3 | import mkt.udon.config.UdonProductPoolBatchConfig 4 | import mkt.udon.core.common.Environment 5 | import mkt.udon.entity.UdonProductPoolEntity 6 | import mkt.udon.infra.spark.SparkBase 7 | import mkt.udon.infra.spark.storage.{DynamoSink, ParquetSink} 8 | import org.apache.log4j.LogManager 9 | import org.apache.spark.sql.functions.lit 10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 11 | import pureconfig.generic.auto._ 12 | 13 | object UdonProductPoolBatch extends SparkBase { 14 | override val logger = LogManager.getLogger(this.getClass.getName) 15 | 16 | override def driver(session: SparkSession): Unit = { 17 | 18 | /** 19 | * 환경변수 추출 및 설정 20 | */ 21 | implicit val configHint = Environment.buildConfigHint[UdonProductPoolBatchConfig]() 22 | val config = Environment.getConfigOrThrow[UdonProductPoolBatchConfig] 23 | 24 | /** 25 | * 데이터 추출 및 가공 26 | */ 27 | val partitionSnapshot = config.partitionSnapshot 28 | val partitionMetricStart = config.partitionMetricStart 29 | val partitionMetricEnd = config.partitionMetricEnd 30 | val dfUserEvent = readUserEvent(session = session, 31 | partitionMetricStart = partitionMetricStart, partitionMetricEnd = partitionMetricEnd) 32 | 33 | val dsResult = UdonProductPoolEntity.convert( 34 | session, 35 | dfUserEvent = dfUserEvent, 36 | maxElementCount = config.maxElementCount) 37 | 38 | /** 39 | * 데이터 저장: Parquet 40 | * 41 | * `part` 를 파티션 컬럼으로 지정해 추가합니다. 42 | * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만, 43 | * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다. 44 | */ 45 | val dfPersistedParquet = dsResult.withColumn("part", lit(partitionSnapshot)) 46 | .repartition(config.parquetPartitionCount) 47 | val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partitionSnapshot) 48 | ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode)) 49 | 50 | /** 51 | * 데이터 저장: Dynamo 52 | */ 53 | DynamoSink.writePartition(config.dynamoTable, config.dynamoRegion, config.expireDays, dsResult) 54 | } 55 | 56 | def readUserEvent(session: SparkSession, 57 | partitionMetricStart: String, partitionMetricEnd: String): DataFrame = { 58 | 59 | if (Environment.isLocalMode()) { 60 | val resourcePath = getClass.getClassLoader.getResource("ecommerce.csv").getPath 61 | 62 | val df = session.read.format("csv") 63 | .option("inferSchema", "true") 64 | .option("header", "true") 65 | .load(resourcePath) 66 | 67 | return df 68 | } 69 | 70 | return session.sql( 71 | s""" 72 | |SELECT * 73 | |FROM airbnb_db.user_client_event 74 | |WHERE part BETWEEN ${partitionMetricStart} AND ${partitionMetricEnd} 75 | |""".stripMargin) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/scala/mkt/udon/config/UdonProductPoolBatchConfig.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.config 2 | 3 | case class UdonProductPoolBatchConfig(dynamoTable: String, dynamoRegion: String, dynamoPartitionCount: String, 4 | parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int, 5 | partitionSnapshot: String, 6 | partitionMetricStart: String, 7 | partitionMetricEnd: String, 8 | maxElementCount: Int, expireDays: Int) 9 | -------------------------------------------------------------------------------- /project-spark/service-batch-discovery/src/main/scala/mkt/udon/entity/UdonProductPoolEntity.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.entity 2 | 3 | import mkt.udon.core.entity.{ProductPool, ProductPoolElement} 4 | import org.apache.spark.sql.expressions.Window 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 7 | 8 | object UdonProductPoolEntity { 9 | 10 | val EVENT_VIEW = "view" 11 | val EVENT_CART = "cart" 12 | val EVENT_ORDER = "purchase" 13 | 14 | def convert(session: SparkSession, dfUserEvent: DataFrame, 15 | maxElementCount: Int): Dataset[ProductPool] = { 16 | 17 | import session.implicits._ 18 | 19 | val dfFiltered = dfUserEvent.selectExpr("product_id", "user_id", "user_session") 20 | .where(col("event_type").isInCollection(List(EVENT_VIEW))) 21 | 22 | /** 23 | * 상품과 상품들을 연관짓기 위해 사용자 Session 을 사용합니다. 이 방법의 기본적인 가정은 24 | * - 사용자가 의도를 가진 채로 상품을 탐색하는 하나의 Session 동안에는 '연관된 상품' 을 보았을 거라 가정하고 25 | * - 하나의 세션 내에서 같이 본 상품은 사용자 관점에서 유의미 하게 비슷할거라는 가설을 가지고 있습니다. 26 | * 27 | * 서비스에 나가는 추천들은 실제로는 더 복잡한 모델을 이용하고 정제되어 있는 많은 Feature 를 이용하지만, 28 | * 여기에서는 가장 기본적인 데이터 가공을 통해 상품 Pool 을 구성하기 위해 위에서 언급한 방법을 이용합니다. 29 | * 30 | * 이 방법을 응용하면 Search Together, View Together, Cart Together, Order Together 와 같은 상품 Pool 을 만들 수 있습니다. 31 | * 혹은 각각의 Unique Session ID 혹은 Unique User ID, 단순 Count 등을 Feature 로 삼아 통계적으로 각 Feature 의 비율을 조합해 내보낼 수도 있습니다. 32 | * 33 | * 사용자의 행위 기반 외에도 도메인이 숙박이라면 상품 메타 정보 (거리, 가격) 등의 메트릭 유사도를 추가할 수 있습니다. 34 | */ 35 | val dfJoined = dfFiltered.alias("L") 36 | .join( 37 | dfFiltered.alias("R"), 38 | col("L.user_session") === col("R.user_session") && 39 | col("L.product_id") =!= col("R.product_id"), 40 | "inner" 41 | ) 42 | .selectExpr( 43 | "L.product_id as product_id", 44 | "R.product_id as product_id_other", 45 | "L.user_session" 46 | ) 47 | 48 | // 순위 생성 및 maxElementCount 를 이용해 필터링 49 | val windowRank = Window.partitionBy(col("product_id")).orderBy(col("count_session_uniq").desc) 50 | val dfGrouped = dfJoined 51 | .groupBy("product_id", "product_id_other") 52 | .agg(countDistinct("user_session").as("count_session_uniq")) 53 | .withColumn("rank", row_number().over(windowRank)) 54 | .where(col("rank") <= lit(maxElementCount)) 55 | 56 | // 배열로 만들기 위해 UDF 를 통해 Case Class 로 변경 57 | // 주의사항: Spark 의 'collect_list' 는 순서를 보존하지 않으므로 Rank 값 없이 리스트화 하면 상품의 순서가 보존되지 않을 수 있습니다. 58 | val udfElementize = udf((id: String, rank: Long) => 59 | ProductPoolElement(id = id, rank = rank)) 60 | val dfConverted = dfGrouped 61 | .withColumn("element", udfElementize(col("product_id_other"), col("rank"))) 62 | .groupBy("product_id") 63 | .agg(collect_list("element").as("elements"), count("*").as("element_count")) 64 | 65 | 66 | return dfConverted.selectExpr("product_id as specifier", "elements", "element_count as elementCount").as[ProductPool] 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/Makefile: -------------------------------------------------------------------------------- 1 | TAG = "Makefile" 2 | 3 | VERSION = $(shell cat ./VERSION) 4 | MODULE = service-batch-statistics 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar 8 | 9 | .PHONY: test 10 | test: 11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 12 | @ echo "" 13 | 14 | @ ../gradlew :$(MODULE):test 15 | 16 | .PHONY: build 17 | build: 18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 19 | @ echo "" 20 | 21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar 22 | 23 | .PHONY: deploy 24 | deploy: 25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)" 26 | @ echo "" 27 | 28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT) 29 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1-SNAPSHOT -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/build.gradle: -------------------------------------------------------------------------------- 1 | def versionValue = file("VERSION").text.trim() 2 | project.version = versionValue 3 | 4 | apply plugin: 'application' 5 | apply plugin: 'com.github.johnrengelman.shadow' 6 | 7 | dependencies { 8 | // shared 9 | implementation project(path: ':module-core') 10 | implementation project(path: ':module-infra-spark') 11 | } 12 | 13 | mainClassName = 'test' 14 | run.classpath = sourceSets.main.runtimeClasspath 15 | 16 | jar { 17 | manifest { 18 | attributes( 19 | "Implementation-Title": project.name, 20 | "Implementation-Version": project.version, 21 | "Build-Jdk": System.getProperty('java.version'), 22 | ) 23 | } 24 | 25 | } 26 | 27 | shadowJar { 28 | zip64 = true 29 | exclude 'META-INF/**' 30 | baseName = project.name 31 | // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고, 32 | // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi 33 | // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do 34 | mergeServiceFiles() 35 | } 36 | 37 | assemble.dependsOn(shadowJar) -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/resources/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | LOCAL { 2 | jdbcHost = "localhost" 3 | jdbcHost = ${?JDBC_HOST} 4 | jdbcPort = 3306 5 | jdbcPort = ${?JDBC_PORT} 6 | jdbcUsername = "root" 7 | jdbcUsername = ${?JDBC_USERNAME} 8 | jdbcPassword = "root" 9 | jdbcPassword = ${?JDBC_PASSWORD} 10 | jdbcSchema = "pipeline" 11 | jdbcSchema = ${?JDBC_SCHEMA} 12 | jdbcTable = "property_stat" 13 | jdbcTable = ${?JDBC_TABLE} 14 | jdbcPartitionCount = 2 15 | jdbcPartitionCount = ${?JDBC_PARTITION_COUNT} 16 | 17 | parquetPrefix = "s3://practical-data-pipeline/udon-data-lake/udon-db/property_stat" 18 | parquetPrefix = ${?PARQUET_PREFIX} 19 | parquetWriteMode = "Overwrite" 20 | parquetWriteMode = ${?PARQUET_WRITE_MODE} 21 | parquetPartitionCount = 5 22 | parquetPartitionCount = ${?PARQUET_PARTITION_COUNT} 23 | 24 | partitionSnapshot = "20191129" 25 | partitionSnapshot = ${?PARTITION_SNAPSHOT} 26 | } 27 | 28 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=INFO, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n 24 | 25 | # Set the default spark-shell/spark-sql log level to WARN. When running the 26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite 27 | # the root logger's log level, so that the user can have different defaults 28 | # for the shell and regular Spark apps. 29 | log4j.logger.org.apache.spark.repl.Main=WARN 30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN 31 | 32 | # Settings to quiet third party logs that are too verbose 33 | log4j.logger.org.sparkproject.jetty=WARN 34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 37 | log4j.logger.org.apache.parquet=ERROR 38 | log4j.logger.parquet=ERROR 39 | 40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 43 | 44 | # For deploying Spark ThriftServer 45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805 46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter 47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message 48 | log4j.appender.console.filter.1.AcceptOnMatch=false -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/scala/mkt/udon/UdonStatBatch.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon 2 | 3 | import mkt.udon.config.UdonStatBatchConfig 4 | import mkt.udon.core.common.{Environment, TimeUtil} 5 | import mkt.udon.entity.UdonStatEntity 6 | import mkt.udon.infra.spark.SparkBase 7 | import mkt.udon.infra.spark.storage.{JdbcSink, ParquetSink} 8 | import org.apache.log4j.LogManager 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 11 | import pureconfig.generic.auto._ 12 | 13 | object UdonStatBatch extends SparkBase { 14 | override val logger = LogManager.getLogger(this.getClass.getName) 15 | 16 | override def driver(session: SparkSession): Unit = { 17 | /** 18 | * 환경변수 추출 및 설정 19 | */ 20 | implicit val configHint = Environment.buildConfigHint[UdonStatBatchConfig]() 21 | val config = Environment.getConfigOrThrow[UdonStatBatchConfig] 22 | 23 | /** 24 | * 데이터 추출 및 가공 25 | */ 26 | val partition = config.partitionSnapshot 27 | val dfPropertyMeta = readPropertyMeta(partition, session) 28 | val dfPropertySales = readPropertySales(partition, session) 29 | val dfPropertyReview = readPropertyReview(partition, session) 30 | 31 | var dfResult = UdonStatEntity.convert(session, partition, 32 | dfPropertyMeta = dfPropertyMeta, 33 | dfPropertySales = dfPropertySales, 34 | dfPropertyReview = dfPropertyReview) 35 | 36 | // 사이즈가 작을 경우 추가적인 연산을 위해 캐싱할 수 있습니다. 37 | dfResult = dfResult.cache() 38 | 39 | /** 40 | * 데이터 저장: Parquet 41 | * 42 | * `part` 를 파티션 컬럼으로 지정해 추가합니다. 43 | * Hive Static Partitioning 을 이용하면 Hive 로 읽을 경우엔 파티셔닝 컬럼이 자동으로 SELECT 시에 붙지만, 44 | * Parquet 를 직접 읽을 경우엔 존재하지 않으므로 Parquet 를 직접 읽는 사용자를 위해 추가합니다. 45 | */ 46 | val dfPersistedParquet = dfResult.withColumn("part", lit(partition)) 47 | .repartition(config.parquetPartitionCount) 48 | val parquetLocation = ParquetSink.buildLocation(config.parquetPrefix, partition) 49 | ParquetSink.write(session, dfPersistedParquet, parquetLocation, SaveMode.valueOf(config.parquetWriteMode)) 50 | 51 | /** 52 | * 데이터 저장: JDBC 53 | * 54 | * `part` 를 파티션 컬럼으로 지정해 추가합니다. Hive 테이블과 달라질 수 있기 때문에 별도 가공을 수행합니다. 55 | */ 56 | val connectionUrl = s"jdbc:mysql://${config.jdbcHost}:${config.jdbcPort}/${config.jdbcSchema}" 57 | val partitionColumns = List(col("property_id")) 58 | 59 | val jdbcPartitionValue = TimeUtil.convertPartitionToSqlTimestamp(partition) 60 | val dfPersistedJdbc = dfResult.withColumn("part", lit(jdbcPartitionValue)) 61 | .repartition(config.jdbcPartitionCount, partitionColumns: _*) 62 | 63 | JdbcSink.delete(jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable, 64 | jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword, 65 | partitionColName = "part", partitionColValue = jdbcPartitionValue 66 | ) 67 | 68 | JdbcSink.write(session, dfPersistedJdbc, 69 | jdbcUrl = connectionUrl, jdbcTable = config.jdbcTable, 70 | jdbcUsername = config.jdbcUsername, jdbcPassword = config.jdbcPassword) 71 | } 72 | 73 | def readPropertyMeta(partition: String, session: SparkSession): DataFrame = { 74 | 75 | if (Environment.isLocalMode()) { 76 | val resourcePath = getClass.getClassLoader.getResource("airbnb_listings.csv").getPath 77 | 78 | val df = session.read.format("csv") 79 | .option("inferSchema", "true") 80 | .option("header", "true") 81 | .option("quote", "\"") 82 | .option("escape", "\"") 83 | .option("sep", ",") 84 | .option("multiline", "true") 85 | .load(resourcePath) 86 | 87 | return df 88 | } 89 | 90 | return session.sql( 91 | s""" 92 | |SELECT * 93 | |FROM airbnb_db.property_meta 94 | |WHERE part = ${partition} 95 | |""".stripMargin) 96 | } 97 | 98 | def readPropertySales(partition: String, session: SparkSession): DataFrame = { 99 | 100 | if (Environment.isLocalMode()) { 101 | val resourcePath = getClass.getClassLoader.getResource("airbnb_calendar.csv").getPath 102 | 103 | val df = session.read.format("csv") 104 | .option("inferSchema", "true") 105 | .option("header", "true") 106 | .option("quote", "\"") 107 | .option("escape", "\"") 108 | .option("sep", ",") 109 | .option("multiline", "true") 110 | .load(resourcePath) 111 | 112 | return df 113 | } 114 | 115 | return session.sql( 116 | s""" 117 | |SELECT * 118 | |FROM airbnb_db.property_sales 119 | |WHERE part = ${partition} 120 | |""".stripMargin) 121 | } 122 | 123 | def readPropertyReview(partition: String, session: SparkSession): DataFrame = { 124 | 125 | if (Environment.isLocalMode()) { 126 | val resourcePath = getClass.getClassLoader.getResource("airbnb_reviews.csv").getPath 127 | 128 | val df = session.read.format("csv") 129 | .option("inferSchema", "true") 130 | .option("header", "true") 131 | .option("quote", "\"") 132 | .option("escape", "\"") 133 | .option("sep", ",") 134 | .option("multiline", "true") 135 | .load(resourcePath) 136 | 137 | return df 138 | } 139 | 140 | return session.sql( 141 | s""" 142 | |SELECT * 143 | |FROM airbnb_db.property_review 144 | |WHERE part = ${partition} 145 | |""".stripMargin) 146 | } 147 | 148 | /** 149 | * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다. 150 | * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고 151 | * - Hive Metastore URI 를 설정해야 합니다. 152 | */ 153 | def createTable(config: UdonStatBatchConfig, session: SparkSession): Unit = { 154 | if (Environment.isLocalMode()) return 155 | 156 | // TODO: execute create table DDL 157 | } 158 | 159 | /** 160 | * 과제: Hive Create Table DDL 을 Spark 를 이용해 실행해봅니다. 161 | * - 실행하기 위해 Hive Metastore 를 Docker Compose 로 띄우고 162 | * - Hive Metastore URI 를 설정해야 합니다. 163 | */ 164 | def createPartition(config: UdonStatBatchConfig, session: SparkSession): Unit = { 165 | if (Environment.isLocalMode()) return 166 | 167 | // TODO: execute create partition DDL 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/scala/mkt/udon/config/UdonStatBatchConfig.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.config 2 | 3 | case class UdonStatBatchConfig(jdbcHost: String, jdbcPort: Int, 4 | jdbcUsername: String, jdbcPassword: String, 5 | jdbcSchema: String, jdbcTable: String, 6 | jdbcPartitionCount: Int, 7 | parquetPrefix: String, parquetWriteMode: String, parquetPartitionCount: Int, 8 | partitionSnapshot: String) 9 | -------------------------------------------------------------------------------- /project-spark/service-batch-statistics/src/main/scala/mkt/udon/entity/UdonStatEntity.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.entity 2 | 3 | import mkt.udon.core.common.TimeUtil 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | 8 | object UdonStatEntity { 9 | 10 | def convert(session: SparkSession, partition: String, 11 | dfPropertyMeta: DataFrame, 12 | dfPropertySales: DataFrame, 13 | dfPropertyReview: DataFrame): DataFrame = { 14 | 15 | val partitionDate = TimeUtil.convertPartitionToDateString(partition) 16 | 17 | /** 18 | * 상품 메타 19 | */ 20 | val dfMeta = dfPropertyMeta 21 | .selectExpr("CAST(id AS BIGINT) as property_id", "property_type", "latitude", "longitude") 22 | 23 | /** 24 | * 상품 메트릭 누적 (리뷰) 25 | */ 26 | val dfMetricReviewTotal = dfPropertyMeta 27 | .selectExpr("CAST(id AS BIGINT) as property_id", "number_of_reviews as count_review_all", "review_scores_rating as score_review_all") 28 | 29 | /** 30 | * 상품 메트릭 델타 (리뷰) 31 | */ 32 | val dfMetricReviewDelta = dfPropertyReview 33 | .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date") 34 | .where(col("date") === lit(partitionDate).cast(DateType)) 35 | .groupBy("property_id") 36 | .agg(count("*").as("count_review")) 37 | 38 | /** 39 | * 상품 메트릭 델타 (판매) 40 | */ 41 | val dfMetricSalesDelta = dfPropertySales 42 | .selectExpr("CAST(listing_id AS BIGINT) as property_id", "CAST(date as DATE) as date", "price as price_raw") 43 | .where(col("date") === lit(partitionDate).cast(DateType)) 44 | .where(col("available") === lit("f")) 45 | .withColumn("price", regexp_extract(col("price_raw"), "[0-9]+.[0-9]+", 0).cast(DoubleType)) 46 | .drop("price_raw") 47 | .groupBy("property_id") 48 | .agg( 49 | count("*").as("count_sales"), 50 | sum("price").as("price_sales") 51 | ) 52 | 53 | /** 54 | * 결과 데이터 프레임 내 2 가지 성격의 데이터가 섞여 있습니다. 55 | * - 누적 데이터 (전체 기간 내 최신 값) 56 | * - 일별 데이터 (해당 일에 대한 변동 값) 57 | * 58 | * 이 데이터를 하나의 결과 테이블로 만드는게 맞을지 / 아니면 Spark Application 과 테이블을 분리하는게 맞을지 논의해 봅시다. 59 | */ 60 | val dfJoined = dfMeta.alias("PROPERTY_META") 61 | .join(dfMetricReviewTotal.alias("METRIC_REVIEW_TOTAL"), 62 | col("PROPERTY_META.property_id") === col("METRIC_REVIEW_TOTAL.property_id"), "left") 63 | .join(dfMetricReviewDelta.alias("METRIC_REVIEW_DELTA"), 64 | col("PROPERTY_META.property_id") === col("METRIC_REVIEW_DELTA.property_id"), "left") 65 | .join(dfMetricSalesDelta.alias("METRIC_SALES_DELTA"), 66 | col("PROPERTY_META.property_id") === col("METRIC_SALES_DELTA.property_id"), "left") 67 | .selectExpr( 68 | "PROPERTY_META.property_id as property_id", 69 | "PROPERTY_META.property_type as property_type", 70 | "PROPERTY_META.latitude as lat", 71 | "PROPERTY_META.longitude as lng", 72 | 73 | "coalesce(METRIC_REVIEW_TOTAL.count_review_all, 0) as count_review_all", 74 | "coalesce(METRIC_REVIEW_TOTAL.score_review_all, 0.0) as score_review_all", 75 | 76 | "coalesce(METRIC_REVIEW_DELTA.count_review, 0) as count_review", 77 | 78 | "coalesce(METRIC_SALES_DELTA.count_sales, 0) as count_sales", 79 | "CAST(coalesce(METRIC_SALES_DELTA.price_sales, 0) AS BIGINT) as price_sales" 80 | ) 81 | 82 | return dfJoined 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/Makefile: -------------------------------------------------------------------------------- 1 | TAG = "Makefile" 2 | 3 | VERSION = $(shell cat ./VERSION) 4 | MODULE = service-stream-profile 5 | DIST_BUCKET = s3://udon-infra/codebuild-artifact 6 | BUILT_ARTIFACT = $(MODULE)-$(VERSION)-all.jar 7 | DIST_ARTIFACT = $(MODULE)-$(VERSION).jar 8 | 9 | .PHONY: test 10 | test: 11 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 12 | @ echo "" 13 | 14 | @ ../gradlew :$(MODULE):test 15 | 16 | .PHONY: build 17 | build: 18 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Building : $(MODULE)" 19 | @ echo "" 20 | 21 | @ ../gradlew :$(MODULE):clean :$(MODULE):shadowJar 22 | 23 | .PHONY: deploy 24 | deploy: 25 | @ echo "[$(TAG)] ($$(date -u '+%H:%M:%S')) - Deploying: $(MODULE)" 26 | @ echo "" 27 | 28 | @ aws s3 cp build/libs/$(BUILT_ARTIFACT) $(DIST_BUCKET)/$(MODULE)/$(DIST_ARTIFACT) 29 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1-SNAPSHOT -------------------------------------------------------------------------------- /project-spark/service-stream-profile/build.gradle: -------------------------------------------------------------------------------- 1 | def versionValue = file("VERSION").text.trim() 2 | project.version = versionValue 3 | 4 | apply plugin: 'application' 5 | apply plugin: 'com.github.johnrengelman.shadow' 6 | 7 | dependencies { 8 | // shared 9 | implementation project(path: ':module-core') 10 | implementation project(path: ':module-infra-spark') 11 | } 12 | 13 | mainClassName = 'test' 14 | run.classpath = sourceSets.main.runtimeClasspath 15 | 16 | jar { 17 | manifest { 18 | attributes( 19 | "Implementation-Title": project.name, 20 | "Implementation-Version": project.version, 21 | "Build-Jdk": System.getProperty('java.version'), 22 | ) 23 | } 24 | 25 | } 26 | 27 | shadowJar { 28 | zip64 = true 29 | exclude 'META-INF/**' 30 | baseName = project.name 31 | // Spark SQL Streaming 은 META-INF 를 조합해 Datasource 여부를 판별하므로 Uber Jar 로는 해결이 불가능하고, 32 | // - https://stackoverflow.com/questions/48011941/why-does-formatkafka-fail-with-failed-to-find-data-source-kafka-even-wi 33 | // - https://stackoverflow.com/questions/32887966/shadow-plugin-gradle-what-does-mergeservicefiles-do 34 | mergeServiceFiles() 35 | } 36 | 37 | assemble.dependsOn(shadowJar) -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | LOCAL { 2 | UserProfileStream { 3 | checkpointLocation = "/tmp/spark-user-profile" 4 | dynamoTable = "service-dev-user-profile" 5 | dynamoTable = ${?DYNAMO_TABLE} 6 | dynamoRegion = "ap-northeast-2" 7 | dynamoRegion = ${?DYNAMO_REGION} 8 | dynamoExpireDays = 15 9 | dynamoExpireDays = ${?DYNAMO_EXPIRE_DAYS} 10 | dynamoPartitionCount = 3 11 | dynamoPartitionCount = ${?DYNAMO_PARTITION_COUNT} 12 | kafkaBroker = "localhost:9092" 13 | kafkaBroker = ${?KAFKA_BROKER} 14 | kafkaTopic = "user-event" 15 | kafkaTopic = ${?KAFKA_TOPIC} 16 | kafkaConsumerGroup= "user-profile" 17 | kafkaConsumerGroup = ${?KAFKA_CONSUMER_GROUP} 18 | kafkaOffsetStarting= "latest" 19 | kafkaOffsetStarting = ${?KAFKA_OFFSET_STARTING} 20 | maxCountView = 10 21 | maxCountView = ${?MAX_COUNT_VIEW} 22 | maxCountOrder = 10 23 | maxCountOrder = ${?MAX_COUNT_ORDER} 24 | } 25 | 26 | UserRelayStream { 27 | checkpointLocation = "/tmp/spark-user-relay" 28 | sourceKafkaBroker = "localhost:9092" 29 | sourceKafkaBroker = ${?SOURCE_KAFKA_BROKER} 30 | sourceKafkaTopic = "user-event" 31 | sourceKafkaTopic = ${?SOURCE_KAFKA_TOPIC} 32 | sourceKafkaConsumerGroup= "user-event-relay" 33 | sourceKafkaConsumerGroup = ${?SOURCE_KAFKA_CONSUMER_GROUP} 34 | sourceKafkaOffsetStarting= "latest" 35 | sourceKafkaOffsetStarting = ${?SOURCE_KAFKA_OFFSET_STARTING} 36 | 37 | sinkKafkaBroker = "localhost:9092" 38 | sinkKafkaBroker = ${?SINK_KAFKA_BROKER} 39 | sinkKafkaTopic = "user-event-relay" 40 | sinkKafkaTopic = ${?SINK_KAFKA_TOPIC} 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=INFO, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss.SSS} LINE:%4L --- [%15.15t] %-40.40C : %m%n 24 | 25 | # Set the default spark-shell/spark-sql log level to WARN. When running the 26 | # spark-shell/spark-sql, the log level for these classes is used to overwrite 27 | # the root logger's log level, so that the user can have different defaults 28 | # for the shell and regular Spark apps. 29 | log4j.logger.org.apache.spark.repl.Main=WARN 30 | log4j.logger.org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver=WARN 31 | 32 | # Settings to quiet third party logs that are too verbose 33 | log4j.logger.org.sparkproject.jetty=WARN 34 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 35 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 36 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 37 | log4j.logger.org.apache.parquet=ERROR 38 | log4j.logger.parquet=ERROR 39 | 40 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 41 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 42 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 43 | 44 | # For deploying Spark ThriftServer 45 | # SPARK-34128?Suppress undesirable TTransportException warnings involved in THRIFT-4805 46 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter 47 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during processing of message 48 | log4j.appender.console.filter.1.AcceptOnMatch=false -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonProfileStream.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon 2 | 3 | import mkt.udon.config.UdonProfileStreamConfig 4 | import mkt.udon.core.common.Environment 5 | import mkt.udon.core.entity.UserEvent 6 | import mkt.udon.entity.UdonProfileStateFunc 7 | import mkt.udon.infra.spark.SparkBase 8 | import org.apache.log4j.LogManager 9 | import org.apache.spark.sql.functions.col 10 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 11 | import org.apache.spark.sql.{Dataset, SparkSession} 12 | import pureconfig.generic.auto._ 13 | 14 | object UdonProfileStream extends SparkBase { 15 | override val logger = LogManager.getLogger(this.getClass.getName) 16 | 17 | val APP = "UserProfileStream" 18 | 19 | override def driver(session: SparkSession): Unit = { 20 | import session.implicits._ 21 | 22 | /** 23 | * 환경변수 추출 및 설정 24 | */ 25 | implicit val configHint = Environment.buildConfigHint[UdonProfileStreamConfig]() 26 | val config = Environment.getConfigOrThrowForApp[UdonProfileStreamConfig](APP) 27 | 28 | 29 | /** 30 | * 데이터 추출 및 가공 31 | */ 32 | val dfRaw = session.readStream 33 | .format("kafka") 34 | .option("kafka.bootstrap.servers", config.kafkaBroker) 35 | .option("subscribe", config.kafkaTopic) 36 | .option("groupIdPrefix", config.kafkaConsumerGroup) 37 | .option("startingOffsets", config.kafkaOffsetStarting) 38 | .load() 39 | 40 | // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다. 41 | val dfConverted = dfRaw 42 | .selectExpr("CAST(value AS STRING)").as[String] 43 | .map(UserEvent.convertFromRaw) 44 | 45 | /** 46 | * 데이터 적재 47 | */ 48 | val dfWritten = dfConverted.writeStream 49 | .queryName(APP) 50 | .trigger(Trigger.ProcessingTime("1 seconds")) 51 | .outputMode(OutputMode.Append()) 52 | .foreachBatch((dsUserEvent: Dataset[UserEvent], batchId: Long) => { 53 | // 사용자 기준으로 repartition 해 하나의 파티션 내에서 해당 사용자 이벤트를 모드 처리할 수 있도록 합니다 54 | val repartitioned = dsUserEvent.repartition(config.dynamoPartitionCount, col("userId")) 55 | 56 | // 파티션 처리 함수를 호출합니다. 57 | repartitioned.foreachPartition((iter: Iterator[UserEvent]) => { 58 | UdonProfileStateFunc.handlePartition(config, iter) 59 | }) 60 | }) 61 | .option("checkpointLocation", config.checkpointLocation) 62 | .start() 63 | 64 | dfWritten.awaitTermination() 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/scala/mkt/udon/UdonRelayStream.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon 2 | 3 | import mkt.udon.config.UdonRelayStreamConfig 4 | import mkt.udon.core.common.Environment 5 | import mkt.udon.core.entity.UserEvent 6 | import mkt.udon.infra.spark.SparkBase 7 | import org.apache.log4j.LogManager 8 | import org.apache.spark.sql.SparkSession 9 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 10 | import pureconfig.generic.auto._ 11 | 12 | object UdonRelayStream extends SparkBase { 13 | override val logger = LogManager.getLogger(this.getClass.getName) 14 | 15 | val APP = "UserRelayStream" 16 | 17 | override def driver(session: SparkSession): Unit = { 18 | import session.implicits._ 19 | 20 | /** 21 | * 환경변수 추출 및 설정 22 | */ 23 | implicit val configHint = Environment.buildConfigHint[UdonRelayStreamConfig]() 24 | val config = Environment.getConfigOrThrowForApp[UdonRelayStreamConfig](APP) 25 | 26 | /** 27 | * 데이터 추출 및 가공 28 | */ 29 | val dfRaw = session.readStream 30 | .format("kafka") 31 | .option("kafka.bootstrap.servers", config.sourceKafkaBroker) 32 | .option("subscribe", config.sourceKafkaTopic) 33 | .option("groupIdPrefix", config.sourceKafkaConsumerGroup) 34 | .option("startingOffsets", config.sourceKafkaOffsetStarting) 35 | .load() 36 | 37 | // stringified JSON 을 Case Class 로 컨버팅 합니다. 만약 Avro 를 쓴다면 이러한 과정 없이 사용할 수 있습니다. 38 | val dfConverted = dfRaw 39 | .selectExpr("CAST(value AS STRING)").as[String] 40 | .map(UserEvent.convertFromRaw) 41 | 42 | /** 43 | * 데이터 적재 44 | */ 45 | 46 | // UserEvent.userId 를 Kafka Partition Key 로 사용합니다. 47 | val dfJson = dfConverted.selectExpr("CAST(userId AS STRING) AS key", "to_json(struct(*)) AS value") 48 | 49 | val dfWritten = dfJson.writeStream 50 | .queryName(APP) 51 | .outputMode(OutputMode.Append()) 52 | .trigger(Trigger.Continuous("1 seconds")) 53 | .format("kafka") 54 | .option("kafka.bootstrap.servers", config.sinkKafkaBroker) 55 | .option("topic", config.sinkKafkaTopic) 56 | .option("checkpointLocation", config.checkpointLocation) 57 | .start() 58 | 59 | dfWritten.awaitTermination() 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonProfileStreamConfig.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.config 2 | 3 | case class UdonProfileStreamConfig(checkpointLocation: String, 4 | dynamoTable: String, 5 | dynamoRegion: String, 6 | dynamoExpireDays: Int, 7 | dynamoPartitionCount: Int, 8 | kafkaBroker: String, 9 | kafkaTopic: String, 10 | kafkaConsumerGroup: String, 11 | kafkaOffsetStarting: String, 12 | maxCountView: Int, 13 | maxCountOrder: Int 14 | ) 15 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/scala/mkt/udon/config/UdonRelayStreamConfig.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.config 2 | 3 | case class UdonRelayStreamConfig(checkpointLocation: String, 4 | sourceKafkaBroker: String, 5 | sourceKafkaTopic: String, 6 | sourceKafkaConsumerGroup: String, 7 | sourceKafkaOffsetStarting: String, 8 | sinkKafkaBroker: String, 9 | sinkKafkaTopic: String) 10 | -------------------------------------------------------------------------------- /project-spark/service-stream-profile/src/main/scala/mkt/udon/entity/UdonProfileStateFunc.scala: -------------------------------------------------------------------------------- 1 | package mkt.udon.entity 2 | 3 | import mkt.udon.config.UdonProfileStreamConfig 4 | import mkt.udon.core.entity.{UserEvent, UserProfile} 5 | import mkt.udon.infra.spark.storage.DynamoSink 6 | 7 | object UdonProfileStateFunc { 8 | 9 | def handlePartition(config: UdonProfileStreamConfig, iter: Iterator[UserEvent]): Unit = { 10 | // Dynamo Client 생성 (@ThreadSafe) 11 | val dynamoClient = DynamoSink.buildClient(dynamoTable = config.dynamoTable, dynamoRegion = config.dynamoRegion) 12 | 13 | // 사용자 마다 그룹화 해 사용자별로 이벤트 시간순 정렬을 할 수 있도록 합니다. 14 | val groupedByUser = iter.toList.groupBy(u => u.userId) 15 | groupedByUser.foreach(kv => { 16 | val userId = kv._1 17 | val userEvents = kv._2.sortBy(x => -x.eventTime) // 시간순 내림차순 정렬 18 | 19 | // 사용자 Profile 을 Dynamo 에서 가져오고 없을 경우 만듭니다 20 | val existing = DynamoSink.getItem[UserProfile](dynamoClient, keyName = "specifier", userId) 21 | .getOrElse(UserProfile.buildEmpty(userId)) 22 | 23 | /** 24 | * 추가적으로 더 해볼 수 있는 최적화는, 사용자 이벤트 숫자를 미리 필터링 하는 것입니다. 25 | * 사용자 이벤트 100개 -> config.maxCount 에 의해 미리 필터링해 existing.update 호출 수를 제한할 수 있습니다. 26 | * 다만 사용자 이벤트에 따른 분기가 미리 일어나는 등 관련 로직을 작성해야 합니다 27 | */ 28 | userEvents.foreach(event => { 29 | existing.update(userEvent = event, maxCountView = config.maxCountView, maxCountOrder = config.maxCountOrder) 30 | }) 31 | 32 | /** 33 | * Stream 이나 Batch 가 여러개일 경우 Dynamo 테이블이 많아지면 API 입장에서 Dynamo Call 을 여러번해야 해 문제가 될 수 있습니다. 34 | * 이 때, 같은 성격의 데이터라면 Dynamo Table 을 공유하고 컬럼을 다르게 적재할 수 있습니다. 35 | * 36 | * 예를 들어, User Profile Table 내에는 37 | * - Kafka 에서 당겨오는 User Event 를 바탕으로 적재하는 Stream User Profile 컬럼과 38 | * - 배치 기반으로 Segment 를 만들어 사용자의 Segment List 를 적재하는 Batch 용 User Profile 컬럼을 만들 수 있습니다. 39 | * - 이 때, Dynamo 1 개의 Row 사이즈에는 제한이 있으므로 너무 많은 컬럼으로 인해 데이터 사이즈가 넘치지 않도록 주의해야 합니다. 40 | * 41 | * 만약 다른 컬럼이 다른 스트림이나 배치에서 업데이트 된다면 Put 대신에 Dynamo Update (Upsert) 를 이용할 수 있습니다. 42 | * - https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_UpdateItem.html 43 | */ 44 | DynamoSink.putItem(dynamoClient, existing, config.dynamoExpireDays) 45 | }) 46 | 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /project-spark/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'project-spark' 2 | 3 | include ':module-core', 4 | ':module-infra-spark', 5 | ':service-stream-profile', 6 | ':service-batch-discovery', 7 | ':service-batch-statistics' 8 | 9 | -------------------------------------------------------------------------------- /project-terraform-aws/.gitignore: -------------------------------------------------------------------------------- 1 | *.hcl 2 | .terraform/ 3 | *.lock.info 4 | *.tfstate 5 | *.tfstate.backup 6 | __tf_state/ 7 | .idea/ 8 | 9 | -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/_aws-root-iam/.gitkeep -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/_local.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | environment_common = "common" 3 | environment_development = "development" 4 | environment_production = "production" 5 | 6 | region_seoul = "ap-northeast-2" 7 | 8 | team_data = "data" 9 | } -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/_output.tf: -------------------------------------------------------------------------------- 1 | output "profile_id_bastion" { 2 | value = module.module-iam-common.profile_id_bastion 3 | } 4 | 5 | output "profile_arn_emr_instance" { 6 | value = module.module-iam-common.profile_arn_emr_instance 7 | } 8 | 9 | output "role_arn_emr_cluster" { 10 | value = module.module-iam-common.role_arn_emr_cluster 11 | } 12 | 13 | output "role_arn_emr_asg" { 14 | value = module.module-iam-common.role_arn_emr_asg 15 | } 16 | -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/_provider.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = local.region_seoul 3 | } -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/_terraform.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.1.3" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 3.71.0" 8 | } 9 | } 10 | 11 | /** 12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다 13 | */ 14 | 15 | backend "local" { 16 | path = "../__tf_state/_aws-root-iam/terraform.tfstate" 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/main_iam_common.tf: -------------------------------------------------------------------------------- 1 | module "module-iam-common" { 2 | source = "./module-iam-common" 3 | 4 | environment = local.environment_common 5 | } -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/module-iam-common/_data.tf: -------------------------------------------------------------------------------- 1 | data "aws_iam_policy" "managed_dynamo_full" { 2 | arn = "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess" 3 | } 4 | 5 | data "aws_iam_policy" "managed_kinesis_stream_full" { 6 | arn = "arn:aws:iam::aws:policy/AmazonKinesisFullAccess" 7 | } 8 | 9 | data "aws_iam_policy" "managed_data_scientist" { 10 | arn = "arn:aws:iam::aws:policy/job-function/DataScientist" 11 | } 12 | 13 | data "aws_iam_policy" "managed_s3_full" { 14 | arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess" 15 | } 16 | -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/module-iam-common/_output.tf: -------------------------------------------------------------------------------- 1 | output "profile_id_bastion" { 2 | value = aws_iam_instance_profile.bastion.id 3 | } 4 | 5 | output "profile_arn_emr_instance" { 6 | value = aws_iam_instance_profile.emr_instance.arn 7 | } 8 | 9 | output "role_arn_emr_cluster" { 10 | value = aws_iam_role.emr_cluster.arn 11 | } 12 | 13 | output "role_arn_emr_asg" { 14 | value = aws_iam_role.emr_asg.arn 15 | } 16 | -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/module-iam-common/_variable.tf: -------------------------------------------------------------------------------- 1 | variable "environment" {} -------------------------------------------------------------------------------- /project-terraform-aws/_aws-root-iam/module-iam-common/common.basic.iam.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | instance_purpose_basic = "ec2-basic" 3 | } 4 | 5 | # 6 | # Role, Instance Profile 7 | # 8 | 9 | resource "aws_iam_role" "basic" { 10 | name = "${lower(var.environment)}-${local.instance_purpose_basic}" 11 | 12 | assume_role_policy = <> /var/spool/cron/${user} 17 | chown ${user}:${user} /var/spool/cron/${user} 18 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-bastion/_terraform.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.1.3" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 3.71.0" 8 | } 9 | } 10 | 11 | /** 12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다 13 | */ 14 | 15 | backend "local" { 16 | path = "../__tf_state/_aws-root-machine-bastion/terraform.tfstate" 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-bastion/main_bastion_dev.tf: -------------------------------------------------------------------------------- 1 | module "module-bastion-data-dev" { 2 | source = "./module-bastion-data-dev" 3 | 4 | environment = local.environment_development 5 | team = local.team_data 6 | 7 | bastion_ami = data.aws_ami.amazon_linux_2.id 8 | bastion_profile = data.terraform_remote_state.root_iam.outputs.profile_id_bastion 9 | bastion_keypair = local.keypair_infra 10 | 11 | bastion_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_bastion_public_data_dev 12 | 13 | bastion_subnet_id = data.terraform_remote_state.root_vpc.outputs.subnet_id_public_az_a_data_dev 14 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-bastion/module-bastion-data-dev/_data.bootstrap.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "bastion_template_cloudwatch" { 2 | template = file("${path.root}/_template/template.cloudwatch.sh") 3 | 4 | vars = { 5 | user = "ec2-user" 6 | installer = "yum" 7 | agent_version = "1.2.2" 8 | } 9 | } 10 | 11 | 12 | data "template_cloudinit_config" "bastion_user_data" { 13 | gzip = false 14 | base64_encode = true 15 | 16 | # install patches for Amazon Linux 17 | part { 18 | content_type = "text/x-shellscript" 19 | 20 | content = <> /var/spool/cron/hadoop" 17 | sudo chown hadoop:hadoop /var/spool/cron/hadoop 18 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-instance-tag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sleep 15s; 4 | 5 | ls -al /mnt/var/lib/info/ 6 | 7 | echo -e "" 8 | 9 | export IS_MASTER=$(cat /mnt/var/lib/info/instance.json | jq -r ".isMaster") 10 | export INSTANCE_GROUP_ID=$(cat /mnt/var/lib/info/instance.json | jq -r ".instanceGroupId") 11 | export CLUSTER_ID=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".jobFlowId") 12 | export INSTANCE_ID=$(wget -q -O - http://169.254.169.254/latest/meta-data/instance-id) 13 | export INSTANCE_GROUP_TYPE=$(cat /mnt/var/lib/info/job-flow.json | jq -r ".instanceGroups | .[] | select( .instanceGroupId == \"${INSTANCE_GROUP_ID}\") | .instanceRole" | tr a-z A-Z) 14 | 15 | echo -e "IS_MASTER: ${IS_MASTER}" 16 | echo -e "INSTANCE_GROUP_ID: ${INSTANCE_GROUP_ID}" 17 | echo -e "CLUSTER_ID: ${CLUSTER_ID}" 18 | echo -e "INSTANCE_ID: ${INSTANCE_ID}" 19 | echo -e "INSTANCE_GROUP_TYPE: ${INSTANCE_GROUP_TYPE}" 20 | 21 | export CURRENT_TAG_NAME=$(aws ec2 --region ap-northeast-2 describe-tags --filters Name=resource-id,Values=${INSTANCE_ID} | jq -r ".Tags | .[] | select( .Key == \"Name\") | .Value") 22 | export NEW_TAG_NAME="${CURRENT_TAG_NAME}-${INSTANCE_GROUP_TYPE}" 23 | 24 | echo -e "CURRENT_TAG_NAME: ${CURRENT_TAG_NAME}" 25 | echo -e "NEW_TAG_NAME: ${NEW_TAG_NAME}" 26 | 27 | echo -e "aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME}" 28 | 29 | aws ec2 create-tags --region ap-northeast-2 --resources ${INSTANCE_ID} --tags Key=Name,Value=${NEW_TAG_NAME} 30 | 31 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-spark-batch.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "httpfs-env", 4 | "properties": { 5 | }, 6 | "configurations": [ 7 | { 8 | "classification": "export", 9 | "properties": { 10 | "TZ": "Asia/Seoul" 11 | }, 12 | "configurations": [ 13 | ] 14 | } 15 | ] 16 | }, 17 | { 18 | "classification": "hadoop-kms-env", 19 | "properties": { 20 | }, 21 | "configurations": [ 22 | { 23 | "classification": "export", 24 | "properties": { 25 | "TZ": "Asia/Seoul" 26 | }, 27 | "configurations": [ 28 | ] 29 | } 30 | ] 31 | }, 32 | { 33 | "classification": "livy-env", 34 | "properties": { 35 | }, 36 | "configurations": [ 37 | { 38 | "classification": "export", 39 | "properties": { 40 | "TZ": "Asia/Seoul" 41 | }, 42 | "configurations": [ 43 | ] 44 | } 45 | ] 46 | }, 47 | { 48 | "classification": "zeppelin-env", 49 | "properties": { 50 | }, 51 | "configurations": [ 52 | { 53 | "classification": "export", 54 | "properties": { 55 | "TZ": "Asia/Seoul" 56 | }, 57 | "configurations": [ 58 | ] 59 | } 60 | ] 61 | }, 62 | { 63 | "classification": "sqoop-env", 64 | "properties": { 65 | }, 66 | "configurations": [ 67 | { 68 | "classification": "export", 69 | "properties": { 70 | "TZ": "Asia/Seoul" 71 | }, 72 | "configurations": [ 73 | ] 74 | } 75 | ] 76 | }, 77 | { 78 | "classification": "oozie-env", 79 | "properties": { 80 | }, 81 | "configurations": [ 82 | { 83 | "classification": "export", 84 | "properties": { 85 | "TZ": "Asia/Seoul" 86 | }, 87 | "configurations": [ 88 | ] 89 | } 90 | ] 91 | }, 92 | { 93 | "classification": "presto-env", 94 | "properties": { 95 | }, 96 | "configurations": [ 97 | { 98 | "classification": "export", 99 | "properties": { 100 | "TZ": "Asia/Seoul" 101 | }, 102 | "configurations": [ 103 | ] 104 | } 105 | ] 106 | }, 107 | { 108 | "classification": "hcatalog-env", 109 | "properties": { 110 | }, 111 | "configurations": [ 112 | { 113 | "classification": "export", 114 | "properties": { 115 | "TZ": "Asia/Seoul" 116 | }, 117 | "configurations": [ 118 | ] 119 | } 120 | ] 121 | }, 122 | { 123 | "classification": "hcatalog-webhcat-env", 124 | "properties": { 125 | }, 126 | "configurations": [ 127 | { 128 | "classification": "export", 129 | "properties": { 130 | "TZ": "Asia/Seoul" 131 | }, 132 | "configurations": [ 133 | ] 134 | } 135 | ] 136 | }, 137 | { 138 | "classification": "hive-env", 139 | "properties": { 140 | }, 141 | "configurations": [ 142 | { 143 | "classification": "export", 144 | "properties": { 145 | "TZ": "Asia/Seoul" 146 | }, 147 | "configurations": [ 148 | ] 149 | } 150 | ] 151 | }, 152 | { 153 | "classification": "mapred-env", 154 | "properties": { 155 | }, 156 | "configurations": [ 157 | { 158 | "classification": "export", 159 | "properties": { 160 | "TZ": "Asia/Seoul" 161 | }, 162 | "configurations": [ 163 | ] 164 | } 165 | ] 166 | }, 167 | { 168 | "classification": "hadoop-env", 169 | "properties": { 170 | }, 171 | "configurations": [ 172 | { 173 | "classification": "export", 174 | "properties": { 175 | "TZ": "Asia/Seoul" 176 | }, 177 | "configurations": [ 178 | ] 179 | } 180 | ] 181 | }, 182 | { 183 | "classification": "hbase-env", 184 | "properties": { 185 | }, 186 | "configurations": [ 187 | { 188 | "classification": "export", 189 | "properties": { 190 | "TZ": "Asia/Seoul" 191 | }, 192 | "configurations": [ 193 | ] 194 | } 195 | ] 196 | }, 197 | { 198 | "classification": "spark-env", 199 | "properties": { 200 | }, 201 | "configurations": [ 202 | { 203 | "classification": "export", 204 | "properties": { 205 | "TZ": "Asia/Seoul" 206 | }, 207 | "configurations": [ 208 | ] 209 | } 210 | ] 211 | }, 212 | { 213 | "Classification": "hive-site", 214 | "Properties": { 215 | "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true", 216 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver", 217 | "javax.jdo.option.ConnectionUserName": "root", 218 | "javax.jdo.option.ConnectionPassword": "admin1234" 219 | } 220 | }, 221 | { 222 | "Classification": "spark-hive-site", 223 | "Properties": { 224 | "javax.jdo.option.ConnectionURL": "jdbc:mysql:\/\/endpoint:3306\/hive_metastore?createDatabaseIfNotExist=true", 225 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver", 226 | "javax.jdo.option.ConnectionUserName": "root", 227 | "javax.jdo.option.ConnectionPassword": "admin1234" 228 | } 229 | }, 230 | { 231 | "Classification": "capacity-scheduler", 232 | "Properties": { 233 | "yarn.scheduler.capacity.resource-calculator": "org.apache.hadoop.yarn.util.resource.DominantResourceCalculator", 234 | "yarn.scheduler.capacity.maximum-am-resource-percent": "0.8" 235 | }, 236 | "configurations": [ 237 | ] 238 | }, 239 | { 240 | "Classification": "yarn-site", 241 | "Properties": { 242 | "yarn.scheduler.minimum-allocation-vcores": "1", 243 | "yarn.scheduler.maximum-allocation-vcores": "8", 244 | "yarn.node-labels.enabled": "true", 245 | "yarn.node-labels.am.default-node-label-expression": "CORE" 246 | }, 247 | "configurations": [ 248 | ] 249 | }, 250 | { 251 | "classification": "yarn-env", 252 | "properties": { 253 | }, 254 | "configurations": [ 255 | { 256 | "classification": "export", 257 | "properties": { 258 | "TZ": "Asia/Seoul" 259 | }, 260 | "configurations": [ 261 | ] 262 | } 263 | ] 264 | } 265 | ] -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/_template/template.emr-system-config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo yum -y update 4 | sudo yum -y upgrade 5 | 6 | sudo timedatectl set-timezone Asia/Seoul 7 | 8 | sudo yum -y groupinstall development 9 | sudo yum -y install curl wget jq htop 10 | 11 | sudo sh -c 'echo "fs.inotify.max_user_instances = 8192" > /etc/sysctl.d/98-inotifyfix.conf' 12 | sudo sh -c 'echo "fs.inotify.max_user_watches = 524288" >> /etc/sysctl.d/98-inotifyfix.conf' 13 | sudo sysctl --system 14 | 15 | sudo sh -c 'echo "* soft nofile 65536" > /etc/security/limits.d/50-custom.conf' 16 | sudo sh -c 'echo "* hard nofile 65536" >> /etc/security/limits.d/50-custom.conf' 17 | sudo sh -c 'echo "* soft nproc 200000" >> /etc/security/limits.d/50-custom.conf' 18 | sudo sh -c 'echo "* hard nproc 200000" >> /etc/security/limits.d/50-custom.conf' 19 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/_terraform.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.1.3" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 3.71.0" 8 | } 9 | } 10 | 11 | /** 12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다 13 | */ 14 | backend "local" { 15 | path = "../__tf_state/_aws-root-machine-emr/terraform.tfstate" 16 | } 17 | } 18 | 19 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/main_emr_data_dev.tf: -------------------------------------------------------------------------------- 1 | module "module-emr-data-dev" { 2 | source = "./module-emr-data-dev" 3 | 4 | environment = local.environment_development 5 | team = local.team_data 6 | 7 | vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev 8 | emr_subnet = data.terraform_remote_state.root_vpc.outputs.subnet_id_private_az_c_data_dev /** AZ-c */ 9 | 10 | emr_keypair = local.keypair_infra 11 | emr_profile_arn_instance = data.terraform_remote_state.root_iam.outputs.profile_arn_emr_instance 12 | emr_role_arn_cluster = data.terraform_remote_state.root_iam.outputs.role_arn_emr_cluster 13 | emr_role_arn_asg = data.terraform_remote_state.root_iam.outputs.role_arn_emr_asg 14 | 15 | emr_master_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_managed_data_dev 16 | emr_master_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_master_additional_data_dev 17 | emr_slave_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_managed_data_dev 18 | emr_slave_additional_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_slave_additional_data_dev 19 | emr_service_managed_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_emr_service_managed_data_dev 20 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_local.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | emr_cluster_spark_batch = "spark-batch" 3 | 4 | emr_release_5_34_0 = "emr-5.34.0" 5 | emr_release_6_5_0 = "emr-6.5.0" 6 | } 7 | 8 | locals { 9 | spot_default_factor = 0.8 10 | 11 | spot_on_demand_price_r5xlarge = 0.304 12 | spot_bid_price_r5xlarge = format("%.2f", tonumber(local.spot_on_demand_price_r5xlarge) * tonumber(local.spot_default_factor)) 13 | 14 | spot_on_demand_price_m5xlarge = 0.236 15 | spot_bid_price_m5_xlarge = format("%.2f", tonumber(local.spot_on_demand_price_m5xlarge) * tonumber(local.spot_default_factor)) 16 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/_variable.tf: -------------------------------------------------------------------------------- 1 | variable "environment" {} 2 | variable "team" {} 3 | 4 | 5 | variable "emr_keypair" {} 6 | 7 | variable "emr_profile_arn_instance" {} 8 | variable "emr_role_arn_cluster" {} 9 | variable "emr_role_arn_asg" {} 10 | 11 | variable "vpc_id" {} 12 | variable "emr_subnet" {} 13 | 14 | variable "emr_master_managed_sg_id" {} 15 | variable "emr_master_additional_sg_id" {} 16 | variable "emr_slave_managed_sg_id" {} 17 | variable "emr_slave_additional_sg_id" {} 18 | variable "emr_service_managed_sg_id" {} 19 | 20 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-batch/module-emr-data-dev/dev.spark-batch-01.cw.tf: -------------------------------------------------------------------------------- 1 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-CPUUtilization" { 2 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_CPUUtil" 3 | # comparison_operator = "GreaterThanOrEqualToThreshold" 4 | # 5 | # period = "300" 6 | # evaluation_periods = "2" 7 | # datapoints_to_alarm = 2 8 | # 9 | # # second 10 | # statistic = "Average" 11 | # threshold = "80" 12 | # alarm_description = "" 13 | # 14 | # metric_name = "CPUUtilization" 15 | # namespace = "AWS/EC2" 16 | # 17 | # dimensions = { 18 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id 19 | # } 20 | # 21 | # actions_enabled = true 22 | # insufficient_data_actions = [] 23 | # ok_actions = [] 24 | # 25 | # alarm_actions = [ 26 | # var.sns_topic_arn_cloudwatch_alarm, 27 | # ] 28 | #} 29 | # 30 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-MemUtil" { 31 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_MemUtil" 32 | # comparison_operator = "GreaterThanOrEqualToThreshold" 33 | # 34 | # period = "300" 35 | # evaluation_periods = "2" 36 | # datapoints_to_alarm = 2 37 | # 38 | # # second 39 | # statistic = "Maximum" 40 | # threshold = "80" 41 | # alarm_description = "" 42 | # 43 | # metric_name = "MemoryUtilization" 44 | # namespace = "System/Linux" 45 | # 46 | # dimensions = { 47 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id 48 | # } 49 | # 50 | # actions_enabled = true 51 | # 52 | # insufficient_data_actions = [ 53 | # var.sns_topic_arn_cloudwatch_alarm, 54 | # ] 55 | # 56 | # ok_actions = [] 57 | # 58 | # alarm_actions = [ 59 | # var.sns_topic_arn_cloudwatch_alarm, 60 | # ] 61 | #} 62 | # 63 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_Has-SystemCheckFailure" { 64 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-Has_SysCheckFailure" 65 | # comparison_operator = "GreaterThanOrEqualToThreshold" 66 | # 67 | # period = "300" 68 | # evaluation_periods = "1" 69 | # datapoints_to_alarm = 1 70 | # 71 | # # second 72 | # statistic = "Sum" 73 | # threshold = "1" 74 | # alarm_description = "" 75 | # 76 | # metric_name = "StatusCheckFailed" 77 | # namespace = "AWS/EC2" 78 | # 79 | # dimensions = { 80 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id 81 | # } 82 | # 83 | # actions_enabled = true 84 | # insufficient_data_actions = [] 85 | # ok_actions = [] 86 | # 87 | # alarm_actions = [ 88 | # var.sns_topic_arn_cloudwatch_alarm, 89 | # ] 90 | #} 91 | # 92 | ## EC2 Custom Metric (Disk, Memory) 93 | # 94 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-RootDiskUtil" { 95 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_RootDiskUtil" 96 | # comparison_operator = "GreaterThanOrEqualToThreshold" 97 | # 98 | # period = "300" 99 | # evaluation_periods = "2" 100 | # datapoints_to_alarm = 2 101 | # 102 | # # second 103 | # statistic = "Maximum" 104 | # threshold = "80" 105 | # alarm_description = "" 106 | # 107 | # metric_name = "DiskSpaceUtilization" 108 | # namespace = "System/Linux" 109 | # 110 | # dimensions = { 111 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id 112 | # MountPath = local.emr_cw_root_disk_mount_path 113 | # Filesystem = local.emr_cw_root_disk_mount_fs 114 | # } 115 | # 116 | # actions_enabled = true 117 | # 118 | # insufficient_data_actions = [ 119 | # var.sns_topic_arn_cloudwatch_alarm, 120 | # ] 121 | # 122 | # ok_actions = [] 123 | # 124 | # alarm_actions = [ 125 | # var.sns_topic_arn_cloudwatch_alarm, 126 | # ] 127 | #} 128 | # 129 | #resource "aws_cloudwatch_metric_alarm" "emr_spark_batch_01_High-DataDiskUtil" { 130 | # alarm_name = "EMR-Master-${lookup(local.spark_batch_cluster_01, "name_prefix")}-${lookup(local.spark_batch_cluster_01, "index")}/${var.environment}-High_DataDiskUtil" 131 | # comparison_operator = "GreaterThanOrEqualToThreshold" 132 | # 133 | # period = "300" 134 | # evaluation_periods = "2" 135 | # datapoints_to_alarm = 2 136 | # 137 | # # second 138 | # statistic = "Maximum" 139 | # threshold = "80" 140 | # alarm_description = "" 141 | # 142 | # metric_name = "DiskSpaceUtilization" 143 | # namespace = "System/Linux" 144 | # 145 | # dimensions = { 146 | # InstanceId = data.aws_instance.emr_master_spark_batch_cluster_01.id 147 | # MountPath = local.emr_cw_data_disk_mount_path 148 | # Filesystem = local.emr_cw_data_disk_mount_fs 149 | # } 150 | # 151 | # actions_enabled = true 152 | # 153 | # insufficient_data_actions = [ 154 | # var.sns_topic_arn_cloudwatch_alarm, 155 | # ] 156 | # 157 | # ok_actions = [] 158 | # 159 | # alarm_actions = [ 160 | # var.sns_topic_arn_cloudwatch_alarm, 161 | # ] 162 | #} 163 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-presto/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-presto/.gitkeep -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-machine-emr-stream/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-machine-emr-stream/.gitkeep -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-aws/aws-root-storage-rds/.gitkeep -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/_data.state.tf: -------------------------------------------------------------------------------- 1 | data "terraform_remote_state" "root_iam" { 2 | backend = "local" 3 | 4 | config = { 5 | path = "../__tf_state/_aws-root-iam/terraform.tfstate" 6 | } 7 | } 8 | 9 | data "terraform_remote_state" "root_vpc" { 10 | backend = "local" 11 | 12 | config = { 13 | path = "../__tf_state/_aws-root-vpc/terraform.tfstate" 14 | } 15 | } 16 | 17 | data "terraform_remote_state" "root_sg" { 18 | backend = "local" 19 | 20 | config = { 21 | path = "../__tf_state/_aws-root-sg/terraform.tfstate" 22 | } 23 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/_local.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | environment_common = "common" 3 | environment_development = "development" 4 | environment_production = "production" 5 | 6 | region_seoul = "ap-northeast-2" 7 | 8 | team_data = "data" 9 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/_provider.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = local.region_seoul 3 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/_terraform.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.1.3" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 3.71.0" 8 | } 9 | } 10 | 11 | /** 12 | * 테스팅 목적으로 Terraform Backend 를 사용하지 않습니다 13 | */ 14 | backend "local" { 15 | path = "../__tf_state/_aws-root-storage-rds/terraform.tfstate" 16 | } 17 | } 18 | 19 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/main_rds_data_dev.tf: -------------------------------------------------------------------------------- 1 | module "module-rds-data-dev" { 2 | source = "./module-rds-data-dev" 3 | 4 | environment = local.environment_development 5 | team = local.team_data 6 | 7 | vpc_id = data.terraform_remote_state.root_vpc.outputs.vpc_id_data_dev 8 | rds_hive_metastore_subnet_list = data.terraform_remote_state.root_vpc.outputs.subnet_list_database_data_dev 9 | rds_hive_metastore_subnet_group = data.terraform_remote_state.root_vpc.outputs.subnet_name_database_data_dev 10 | rds_hive_metastore_sg_id = data.terraform_remote_state.root_sg.outputs.sg_id_rds_hive_metastore_data_dev 11 | } -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/_variable.tf: -------------------------------------------------------------------------------- 1 | variable "environment" {} 2 | variable "team" {} 3 | 4 | variable "vpc_id" {} 5 | variable "rds_hive_metastore_sg_id" {} 6 | variable "rds_hive_metastore_subnet_group" {} 7 | variable "rds_hive_metastore_subnet_list" {} 8 | 9 | -------------------------------------------------------------------------------- /project-terraform-aws/aws-root-storage-rds/module-rds-data-dev/dev.hive-metastore.rds.tf: -------------------------------------------------------------------------------- 1 | module "rds-hive-metastore-data-development" { 2 | source = "terraform-aws-modules/rds-aurora/aws" 3 | version = "6.1.4" 4 | 5 | name = "hive-metastore-${var.environment}" 6 | engine = "aurora-mysql" 7 | engine_version = "5.7.12" 8 | instance_class = "db.t3.medium" 9 | instances = { 10 | 01 = {} 11 | 02 = {} 12 | } 13 | 14 | storage_encrypted = true 15 | apply_immediately = true 16 | skip_final_snapshot = true 17 | create_monitoring_role = false 18 | 19 | vpc_id = var.vpc_id 20 | db_subnet_group_name = var.rds_hive_metastore_subnet_group 21 | vpc_security_group_ids = [var.rds_hive_metastore_sg_id] 22 | create_db_subnet_group = false 23 | create_security_group = false 24 | 25 | # 이후 실습에서의 편의를 위해 고정된 값을 사용합니다. 26 | # Password 를 Terraform 에서 지정시 State 에 저장되므로 주의해야합니다. 27 | master_password = "admin1234" 28 | # master_password = random_password.hive-metastore.result 29 | create_random_password = false 30 | 31 | db_parameter_group_name = aws_db_parameter_group.hive-metastore.name 32 | db_cluster_parameter_group_name = aws_rds_cluster_parameter_group.hive-metastore.name 33 | 34 | enabled_cloudwatch_logs_exports = [] 35 | 36 | tags = { 37 | Environment = var.environment 38 | Team = var.team 39 | } 40 | } 41 | 42 | resource "random_password" "hive-metastore" { 43 | length = 10 44 | } 45 | 46 | resource "aws_db_parameter_group" "hive-metastore" { 47 | name = "hive-metastore-aurora-db-57-parameter-group" 48 | family = "aurora-mysql5.7" 49 | description = "hive-metastore-aurora-db-57-parameter-group" 50 | tags = { 51 | Environment = var.environment 52 | Team = var.team 53 | } 54 | } 55 | 56 | resource "aws_rds_cluster_parameter_group" "hive-metastore" { 57 | name = "hive-metastore-aurora-57-cluster-parameter-group" 58 | family = "aurora-mysql5.7" 59 | description = "hive-metastore-aurora-57-cluster-parameter-group" 60 | tags = { 61 | Environment = var.environment 62 | Team = var.team 63 | } 64 | } -------------------------------------------------------------------------------- /project-terraform-gcp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1ambda/practical-data-pipeline-code/8df98341fb4db44d34684c124d179e9e0b94f62d/project-terraform-gcp/.gitkeep --------------------------------------------------------------------------------