├── .dockerignore ├── .gitignore ├── .scalafmt.conf ├── Dockerfile ├── LICENSE ├── README.md ├── benchmarks ├── arm64 │ ├── README.md │ ├── charts │ │ ├── tpcds-benchmark │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ │ ├── _helpers.tpl │ │ │ │ └── sparkapplication.yaml │ │ │ └── values.yaml │ │ └── tpcds-data-generation │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── _helpers.tpl │ │ │ └── sparkapplication.yaml │ │ │ └── values.yaml │ └── terraform │ │ └── alicloud │ │ ├── main.tf │ │ ├── provider.tf │ │ ├── terraform.tfvars │ │ ├── user_data.sh │ │ └── variables.tf ├── hadoop-aliyun │ ├── README.md │ ├── charts │ │ └── tpcds-benchmark │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── _helpers.tpl │ │ │ └── sparkapplication.yaml │ │ │ └── values.yaml │ ├── result.csv │ └── terraform │ │ └── alicloud │ │ ├── main.tf │ │ ├── provider.tf │ │ ├── terraform.tfvars │ │ ├── user_data.sh │ │ └── variables.tf └── jindosdk │ ├── README.md │ ├── charts │ ├── tpcds-benchmark │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── _helpers.tpl │ │ │ └── sparkapplication.yaml │ │ └── values.yaml │ └── tpcds-data-generation │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ └── sparkapplication.yaml │ │ └── values.yaml │ ├── result.csv │ └── terraform │ └── alicloud │ ├── main.tf │ ├── provider.tf │ ├── terraform.tfvars │ ├── user_data.sh │ └── variables.tf ├── build.sbt ├── charts ├── tpcds-benchmark │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ └── sparkapplication.yaml │ └── values.yaml └── tpcds-data-generation │ ├── Chart.yaml │ ├── templates │ ├── _helpers.tpl │ └── sparkapplication.yaml │ └── values.yaml ├── docs ├── benchmark │ ├── apache-spark-vs-emr-spark.md │ ├── hadoop-aliyun │ │ ├── cloud-disk-read-write-bps.png │ │ ├── cloud-disk-read-write-iops.png │ │ ├── cpu-usage.png │ │ ├── index.md │ │ ├── memory-usage.png │ │ ├── network-bandwidth.png │ │ ├── oss-bandwidth-usage.png │ │ └── system-load.png │ ├── setup.md │ ├── spark-on-ack-arm64 │ │ ├── cloud-disk-read-write-bps.png │ │ ├── cloud-disk-read-write-iops.png │ │ ├── cpu-usage.png │ │ ├── index.md │ │ ├── memory-usage.png │ │ ├── network-bandwidth.png │ │ ├── oss-bandwidth-usage.png │ │ ├── result.csv │ │ └── system-load.png │ ├── spark-on-ack │ │ ├── cloud-disk-read-write-bps.png │ │ ├── cloud-disk-read-write-iops.png │ │ ├── cpu-usage.png │ │ ├── index.md │ │ ├── memory-usage.png │ │ ├── network-bandwidth.png │ │ ├── oss-bandwidth-usage.png │ │ ├── result.csv │ │ └── system-load.png │ ├── spark-on-ecs-vs-on-ack │ │ └── index.md │ ├── spark-on-x86-vs-on-arm64 │ │ ├── cost_comparison.png │ │ ├── index.md │ │ └── performance_comparison.png │ ├── tpcds-benchmark.md │ └── tpcds-data-generation.md ├── bestpractice │ ├── emrspark-ess-jindofs.md │ ├── emrspark-ess.md │ ├── emrspark-jindofs.md │ ├── emrspark.md │ └── generate-data.md ├── img │ ├── alluxio-overview.jpg │ ├── alluxio_capacity.png │ ├── apache-spark-per-10t.jpg │ ├── apache-spark-total-10t.jpg │ ├── create_ack_cluster.jpeg │ ├── emr-spark-ess-jindofs-per-1t.jpg │ ├── emr-spark-ess-jindofs-total-1t.jpg │ ├── emr-spark-jindofs-per-1t.jpg │ ├── emr-spark-jindofs-total-1t.jpg │ ├── emr-spark-rss-per-10t.jpg │ ├── emr-spark-rss-total-10t.jpg │ ├── get_spark_history_svc.jpeg │ ├── get_sparkapplication_id.jpeg │ ├── install_spark_history.jpeg │ ├── install_spark_operator.jpeg │ ├── jindofs.png │ ├── localhost_spark_ui.jpeg │ ├── mount_disk.jpeg │ ├── port-forward_svc.jpg │ ├── spark_vs_alluxio.jpg │ ├── sparkapplication_svc.jpg │ └── tpcds_per_query.jpeg ├── performance │ ├── emr-spark.md │ ├── jindofs.md │ ├── oss.md │ ├── remote-shuffle-service.md │ ├── serverless-spark │ │ ├── index.md │ │ └── serverless-spark.jpeg │ └── spark-operator.md └── quickstart │ ├── benchmark_code.md │ ├── benchmark_env.md │ ├── benchmark_result.md │ ├── benchmark_steps.md │ └── debugging_guide.md ├── kubernetes ├── alluxio │ └── config.yaml ├── emr │ ├── jar │ │ └── spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar │ ├── tpcds-benchmark-with-emrspark-ess-jindofs.yaml │ ├── tpcds-benchmark-with-emrspark-ess.yaml │ ├── tpcds-benchmark-with-emrspark-jindofs.yaml │ ├── tpcds-benchmark-with-emrspark.yaml │ └── tpcds-data-generation.yaml └── spark │ ├── tpcds-benchmark-with-alluxio.yaml │ ├── tpcds-benchmark.yaml │ └── tpcds-data-generation.yaml ├── lib └── spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar ├── project ├── build.properties └── plugins.sbt ├── src └── main │ └── scala │ └── com │ └── aliyun │ └── ack │ └── spark │ └── tpcds │ ├── Benchmark.scala │ └── DataGeneration.scala └── terraform └── alicloud ├── datasources.tf ├── graph.svg ├── main.tf ├── modules ├── celeborn │ ├── main.tf │ ├── master_user_data.sh │ ├── variables.tf │ └── worker_user_data.sh ├── cs │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── ecs │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── fluid │ ├── main.tf │ └── variables.tf ├── oss │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── resource-manager │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── spark │ ├── main.tf │ ├── user_data.sh │ ├── user_data_arm64.sh │ └── variables.tf └── vpc │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── provider.tf ├── root.tf ├── terraform.tfvars └── variables.tf /.dockerignore: -------------------------------------------------------------------------------- 1 | .bsp/ 2 | .git/ 3 | .metals/ 4 | .vscode/ 5 | benchmarks/ 6 | charts/ 7 | docs/ 8 | kubernetes/ 9 | target/ 10 | terraform/ 11 | .dockerignore 12 | .gitignore 13 | .scalafmt.conf 14 | Dockerfile 15 | LICENSE 16 | README.md 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Class files 2 | target/ 3 | 4 | # SBT 5 | .metals/ 6 | .bsp/ 7 | 8 | # Benchmark 9 | benchmark/*/values.yaml 10 | config/benchmark.properties 11 | config/values.yaml 12 | 13 | # Terraform 14 | .terraform/ 15 | .terraform.lock.hcl 16 | .terraform.tfstate.lock.info 17 | terraform.tfstate 18 | terraform.tfstate.backup 19 | 20 | # Various IDEs 21 | .idea/ 22 | .vscode/ 23 | 24 | # Mac OS 25 | .DS_Store 26 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "3.7.15" 2 | runner.dialect = scala212 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG SPARK_IMAGE=spark:3.5.5 2 | 3 | ARG SBT_IMAGE=sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.12.20 4 | 5 | FROM debian:bullseye-slim AS tpcds-kit-builder 6 | 7 | ENV GIT_COMMIT_ID=1b7fb7529edae091684201fab142d956d6afd881 8 | 9 | WORKDIR /app 10 | 11 | RUN set -eux && \ 12 | apt-get update && \ 13 | apt-get install -y gcc make flex bison byacc git 14 | 15 | RUN set -eux && \ 16 | git clone https://github.com/databricks/tpcds-kit.git && \ 17 | cd tpcds-kit && \ 18 | git checkout ${GIT_COMMIT_ID} && \ 19 | cd tools && \ 20 | make OS=LINUX 21 | 22 | FROM ${SBT_IMAGE} AS benchmark-builder 23 | 24 | WORKDIR /app 25 | 26 | COPY . . 27 | 28 | RUN set -eux && \ 29 | sbt assembly 30 | 31 | FROM ${SPARK_IMAGE} 32 | 33 | COPY --from=tpcds-kit-builder /app/tpcds-kit/tools /opt/tpcds-kit/tools 34 | 35 | COPY --from=benchmark-builder /app/target/scala-2.12/*.jar /opt/spark/jars/ 36 | 37 | COPY lib /opt/spark/jars/ 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Aliyun (Alibaba Cloud) Container Service 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-benchmark/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-benchmark 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-benchmark/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-benchmark.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-benchmark.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-benchmark.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-benchmark.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-benchmark.chart" . }} 38 | {{ include "tpcds-benchmark.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-benchmark.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-benchmark.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-benchmark.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-benchmark.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-benchmark/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS endpoint 22 | endpoint: oss-cn-beijing-internal.aliyuncs.com 23 | 24 | benchmark: 25 | # -- Scale factor 26 | scaleFactor: 3072 27 | # -- Number of iterations 28 | numIterations: 1 29 | # -- Whether to optimize queries 30 | optimizeQueries: false 31 | # -- Filter queries, will run all if empty 32 | queries: [] 33 | # - q70-v2.4 34 | # - q82-v2.4 35 | # - q64-v2.4 36 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-data-generation/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-data-generation 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-data-generation/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-data-generation.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-data-generation.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-data-generation.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-data-generation.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-data-generation.chart" . }} 38 | {{ include "tpcds-data-generation.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-data-generation.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-data-generation.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-data-generation.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-data-generation.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /benchmarks/arm64/charts/tpcds-data-generation/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS endpoint 22 | endpoint: oss-cn-beijing-internal.aliyuncs.com 23 | 24 | benchmark: 25 | # -- Scale factor 26 | scaleFactor: 3072 27 | # -- Number of partitions 28 | numPartitions: 640 29 | -------------------------------------------------------------------------------- /benchmarks/arm64/terraform/alicloud/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | alicloud = { 4 | source = "hashicorp/alicloud" 5 | version = "1.223.2" 6 | } 7 | } 8 | 9 | required_version = ">= 1.8.0" 10 | } 11 | 12 | resource "random_string" "suffix" { 13 | length = 16 14 | lower = true 15 | upper = false 16 | special = false 17 | } 18 | 19 | resource "alicloud_resource_manager_resource_group" "default" { 20 | resource_group_name = "rg-${random_string.suffix.result}" 21 | display_name = "rg-${random_string.suffix.result}" 22 | } 23 | 24 | resource "alicloud_vpc" "default" { 25 | vpc_name = "vpc-${random_string.suffix.result}" 26 | cidr_block = "192.168.0.0/16" 27 | resource_group_id = alicloud_resource_manager_resource_group.default.id 28 | } 29 | 30 | resource "alicloud_vswitch" "default" { 31 | vswitch_name = "vsw-${random_string.suffix.result}" 32 | cidr_block = "192.168.0.0/24" 33 | vpc_id = alicloud_vpc.default.id 34 | zone_id = var.zone_id 35 | } 36 | 37 | resource "alicloud_security_group" "default" { 38 | name = "sg-${random_string.suffix.result}" 39 | vpc_id = alicloud_vpc.default.id 40 | resource_group_id = alicloud_resource_manager_resource_group.default.id 41 | security_group_type = "normal" 42 | } 43 | 44 | resource "alicloud_security_group_rule" "default" { 45 | type = "ingress" 46 | ip_protocol = "all" 47 | port_range = "-1/-1" 48 | cidr_ip = "192.168.0.0/16" 49 | security_group_id = alicloud_security_group.default.id 50 | priority = 1 51 | } 52 | 53 | resource "alicloud_security_group_rule" "icmp" { 54 | type = "ingress" 55 | ip_protocol = "icmp" 56 | port_range = "-1/-1" 57 | cidr_ip = "0.0.0.0/0" 58 | security_group_id = alicloud_security_group.default.id 59 | priority = 1 60 | } 61 | 62 | resource "alicloud_cs_managed_kubernetes" "default" { 63 | name = "ack-${random_string.suffix.result}" 64 | timezone = "Asia/Shanghai" 65 | version = "1.32.1-aliyun.1" 66 | 67 | worker_vswitch_ids = [alicloud_vswitch.default.id] 68 | pod_vswitch_ids = [alicloud_vswitch.default.id] 69 | service_cidr = "172.16.0.0/16" 70 | 71 | addons { 72 | name = "terway-eniip" 73 | } 74 | 75 | proxy_mode = "ipvs" 76 | cluster_domain = "cluster.local" 77 | deletion_protection = false 78 | cluster_spec = "ack.pro.small" 79 | load_balancer_spec = "slb.s1.small" 80 | new_nat_gateway = true 81 | slb_internet_enabled = true 82 | resource_group_id = alicloud_resource_manager_resource_group.default.id 83 | security_group_id = alicloud_security_group.default.id 84 | } 85 | 86 | resource "alicloud_cs_kubernetes_node_pool" "spark-master" { 87 | node_pool_name = "spark-master" 88 | cluster_id = alicloud_cs_managed_kubernetes.default.id 89 | vswitch_ids = [alicloud_vswitch.default.id] 90 | desired_size = var.spark.master.instance_count 91 | instance_types = [var.spark.master.instance_type] 92 | image_type = "AliyunLinux3" 93 | system_disk_category = "cloud_essd" 94 | system_disk_size = 40 95 | system_disk_performance_level = "PL1" 96 | 97 | labels { 98 | key = "spark.tpcds.benchmark/role" 99 | value = "spark-master" 100 | } 101 | 102 | resource_group_id = alicloud_resource_manager_resource_group.default.id 103 | security_group_ids = [alicloud_security_group.default.id] 104 | } 105 | 106 | resource "alicloud_cs_kubernetes_node_pool" "spark-worker" { 107 | node_pool_name = "spark-worker" 108 | cluster_id = alicloud_cs_managed_kubernetes.default.id 109 | vswitch_ids = [alicloud_vswitch.default.id] 110 | desired_size = var.spark.worker.instance_count 111 | instance_types = [var.spark.worker.instance_type] 112 | image_type = "AliyunLinux3Arm64" 113 | system_disk_category = "cloud_essd" 114 | system_disk_size = 40 115 | system_disk_performance_level = "PL1" 116 | data_disks { 117 | category = "cloud_essd" 118 | size = 300 119 | performance_level = "PL1" 120 | device = "/dev/vdb" 121 | } 122 | data_disks { 123 | category = "cloud_essd" 124 | size = 300 125 | performance_level = "PL1" 126 | device = "/dev/vdc" 127 | } 128 | data_disks { 129 | category = "cloud_essd" 130 | size = 300 131 | performance_level = "PL1" 132 | device = "/dev/vdd" 133 | } 134 | data_disks { 135 | category = "cloud_essd" 136 | size = 300 137 | performance_level = "PL1" 138 | device = "/dev/vde" 139 | } 140 | data_disks { 141 | category = "cloud_essd" 142 | size = 300 143 | performance_level = "PL1" 144 | device = "/dev/vdf" 145 | } 146 | data_disks { 147 | category = "cloud_essd" 148 | size = 300 149 | performance_level = "PL1" 150 | device = "/dev/vdg" 151 | } 152 | data_disks { 153 | category = "cloud_essd" 154 | size = 40 155 | performance_level = "PL1" 156 | device = "/dev/vdh" 157 | } 158 | 159 | labels { 160 | key = "spark.tpcds.benchmark/role" 161 | value = "spark-worker" 162 | } 163 | 164 | taints { 165 | key = "spark.tpcds.benchmark/role" 166 | value = "spark-worker" 167 | effect = "NoSchedule" 168 | } 169 | 170 | kubelet_configuration { 171 | eviction_hard = { 172 | "imagefs.available" = "5%" 173 | "memory.available" = "100Mi" 174 | "nodefs.available" = "5%" 175 | "nodefs.inodesFree" = "5%" 176 | } 177 | system_reserved = { 178 | cpu = "300m" 179 | memory = "600Mi" 180 | pid = "1000" 181 | } 182 | kube_reserved = { 183 | cpu = "300m" 184 | memory = "600Mi" 185 | pid = "1000" 186 | } 187 | } 188 | 189 | user_data = base64encode(file("user_data.sh")) 190 | 191 | resource_group_id = alicloud_resource_manager_resource_group.default.id 192 | security_group_ids = [alicloud_security_group.default.id] 193 | } 194 | -------------------------------------------------------------------------------- /benchmarks/arm64/terraform/alicloud/provider.tf: -------------------------------------------------------------------------------- 1 | provider "alicloud" { 2 | region = var.region 3 | profile = var.profile 4 | } 5 | -------------------------------------------------------------------------------- /benchmarks/arm64/terraform/alicloud/terraform.tfvars: -------------------------------------------------------------------------------- 1 | region = "cn-beijing" 2 | 3 | zone_id = "cn-beijing-i" 4 | 5 | profile = "default" 6 | 7 | spark = { 8 | master = { 9 | instance_count = 1 10 | instance_type = "ecs.g7.2xlarge" 11 | } 12 | worker = { 13 | instance_count = 6 14 | instance_type = "ecs.g7.8xlarge" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /benchmarks/arm64/terraform/alicloud/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 打印命令 4 | set -ex 5 | 6 | # 添加 parted 7 | yum install -y parted e2fsprogs 8 | 9 | # 为数据盘新建分区 10 | disks=(/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1) 11 | for disk in ${disks[@]}; do 12 | parted ${disk} mklabel gpt 13 | parted ${disk} mkpart primary 1 100% 14 | parted ${disk} align-check optimal 1 15 | done 16 | partprobe 17 | 18 | # 为分区创建文件系统 19 | for disk in ${disks[@]}; do 20 | mkfs -t xfs ${disk}p1 21 | done 22 | 23 | # 挂载分区 24 | cp /etc/fstab /etc/fstab.bak 25 | n=${#disks[@]} 26 | for ((i = 0; i < n; i++)); do 27 | dir="/mnt/disk$(($i + 1))" 28 | mkdir -p ${dir} 29 | echo "$(blkid ${disks[i]}p1 | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 30 | done 31 | mount -a 32 | 33 | # 修改挂载点访问权限 34 | chmod a+w /mnt/disk* 35 | -------------------------------------------------------------------------------- /benchmarks/arm64/terraform/alicloud/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = string 3 | default = "cn-beijing" 4 | } 5 | 6 | variable "zone_id" { 7 | type = string 8 | default = "cn-beijing-i" 9 | } 10 | 11 | variable "profile" { 12 | type = string 13 | default = "default" 14 | } 15 | 16 | variable "spark" { 17 | type = object({ 18 | master = object({ 19 | instance_count = number 20 | instance_type = string 21 | }) 22 | worker = object({ 23 | instance_count = number 24 | instance_type = string 25 | }) 26 | }) 27 | default = { 28 | master = { 29 | instance_count = 0 30 | instance_type = "ecs.g7.2xlarge" 31 | } 32 | worker = { 33 | instance_count = 0 34 | instance_type = "ecs.g7.8xlarge" 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/charts/tpcds-benchmark/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-benchmark 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/charts/tpcds-benchmark/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-benchmark.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-benchmark.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-benchmark.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-benchmark.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-benchmark.chart" . }} 38 | {{ include "tpcds-benchmark.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-benchmark.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-benchmark.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-benchmark.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-benchmark.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/charts/tpcds-benchmark/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS endpoint 22 | endpoint: oss-cn-beijing-internal.aliyuncs.com 23 | 24 | benchmark: 25 | # -- Scale factor 26 | scaleFactor: 3072 27 | # -- Number of iterations 28 | numIterations: 1 29 | # -- Whether to optimize queries 30 | optimizeQueries: false 31 | # -- Filter queries, will run all if empty 32 | queries: [] 33 | # - q70-v2.4 34 | # - q82-v2.4 35 | # - q64-v2.4 36 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/result.csv: -------------------------------------------------------------------------------- 1 | q1-v2.4,7.555827733,14.645017924,9.525067462200001,2.5904191714618894 2 | q10-v2.4,10.139219717,12.947744009,10.972181106399999,1.0039544386190118 3 | q11-v2.4,55.874544235,57.542876171,56.6287968262,0.5430662413947229 4 | q12-v2.4,4.579125717,5.402954092,4.9307222384,0.32301686392230966 5 | q13-v2.4,15.9073942,18.265080114,17.072347103000002,0.9933099727270132 6 | q14a-v2.4,119.177384819,129.495269281,123.5429690906,3.786778786711343 7 | q14b-v2.4,103.46409887899999,115.289697081,109.85192228479998,4.662932511628337 8 | q15-v2.4,13.842170409,17.986607644,15.4219717892,1.3734498605155039 9 | q16-v2.4,43.282444291,55.276480839,47.590108713199996,4.59185208768297 10 | q17-v2.4,14.042981717,14.980429148999999,14.637328397000001,0.3634001139334673 11 | q18-v2.4,28.403695799,41.949688296,32.238505677199996,4.921754787967268 12 | q19-v2.4,7.363309527,8.245975126,7.656795179400001,0.30514480884743084 13 | q2-v2.4,25.816357012,32.470428279,28.547582622199997,2.451879133915654 14 | q20-v2.4,6.086647,6.4193623209999995,6.273996961,0.12623830909269768 15 | q21-v2.4,2.378381887,3.4156040450000003,2.8758012694,0.39138118136789785 16 | q22-v2.4,16.086610273,17.14673522,16.5761784312,0.3809763520523912 17 | q23a-v2.4,258.076754195,269.193093559,265.27780513740004,3.973645169102721 18 | q23b-v2.4,368.528432661,388.671088098,380.94798291300003,7.663851230049627 19 | q24a-v2.4,189.671067115,209.15216272299998,196.59965459739996,7.0735138307511125 20 | q24b-v2.4,169.089763371,182.15292864999998,175.2570714872,4.5769447104419365 21 | q25-v2.4,11.874169405,13.685109596,12.6089382824,0.6396215210419964 22 | q26-v2.4,9.310247432,12.234403261999999,10.9519195662,1.167272378992625 23 | q27-v2.4,11.289865549,15.073064023,13.2927417792,1.2331188089959368 24 | q28-v2.4,96.97509236500001,125.295863472,112.94800023280001,9.869950063745005 25 | q29-v2.4,38.803619543,41.97551136,40.068913475,1.2826696130533022 26 | q3-v2.4,5.364783305,7.824145384,6.2223974736,0.8462572739417858 27 | q30-v2.4,20.625805219999997,21.593344244,20.9982609472,0.3287445731381358 28 | q31-v2.4,17.829719669,25.059305644,21.779922139,2.3771383205792267 29 | q32-v2.4,2.720048041,4.683839247,3.3433007876000005,0.6961096711707798 30 | q33-v2.4,9.704455745999999,12.813910995,11.487099950600001,1.0967935667484785 31 | q34-v2.4,7.898450826,9.110205018,8.256833764800001,0.4484369296701056 32 | q35-v2.4,24.445621609000003,28.419166817,26.066038887200005,1.3988650902774291 33 | q36-v2.4,10.442542739,14.228895264999998,12.410845556599998,1.2762512908986128 34 | q37-v2.4,20.072886561,25.434238496000003,22.857843468,1.9864712650677976 35 | q38-v2.4,30.999390854,31.964420292,31.2859972066,0.34561495021578303 36 | q39a-v2.4,6.8627511519999995,7.2298869009999995,7.0608150454,0.11908449974127251 37 | q39b-v2.4,5.610674997,6.385491304,5.9222366724,0.26585784635081794 38 | q4-v2.4,367.93584179799996,374.496266095,371.539061803,2.3286806342391855 39 | q40-v2.4,16.463025893999998,21.682886973000002,19.2418352114,1.6764054241600101 40 | q41-v2.4,1.05442466,1.5361814,1.2747248748,0.19715515022243038 41 | q42-v2.4,2.554909318,3.899367612,3.4005177458000007,0.4700603955042605 42 | q43-v2.4,8.078254149000001,9.896694641,8.957477204000002,0.6635069337947319 43 | q44-v2.4,41.550018738,54.162684629000005,48.663437501,4.2148260123451955 44 | q45-v2.4,14.805159845,20.566858976,16.8483266144,1.9615425240330668 45 | q46-v2.4,13.708602385,15.891152766,14.930230506799997,0.7300563130353975 46 | q47-v2.4,14.537253275,17.721562365999997,15.533307980199998,1.1445041779908376 47 | q48-v2.4,14.502060713,17.395740532,15.9916758498,0.9875399045826255 48 | q49-v2.4,29.651472364,42.489712072,37.601138269,4.444309640017966 49 | q5-v2.4,37.334653149,40.200471311,39.2507282382,0.9886498881274202 50 | q50-v2.4,101.391678285,105.781127081,102.69716413560002,1.5935547273864317 51 | q51-v2.4,18.693100599,22.018999121,20.032070217199998,1.2816532821197753 52 | q52-v2.4,2.28741606,4.112365464000001,3.1125799768,0.5962140676132831 53 | q53-v2.4,6.969523639,10.692510120000001,8.7085242732,1.199973805766159 54 | q54-v2.4,11.412873501,13.927249072,12.6033706528,0.8285475418649881 55 | q55-v2.4,2.476024336,3.89929411,3.2573972826000004,0.46950323781500314 56 | q56-v2.4,11.167401115,12.48719738,12.0896413296,0.49039838179588596 57 | q57-v2.4,10.011080384000001,11.792459323,11.037184577,0.5917755073183687 58 | q58-v2.4,4.376534702,7.203354707000001,5.953215469,1.0254708580280585 59 | q59-v2.4,23.156396418,34.160966369,28.4156941152,3.49225350518341 60 | q6-v2.4,13.070614896999999,15.60744374,14.014995768599999,0.9282019042221119 61 | q60-v2.4,14.731191304,19.66390691,16.5136571402,1.7881210242320205 62 | q61-v2.4,8.821308386,11.460828377,9.676813126599999,0.9203215490588001 63 | q62-v2.4,10.415500262,17.370040453,13.605939325200001,2.32137023353903 64 | q63-v2.4,7.415368265,10.95750388,8.6614520724,1.3092332760032435 65 | q64-v2.4,131.346003199,164.305281001,142.6887317056,11.592601856013589 66 | q65-v2.4,38.243325244,39.455820337,38.797347355599996,0.4272797273899433 67 | q66-v2.4,12.808035632,17.160159465,15.0615725896,1.4475748377297115 68 | q67-v2.4,769.775210296,781.8728767829999,775.7869474142,4.714861010478136 69 | q68-v2.4,8.771130701,10.705161347999999,9.3756038822,0.7047911441610056 70 | q69-v2.4,9.775576563000001,11.114773268999999,10.5136436122,0.5489375903128211 71 | q7-v2.4,13.023328125,16.290491273,14.659974144200001,1.144137952058827 72 | q70-v2.4,18.568223587000002,22.762795158,20.132875812800002,1.5265775159110402 73 | q71-v2.4,6.317815402,9.284696618,7.462921685,1.0245048254410207 74 | q72-v2.4,34.545024402,37.855211897000004,35.791231272000005,1.1375600062074522 75 | q73-v2.4,4.625113768,4.803249523000001,4.7231101598,0.06935726775623359 76 | q74-v2.4,47.742458582,48.627326292999996,48.020538134800006,0.32562188219485755 77 | q75-v2.4,83.773148871,87.936735725,85.45101943979999,1.5109531589349858 78 | q76-v2.4,47.902195574,53.092793132,50.0153055606,1.8751372767480663 79 | q77-v2.4,4.625710815000001,5.525386221,5.0158455474,0.3263215865681226 80 | q78-v2.4,140.072784853,148.793414521,144.359776035,3.7877318101211377 81 | q79-v2.4,8.043859199,10.584506912,8.750632112400002,0.9458295548172388 82 | q8-v2.4,7.488463216,8.756565388999999,8.202961499,0.4802299213424071 83 | q80-v2.4,37.678019479,43.77925282,39.8464992736,2.242299127605053 84 | q81-v2.4,17.389926685,19.717641134,18.400887212199997,0.810545298607923 85 | q82-v2.4,37.872316255,39.366411694,38.380020683000005,0.5355719013008041 86 | q83-v2.4,2.23556385,3.01051841,2.6742351806,0.2563610271711477 87 | q84-v2.4,11.58943321,16.406073601,13.5190641438,1.712521432316087 88 | q85-v2.4,17.730514398,25.712928753,19.6707546448,3.040351179347042 89 | q86-v2.4,4.818967488999999,8.152198697,5.7807977128,1.2028729164093217 90 | q87-v2.4,30.755230952999998,34.140598065999995,31.881264702599992,1.201093326422785 91 | q88-v2.4,90.14547673999999,114.130656328,96.31127402519999,9.030468283157418 92 | q89-v2.4,8.612939995000001,11.532649208,9.692841781800002,0.9873480720382047 93 | q9-v2.4,79.609407603,87.886195604,83.26658918460001,3.1943463401434364 94 | q90-v2.4,11.343873734999999,15.817066508,12.580913439799998,1.635053035123793 95 | q91-v2.4,4.004272509,5.583233218999999,4.7409098592,0.5312560634570823 96 | q92-v2.4,2.679482601,3.689624003,3.0633189884,0.34324438124740586 97 | q93-v2.4,169.17162040300002,178.823478963,176.0718832048,3.604740942204125 98 | q94-v2.4,32.831807897,41.108575898000005,36.379850151599996,3.053098719484592 99 | q95-v2.4,91.738810352,95.363992544,93.6360985476,1.4433886352403469 100 | q96-v2.4,14.032909249000001,19.409014901,16.278186813599998,2.1187887248160058 101 | q97-v2.4,40.717438293,42.967118715999995,42.0139412096,0.8044572221752353 102 | q98-v2.4,7.213829552,9.159006702000001,7.983071198199999,0.682698666912065 103 | q99-v2.4,14.984824776,18.425158197000002,16.483918421400002,1.268318536867176 104 | ss_max-v2.4,28.861745663,35.437128162,31.9152317112,2.2989497172157667 105 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/terraform/alicloud/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | alicloud = { 4 | source = "hashicorp/alicloud" 5 | version = "1.223.2" 6 | } 7 | } 8 | 9 | required_version = ">= 1.8.0" 10 | } 11 | 12 | resource "random_string" "suffix" { 13 | length = 16 14 | lower = true 15 | upper = false 16 | special = false 17 | } 18 | 19 | resource "alicloud_resource_manager_resource_group" "default" { 20 | resource_group_name = "rg-${random_string.suffix.result}" 21 | display_name = "rg-${random_string.suffix.result}" 22 | } 23 | 24 | resource "alicloud_vpc" "default" { 25 | vpc_name = "vpc-${random_string.suffix.result}" 26 | cidr_block = "192.168.0.0/16" 27 | resource_group_id = alicloud_resource_manager_resource_group.default.id 28 | } 29 | 30 | resource "alicloud_vswitch" "default" { 31 | vswitch_name = "vsw-${random_string.suffix.result}" 32 | cidr_block = "192.168.0.0/24" 33 | vpc_id = alicloud_vpc.default.id 34 | zone_id = var.zone_id 35 | } 36 | 37 | resource "alicloud_security_group" "default" { 38 | name = "sg-${random_string.suffix.result}" 39 | vpc_id = alicloud_vpc.default.id 40 | resource_group_id = alicloud_resource_manager_resource_group.default.id 41 | security_group_type = "normal" 42 | } 43 | 44 | resource "alicloud_security_group_rule" "default" { 45 | type = "ingress" 46 | ip_protocol = "all" 47 | port_range = "-1/-1" 48 | cidr_ip = "192.168.0.0/16" 49 | security_group_id = alicloud_security_group.default.id 50 | priority = 1 51 | } 52 | 53 | resource "alicloud_security_group_rule" "icmp" { 54 | type = "ingress" 55 | ip_protocol = "icmp" 56 | port_range = "-1/-1" 57 | cidr_ip = "0.0.0.0/0" 58 | security_group_id = alicloud_security_group.default.id 59 | priority = 1 60 | } 61 | 62 | resource "alicloud_cs_managed_kubernetes" "default" { 63 | name = "ack-${random_string.suffix.result}" 64 | timezone = "Asia/Shanghai" 65 | version = "1.32.1-aliyun.1" 66 | 67 | worker_vswitch_ids = [alicloud_vswitch.default.id] 68 | pod_vswitch_ids = [alicloud_vswitch.default.id] 69 | service_cidr = "172.16.0.0/16" 70 | 71 | addons { 72 | name = "terway-eniip" 73 | } 74 | 75 | proxy_mode = "ipvs" 76 | cluster_domain = "cluster.local" 77 | deletion_protection = false 78 | cluster_spec = "ack.pro.small" 79 | load_balancer_spec = "slb.s1.small" 80 | new_nat_gateway = true 81 | slb_internet_enabled = true 82 | resource_group_id = alicloud_resource_manager_resource_group.default.id 83 | security_group_id = alicloud_security_group.default.id 84 | } 85 | 86 | resource "alicloud_cs_kubernetes_node_pool" "spark-master" { 87 | node_pool_name = "spark-master" 88 | cluster_id = alicloud_cs_managed_kubernetes.default.id 89 | vswitch_ids = [alicloud_vswitch.default.id] 90 | desired_size = var.spark.master.instance_count 91 | instance_types = [var.spark.master.instance_type] 92 | image_type = "AliyunLinux3" 93 | system_disk_category = "cloud_essd" 94 | system_disk_size = 40 95 | system_disk_performance_level = "PL1" 96 | 97 | labels { 98 | key = "spark.tpcds.benchmark/role" 99 | value = "spark-master" 100 | } 101 | 102 | resource_group_id = alicloud_resource_manager_resource_group.default.id 103 | security_group_ids = [alicloud_security_group.default.id] 104 | } 105 | 106 | resource "alicloud_cs_kubernetes_node_pool" "spark-worker" { 107 | node_pool_name = "spark-worker" 108 | cluster_id = alicloud_cs_managed_kubernetes.default.id 109 | vswitch_ids = [alicloud_vswitch.default.id] 110 | desired_size = var.spark.worker.instance_count 111 | instance_types = [var.spark.worker.instance_type] 112 | image_type = "AliyunLinux3" 113 | system_disk_category = "cloud_essd" 114 | system_disk_size = 40 115 | system_disk_performance_level = "PL1" 116 | data_disks { 117 | category = "cloud_essd" 118 | size = 300 119 | performance_level = "PL1" 120 | device = "/dev/vdb" 121 | } 122 | data_disks { 123 | category = "cloud_essd" 124 | size = 300 125 | performance_level = "PL1" 126 | device = "/dev/vdc" 127 | } 128 | data_disks { 129 | category = "cloud_essd" 130 | size = 300 131 | performance_level = "PL1" 132 | device = "/dev/vdd" 133 | } 134 | data_disks { 135 | category = "cloud_essd" 136 | size = 300 137 | performance_level = "PL1" 138 | device = "/dev/vde" 139 | } 140 | data_disks { 141 | category = "cloud_essd" 142 | size = 300 143 | performance_level = "PL1" 144 | device = "/dev/vdf" 145 | } 146 | data_disks { 147 | category = "cloud_essd" 148 | size = 300 149 | performance_level = "PL1" 150 | device = "/dev/vdg" 151 | } 152 | data_disks { 153 | category = "cloud_essd" 154 | size = 40 155 | performance_level = "PL1" 156 | device = "/dev/vdh" 157 | } 158 | 159 | labels { 160 | key = "spark.tpcds.benchmark/role" 161 | value = "spark-worker" 162 | } 163 | 164 | taints { 165 | key = "spark.tpcds.benchmark/role" 166 | value = "spark-worker" 167 | effect = "NoSchedule" 168 | } 169 | 170 | kubelet_configuration { 171 | eviction_hard = { 172 | "imagefs.available" = "5%" 173 | "memory.available" = "100Mi" 174 | "nodefs.available" = "5%" 175 | "nodefs.inodesFree" = "5%" 176 | } 177 | system_reserved = { 178 | cpu = "300m" 179 | memory = "600Mi" 180 | pid = "1000" 181 | } 182 | kube_reserved = { 183 | cpu = "300m" 184 | memory = "600Mi" 185 | pid = "1000" 186 | } 187 | } 188 | 189 | user_data = base64encode(file("user_data.sh")) 190 | 191 | resource_group_id = alicloud_resource_manager_resource_group.default.id 192 | security_group_ids = [alicloud_security_group.default.id] 193 | } 194 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/terraform/alicloud/provider.tf: -------------------------------------------------------------------------------- 1 | provider "alicloud" { 2 | region = var.region 3 | profile = var.profile 4 | } 5 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/terraform/alicloud/terraform.tfvars: -------------------------------------------------------------------------------- 1 | region = "cn-beijing" 2 | 3 | zone_id = "cn-beijing-i" 4 | 5 | profile = "default" 6 | 7 | spark = { 8 | master = { 9 | instance_count = 1 10 | instance_type = "ecs.g7.2xlarge" 11 | } 12 | worker = { 13 | instance_count = 6 14 | instance_type = "ecs.g7.8xlarge" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/terraform/alicloud/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 打印命令 4 | set -ex 5 | 6 | # 添加 parted 7 | yum install -y parted e2fsprogs 8 | 9 | # 为数据盘新建分区 10 | disks=(/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf /dev/vdg) 11 | for disk in ${disks[@]}; do 12 | parted ${disk} mklabel gpt 13 | parted ${disk} mkpart primary 1 100% 14 | parted ${disk} align-check optimal 1 15 | done 16 | partprobe 17 | 18 | # 为分区创建文件系统 19 | for disk in ${disks[@]}; do 20 | mkfs -t xfs ${disk}1 21 | done 22 | 23 | # 挂载分区 24 | cp /etc/fstab /etc/fstab.bak 25 | n=${#disks[@]} 26 | for ((i = 0; i < n; i++)); do 27 | dir="/mnt/disk$(($i + 1))" 28 | mkdir -p ${dir} 29 | echo "$(blkid ${disks[i]}1 | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 30 | chmod g+w ${dir} 31 | done 32 | mount -a 33 | 34 | # 修改挂载点访问权限 35 | chmod a+w /mnt/disk* 36 | -------------------------------------------------------------------------------- /benchmarks/hadoop-aliyun/terraform/alicloud/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = string 3 | default = "cn-beijing" 4 | } 5 | 6 | variable "zone_id" { 7 | type = string 8 | default = "cn-beijing-i" 9 | } 10 | 11 | variable "profile" { 12 | type = string 13 | default = "default" 14 | } 15 | 16 | variable "spark" { 17 | type = object({ 18 | master = object({ 19 | instance_count = number 20 | instance_type = string 21 | }) 22 | worker = object({ 23 | instance_count = number 24 | instance_type = string 25 | }) 26 | }) 27 | default = { 28 | master = { 29 | instance_count = 0 30 | instance_type = "ecs.g7.2xlarge" 31 | } 32 | worker = { 33 | instance_count = 0 34 | instance_type = "ecs.g7.8xlarge" 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-benchmark/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-benchmark 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-benchmark/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-benchmark.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-benchmark.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-benchmark.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-benchmark.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-benchmark.chart" . }} 38 | {{ include "tpcds-benchmark.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-benchmark.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-benchmark.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-benchmark.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-benchmark.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-benchmark/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS endpoint 22 | endpoint: oss-cn-beijing-internal.aliyuncs.com 23 | 24 | benchmark: 25 | # -- Scale factor 26 | scaleFactor: 3072 27 | # -- Number of iterations 28 | numIterations: 1 29 | # -- Whether to optimize queries 30 | optimizeQueries: false 31 | # -- Filter queries, will run all if empty 32 | queries: [] 33 | # - q70-v2.4 34 | # - q82-v2.4 35 | # - q64-v2.4 36 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-data-generation/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-data-generation 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-data-generation/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-data-generation.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-data-generation.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-data-generation.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-data-generation.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-data-generation.chart" . }} 38 | {{ include "tpcds-data-generation.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-data-generation.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-data-generation.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-data-generation.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-data-generation.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/charts/tpcds-data-generation/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS endpoint 22 | endpoint: oss-cn-beijing-internal.aliyuncs.com 23 | 24 | benchmark: 25 | # -- Scale factor 26 | scaleFactor: 3072 27 | # -- Number of partitions 28 | numPartitions: 640 29 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/result.csv: -------------------------------------------------------------------------------- 1 | q1-v2.4,5.82740906,12.178573564,7.1879224572,2.4960589575459804 2 | q10-v2.4,7.573033074,8.666177659,8.1316966384,0.3648029760376505 3 | q11-v2.4,53.436323165000005,53.921298953,53.75794683080001,0.17454812541879453 4 | q12-v2.4,3.947420554,5.6403560420000005,4.431623394400001,0.6216755464731617 5 | q13-v2.4,12.143909611,13.038156586,12.6490554062,0.3300995714054239 6 | q14a-v2.4,107.46807310999999,115.669366161,111.85752393800001,2.8042210551273055 7 | q14b-v2.4,105.816832083,110.095723619,107.19382418820001,1.5379012340740084 8 | q15-v2.4,13.701433589,14.150182401,13.954124861,0.14472680646740804 9 | q16-v2.4,36.780885457,40.265582759,37.7290644368,1.2985744150482166 10 | q17-v2.4,12.995115227,13.452064576000001,13.1343180432,0.16627461371977903 11 | q18-v2.4,26.913184947,28.227103931000002,27.487764137,0.521758556010464 12 | q19-v2.4,6.53442605,6.785478434,6.6804468676,0.09669073284863788 13 | q2-v2.4,21.132480734,22.656578223,21.6629419568,0.5803333423209112 14 | q20-v2.4,5.325541791,5.768802061000001,5.512311510799999,0.17759409301795656 15 | q21-v2.4,1.870814209,2.354133757,2.0752153428,0.16786013784580675 16 | q22-v2.4,15.361660048,15.842826313,15.610420318,0.16316416909917944 17 | q23a-v2.4,252.781698925,267.126780039,263.6303611626,5.445611471905185 18 | q23b-v2.4,362.414191738,396.677759852,380.3411332984,11.85351761507152 19 | q24a-v2.4,168.934801095,195.761364316,179.9287417506,9.510202007542013 20 | q24b-v2.4,156.003102491,186.871001702,165.02367737039998,11.386752754897792 21 | q25-v2.4,9.959931248,10.710937722,10.389834249200002,0.2773901160922852 22 | q26-v2.4,6.968146132,9.120252821,7.9289348678,0.7467140720959528 23 | q27-v2.4,8.532492410000001,10.062598792000001,9.309868940800001,0.5251773595287177 24 | q28-v2.4,79.790235462,86.565475439,83.5998490682,2.7366682823821984 25 | q29-v2.4,35.326786917,37.729033465,36.606352364399996,0.8259770016569948 26 | q3-v2.4,3.6432516919999998,6.147688761,4.305347787600001,0.9331086197419158 27 | q30-v2.4,18.220981156999997,18.633722778,18.437182831999998,0.14174993480267611 28 | q31-v2.4,15.942112084,21.284957720999998,18.4043419582,1.9913891609695626 29 | q32-v2.4,2.345385271,2.577304551,2.4563204285999998,0.07893259517224921 30 | q33-v2.4,8.450874475,12.325880425,10.5759360036,1.4506385441266296 31 | q34-v2.4,5.186600594,5.957797243,5.4886637674,0.2539189080514718 32 | q35-v2.4,20.837288122999997,24.231901855,22.1871265838,1.1473412689581974 33 | q36-v2.4,7.836719654,9.303440971,8.411142502,0.5375508429923208 34 | q37-v2.4,18.839476587,23.085603574,20.425648218,1.6288495424610643 35 | q38-v2.4,29.27162852,29.888590269,29.704503160799998,0.22100625676104002 36 | q39a-v2.4,5.6715128990000006,5.987813289,5.8983753884,0.11670704181381068 37 | q39b-v2.4,5.124887764,5.544479965000001,5.3199175924,0.13795567629304478 38 | q4-v2.4,351.110730348,356.019829643,354.574727248,1.803913576819102 39 | q40-v2.4,11.522964602,11.855520104,11.665169434000001,0.12009998943452786 40 | q41-v2.4,0.630769625,0.734893715,0.690501993,0.045762016697730806 41 | q42-v2.4,1.5774040230000002,2.108183575,1.8526063143999998,0.1974445726816084 42 | q43-v2.4,5.243902481,5.536814871,5.4029682634,0.12019622763754292 43 | q44-v2.4,28.837908760999998,34.309312426999995,31.282211627200002,1.8384245102982972 44 | q45-v2.4,13.153637891,13.628053518,13.339359959800001,0.17041278652280728 45 | q46-v2.4,11.287654004,12.271047093,11.758188709399999,0.3187610867133229 46 | q47-v2.4,13.463470857,14.189145694,13.7540722738,0.2650707791268198 47 | q48-v2.4,11.241675956,12.103887507,11.707699290599999,0.325604968552439 48 | q49-v2.4,17.834184225,20.899016822,19.4238399256,1.0754022056717856 49 | q5-v2.4,34.621159285000005,36.314129427,35.1836529778,0.626921312924483 50 | q50-v2.4,95.74525486499999,103.6536799,100.2474700934,2.555714600741113 51 | q51-v2.4,15.991921506999999,16.623763728999997,16.232791363,0.2499447379507674 52 | q52-v2.4,1.7025136520000002,1.887806007,1.7745731660000001,0.06949580697355572 53 | q53-v2.4,5.1523663619999995,5.8181278579999995,5.4547989575999996,0.23320098695816197 54 | q54-v2.4,9.884381209999999,10.559274303,10.2496401932,0.23028904786036253 55 | q55-v2.4,1.722851433,1.854796877,1.8043138242,0.04516697302902219 56 | q56-v2.4,9.618347347,12.203963006,11.2766603764,0.9722058751923929 57 | q57-v2.4,8.914394118,9.394239197000001,9.094486610199999,0.18040418586569554 58 | q58-v2.4,3.500587732,3.7713736840000003,3.6416817162,0.09703387195726251 59 | q59-v2.4,20.758752511999997,21.683708199,21.1298770402,0.32775226611819375 60 | q6-v2.4,12.440352381999999,15.804344041,13.647992317200002,1.1629592307863936 61 | q60-v2.4,13.255234117,13.908884819999999,13.479432537200001,0.23210091359158994 62 | q61-v2.4,6.61329201,7.086961272,6.7748774466,0.17045367659258243 63 | q62-v2.4,7.156209132,7.642842674000001,7.4162772407999995,0.1618788158931069 64 | q63-v2.4,4.832625473,5.236620013,5.0496996798,0.15297990775471795 65 | q64-v2.4,118.185557968,122.85405472,120.3819681836,1.6459305459978475 66 | q65-v2.4,37.528430785999994,38.122415229,37.7123882112,0.20992940071828223 67 | q66-v2.4,11.574929771999999,12.738605019000001,11.8822441496,0.4340046024026489 68 | q67-v2.4,754.5656415010001,785.364204246,771.5511458138001,10.811958985534648 69 | q68-v2.4,7.227422299,7.7298664509999995,7.4512599586,0.2250363828920852 70 | q69-v2.4,7.1655399509999995,7.585426471,7.4151779448,0.1463373047409165 71 | q7-v2.4,9.218510941,30.921400187,14.3854956262,8.305414189617764 72 | q70-v2.4,13.038384813,14.662040791999999,13.803610042399999,0.6790784435658018 73 | q71-v2.4,4.49366801,5.534065178,4.958342104,0.3485496889032831 74 | q72-v2.4,30.596248559,32.154659074,31.222147883800005,0.5634306317924787 75 | q73-v2.4,4.058459936,4.237710261999999,4.1268822094,0.06837608437373675 76 | q74-v2.4,43.92400146,45.089497551,44.507426736,0.40051477172070366 77 | q75-v2.4,70.99690424,77.740359476,73.7622416966,2.5833038102991885 78 | q76-v2.4,37.639858732,44.782194468,41.615109209799996,3.054873881024291 79 | q77-v2.4,2.924383521,3.658697241,3.2924847838000004,0.24097283466005903 80 | q78-v2.4,135.53521893,138.923424414,137.4993615154,1.5130299533763367 81 | q79-v2.4,6.585374795,7.010601788,6.7634720192,0.1431610726607435 82 | q8-v2.4,7.054331802,8.213357267,7.4826677602,0.40186409283060637 83 | q80-v2.4,30.762710629,33.956214714000005,31.763478078600002,1.1364042478378251 84 | q81-v2.4,17.026213071999997,30.736642845,24.8111300872,6.311979062312886 85 | q82-v2.4,36.560933301,38.963618909,38.080525601199994,1.0161681076399023 86 | q83-v2.4,1.556373215,2.012291026,1.8411018434,0.16810816431476194 87 | q84-v2.4,7.54376091,8.449147638000001,7.9804698894,0.32813136466517406 88 | q85-v2.4,11.305392544,12.246462002,11.6826919822,0.3126755402485902 89 | q86-v2.4,4.166293958,7.4550414609999995,4.846635666799999,1.3044220575400889 90 | q87-v2.4,29.857144283,30.197245811000002,30.0568481086,0.12204064209977733 91 | q88-v2.4,68.946333353,73.469319152,70.9938738528,1.50355082387917 92 | q89-v2.4,6.465980558999999,7.482841177999999,7.048671743599999,0.37705910005996873 93 | q9-v2.4,65.27047729499999,70.363230309,68.0142375148,1.7138701157064395 94 | q90-v2.4,7.551024351,7.847182861,7.724840075,0.14168198677742588 95 | q91-v2.4,2.9543571280000003,3.161031375,3.0627617616,0.08722093797597069 96 | q92-v2.4,1.734802363,2.452249051,1.9558326864000002,0.25509095562300277 97 | q93-v2.4,165.97376033700002,175.536953304,170.9042379306,3.7117345343598487 98 | q94-v2.4,27.420068029,29.638806560000003,28.471682324800003,0.8848546965648024 99 | q95-v2.4,85.688196242,88.987488021,87.09022427260001,1.2108144847560522 100 | q96-v2.4,10.45255504,11.140827458,10.739148343600002,0.2511779065303994 101 | q97-v2.4,39.057097029999994,39.838978937,39.6167914878,0.28567950252043145 102 | q98-v2.4,6.654428135,7.631984391,7.311427204199999,0.3560569231980692 103 | q99-v2.4,10.458687404,11.871515145,11.1029979198,0.49911583952624683 104 | ss_max-v2.4,24.555110707,26.187189824,25.5329044538,0.6290656580040886 105 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/terraform/alicloud/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | alicloud = { 4 | source = "hashicorp/alicloud" 5 | version = "1.223.2" 6 | } 7 | } 8 | 9 | required_version = ">= 1.8.0" 10 | } 11 | 12 | resource "random_string" "suffix" { 13 | length = 16 14 | lower = true 15 | upper = false 16 | special = false 17 | } 18 | 19 | resource "alicloud_resource_manager_resource_group" "default" { 20 | resource_group_name = "rg-${random_string.suffix.result}" 21 | display_name = "rg-${random_string.suffix.result}" 22 | } 23 | 24 | resource "alicloud_vpc" "default" { 25 | vpc_name = "vpc-${random_string.suffix.result}" 26 | cidr_block = "192.168.0.0/16" 27 | resource_group_id = alicloud_resource_manager_resource_group.default.id 28 | } 29 | 30 | resource "alicloud_vswitch" "default" { 31 | vswitch_name = "vsw-${random_string.suffix.result}" 32 | cidr_block = "192.168.0.0/24" 33 | vpc_id = alicloud_vpc.default.id 34 | zone_id = var.zone_id 35 | } 36 | 37 | resource "alicloud_security_group" "default" { 38 | name = "sg-${random_string.suffix.result}" 39 | vpc_id = alicloud_vpc.default.id 40 | resource_group_id = alicloud_resource_manager_resource_group.default.id 41 | security_group_type = "normal" 42 | } 43 | 44 | resource "alicloud_security_group_rule" "default" { 45 | type = "ingress" 46 | ip_protocol = "all" 47 | port_range = "-1/-1" 48 | cidr_ip = "192.168.0.0/16" 49 | security_group_id = alicloud_security_group.default.id 50 | priority = 1 51 | } 52 | 53 | resource "alicloud_security_group_rule" "icmp" { 54 | type = "ingress" 55 | ip_protocol = "icmp" 56 | port_range = "-1/-1" 57 | cidr_ip = "0.0.0.0/0" 58 | security_group_id = alicloud_security_group.default.id 59 | priority = 1 60 | } 61 | 62 | resource "alicloud_cs_managed_kubernetes" "default" { 63 | name = "ack-${random_string.suffix.result}" 64 | timezone = "Asia/Shanghai" 65 | version = "1.32.1-aliyun.1" 66 | 67 | worker_vswitch_ids = [alicloud_vswitch.default.id] 68 | pod_vswitch_ids = [alicloud_vswitch.default.id] 69 | service_cidr = "172.16.0.0/16" 70 | 71 | addons { 72 | name = "terway-eniip" 73 | } 74 | 75 | proxy_mode = "ipvs" 76 | cluster_domain = "cluster.local" 77 | deletion_protection = false 78 | cluster_spec = "ack.pro.small" 79 | load_balancer_spec = "slb.s1.small" 80 | new_nat_gateway = true 81 | slb_internet_enabled = true 82 | resource_group_id = alicloud_resource_manager_resource_group.default.id 83 | security_group_id = alicloud_security_group.default.id 84 | } 85 | 86 | resource "alicloud_cs_kubernetes_node_pool" "spark-master" { 87 | node_pool_name = "spark-master" 88 | cluster_id = alicloud_cs_managed_kubernetes.default.id 89 | vswitch_ids = [alicloud_vswitch.default.id] 90 | desired_size = var.spark.master.instance_count 91 | instance_types = [var.spark.master.instance_type] 92 | image_type = "AliyunLinux3" 93 | system_disk_category = "cloud_essd" 94 | system_disk_size = 40 95 | system_disk_performance_level = "PL1" 96 | 97 | labels { 98 | key = "spark.tpcds.benchmark/role" 99 | value = "spark-master" 100 | } 101 | 102 | resource_group_id = alicloud_resource_manager_resource_group.default.id 103 | security_group_ids = [alicloud_security_group.default.id] 104 | } 105 | 106 | resource "alicloud_cs_kubernetes_node_pool" "spark-worker" { 107 | node_pool_name = "spark-worker" 108 | cluster_id = alicloud_cs_managed_kubernetes.default.id 109 | vswitch_ids = [alicloud_vswitch.default.id] 110 | desired_size = var.spark.worker.instance_count 111 | instance_types = [var.spark.worker.instance_type] 112 | image_type = "AliyunLinux3" 113 | system_disk_category = "cloud_essd" 114 | system_disk_size = 40 115 | system_disk_performance_level = "PL1" 116 | data_disks { 117 | category = "cloud_essd" 118 | size = 300 119 | performance_level = "PL1" 120 | device = "/dev/vdb" 121 | } 122 | data_disks { 123 | category = "cloud_essd" 124 | size = 300 125 | performance_level = "PL1" 126 | device = "/dev/vdc" 127 | } 128 | data_disks { 129 | category = "cloud_essd" 130 | size = 300 131 | performance_level = "PL1" 132 | device = "/dev/vdd" 133 | } 134 | data_disks { 135 | category = "cloud_essd" 136 | size = 300 137 | performance_level = "PL1" 138 | device = "/dev/vde" 139 | } 140 | data_disks { 141 | category = "cloud_essd" 142 | size = 300 143 | performance_level = "PL1" 144 | device = "/dev/vdf" 145 | } 146 | data_disks { 147 | category = "cloud_essd" 148 | size = 300 149 | performance_level = "PL1" 150 | device = "/dev/vdg" 151 | } 152 | data_disks { 153 | category = "cloud_essd" 154 | size = 40 155 | performance_level = "PL1" 156 | device = "/dev/vdh" 157 | } 158 | 159 | labels { 160 | key = "spark.tpcds.benchmark/role" 161 | value = "spark-worker" 162 | } 163 | 164 | taints { 165 | key = "spark.tpcds.benchmark/role" 166 | value = "spark-worker" 167 | effect = "NoSchedule" 168 | } 169 | 170 | kubelet_configuration { 171 | eviction_hard = { 172 | "imagefs.available" = "5%" 173 | "memory.available" = "100Mi" 174 | "nodefs.available" = "5%" 175 | "nodefs.inodesFree" = "5%" 176 | } 177 | system_reserved = { 178 | cpu = "300m" 179 | memory = "600Mi" 180 | pid = "1000" 181 | } 182 | kube_reserved = { 183 | cpu = "300m" 184 | memory = "600Mi" 185 | pid = "1000" 186 | } 187 | } 188 | 189 | user_data = base64encode(file("user_data.sh")) 190 | 191 | resource_group_id = alicloud_resource_manager_resource_group.default.id 192 | security_group_ids = [alicloud_security_group.default.id] 193 | } 194 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/terraform/alicloud/provider.tf: -------------------------------------------------------------------------------- 1 | provider "alicloud" { 2 | region = var.region 3 | profile = var.profile 4 | } 5 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/terraform/alicloud/terraform.tfvars: -------------------------------------------------------------------------------- 1 | region = "cn-beijing" 2 | 3 | zone_id = "cn-beijing-i" 4 | 5 | profile = "default" 6 | 7 | spark = { 8 | master = { 9 | instance_count = 1 10 | instance_type = "ecs.g7.2xlarge" 11 | } 12 | worker = { 13 | instance_count = 6 14 | instance_type = "ecs.g7.8xlarge" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/terraform/alicloud/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 打印命令 4 | set -ex 5 | 6 | # 添加 parted 7 | yum install -y parted e2fsprogs 8 | 9 | # 为数据盘新建分区 10 | disks=(/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf /dev/vdg) 11 | for disk in ${disks[@]}; do 12 | parted ${disk} mklabel gpt 13 | parted ${disk} mkpart primary 1 100% 14 | parted ${disk} align-check optimal 1 15 | done 16 | partprobe 17 | 18 | # 为分区创建文件系统 19 | for disk in ${disks[@]}; do 20 | mkfs -t xfs ${disk}1 21 | done 22 | 23 | # 挂载分区 24 | cp /etc/fstab /etc/fstab.bak 25 | n=${#disks[@]} 26 | for ((i = 0; i < n; i++)); do 27 | dir="/mnt/disk$(($i + 1))" 28 | mkdir -p ${dir} 29 | echo "$(blkid ${disks[i]}1 | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 30 | chmod g+w ${dir} 31 | done 32 | mount -a 33 | 34 | # 修改挂载点访问权限 35 | chmod a+w /mnt/disk* 36 | -------------------------------------------------------------------------------- /benchmarks/jindosdk/terraform/alicloud/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = string 3 | default = "cn-beijing" 4 | } 5 | 6 | variable "zone_id" { 7 | type = string 8 | default = "cn-beijing-i" 9 | } 10 | 11 | variable "profile" { 12 | type = string 13 | default = "default" 14 | } 15 | 16 | variable "spark" { 17 | type = object({ 18 | master = object({ 19 | instance_count = number 20 | instance_type = string 21 | }) 22 | worker = object({ 23 | instance_count = number 24 | instance_type = string 25 | }) 26 | }) 27 | default = { 28 | master = { 29 | instance_count = 0 30 | instance_type = "ecs.g7.2xlarge" 31 | } 32 | worker = { 33 | instance_count = 0 34 | instance_type = "ecs.g7.8xlarge" 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / organization := "com.aliyun.ack" 2 | ThisBuild / version := "0.1" 3 | ThisBuild / scalaVersion := "2.12.20" 4 | 5 | val sparkVersion = "3.5.5" 6 | 7 | lazy val benchmark = (project in file(".")) 8 | .settings( 9 | name := "spark-tpcds-benchmark", 10 | libraryDependencies ++= Seq( 11 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 12 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 13 | "com.github.scopt" %% "scopt" % "4.1.0" 14 | ), 15 | javacOptions ++= Seq("-source", "1.8", "-target", "1.8") 16 | ) 17 | -------------------------------------------------------------------------------- /charts/tpcds-benchmark/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-benchmark 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /charts/tpcds-benchmark/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-benchmark.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-benchmark.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-benchmark.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-benchmark.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-benchmark.chart" . }} 38 | {{ include "tpcds-benchmark.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-benchmark.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-benchmark.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-benchmark.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-benchmark.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /charts/tpcds-benchmark/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS region 22 | region: cn-beijing 23 | # -- OSS endpoint 24 | endpoint: oss-cn-beijing-internal.aliyuncs.com 25 | 26 | benchmark: 27 | # -- Scale factor 28 | scaleFactor: 3072 29 | # -- Number of iterations 30 | numIterations: 1 31 | # -- Whether to optimize queries 32 | optimizeQueries: false 33 | # -- Filter queries, will run all if empty 34 | queries: [] 35 | # - q70-v2.4 36 | # - q82-v2.4 37 | # - q64-v2.4 38 | 39 | # -- Specifies which SDK to use when accessing OSS. 40 | # Available options are: 41 | # 1. hadoop-aliyun (Ref: https://apache.github.io/hadoop/hadoop-aliyun/tools/hadoop-aliyun/index.html) 42 | # 2. hadoop-aws (Ref: https://apache.github.io/hadoop/hadoop-aws/tools/hadoop-aws/index.html) 43 | # 3. jindoSDK (Ref: https://github.com/aliyun/alibabacloud-jindodata) 44 | sdk: hadoop-aliyun 45 | -------------------------------------------------------------------------------- /charts/tpcds-data-generation/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: tpcds-data-generation 3 | description: A Helm chart for Kubernetes 4 | type: application 5 | version: 0.1.0 6 | appVersion: 0.1.0 7 | -------------------------------------------------------------------------------- /charts/tpcds-data-generation/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "tpcds-data-generation.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "tpcds-data-generation.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "tpcds-data-generation.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "tpcds-data-generation.labels" -}} 37 | helm.sh/chart: {{ include "tpcds-data-generation.chart" . }} 38 | {{ include "tpcds-data-generation.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "tpcds-data-generation.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "tpcds-data-generation.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "tpcds-data-generation.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "tpcds-data-generation.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /charts/tpcds-data-generation/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for tpcds-benchmark. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | image: 6 | # -- Image registry 7 | registry: registry-cn-beijing-vpc.ack.aliyuncs.com 8 | # -- Image repository 9 | repository: ack-demo/spark-tpcds-benchmark 10 | # -- Image tag 11 | tag: 3.5.3-0.1 12 | # -- Image pull policy 13 | pullPolicy: IfNotPresent 14 | # -- Image pull secrets 15 | pullSecrets: [] 16 | # - name: pull-secret 17 | 18 | oss: 19 | # -- OSS bucket 20 | bucket: example-bucket 21 | # -- OSS region 22 | region: cn-beijing 23 | # -- OSS endpoint 24 | endpoint: oss-cn-beijing-internal.aliyuncs.com 25 | 26 | benchmark: 27 | # -- Scale factor 28 | scaleFactor: 3072 29 | # -- Number of partitions 30 | numPartitions: 640 31 | 32 | # -- Specifies which SDK to use when accessing OSS. 33 | # Available options are: 34 | # 1. hadoop-aliyun (Ref: https://apache.github.io/hadoop/hadoop-aliyun/tools/hadoop-aliyun/index.html) 35 | # 2. hadoop-aws (Ref: https://apache.github.io/hadoop/hadoop-aws/tools/hadoop-aws/index.html) 36 | # 3. jindoSDK (Ref: https://github.com/aliyun/alibabacloud-jindodata) 37 | sdk: hadoop-aliyun 38 | -------------------------------------------------------------------------------- /docs/benchmark/apache-spark-vs-emr-spark.md: -------------------------------------------------------------------------------- 1 | # Apache Spark v.s. EMR Spark on ACK 性能对比 2 | 3 | ## 概述 4 | 5 | 本文在同一 ACK 集群环境中分别用 Apache Spark 和阿里云 EMR Spark 运行相同规模的 TPC-DS 基准测试,并对两者的性能进行对比。 6 | 7 | ## 集群环境 8 | 9 | | 集群配置 | 参数 | 10 | | --------------- | ------------------------------------------------------------ | 11 | | 集群类型 | ACK标准集群 | 12 | | 集群版本 | 1.16.9-aliyun.1 | 13 | | ECS实例 | ECS规格:ecs.d1ne.6xlarge
操作系统:CentOS 7.7 64位
CPU:24核
内存:96G
数据盘:5500GB HDDx12 | 14 | | Worker Node个数 | 20 | 15 | 16 | ## 对比结果 17 | 18 | ### Apache Spark vs EMR Spark 19 | 20 | 测试数据:10TB 21 | 22 | ![apache-spark-per-10t](../img/apache-spark-per-10t.jpg) 23 | 24 | ![apache-spark-total-10t](../img/apache-spark-total-10t.jpg) 25 | 26 | 在10TB数据上测试,EMR Spark相比社区版Apache Spark约有57%的性能提升,详细测试过程参考[使用EMR Spark运行Spark工作负载](../bestpractice/emrspark.md)。 27 | 28 | ### EMR Spark vs EMR Spark + Remote Shuffle Service 29 | 30 | 测试数据:10TB 31 | 32 | ![emr-spark-rss-per-10t](../img/emr-spark-rss-per-10t.jpg) 33 | 34 | ![emr-spark-rss-total-10t](../img/emr-spark-rss-total-10t.jpg) 35 | 36 | 在10TB数据上,增加Shuffle Service后,相比直接使用EMR Spark,约有16%的性能提升。详细测试过程请参考[使用EMR Spark + Remote Shuffle Service运行Spark工作负载](../bestpractice/emrspark-ess.md)。 37 | 38 | ### EMR Spark vs EMR Spark + JindoFS 39 | 40 | 测试数据:1TB 41 | 42 | ![emr-spark-jindofs-per-1t](../img/emr-spark-jindofs-per-1t.jpg) 43 | 44 | ![emr-spark-jindofs-total-1t](../img/emr-spark-jindofs-total-1t.jpg) 45 | 46 | 在1TB数据上,使用JindoFS做数据分布式缓存后,相比直接使用EMR Spark,得到约15%性能提升。详细测试过程请参考[使用EMR Spark + JindoFS运行Spark工作负载](../bestpractice/emrspark-jindofs.md)。 47 | -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/cloud-disk-read-write-bps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/cloud-disk-read-write-bps.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/cloud-disk-read-write-iops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/cloud-disk-read-write-iops.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/cpu-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/cpu-usage.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/memory-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/memory-usage.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/network-bandwidth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/network-bandwidth.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/oss-bandwidth-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/oss-bandwidth-usage.png -------------------------------------------------------------------------------- /docs/benchmark/hadoop-aliyun/system-load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/hadoop-aliyun/system-load.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/cloud-disk-read-write-bps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/cloud-disk-read-write-bps.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/cloud-disk-read-write-iops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/cloud-disk-read-write-iops.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/cpu-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/cpu-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/memory-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/memory-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/network-bandwidth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/network-bandwidth.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/oss-bandwidth-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/oss-bandwidth-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/result.csv: -------------------------------------------------------------------------------- 1 | q1-v2.4,6.403341799,14.113337988,8.110346962000001,3.0072773026535176 2 | q10-v2.4,7.925076748,9.735050749,8.859851016599999,0.7062512866520814 3 | q11-v2.4,44.772593332999996,46.987956649,46.0210590318,0.7955303035720936 4 | q12-v2.4,4.779693126,5.655751446,5.0701750946,0.31502307952753555 5 | q13-v2.4,13.565382688,16.944541015,15.043409411999999,1.1478876927498043 6 | q14a-v2.4,127.972405122,133.39763662899998,130.08566996379997,1.9644635099712715 7 | q14b-v2.4,111.962040462,123.435222735,117.2949632782,4.122875578276983 8 | q15-v2.4,13.654991439,14.299390541000001,14.026876698399999,0.25479388092175814 9 | q16-v2.4,38.160023408,51.282346682000004,44.041532853,4.39200820233433 10 | q17-v2.4,12.382950515000001,13.585545535,12.922711180200002,0.3867699635053326 11 | q18-v2.4,29.764483285999997,32.580894582,30.8709355578,0.9551782311484887 12 | q19-v2.4,7.575141244,8.487050711,8.0599446862,0.3015586914436512 13 | q2-v2.4,24.847467029,42.722474430000005,30.238407568400003,6.455989615682667 14 | q20-v2.4,6.07467122,6.521158145999999,6.3502311806,0.16749299927607597 15 | q21-v2.4,2.334248386,3.041132433,2.624342529,0.30168316814964863 16 | q22-v2.4,15.791765379000001,17.470257275,16.3519068954,0.6050697722586454 17 | q23a-v2.4,247.19072309700002,269.194406393,256.7901344234,9.14143306577228 18 | q23b-v2.4,347.095614555,376.91161961499995,359.68113510374997,10.835390097514555 19 | q24a-v2.4,170.73195806,222.394821357,186.81856552999997,18.850993884899363 20 | q24b-v2.4,154.569191914,181.776721361,161.57589051999997,10.191832092747783 21 | q25-v2.4,10.168618281999999,11.310371117999999,10.797841795,0.418828419556337 22 | q26-v2.4,8.32073711,10.909086046,9.5687395584,0.868934197904754 23 | q27-v2.4,9.160393658,11.338790632,10.12739899,0.7975963914623747 24 | q28-v2.4,86.43398751999999,100.024091722,93.70992886719998,4.530918546869185 25 | q29-v2.4,34.694480544,37.394205241,35.6780936814,1.102404252659388 26 | q3-v2.4,4.425652285,10.658408525999999,6.0468644542,2.3246590138935876 27 | q30-v2.4,22.221355283,24.362323517,23.5632527726,0.7897931942834879 28 | q31-v2.4,15.535605863,22.435048117999997,19.1734882166,2.7754359387002783 29 | q32-v2.4,2.7240765799999997,4.294419565,3.4110461104,0.5180495245529745 30 | q33-v2.4,9.486729407999999,14.0507404,10.8410765628,1.6684470542531107 31 | q34-v2.4,5.754013076000001,6.858750014,6.3262469768,0.36247159301883486 32 | q35-v2.4,22.054084172,28.678958477000002,25.5516858866,2.131843967920238 33 | q36-v2.4,9.201601921,11.342347661,10.1660546476,0.8033960435433924 34 | q37-v2.4,20.614719791,25.339465841000003,22.7586753862,1.6267379678394953 35 | q38-v2.4,24.363691602,25.894881488,25.013908631,0.56493746506244 36 | q39a-v2.4,7.2125916100000005,7.898016800000001,7.573020051199999,0.23697983984591808 37 | q39b-v2.4,6.18982408,6.457127251,6.3307544906,0.11155595117662326 38 | q4-v2.4,411.27097534399996,416.429732382,413.1030275622,1.7518852154912812 39 | q40-v2.4,12.701440823,15.116922094,13.6072663976,0.9370664149170012 40 | q41-v2.4,0.735660971,1.064742505,0.8516507424,0.11769377387532427 41 | q42-v2.4,2.1424351539999997,2.566336106,2.3445844576,0.1730397418357677 42 | q43-v2.4,5.221034044,5.895489853,5.578208038800001,0.2675375069381337 43 | q44-v2.4,35.767253492,43.123901767,39.088050853000006,2.7321370388020187 44 | q45-v2.4,14.676387136999999,16.24020423,15.256181076999999,0.6489453313215297 45 | q46-v2.4,14.427242047,15.689932881,15.280348465200001,0.45794322281169697 46 | q47-v2.4,15.574771598,16.561673385,16.024442170999997,0.34720748229917753 47 | q48-v2.4,13.14376774,14.165969597,13.582910882799998,0.3534627105889629 48 | q49-v2.4,24.847529014,27.807120036,25.949745603400004,1.0226374444989852 49 | q5-v2.4,37.217741593,40.841633301,38.1758336198,1.3637240880115 50 | q50-v2.4,88.902628371,106.73569835,97.013035085,6.729219006565264 51 | q51-v2.4,17.650206145,18.522226133,18.0863460476,0.3173514020221161 52 | q52-v2.4,2.1186154459999997,2.22236456,2.1760548741999997,0.04471532999239851 53 | q53-v2.4,5.541361597,6.945895214,6.1308230004,0.47765772015250924 54 | q54-v2.4,10.480562544,11.998553545,11.145047851,0.5785241999873934 55 | q55-v2.4,2.282213192,2.596218903,2.4718316440000003,0.11371205184911885 56 | q56-v2.4,9.013889321,13.735663051,10.562323225399998,1.6578829837835554 57 | q57-v2.4,10.846739192,11.813498036999999,11.3235967424,0.30606111764504235 58 | q58-v2.4,5.285776542,6.052705576,5.7267195286,0.2901917190211392 59 | q59-v2.4,24.296451883000003,26.350496043,24.998931687200002,0.7095267391199042 60 | q6-v2.4,15.253336406,16.982870718999997,16.129227191,0.7343692217040048 61 | q60-v2.4,13.171913549000001,18.007604105,15.6198705214,1.7316221019179017 62 | q61-v2.4,7.9207158259999995,8.870970827999999,8.4566962112,0.3511242043446138 63 | q62-v2.4,8.165779923,11.188606775,9.520968088999998,1.0797726459511297 64 | q63-v2.4,5.2532056009999994,6.687694218,6.1321477966,0.49987278890075076 65 | q64-v2.4,131.18216769100002,139.66165989400002,136.3903266718,3.6949907245897045 66 | q65-v2.4,39.527864126,41.438105649,40.6470150722,0.73790726794108 67 | q66-v2.4,11.944895418,16.266170765000002,13.356976803,1.5262260356762227 68 | q67-v2.4,824.834365725,953.592219898,876.5785078716001,43.60044480825489 69 | q68-v2.4,8.018602137,9.789883804999999,8.833794428000001,0.6619565055625116 70 | q69-v2.4,8.086222853,9.677931814999999,8.6095748212,0.5901133398471722 71 | q7-v2.4,10.655323532,17.72042745,12.908784262200001,2.53689258899848 72 | q70-v2.4,12.524132598,14.411998091,13.339655970799999,0.6203014490996702 73 | q71-v2.4,4.796424706,6.902401469,6.1352114306,0.8666732619665505 74 | q72-v2.4,30.185360558,33.343284462,31.847113491400002,1.2618025257467707 75 | q73-v2.4,4.07596665,4.758130047,4.394250324,0.23437617102591404 76 | q74-v2.4,39.609209111,42.034777242,40.538453946800004,0.9273946012601438 77 | q75-v2.4,76.175883508,84.964818563,79.1853127072,3.661016550087643 78 | q76-v2.4,48.110222857000004,60.20403243,52.6974528608,4.4656675887183015 79 | q77-v2.4,3.657823893,4.6782597500000005,4.0992353078,0.35978757135058226 80 | q78-v2.4,135.273596329,144.133638319,138.5148938698,3.2484652059360704 81 | q79-v2.4,7.339861827,8.515751003,7.8078936262,0.42491863604184127 82 | q8-v2.4,7.966001344,10.23591579,8.684960683,0.8252855195166873 83 | q80-v2.4,33.401593257,38.877170373,35.237511009,1.9686386175709432 84 | q81-v2.4,19.417714241000002,46.429227286,36.519808876,9.4715331274212 85 | q82-v2.4,37.396584625,41.270269618,39.6436047382,1.6097228363399385 86 | q83-v2.4,2.184723509,3.33329588,2.665223494,0.372673475182942 87 | q84-v2.4,8.53208389,9.846927667,9.2276907376,0.518592084106568 88 | q85-v2.4,13.002436506,16.702057365,14.891505778200003,1.1800328961216877 89 | q86-v2.4,4.904867805,5.439357235,5.148247554599999,0.17843809022695464 90 | q87-v2.4,24.393585114,25.127197267,24.558069942,0.28488626684885726 91 | q88-v2.4,78.7644847,91.614683828,84.54430226180001,4.242549769900448 92 | q89-v2.4,7.45227157,9.86460246,8.311342793200001,0.8722823283576866 93 | q9-v2.4,78.2763318,91.73873975,84.7820893802,5.341610603371891 94 | q90-v2.4,9.018264886,12.035711667,9.8776809608,1.1016814318457697 95 | q91-v2.4,3.309663625,5.368012049,3.8723665273999996,0.7635696335475226 96 | q92-v2.4,2.275226487,3.165737054,2.548989435,0.3205299610638202 97 | q93-v2.4,189.061058163,193.990356962,191.68693537579998,2.023929208422184 98 | q94-v2.4,29.167269231,34.975495122,31.3409928716,2.01367221037295 99 | q95-v2.4,72.104501149,76.888610978,74.05870401920001,1.7079831665403682 100 | q96-v2.4,11.44562275,14.913310049,12.8495213302,1.313787446313863 101 | q97-v2.4,38.64428042,53.627609378,42.0062587314,5.819342651112627 102 | q98-v2.4,6.97448467,8.024952473,7.5863038148,0.3467894862651928 103 | q99-v2.4,12.60409641,15.364135885,13.7726032862,0.9690467806801478 104 | ss_max-v2.4,23.568913382999998,26.611204045999997,24.978771977199997,1.0616156616432846 105 | -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack-arm64/system-load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack-arm64/system-load.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/cloud-disk-read-write-bps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/cloud-disk-read-write-bps.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/cloud-disk-read-write-iops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/cloud-disk-read-write-iops.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/cpu-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/cpu-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/memory-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/memory-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/network-bandwidth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/network-bandwidth.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/oss-bandwidth-usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/oss-bandwidth-usage.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/result.csv: -------------------------------------------------------------------------------- 1 | q1-v2.4,5.82740906,12.178573564,7.1879224572,2.4960589575459804 2 | q10-v2.4,7.573033074,8.666177659,8.1316966384,0.3648029760376505 3 | q11-v2.4,53.436323165000005,53.921298953,53.75794683080001,0.17454812541879453 4 | q12-v2.4,3.947420554,5.6403560420000005,4.431623394400001,0.6216755464731617 5 | q13-v2.4,12.143909611,13.038156586,12.6490554062,0.3300995714054239 6 | q14a-v2.4,107.46807310999999,115.669366161,111.85752393800001,2.8042210551273055 7 | q14b-v2.4,105.816832083,110.095723619,107.19382418820001,1.5379012340740084 8 | q15-v2.4,13.701433589,14.150182401,13.954124861,0.14472680646740804 9 | q16-v2.4,36.780885457,40.265582759,37.7290644368,1.2985744150482166 10 | q17-v2.4,12.995115227,13.452064576000001,13.1343180432,0.16627461371977903 11 | q18-v2.4,26.913184947,28.227103931000002,27.487764137,0.521758556010464 12 | q19-v2.4,6.53442605,6.785478434,6.6804468676,0.09669073284863788 13 | q2-v2.4,21.132480734,22.656578223,21.6629419568,0.5803333423209112 14 | q20-v2.4,5.325541791,5.768802061000001,5.512311510799999,0.17759409301795656 15 | q21-v2.4,1.870814209,2.354133757,2.0752153428,0.16786013784580675 16 | q22-v2.4,15.361660048,15.842826313,15.610420318,0.16316416909917944 17 | q23a-v2.4,252.781698925,267.126780039,263.6303611626,5.445611471905185 18 | q23b-v2.4,362.414191738,396.677759852,380.3411332984,11.85351761507152 19 | q24a-v2.4,168.934801095,195.761364316,179.9287417506,9.510202007542013 20 | q24b-v2.4,156.003102491,186.871001702,165.02367737039998,11.386752754897792 21 | q25-v2.4,9.959931248,10.710937722,10.389834249200002,0.2773901160922852 22 | q26-v2.4,6.968146132,9.120252821,7.9289348678,0.7467140720959528 23 | q27-v2.4,8.532492410000001,10.062598792000001,9.309868940800001,0.5251773595287177 24 | q28-v2.4,79.790235462,86.565475439,83.5998490682,2.7366682823821984 25 | q29-v2.4,35.326786917,37.729033465,36.606352364399996,0.8259770016569948 26 | q3-v2.4,3.6432516919999998,6.147688761,4.305347787600001,0.9331086197419158 27 | q30-v2.4,18.220981156999997,18.633722778,18.437182831999998,0.14174993480267611 28 | q31-v2.4,15.942112084,21.284957720999998,18.4043419582,1.9913891609695626 29 | q32-v2.4,2.345385271,2.577304551,2.4563204285999998,0.07893259517224921 30 | q33-v2.4,8.450874475,12.325880425,10.5759360036,1.4506385441266296 31 | q34-v2.4,5.186600594,5.957797243,5.4886637674,0.2539189080514718 32 | q35-v2.4,20.837288122999997,24.231901855,22.1871265838,1.1473412689581974 33 | q36-v2.4,7.836719654,9.303440971,8.411142502,0.5375508429923208 34 | q37-v2.4,18.839476587,23.085603574,20.425648218,1.6288495424610643 35 | q38-v2.4,29.27162852,29.888590269,29.704503160799998,0.22100625676104002 36 | q39a-v2.4,5.6715128990000006,5.987813289,5.8983753884,0.11670704181381068 37 | q39b-v2.4,5.124887764,5.544479965000001,5.3199175924,0.13795567629304478 38 | q4-v2.4,351.110730348,356.019829643,354.574727248,1.803913576819102 39 | q40-v2.4,11.522964602,11.855520104,11.665169434000001,0.12009998943452786 40 | q41-v2.4,0.630769625,0.734893715,0.690501993,0.045762016697730806 41 | q42-v2.4,1.5774040230000002,2.108183575,1.8526063143999998,0.1974445726816084 42 | q43-v2.4,5.243902481,5.536814871,5.4029682634,0.12019622763754292 43 | q44-v2.4,28.837908760999998,34.309312426999995,31.282211627200002,1.8384245102982972 44 | q45-v2.4,13.153637891,13.628053518,13.339359959800001,0.17041278652280728 45 | q46-v2.4,11.287654004,12.271047093,11.758188709399999,0.3187610867133229 46 | q47-v2.4,13.463470857,14.189145694,13.7540722738,0.2650707791268198 47 | q48-v2.4,11.241675956,12.103887507,11.707699290599999,0.325604968552439 48 | q49-v2.4,17.834184225,20.899016822,19.4238399256,1.0754022056717856 49 | q5-v2.4,34.621159285000005,36.314129427,35.1836529778,0.626921312924483 50 | q50-v2.4,95.74525486499999,103.6536799,100.2474700934,2.555714600741113 51 | q51-v2.4,15.991921506999999,16.623763728999997,16.232791363,0.2499447379507674 52 | q52-v2.4,1.7025136520000002,1.887806007,1.7745731660000001,0.06949580697355572 53 | q53-v2.4,5.1523663619999995,5.8181278579999995,5.4547989575999996,0.23320098695816197 54 | q54-v2.4,9.884381209999999,10.559274303,10.2496401932,0.23028904786036253 55 | q55-v2.4,1.722851433,1.854796877,1.8043138242,0.04516697302902219 56 | q56-v2.4,9.618347347,12.203963006,11.2766603764,0.9722058751923929 57 | q57-v2.4,8.914394118,9.394239197000001,9.094486610199999,0.18040418586569554 58 | q58-v2.4,3.500587732,3.7713736840000003,3.6416817162,0.09703387195726251 59 | q59-v2.4,20.758752511999997,21.683708199,21.1298770402,0.32775226611819375 60 | q6-v2.4,12.440352381999999,15.804344041,13.647992317200002,1.1629592307863936 61 | q60-v2.4,13.255234117,13.908884819999999,13.479432537200001,0.23210091359158994 62 | q61-v2.4,6.61329201,7.086961272,6.7748774466,0.17045367659258243 63 | q62-v2.4,7.156209132,7.642842674000001,7.4162772407999995,0.1618788158931069 64 | q63-v2.4,4.832625473,5.236620013,5.0496996798,0.15297990775471795 65 | q64-v2.4,118.185557968,122.85405472,120.3819681836,1.6459305459978475 66 | q65-v2.4,37.528430785999994,38.122415229,37.7123882112,0.20992940071828223 67 | q66-v2.4,11.574929771999999,12.738605019000001,11.8822441496,0.4340046024026489 68 | q67-v2.4,754.5656415010001,785.364204246,771.5511458138001,10.811958985534648 69 | q68-v2.4,7.227422299,7.7298664509999995,7.4512599586,0.2250363828920852 70 | q69-v2.4,7.1655399509999995,7.585426471,7.4151779448,0.1463373047409165 71 | q7-v2.4,9.218510941,30.921400187,14.3854956262,8.305414189617764 72 | q70-v2.4,13.038384813,14.662040791999999,13.803610042399999,0.6790784435658018 73 | q71-v2.4,4.49366801,5.534065178,4.958342104,0.3485496889032831 74 | q72-v2.4,30.596248559,32.154659074,31.222147883800005,0.5634306317924787 75 | q73-v2.4,4.058459936,4.237710261999999,4.1268822094,0.06837608437373675 76 | q74-v2.4,43.92400146,45.089497551,44.507426736,0.40051477172070366 77 | q75-v2.4,70.99690424,77.740359476,73.7622416966,2.5833038102991885 78 | q76-v2.4,37.639858732,44.782194468,41.615109209799996,3.054873881024291 79 | q77-v2.4,2.924383521,3.658697241,3.2924847838000004,0.24097283466005903 80 | q78-v2.4,135.53521893,138.923424414,137.4993615154,1.5130299533763367 81 | q79-v2.4,6.585374795,7.010601788,6.7634720192,0.1431610726607435 82 | q8-v2.4,7.054331802,8.213357267,7.4826677602,0.40186409283060637 83 | q80-v2.4,30.762710629,33.956214714000005,31.763478078600002,1.1364042478378251 84 | q81-v2.4,17.026213071999997,30.736642845,24.8111300872,6.311979062312886 85 | q82-v2.4,36.560933301,38.963618909,38.080525601199994,1.0161681076399023 86 | q83-v2.4,1.556373215,2.012291026,1.8411018434,0.16810816431476194 87 | q84-v2.4,7.54376091,8.449147638000001,7.9804698894,0.32813136466517406 88 | q85-v2.4,11.305392544,12.246462002,11.6826919822,0.3126755402485902 89 | q86-v2.4,4.166293958,7.4550414609999995,4.846635666799999,1.3044220575400889 90 | q87-v2.4,29.857144283,30.197245811000002,30.0568481086,0.12204064209977733 91 | q88-v2.4,68.946333353,73.469319152,70.9938738528,1.50355082387917 92 | q89-v2.4,6.465980558999999,7.482841177999999,7.048671743599999,0.37705910005996873 93 | q9-v2.4,65.27047729499999,70.363230309,68.0142375148,1.7138701157064395 94 | q90-v2.4,7.551024351,7.847182861,7.724840075,0.14168198677742588 95 | q91-v2.4,2.9543571280000003,3.161031375,3.0627617616,0.08722093797597069 96 | q92-v2.4,1.734802363,2.452249051,1.9558326864000002,0.25509095562300277 97 | q93-v2.4,165.97376033700002,175.536953304,170.9042379306,3.7117345343598487 98 | q94-v2.4,27.420068029,29.638806560000003,28.471682324800003,0.8848546965648024 99 | q95-v2.4,85.688196242,88.987488021,87.09022427260001,1.2108144847560522 100 | q96-v2.4,10.45255504,11.140827458,10.739148343600002,0.2511779065303994 101 | q97-v2.4,39.057097029999994,39.838978937,39.6167914878,0.28567950252043145 102 | q98-v2.4,6.654428135,7.631984391,7.311427204199999,0.3560569231980692 103 | q99-v2.4,10.458687404,11.871515145,11.1029979198,0.49911583952624683 104 | ss_max-v2.4,24.555110707,26.187189824,25.5329044538,0.6290656580040886 105 | -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ack/system-load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-ack/system-load.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-ecs-vs-on-ack/index.md: -------------------------------------------------------------------------------- 1 | # Spark on ACK 基准测试之对比 on ECS 和 ACK 2 | 3 | 本文将在同一规模的 ECS 和 ACK 集群中分别运行 Scale Factor 为 3072 的 TPC-DS 基准测试。 4 | 5 | ## 基准测试环境 6 | 7 | 本文使用的基准测试环境如下: 8 | 9 | | **集群类型** | ACK 专业版 | 10 | | ------------ | ---------------------------------------------- | 11 | | **K8s 版本** | 1.26.3-aliyun.1 | 12 | | **地域** | 华北2(北京) | 13 | | **实例规格** | ecs.g8y.8xlarge(32 vCPU + 128 GB) | 14 | | **节点数量** | 1 master 节点 + 6 worker 节点 | 15 | | **操作系统** | Alibaba Cloud Linux 3.2104 LTS 64位 ARM版 | 16 | | **本地存储** | 每个 worker 节点挂载 6 块 300 GB ESSD PL1 云盘 | 17 | 18 | 注: 19 | 20 | - master 节点仅用于调度 driver pod,不调度 worker pod。 21 | - worker 节点用于调度 executor pod。 22 | 23 | ## 基准测试过程 24 | 25 | 1. 创建基准测试环境,详情请参见[搭建基准测试环境](../setup-env/index.md); 26 | 2. 生成测试数据,详情请参见[生成基准测试数据集](../../../charts/tpcds-data-generation/README.md) 27 | 3. 运行基准测试,详情请参见[运行 TPCDS 基准测试](../../../charts/tpcds-benchmark/README.md) 28 | 29 | ## 基准测试配置 30 | 31 | 在基准测试阶段,总共调度 60 个 executor pod,其中每个 worker 节点调度 10 个 executor pod,每个 executor pod 分配 3 个 cpu 核心和 12g 内存(9g 堆内内存 + 3g 堆外内存),因此每个节点的 cpu request 为 30,内存的 request 和 limit 均为 120g 。 32 | 33 | ## 基准测试结果参考 34 | 35 | 本次基准测试在相同数量和规格的 ECS 和 ACK 集群环境下运行规模为 3TB(SF=3072)的 TPCDS 基准测试,结果如下: 36 | 37 | - Spark on ECS 环境下运行了 3 次,平均用时为 4786 秒;Spark on ACK 环境下运行了 5 次,平均用时为 4758.6 秒,相比于前者降低约 0.5%,鉴于查询时间存在一定的波动,可以认为两者查询性能几乎一致。 38 | -------------------------------------------------------------------------------- /docs/benchmark/spark-on-x86-vs-on-arm64/cost_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-x86-vs-on-arm64/cost_comparison.png -------------------------------------------------------------------------------- /docs/benchmark/spark-on-x86-vs-on-arm64/index.md: -------------------------------------------------------------------------------- 1 | # Spark on ACK 基准测试之对比 x86_64 架构和 arm64 架构 2 | 3 | 为了对比 Spark on ACK 在 x86/arm64 架构下的性能和成本,本次基准测试创建了相同规模的 x86 架构和 arm64 架构 ACK 集群,它们具有相同的节点数量、vCPU 数量和内存大小,并挂载了相同数量和规模的云盘,区别主要在于 x86 架构集群使用了 g7 实例规格族 ECS,arm64 架构集群使用了 g8y 实例规格族 ECS。 4 | 5 | ## 基准测试环境 6 | 7 | 本次基准测试所使用的测试环境如下: 8 | 9 | | | **x86 架构** | **arm64 架构** | 10 | | ------------ | ------------------------------------------------------------ | ----------------------------------------------------------- | 11 | | **集群类型** | ACK pro | ACK pro | 12 | | **K8s 版本** | 1.26.3-aliyun.1 | 1.26.3-aliyun.1 | 13 | | **地域** | 华北2(北京) | 华北2(北京) | 14 | | **实例规格** | ecs.g7.8xlarge(32 vCPU + 128 GB) | ecs.g8y.8xlarge(32 vCPU + 128 GB) | 15 | | **节点数量** | 1 master 节点(g7.2xlarge)
6 worker 节点 (g7.8xlarge) | 1 master 节点 (g8y.2xlarge)
6 worker 节点 (g8y.8xlarge) | 16 | | **操作系统** | Alibaba Cloud Linux 3.2104 LTS 64位 | Alibaba Cloud Linux 3.2104 LTS 64位 ARM版 | 17 | | **镜像 ID** | aliyun_3_x64_20G_alibase_20230727.vhd | aliyun_3_arm64_20G_alibase_20230515.vhd | 18 | 19 | ## 测试数据集 20 | 21 | 本次基准测试使用的测试数据集规模为 SF=3072,数据分区数量为 640。 22 | 23 | ## 测试结果 24 | 25 | x86/arm64 性能对比如下图所示: 26 | 27 | ![x86/arm64 性能对比](performance_comparison.png) 28 | 29 | x86/arm64 成本对比如下图所示: 30 | 31 | ![x86/arm64 成本对比](cost_comparison.png) 32 | 33 | 6 节点 3 TB 基准测试结果如下: 34 | 35 | - 数据生成:arm64 相比于 x86 用时减少 `3.67%`,ECS 成本减少 `26.35%`。 36 | - 端到端查询:arm64 相比于 x86 用时减少 `5.11%`,ECS 成本减少 `27.45%`。 37 | 38 | 注: 39 | 40 | - ECS 成本的计算方式为:`总成本 = 查询总时间 x 集群单价`。 41 | - 本次基准测试在 x86/arm64 架构下使用了相同的作业参数,但是未对作业参数进行调优,因此不代表最优性能,主要用于对比两种架构下的性能/成本。 42 | -------------------------------------------------------------------------------- /docs/benchmark/spark-on-x86-vs-on-arm64/performance_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/benchmark/spark-on-x86-vs-on-arm64/performance_comparison.png -------------------------------------------------------------------------------- /docs/benchmark/tpcds-benchmark.md: -------------------------------------------------------------------------------- 1 | # 运行 TPC-DS 基准测试 2 | 3 | 本文说明如何运行 TPC-DS 基准测试。 4 | 5 | ## 前提条件 6 | 7 | - 已经在本地机器安装 [Git](https://git-scm.com/)、[Docker](https://www.docker.com/)、[kubectl](https://kubernetes.io/docs/reference/kubectl/) 和 [Helm 3](https://helm.sh/) 等工具; 8 | - 已经在本地机器安装 ossutil 工具,详情请参见[安装 ossutil](https://help.aliyun.com/zh/oss/developer-reference/install-ossutil); 9 | - 已经搭建基准测试环境,详情参见[搭建 Spark on ACK 基准测试环境](setup.md); 10 | - 已经生成 TPC-DS 基准测试数据集并上传至 OSS,详情请参见[生成 TPC-DS 测试数据集成](tpcds-data-generation.md)。 11 | 12 | ## 提交基准测试作业 13 | 14 | 1. 执行如下命令,设置基准测试作业参数: 15 | 16 | ```shell 17 | # 规模因子 18 | SCALE_FACTOR=3072 19 | ``` 20 | 21 | 2. 执行如下命令,提交基准测试作业: 22 | 23 | ```shell 24 | helm install tpcds-benchmark charts/tpcds-benchmark \ 25 | --namespace spark \ 26 | --create-namespace \ 27 | --set image.registry=${IMAGE_REGISTRY} \ 28 | --set image.repository=${IMAGE_REPOSITORY} \ 29 | --set image.tag=${IMAGE_TAG} \ 30 | --set oss.bucket=${OSS_BUCKET} \ 31 | --set oss.endpoint=${OSS_ENDPOINT} \ 32 | --set benchmark.scaleFactor=${SCALE_FACTOR} \ 33 | --set benchmark.numIterations=1 34 | ``` 35 | 36 | 可以添加更多形如 `--set key=value` 的参数指定基准测试的配置,支持的配置选项请参见 `charts/tpcds-benchmark/values.yaml`。 37 | 38 | 3. 执行如下命令,实时查看基准测试作业状态: 39 | 40 | ```shell 41 | kubectl get -n spark -w sparkapplication tpcds-benchmark-${SCALE_FACTOR}gb 42 | ``` 43 | 44 | 4. 执行如下命令,实时查看 Driver Pod 日志输出: 45 | 46 | ```shell 47 | kubectl logs -n spark -f tpcds-benchmark-${SCALE_FACTOR}gb-driver 48 | ``` 49 | 50 | ## 查看基准测试结果 51 | 52 | 1. 执行如下命令,查看基准测试输出: 53 | 54 | ```shell 55 | ossutil ls -s oss://${OSS_BUCKET}/spark/result/tpcds/${SCALE_FACTOR}gb/ 56 | ``` 57 | 58 | 期望输出如下: 59 | 60 | ```shell 61 | oss://example-bucket/spark/result/tpcds/SF=3072/ 62 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/ 63 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/_SUCCESS 64 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/part-00000-80c681de-ae8d-4449-b647-5e3d373edef1-c000.json 65 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/summary.csv/ 66 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/summary.csv/_SUCCESS 67 | oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/summary.csv/part-00000-5a5d1e4a-3fe0-43a1-8248-3259af4f10a7-c000.csv 68 | Object Number is: 7 69 | 70 | 0.172532(s) elapsed 71 | ``` 72 | 73 | 2. 执行如下命令,从 OSS 下载基准测试结果至本地并保存为 `result.csv`: 74 | 75 | ```shell 76 | ossutil cp oss://example-bucket/spark/result/tpcds/SF=3072/timestamp=1716901969870/summary.csv/part-00000-5a5d1e4a-3fe0-43a1-8248-3259af4f10a7-c000.csv result.csv 77 | ``` 78 | 79 | 3. 执行如下命令,查看基准测试结果: 80 | 81 | ```shell 82 | cat result.csv 83 | ``` 84 | 85 | 期望输出如下(已省略部分内容): 86 | 87 | ```shell 88 | q1-v2.4,13.169382888,13.169382888,13.169382888,0.0 89 | q10-v2.4,9.502788331,9.502788331,9.502788331,0.0 90 | q11-v2.4,57.161809588,57.161809588,57.161809588,0.0 91 | q12-v2.4,5.344221526999999,5.344221526999999,5.344221526999999,0.0 92 | q13-v2.4,16.183193874,16.183193874,16.183193874,0.0 93 | q14a-v2.4,121.433786224,121.433786224,121.433786224,0.0 94 | q14b-v2.4,112.871190193,112.871190193,112.871190193,0.0 95 | q15-v2.4,14.63114106,14.63114106,14.63114106,0.0 96 | q16-v2.4,47.082124609,47.082124609,47.082124609,0.0 97 | q17-v2.4,14.320191869,14.320191869,14.320191869,0.0 98 | q18-v2.4,30.619759895999998,30.619759895999998,30.619759895999998,0.0 99 | q19-v2.4,7.874492828999999,7.874492828999999,7.874492828999999,0.0 100 | q2-v2.4,34.106892226999996,34.106892226999996,34.106892226999996,0.0 101 | q20-v2.4,6.1991251609999996,6.1991251609999996,6.1991251609999996,0.0 102 | ... 103 | ``` 104 | 105 | 输出结果分为五列,分别为查询名称、最短运行时间(秒)、最长运行时间(秒)、平均运行时间(秒)和标准差(秒)。本示例由于只跑了一轮查询,因此最短/最长/平均执行时间相同,标准差为 0。 106 | 107 | ## 环境清理 108 | 109 | 1. 执行如下命令,删除基准测试作业: 110 | 111 | ```shell 112 | helm uninstall -n spark tpcds-benchmark 113 | ``` 114 | 115 | 2. 执行如下命令,删除 PVC 资源: 116 | 117 | ```shell 118 | kubectl delete -f oss-pvc.yaml 119 | ``` 120 | 121 | 3. 执行如下命令,删除 PV 资源: 122 | 123 | ```shell 124 | kubectl delete -f oss-pv.yaml 125 | ``` 126 | 127 | 4. 执行如下命令,删除 Secret 资源: 128 | 129 | ```shell 130 | kubectl delete -f oss-secret.yaml 131 | ``` 132 | 133 | 5. 如果不再需要本示例中创建的存储桶,执行如下命令,删除 OSS 存储桶: 134 | 135 | ```shell 136 | ossutil rm oss://${OSS_BUCKET} -b 137 | ``` 138 | 139 | 注意事项: 140 | 141 | - 删除 OSS 存储桶为不可逆操作,请谨慎操作,以免数据丢失。 142 | 143 | 6. 销毁本集群测试集群环境: 144 | 145 | ```shell 146 | terraform -chdir=terraform/alicloud destroy 147 | ``` 148 | -------------------------------------------------------------------------------- /docs/benchmark/tpcds-data-generation.md: -------------------------------------------------------------------------------- 1 | # TPC-DS 测试数据集生成 2 | 3 | 本文说明如何生成 TPC-DS 基准测试所需要用到的数据集。 4 | 5 | ## 前提条件 6 | 7 | - 已经在本地机器安装 [Git](https://git-scm.com/)、[Docker](https://www.docker.com/)、[kubectl](https://kubernetes.io/docs/reference/kubectl/) 和 [Helm 3](https://helm.sh/) 等工具。 8 | - 已经在本地机器安装 ossutil 工具,详情请参见[安装 ossutil](https://help.aliyun.com/zh/oss/developer-reference/install-ossutil); 9 | - 已经搭建基准测试环境,详情参见[搭建 Spark on ACK 基准测试环境](setup.md); 10 | 11 | ## 提交数据集生成作业 12 | 13 | 1. 执行如下命令,设置数据生成作业参数: 14 | 15 | ```shell 16 | # 规模因子 17 | SCALE_FACTOR=3072 18 | 19 | # 分区数量 20 | NUM_PARTITIONS=640 21 | ``` 22 | 23 | 2. 执行如下命令,提交数据生成作业: 24 | 25 | ```shell 26 | helm install tpcds-data-generation charts/tpcds-data-generation \ 27 | --namespace spark \ 28 | --create-namespace \ 29 | --set image.registry=${IMAGE_REGISTRY} \ 30 | --set image.repository=${IMAGE_REPOSITORY} \ 31 | --set image.tag=${IMAGE_TAG} \ 32 | --set oss.bucket=${OSS_BUCKET} \ 33 | --set oss.endpoint=${OSS_ENDPOINT} \ 34 | --set benchmark.scaleFactor=${SCALE_FACTOR} \ 35 | --set benchmark.numPartitions=${NUM_PARTITIONS} 36 | ``` 37 | 38 | 3. 执行如下命令,实时查看 Spark 作业状态: 39 | 40 | ```shell 41 | kubectl get -n spark -w sparkapplication tpcds-data-generation-${SCALE_FACTOR}gb 42 | ``` 43 | 44 | 4. 执行如下命令,实时查看 Driver 日志输出: 45 | 46 | ```shell 47 | kubectl logs -n spark -f tpcds-data-generation-${SCALE_FACTOR}gb-driver 48 | ``` 49 | 50 | ## 查看数据集 51 | 52 | 当作业执行完成之后,执行如下命令,查看生成的数据集目录结构: 53 | 54 | ```shell 55 | ossutil ls -d oss://${OSS_BUCKET}/spark/data/tpcds/${SCALE_FACTOR}gb/ 56 | ``` 57 | 58 | 预期输出: 59 | 60 | ```text 61 | oss://example-bucket/spark/data/tpcds/SF=3072/ 62 | oss://example-bucket/spark/data/tpcds/SF=3072/call_center/ 63 | oss://example-bucket/spark/data/tpcds/SF=3072/catalog_page/ 64 | oss://example-bucket/spark/data/tpcds/SF=3072/catalog_returns/ 65 | oss://example-bucket/spark/data/tpcds/SF=3072/catalog_sales/ 66 | oss://example-bucket/spark/data/tpcds/SF=3072/customer/ 67 | oss://example-bucket/spark/data/tpcds/SF=3072/customer_address/ 68 | oss://example-bucket/spark/data/tpcds/SF=3072/customer_demographics/ 69 | oss://example-bucket/spark/data/tpcds/SF=3072/date_dim/ 70 | oss://example-bucket/spark/data/tpcds/SF=3072/household_demographics/ 71 | oss://example-bucket/spark/data/tpcds/SF=3072/income_band/ 72 | oss://example-bucket/spark/data/tpcds/SF=3072/inventory/ 73 | oss://example-bucket/spark/data/tpcds/SF=3072/item/ 74 | oss://example-bucket/spark/data/tpcds/SF=3072/promotion/ 75 | oss://example-bucket/spark/data/tpcds/SF=3072/reason/ 76 | oss://example-bucket/spark/data/tpcds/SF=3072/ship_mode/ 77 | oss://example-bucket/spark/data/tpcds/SF=3072/store/ 78 | oss://example-bucket/spark/data/tpcds/SF=3072/store_returns/ 79 | oss://example-bucket/spark/data/tpcds/SF=3072/store_sales/ 80 | oss://example-bucket/spark/data/tpcds/SF=3072/time_dim/ 81 | oss://example-bucket/spark/data/tpcds/SF=3072/warehouse/ 82 | oss://example-bucket/spark/data/tpcds/SF=3072/web_page/ 83 | oss://example-bucket/spark/data/tpcds/SF=3072/web_returns/ 84 | oss://example-bucket/spark/data/tpcds/SF=3072/web_sales/ 85 | oss://example-bucket/spark/data/tpcds/SF=3072/web_site/ 86 | Object and Directory Number is: 25 87 | 88 | 0.446278(s) elapsed 89 | ``` 90 | -------------------------------------------------------------------------------- /docs/bestpractice/emrspark-ess.md: -------------------------------------------------------------------------------- 1 | 本文介绍如何在ACK上运行Spark作业,并使用EMR Spark Core和Remote Shuffle Service优化性能。 2 | 3 | ### 前提条件 4 | - ACK标准集群,节点规格选用ecs.d1ne.6xlarge大数据型,共20个Worker节点。 5 | - 阿里云OSS,并创建一个bucket,用来替换YAML文件中的OSS配置。 6 | - 利用TPC-DS生成10TB数据,存储在阿里云OSS上,详情参考[生成数据](./generate-data.md)。 7 | 8 | ### 环境准备 9 | - **Worker节点挂载磁盘** 10 | 11 | ecs.d1ne.6xlarge型实例默认自带12块5500G HDD数据盘,这些数据盘需要挂载后才能使用,挂载方式如下 12 | 13 | ```shell 14 | wget https://shilei-tpc-ds.oss-cn-beijing.aliyuncs.com/tools/mount.tgz 15 | tar -xzvf mount.tgz 16 | cd mount/ 17 | ./mount 18 | # SSH password: 此时输入SSH密码后,开始自动执行磁盘挂载 19 | ``` 20 | 21 | - **安装ack-spark-operator** 22 | 23 | 通过安装ack-spark-operator组件,您可以使用ACK Spark Operator简化提交作业的操作。 24 | 25 | 1). 登录容器服务管理控制台。 26 | 27 | 2). 在控制台左侧导航栏中,选择**市场 > 应用目录**。 28 | 29 | 3). 在**应用目录**页面,找到并单击**ack-spark-operator**。 30 | 31 | 4). 在**应用目录 - ack-spark-operator**页面右侧,单击**创建**。 32 | 33 | - **安装ack-spark-history-server**(可选) 34 | 35 | ACK Spark History Server通过记录Spark执行任务过程中的日志和事件信息,并提供UI界面,帮助排查问题。 36 | 37 | 在创建**ack-spark-history-server**组件时,您需在**参数**页签配置OSS相关的信息,用于存储Spark历史数据。 38 | 39 | 1). 登录容器服务管理控制台。 40 | 41 | 2). 在控制台左侧导航栏中,选择**市场 > 应用目录**。 42 | 43 | 3). 在**应用目录**页面,找到并单击**ack-spark-history-server**。 44 | 45 | 4). 在**应用目录 -** **ack-spark-history-server**页面右侧,单击**创建**。 46 | 47 | - **部署remote-shuffle-service** 48 | 49 | remote-shuffle-service可通过钉钉群联系我们,获取安装方式。 50 | 51 | 52 | ### 提交Spark作业 53 | 54 | ```yaml 55 | apiVersion: "sparkoperator.k8s.io/v1beta2" 56 | kind: SparkApplication 57 | metadata: 58 | name: tpcds-benchmark-emrspark-ess-10t 59 | namespace: default 60 | spec: 61 | type: Scala 62 | mode: cluster 63 | image: "" # 可通过钉钉群联系我们获取 64 | imagePullPolicy: Always 65 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 66 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 67 | arguments: 68 | - "--dataset_location" 69 | - "oss:///datasets/" 70 | - "--output_location" 71 | - "oss:///outputs/ack-pr-10t-emr-with-ess" 72 | - "--iterations" 73 | - "1" 74 | - "--shuffle_partitions" 75 | - "1000" 76 | - "--scale_factor" 77 | - "10000" 78 | - "--regenerate_dataset" 79 | - "false" 80 | - "--regenerate_metadata" 81 | - "false" 82 | - "--only_generate_data_and_meta" 83 | - "false" 84 | - "--format" 85 | - "parquet" 86 | - "--query_exclude_list" 87 | - "q14a,q14b,q67" 88 | sparkVersion: 2.4.5 89 | restartPolicy: 90 | type: Never 91 | hadoopConf: 92 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 93 | "fs.oss.endpoint": "" 94 | "fs.oss.accessKeyId": "" 95 | "fs.oss.accessKeySecret": "" 96 | hive.metastore.uris: thrift://service-hive-metastore.default:9083 97 | hive.metastore.client.socket.timeout: 600s 98 | sparkConf: 99 | spark.eventLog.enabled: "true" 100 | spark.eventLog.dir: "oss:///spark/eventlogs" 101 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 102 | spark.driver.maxResultSize: 40g 103 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=6g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 104 | spark.locality.wait.node: "0" 105 | spark.locality.wait.process: "0" 106 | spark.locality.wait.rack: "0" 107 | spark.locality.wait: "0" 108 | spark.memory.fraction: "0.8" 109 | spark.memory.offHeap.enabled: "false" 110 | spark.memory.offHeap.size: "17179869184" 111 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 112 | spark.sql.adaptive.enabled: "false" 113 | spark.sql.analyze.column.async.delay: "200" 114 | spark.sql.auto.reused.cte.enabled: "true" 115 | spark.sql.broadcastTimeout: "3600" 116 | spark.sql.columnVector.offheap.enabled: "false" 117 | spark.sql.crossJoin.enabled: "true" 118 | spark.sql.delete.optimizeInSubquery: "true" 119 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 120 | spark.sql.dynamic.runtime.filter.enabled: "true" 121 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 122 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 123 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 124 | spark.sql.files.openCostInBytes: "34108864" 125 | spark.sql.inMemoryColumnarStorage.compressed: "true" 126 | spark.sql.join.preferNativeJoin: "false" 127 | spark.sql.native.codecache: "true" 128 | spark.sql.native.codegen.wholeStage: "false" 129 | spark.sql.native.nativewrite: "false" 130 | spark.sql.pkfk.optimize.enable: "true" 131 | spark.sql.pkfk.riJoinElimination: "true" 132 | spark.sql.shuffle.partitions: "1000" 133 | spark.sql.simplifyDecimal.enabled: "true" 134 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 135 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 136 | spark.shuffle.reduceLocality.enabled: "false" 137 | spark.shuffle.service.enabled: "true" 138 | spark.dynamicAllocation.enabled: "false" 139 | spark.shuffle.manager: org.apache.spark.shuffle.ess.EssShuffleManager 140 | spark.ess.master.host: emr-rss-master.spark-rss 141 | spark.ess.master.port: "9099" 142 | spark.ess.push.data.buffer.size: 64k 143 | spark.ess.push.data.max.inflight: "2048" 144 | spark.ess.rpc.io.clientThreads: "8" 145 | spark.ess.rpc.io.serverThreads: "8" 146 | spark.ess.data.io.clientThreads: "8" 147 | spark.ess.data.io.numConnectionsPerPeer: "8" 148 | driver: 149 | cores: 15 150 | coreLimit: 15000m 151 | memory: 50g 152 | labels: 153 | version: 2.4.5 154 | serviceAccount: spark 155 | env: 156 | - name: TZ 157 | value: "Asia/Shanghai" 158 | executor: 159 | cores: 4 160 | coreLimit: 6000m 161 | instances: 20 162 | memory: 24g 163 | memoryOverhead: 10g 164 | deleteOnTermination: false 165 | labels: 166 | version: 2.4.5 167 | env: 168 | - name: TZ 169 | value: "Asia/Shanghai" 170 | ``` 171 | 完整YAML文件可参考[tpcds-benchmark-with-emrspark-ess](../../kubernetes/emr/tpcds-benchmark-with-emrspark-ess.yaml),其中spec.mainApplicationFile中的jar包 172 | 可通过这里[下载](../../kubernetes/emr/jar/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar),放在自己的OSS中。 -------------------------------------------------------------------------------- /docs/bestpractice/generate-data.md: -------------------------------------------------------------------------------- 1 | 本文介绍如何在ACK上,使用EMR Spark和TPC-DS生成测试数据。 2 | 3 | ### 前提条件 4 | - ACK标准集群,节点规格选用ecs.d1ne.6xlarge大数据型,共20个Worker节点。 5 | - 阿里云OSS,并创建一个bucket,用来替换YAML文件中的OSS配置。 6 | 7 | ### 环境准备 8 | 9 | - **安装ack-spark-operator** 10 | 11 | 通过安装ack-spark-operator组件,您可以使用ACK Spark Operator简化提交作业的操作。 12 | 13 | 1). 登录容器服务管理控制台。 14 | 15 | 2). 在控制台左侧导航栏中,选择**市场 > 应用目录**。 16 | 17 | 3). 在**应用目录**页面,找到并单击**ack-spark-operator**。 18 | 19 | 4). 在**应用目录 - ack-spark-operator**页面右侧,单击**创建**。 20 | 21 | - **安装ack-spark-history-server**(可选) 22 | 23 | ACK Spark History Server通过记录Spark执行任务过程中的日志和事件信息,并提供UI界面,帮助排查问题。 24 | 25 | 在创建**ack-spark-history-server**组件时,您需在**参数**页签配置OSS相关的信息,用于存储Spark历史数据。 26 | 27 | 1). 登录容器服务管理控制台。 28 | 29 | 2). 在控制台左侧导航栏中,选择**市场 > 应用目录**。 30 | 31 | 3). 在**应用目录**页面,找到并单击**ack-spark-history-server**。 32 | 33 | 4). 在**应用目录 -** **ack-spark-history-server**页面右侧,单击**创建**。 34 | 35 | 36 | ### 提交Spark作业 37 | 38 | ```yaml 39 | apiVersion: "sparkoperator.k8s.io/v1beta2" 40 | kind: SparkApplication 41 | metadata: 42 | name: tpcds-data-generation-10t 43 | namespace: default 44 | spec: 45 | type: Scala 46 | mode: cluster 47 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 48 | imagePullPolicy: Always 49 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 50 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 51 | arguments: 52 | - "--dataset_location" 53 | - "oss:///datasets/" 54 | - "--output_location" 55 | - "oss:///outputs/ack-pr-10t-emr" 56 | - "--iterations" 57 | - "1" 58 | - "--shuffle_partitions" 59 | - "1000" 60 | - "--scale_factor" 61 | - "10000" #指定生成数据大小,默认单位为GB 62 | - "--regenerate_dataset" 63 | - "true" 64 | - "--regenerate_metadata" 65 | - "true" 66 | - "--only_generate_data_and_meta" 67 | - "true" 68 | - "--format" 69 | - "parquet" 70 | sparkVersion: 2.4.5 71 | restartPolicy: 72 | type: Never 73 | sparkConf: 74 | spark.eventLog.enabled: "true" 75 | spark.eventLog.dir: "oss:///spark/eventlogs" 76 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 77 | spark.driver.maxResultSize: 40g 78 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=32g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 79 | spark.locality.wait.node: "0" 80 | spark.locality.wait.process: "0" 81 | spark.locality.wait.rack: "0" 82 | spark.locality.wait: "0" 83 | spark.memory.fraction: "0.8" 84 | spark.memory.offHeap.enabled: "false" 85 | spark.memory.offHeap.size: "17179869184" 86 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 87 | spark.sql.adaptive.enabled: "false" 88 | spark.sql.analyze.column.async.delay: "200" 89 | spark.sql.auto.reused.cte.enabled: "true" 90 | spark.sql.broadcastTimeout: "3600" 91 | spark.sql.columnVector.offheap.enabled: "false" 92 | spark.sql.crossJoin.enabled: "true" 93 | spark.sql.delete.optimizeInSubquery: "true" 94 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 95 | spark.sql.dynamic.runtime.filter.enabled: "true" 96 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 97 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 98 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 99 | spark.sql.files.openCostInBytes: "34108864" 100 | spark.sql.inMemoryColumnarStorage.compressed: "true" 101 | spark.sql.join.preferNativeJoin: "false" 102 | spark.sql.native.codecache: "true" 103 | spark.sql.native.codegen.wholeStage: "false" 104 | spark.sql.native.nativewrite: "false" 105 | spark.sql.pkfk.optimize.enable: "true" 106 | spark.sql.pkfk.riJoinElimination: "true" 107 | spark.sql.shuffle.partitions: "1000" 108 | spark.sql.simplifyDecimal.enabled: "true" 109 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 110 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 111 | spark.shuffle.reduceLocality.enabled: "false" 112 | spark.shuffle.service.enabled: "true" 113 | spark.dynamicAllocation.enabled: "false" 114 | driver: 115 | cores: 15 116 | coreLimit: 15000m 117 | memory: 30g 118 | labels: 119 | version: 2.4.5 120 | serviceAccount: spark 121 | env: 122 | - name: TZ 123 | value: "Asia/Shanghai" 124 | executor: 125 | cores: 8 126 | coreLimit: 8000m 127 | instances: 20 128 | memory: 24g 129 | labels: 130 | version: 2.4.5 131 | env: 132 | - name: TZ 133 | value: "Asia/Shanghai" 134 | ``` 135 | 完整YAML文件可参考[tpcds-data-generation](../../kubernetes/emr/tpcds-data-generation.yaml),其中spec.mainApplicationFile中的jar包 136 | 可通过这里[下载](../../kubernetes/emr/jar/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar),放在自己的OSS中。 -------------------------------------------------------------------------------- /docs/img/alluxio-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/alluxio-overview.jpg -------------------------------------------------------------------------------- /docs/img/alluxio_capacity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/alluxio_capacity.png -------------------------------------------------------------------------------- /docs/img/apache-spark-per-10t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/apache-spark-per-10t.jpg -------------------------------------------------------------------------------- /docs/img/apache-spark-total-10t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/apache-spark-total-10t.jpg -------------------------------------------------------------------------------- /docs/img/create_ack_cluster.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/create_ack_cluster.jpeg -------------------------------------------------------------------------------- /docs/img/emr-spark-ess-jindofs-per-1t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-ess-jindofs-per-1t.jpg -------------------------------------------------------------------------------- /docs/img/emr-spark-ess-jindofs-total-1t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-ess-jindofs-total-1t.jpg -------------------------------------------------------------------------------- /docs/img/emr-spark-jindofs-per-1t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-jindofs-per-1t.jpg -------------------------------------------------------------------------------- /docs/img/emr-spark-jindofs-total-1t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-jindofs-total-1t.jpg -------------------------------------------------------------------------------- /docs/img/emr-spark-rss-per-10t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-rss-per-10t.jpg -------------------------------------------------------------------------------- /docs/img/emr-spark-rss-total-10t.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/emr-spark-rss-total-10t.jpg -------------------------------------------------------------------------------- /docs/img/get_spark_history_svc.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/get_spark_history_svc.jpeg -------------------------------------------------------------------------------- /docs/img/get_sparkapplication_id.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/get_sparkapplication_id.jpeg -------------------------------------------------------------------------------- /docs/img/install_spark_history.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/install_spark_history.jpeg -------------------------------------------------------------------------------- /docs/img/install_spark_operator.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/install_spark_operator.jpeg -------------------------------------------------------------------------------- /docs/img/jindofs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/jindofs.png -------------------------------------------------------------------------------- /docs/img/localhost_spark_ui.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/localhost_spark_ui.jpeg -------------------------------------------------------------------------------- /docs/img/mount_disk.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/mount_disk.jpeg -------------------------------------------------------------------------------- /docs/img/port-forward_svc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/port-forward_svc.jpg -------------------------------------------------------------------------------- /docs/img/spark_vs_alluxio.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/spark_vs_alluxio.jpg -------------------------------------------------------------------------------- /docs/img/sparkapplication_svc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/sparkapplication_svc.jpg -------------------------------------------------------------------------------- /docs/img/tpcds_per_query.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/img/tpcds_per_query.jpeg -------------------------------------------------------------------------------- /docs/performance/emr-spark.md: -------------------------------------------------------------------------------- 1 | EMR Spark是运行在阿里云平台上的大数据处理解决方案,在开源版Apache Spark的基础上做了大量性能、功能以及稳定性方面的改造,并且在和阿里云基础服务的适配上做了非常多的工作。主要有以下核心技术: 2 | 3 | - 实现SparkSQL事务功能,支持update、delete语句。 4 | - 实现PK、FK、NOT NULL等SQL Constraint,并应用在SQL优化中。 5 | - 实现Relational Cache:SparkSQL的物化视图。 6 | - 实现多租户高可用的SparkSQL JDBC Server。 7 | - SparkSQL部分性能优化列表: 8 | - 支持Runtime Filter。 9 | - 使用Adaptive Execution,可在运行时调整作业行为。 10 | - CBO Join Reorder进一步优化,支持遗传算法。 11 | - Shuffle流程优化,构建异步非阻塞的Shuffle IO。 -------------------------------------------------------------------------------- /docs/performance/jindofs.md: -------------------------------------------------------------------------------- 1 | 计算存储分离已经成为云计算的一种发展趋势。在计算存储分离之前,普遍采用的是传统的计算存储相互融合的架构,但是这种架构存在一定的问题,比如在集群扩容的时候会面临计算能力和存储能力相互不匹配的问题。用户在某些情况下只需要扩容计算能力或者存储能力,而传统的融合架构不能满足用户的这种需求,进行单独的扩充计算或者存储能力;其次在缩容的时候可能会遇到人工干预,人工干预完后需要保证数据在多个节点中同步,而当有多个副本需要同步时候,可能会造成的数据丢失。计算存储分离架构则可以很好的解决这些问题,使得用户只需要关心整个集群的计算能力,但同时也会引入读写数据网络延迟的问题。 2 | 3 | JindoFS是一种云原生的文件系统,结合OSS和本地存储,成为E-MapReduce产品的新一代存储系统,为上层计算提供了高效可靠的存储。 4 | 5 | JindoFS 提供了块存储模式(Block)和缓存模式(Cache)的存储模式。 6 | 7 | JindoFS 采用了本地存储和OSS的异构多备份机制,Storage Service提供了数据存储能力,首先使用OSS作为存储后端,保证数据的高可靠性,同时利用本地存储实现冗余备份,利用本地的备份,可以加速数据读取;另外,JindoFS 的元数据通过本地服务Namespace Service管理,从而保证了元数据操作的性能(和HDFS元数据操作性能相似)。 8 | 9 | ![jindofs](../img/jindofs.png) -------------------------------------------------------------------------------- /docs/performance/oss.md: -------------------------------------------------------------------------------- 1 | 在运行Spark任务时,用户数据经常会存放在OSS上,针对一些多Executor、小文件场景,可通过以下配置来优化性能: 2 | ```yaml 3 | fs.oss.paging.maximum: 1000 4 | fs.oss.multipart.download.threads: 32 5 | fs.oss.max.total.tasks: 256 6 | fs.oss.connection.maximum: 2048 7 | ``` 8 | 更多配置可参考[hadoop-aliyun](https://hadoop.apache.org/docs/stable/hadoop-aliyun/tools/hadoop-aliyun/index.html) -------------------------------------------------------------------------------- /docs/performance/remote-shuffle-service.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/performance/serverless-spark/index.md: -------------------------------------------------------------------------------- 1 | # Serverless Spark 2 | 3 | ## 什么是 Serverless Spark? 4 | 5 | [弹性容器实例(Elastic Container Instance,ECI)](https://help.aliyun.com/ack/serverless-kubernetes/user-guide/eci-pod/)能够为 Kubernetes 提供基础的容器 Pod 运行环境,通过将 Spark 作业中的 Driver 和 Executor Pod 动态调度到 ECI 上,即可以实现无服务器 Spark 作业执行。每个容器实例底层通过轻量级虚拟化安全沙箱技术完全强隔离,容器实例间互不影响。 6 | 7 | ![Serverless Spark](serverless-spark.jpeg) 8 | 9 | 将 Spark 作业运行在 ECI 上具有以下优势: 10 | 11 | - **超大容量**:集群无需额外配置既可获得最多 2 万 Pod 容量,无需提前规划容量。 12 | - **秒级弹性**:可在极短时间内创建出数千个 Pod以交付大量算力,无需担心业务高峰期间 Pod 创建时延受到影响。 13 | - **节约成本**:ECI Pod 按需创建,按量计费,不会由于资源闲置造成资源浪费,并且能够支持 Spot 实例、多种实例组合,节省成本。 14 | 15 | ## 如何将 Spark 作业运行在 ECI 上? 16 | 17 | 由于 ECI 节点被打上了特定的标签和污点,为了将 Spark 作业运行在 ECI 上,只需要在 Driver 或 Executor Pod 中添加如下配置。首先,nodeSelector 中需要添加 `type: virtual-kubelet` 以将 Pod 调度到 ECI 上。其次,tolerations 中需要添加额外的污点容忍,以容忍 ECI 的污点: 18 | 19 | ```yaml 20 | nodeSelector: 21 | type: virtual-kubelet 22 | 23 | tolerations: 24 | - key: virtual-kubelet.io/provider 25 | operator: Equal 26 | value: alibabacloud 27 | effect: NoSchedule 28 | ``` 29 | 30 | ```yaml 31 | apiVersion: sparkoperator.k8s.io/v1beta2 32 | kind: SparkApplication 33 | metadata: 34 | name: spark-pi 35 | spec: 36 | type: Scala 37 | mode: cluster 38 | image: apache/spark:3.5.0 39 | mainClass: org.apache.spark.examples.SparkPi 40 | mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar 41 | sparkVersion: 3.5.0 42 | driver: 43 | cores: 1 44 | coreLimit: 1200m 45 | memory: 512m 46 | serviceAccount: spark 47 | annotations: 48 | k8s.aliyun.com/eci-use-specs: "2-4Gi" 49 | nodeSelector: 50 | type: virtual-kubelet 51 | tolerations: 52 | - key: virtual-kubelet.io/provider 53 | operator: Equal 54 | value: alibabacloud 55 | effect: NoSchedule 56 | executor: 57 | instances: 2 58 | cores: 2 59 | memory: 4g 60 | annotations: 61 | k8s.aliyun.com/eci-use-specs: "2-4Gi" 62 | nodeSelector: 63 | type: virtual-kubelet 64 | tolerations: 65 | - key: virtual-kubelet.io/provider 66 | operator: Equal 67 | value: alibabacloud 68 | effect: NoSchedule 69 | ``` 70 | 71 | 在上面的示例中,我们通过注解 `k8s.aliyun.com/eci-use-specs: "2-4Gi"` 指定了 ECI Pod 的规格为 2 vCPU + 4 GiB 内存。需要注意的是,并不是所有的 vCPU 和内存规格组合都支持,具体请参考[通过指定 vCPU 和内存创建 ECI Pod](https://help.aliyun.com/ack/serverless-kubernetes/user-guide/specify-cpu-and-memory-specifications-to-create-an-elastic-container-instance/)。 72 | 73 | 指定 ECI Pod 规格有多种方式,例如指定 vCPU 和内存、指定 ECS 规格等, 74 | 75 | ## 76 | 77 | ## 使用 ImageCache 加速镜像拉取 78 | 79 | ECI 支持镜像缓存功能,以便 Kubernetes 用户可以加速镜像拉取,提升 Pod 创建速度。具体地说,ECI 提供了一种名为 ImageCache 的集群级别的 CRD 资源,详细的使用文档请参考[使用 ImageCache 加速创建 Pod](https://help.aliyun.com/ack/serverless-kubernetes/user-guide/use-image-caches-to-accelerate-the-creation-of-pods/)。 80 | 81 | 仍然以上面的 Spark 作业为例,其使用的镜像为社区镜像 `apache/spark:3.5.0`,我们对比一下使用 ImageCache 前后镜像的拉取速度。在使用之前,我们查看 driver pod 的事件: 82 | 83 | ```shell 84 | $ kubectl describe pod spark-pi-driver 85 | ... 86 | Events: 87 | Type Reason Age From Message 88 | ---- ------ ---- ---- ------- 89 | Normal Scheduled 24m default-scheduler Successfully assigned spark-operator/spark-pi-driver to virtual-kubelet-cn-beijing-i 90 | Normal UserInstanceTypeSpec 24m EciService [eci.containergroup]The user-specified instanceType for current eci instance is 2.0-4.0Gi 91 | Warning ImageCacheMissed 24m EciService [eci.imagecache]Missed image cache. 92 | Normal ImageCacheAutoCreated 24m EciService [eci.imagecache]Image cache imc-2ze5hdcnngenmwc1jmwf is auto created 93 | Normal Pulling 24m kubelet Pulling image "apache/spark:3.5.0" 94 | Normal Pulled 23m kubelet Successfully pulled image "apache/spark:3.5.0" in 1m41.289s (1m41.289s including waiting) 95 | Normal Created 23m kubelet Created container spark-kubernetes-driver 96 | Normal Started 23m kubelet Started container spark-kubernetes-driver 97 | ``` 98 | 99 | 事件显示 ImageCache 未命中,并创建了新的 ImageCache,其 ID 为 `imc-2ze5hdcnngenmwc1jmwf`。接下来,我们在 Driver 和 Executor 中添加如下注解以明确指定该镜像缓存: 100 | 101 | ```yaml 102 | annotations: 103 | k8s.aliyun.com/eci-image-snapshot-id: imc-2ze5hdcnngenmwc1jmwf 104 | ``` 105 | 106 | 再次执行作业,观察 driver pod 的镜像拉取速度: 107 | 108 | ```shell 109 | $ kubectl describe pod spark-pi-driver 110 | ... 111 | Events: 112 | Type Reason Age From Message 113 | ---- ------ ---- ---- ------- 114 | Normal Scheduled 13s default-scheduler Successfully assigned spark-operator/spark-pi-driver to virtual-kubelet-cn-beijing-i 115 | Normal UserInstanceTypeSpec 24s EciService [eci.containergroup]The user-specified instanceType for current eci instance is 2.0-4.0Gi 116 | Normal SuccessfulHitImageCache 23s EciService [eci.imagecache]Successfully hit image cache imc-2ze5hdcnngenmwc1jmwf, eci will be scheduled with this image cache. 117 | Normal Pulled 4s kubelet Container image "apache/spark:3.5.0" already present on machine 118 | Normal Created 4s kubelet Created container spark-kubernetes-driver 119 | Normal Started 3s kubelet Started container spark-kubernetes-driver 120 | ``` 121 | 122 | Driver pod 事件显示镜像缓存成功命中,无需再次拉取镜像。 123 | 124 | 实际使用过程中,可以在 Driver/Executor Pod 中添加如下注解以实现自动创建和匹配镜像缓存,而无需明确指定镜像缓存 ID: 125 | 126 | ```yaml 127 | annotations: 128 | k8s.aliyun.com/eci-image-cache: "true" 129 | ``` 130 | -------------------------------------------------------------------------------- /docs/performance/serverless-spark/serverless-spark.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/docs/performance/serverless-spark/serverless-spark.jpeg -------------------------------------------------------------------------------- /docs/performance/spark-operator.md: -------------------------------------------------------------------------------- 1 | Spark on Kubernetes Operator帮助用户在Kubernetes上像其他工作负载一样用通用的方式运行Spark Application,为了让Spark能更好的运行在Kubernetes中,我们对Spark Operator也做了一些优化工作。 2 | 3 | - 相比社区版Spark Operator实现中的阻塞串行调度,ACK版本支持非阻塞并行调度,调度性能可达350 Pods/s,能够快速把Spark作业调度到节点上。 4 | - 增强Spark Kernel对Kubernetes原生能力的支持,如Tolerations、Labels、Node Name。 5 | - Spark Kernel支持dynamic allocation,资源利用率可提升30%。 6 | - 支持设置Spark Job使用自定义调度器。 -------------------------------------------------------------------------------- /docs/quickstart/benchmark_code.md: -------------------------------------------------------------------------------- 1 | 1. [测试环境搭建](benchmark_env.md) 2 | 2. [测试代码开发](benchmark_code.md) 3 | 3. [Spark on ACK测试](benchmark_steps.md) 4 | 4. [测试结果分析](benchmark_result.md) 5 | 5. [问题排查定位](debugging_guide.md) 6 | 7 | *说明:为了方便测试,已经提供了预制镜像(registry.cn-beijing.aliyuncs.com/yukong/ack-spark-benchmark:1.0.0),可以直接使用。* 8 | 9 | ### 准备工作 10 | 11 | 测试代码依赖databricks两个工具:一个是tpcds测试包,另一个是测试数据生成工具tpcds-kit。 12 | 13 | #### 1)打包tpcds依赖jar 14 | 15 | databricks的tpcds: https://github.com/databricks/spark-sql-perf 16 | 17 | ```shell 18 | git clone https://github.com/databricks/spark-sql-perf.git 19 | sbt package 20 | ``` 21 | 22 | 得到jar包:spark-sql-perf_2.11-0.5.1-SNAPSHOT,作为测试项目的依赖。 23 | 24 | 25 | 26 | #### 2)编译tpcds-kit 27 | 28 | tpcds标准测试数据集生成工具: https://github.com/databricks/tpcds-kit 29 | 30 | ```shell 31 | git clone https://github.com/davies/tpcds-kit.git 32 | yum install gcc gcc-c++ bison flex cmake ncurses-devel 33 | cd tpcds-kit/tools 34 | cp Makefile.suite Makefile # 复制Makefile.suite为Makefile 35 | make 36 | #验证 37 | ./dsqgen --help 38 | ``` 39 | 40 | 编译后生成二进制可执行程序,本实验主要依赖两个:dsdgen(数据生成)和dsqgen(查询生成) 41 | 42 | ### 编写代码 43 | 44 | #### 1)生成数据 45 | 46 | DataGeneration.scala 47 | 48 | ```scala 49 | package com.aliyun.spark.benchmark.tpcds 50 | 51 | import com.databricks.spark.sql.perf.tpcds.TPCDSTables 52 | import org.apache.log4j.{Level, LogManager} 53 | import org.apache.spark.sql.SparkSession 54 | 55 | import scala.util.Try 56 | 57 | object DataGeneration { 58 | def main(args: Array[String]) { 59 | val tpcdsDataDir = args(0) 60 | val dsdgenDir = args(1) 61 | val format = Try(args(2).toString).getOrElse("parquet") 62 | val scaleFactor = Try(args(3).toString).getOrElse("1") 63 | val genPartitions = Try(args(4).toInt).getOrElse(100) 64 | val partitionTables = Try(args(5).toBoolean).getOrElse(false) 65 | val clusterByPartitionColumns = Try(args(6).toBoolean).getOrElse(false) 66 | val onlyWarn = Try(args(7).toBoolean).getOrElse(false) 67 | 68 | println(s"DATA DIR is $tpcdsDataDir") 69 | println(s"Tools dsdgen executable located in $dsdgenDir") 70 | println(s"Scale factor is $scaleFactor GB") 71 | 72 | val spark = SparkSession 73 | .builder 74 | .appName(s"TPCDS Generate Data $scaleFactor GB") 75 | .getOrCreate() 76 | 77 | if (onlyWarn) { 78 | println(s"Only WARN") 79 | LogManager.getLogger("org").setLevel(Level.WARN) 80 | } 81 | 82 | val tables = new TPCDSTables(spark.sqlContext, 83 | dsdgenDir = dsdgenDir, 84 | scaleFactor = scaleFactor, 85 | useDoubleForDecimal = false, 86 | useStringForDate = false) 87 | 88 | println(s"Generating TPCDS data") 89 | 90 | tables.genData( 91 | location = tpcdsDataDir, 92 | format = format, 93 | overwrite = true, // overwrite the data that is already there 94 | partitionTables = partitionTables, // create the partitioned fact tables 95 | clusterByPartitionColumns = clusterByPartitionColumns, // shuffle to get partitions coalesced into single files. 96 | filterOutNullPartitionValues = false, // true to filter out the partition with NULL key value 97 | tableFilter = "", // "" means generate all tables 98 | numPartitions = genPartitions) // how many dsdgen partitions to run - number of input tasks. 99 | 100 | println(s"Data generated at $tpcdsDataDir") 101 | 102 | spark.stop() 103 | } 104 | } 105 | ``` 106 | 107 | #### 2)查询数据 108 | 109 | BenchmarkSQL.scala 110 | 111 | ```scala 112 | package com.aliyun.spark.benchmark.tpcds 113 | 114 | import com.databricks.spark.sql.perf.tpcds.{TPCDS, TPCDSTables} 115 | import org.apache.spark.sql.SparkSession 116 | import org.apache.spark.sql.functions._ 117 | import org.apache.spark.sql.functions.col 118 | import org.apache.log4j.{Level, LogManager} 119 | import scala.util.Try 120 | 121 | object BenchmarkSQL { 122 | def main(args: Array[String]) { 123 | val tpcdsDataDir = args(0) 124 | val resultLocation = args(1) 125 | val dsdgenDir = args(2) 126 | val format = Try(args(3).toString).getOrElse("parquet") 127 | val scaleFactor = Try(args(4).toString).getOrElse("1") 128 | val iterations = args(5).toInt 129 | val optimizeQueries = Try(args(6).toBoolean).getOrElse(false) 130 | val filterQueries = Try(args(7).toString).getOrElse("") 131 | val onlyWarn = Try(args(8).toBoolean).getOrElse(false) 132 | 133 | val databaseName = "tpcds_db" 134 | val timeout = 24*60*60 135 | 136 | println(s"DATA DIR is $tpcdsDataDir") 137 | 138 | val spark = SparkSession 139 | .builder 140 | .appName(s"TPCDS SQL Benchmark $scaleFactor GB") 141 | .getOrCreate() 142 | 143 | if (onlyWarn) { 144 | println(s"Only WARN") 145 | LogManager.getLogger("org").setLevel(Level.WARN) 146 | } 147 | 148 | val tables = new TPCDSTables(spark.sqlContext, 149 | dsdgenDir = dsdgenDir, 150 | scaleFactor = scaleFactor, 151 | useDoubleForDecimal = false, 152 | useStringForDate = false) 153 | 154 | if (optimizeQueries) { 155 | Try { 156 | spark.sql(s"create database $databaseName") 157 | } 158 | tables.createExternalTables(tpcdsDataDir, format, databaseName, overwrite = true, discoverPartitions = true) 159 | tables.analyzeTables(databaseName, analyzeColumns = true) 160 | spark.conf.set("spark.sql.cbo.enabled", "true") 161 | } else { 162 | tables.createTemporaryTables(tpcdsDataDir, format) 163 | } 164 | 165 | val tpcds = new TPCDS(spark.sqlContext) 166 | 167 | var query_filter : Seq[String] = Seq() 168 | if (!filterQueries.isEmpty) { 169 | println(s"Running only queries: $filterQueries") 170 | query_filter = filterQueries.split(",").toSeq 171 | } 172 | 173 | val filtered_queries = query_filter match { 174 | case Seq() => tpcds.tpcds2_4Queries 175 | case _ => tpcds.tpcds2_4Queries.filter(q => query_filter.contains(q.name)) 176 | } 177 | 178 | // Start experiment 179 | val experiment = tpcds.runExperiment( 180 | filtered_queries, 181 | iterations = iterations, 182 | resultLocation = resultLocation, 183 | forkThread = true) 184 | 185 | experiment.waitForFinish(timeout) 186 | 187 | // Collect general results 188 | val resultPath = experiment.resultPath 189 | println(s"Reading result at $resultPath") 190 | val specificResultTable = spark.read.json(resultPath) 191 | specificResultTable.show() 192 | 193 | // Summarize results 194 | val result = specificResultTable 195 | .withColumn("result", explode(col("results"))) 196 | .withColumn("executionSeconds", col("result.executionTime")/1000) 197 | .withColumn("queryName", col("result.name")) 198 | result.select("iteration", "queryName", "executionSeconds").show() 199 | println(s"Final results at $resultPath") 200 | 201 | val aggResults = result.groupBy("queryName").agg( 202 | callUDF("percentile", col("executionSeconds").cast("long"), lit(0.5)).as('medianRuntimeSeconds), 203 | callUDF("min", col("executionSeconds").cast("long")).as('minRuntimeSeconds), 204 | callUDF("max", col("executionSeconds").cast("long")).as('maxRuntimeSeconds) 205 | ).orderBy(col("queryName")) 206 | aggResults.repartition(1).write.csv(s"$resultPath/summary.csv") 207 | aggResults.show(105) 208 | 209 | spark.stop() 210 | } 211 | } 212 | ``` 213 | 214 | ### 镜像制作 215 | 216 | 测试代码编译成jar后,可以和依赖的其他jar包一起,制作成镜像供测试使用,Dockerfile如下: 217 | 218 | ```dockerfile 219 | FROM registry.cn-hangzhou.aliyuncs.com/acs/spark:ack-2.4.5-f757ab6 220 | RUN mkdir -p /opt/spark/jars 221 | RUN mkdir -p /tmp/tpcds-kit 222 | COPY ./target/scala-2.11/spark-tpcds-assembly-0.1.jar /opt/spark/jars/ 223 | COPY ./lib/*.jar /opt/spark/jars/ 224 | COPY ./tpcds-kit/tools.tar.gz /tmp/tpcds-kit/ 225 | RUN cd /tmp/tpcds-kit/ && tar -xzvf tools.tar.gz 226 | ``` 227 | -------------------------------------------------------------------------------- /docs/quickstart/benchmark_env.md: -------------------------------------------------------------------------------- 1 | 1. [测试环境搭建](benchmark_env.md) 2 | 2. [测试代码开发](benchmark_code.md) 3 | 3. [Spark on ACK测试](benchmark_steps.md) 4 | 4. [测试结果分析](benchmark_result.md) 5 | 5. [问题排查定位](debugging_guide.md) 6 | 7 | ## 环境准备 8 | 9 | ### 1)ACK集群 10 | 11 | 创建ACK标准专有集群,其中worker节点采用大数据网络增强型的ecs.d1ne.6xlarge规格,共20个节点。 12 | 13 | ![create_ack_cluster.jpeg](../img/create_ack_cluster.jpeg) 14 | 15 | 16 | 17 | 其中每个ecs.d1ne.6xlarge自带12块5TB的HDD数据盘,需要对这12个数据盘进行分区格式化挂载,操作步骤请参考[分区格式linux数据盘](https://help.aliyun.com/document_detail/34377.html?spm=a2c4g.11174283.6.813.4be652feB9omRD#title-f8r-9od-yn9)。格式化并挂载完成后,执行df -h,可以看到如下的挂载情况。/mnt目录下的12个文件路径会在Alluxio中用到。 18 | 19 | ![mount_disk.jpeg](../img/mount_disk.jpeg) 20 | 21 | 22 | 23 | ### 2)OSS 24 | 25 | 在阿里云OSS控制台上开通OSS服务,并创建一个bucket用来存放tpc-ds生成的数据、测试结果和测试过程中的日志等。本次实验中的bucket名字为cloudnativeai。 26 | 27 | 28 | 29 | ### 3)安装ack-spark-operator 30 | 31 | 在ack管控台,**市场**-**应用目录**中找到ack-spark-operator,点击右侧“**创建**”按钮,安装spark operator。 32 | 33 | ![install_spark_operator.jpeg](../img/install_spark_operator.jpeg) 34 | 35 | 安装完成后,执行命令查看是否成功 36 | 37 | ```shell 38 | kubectl get deployment ack-spark-operator -n spark-operator 39 | ``` 40 | 41 | 42 | 43 | ### 4)安装ack-spark-history-server 44 | 45 | ![install_spark_history.jpeg](../img/install_spark_history.jpeg) 46 | 47 | ack-spark-history-server通过记录spark执行任务过程中的日志和事件信息,并提供UI界面,帮助排查问题。 48 | 49 | **市场**-**应用目录**中找到ack-spark-history-server,在参数页中配置oss相关信息,存储spark历史数据,点击右侧“**创建**”按钮,安装spark history server。 50 | 51 | ```yaml 52 | oss: 53 | enableOSS: false 54 | # Please input your accessKeyId 55 | alibabaCloudAccessKeyId: "" 56 | # Please input your accessKeySecret 57 | alibabaCloudAccessKeySecret: "" 58 | # oss bucket endpoint such as oss-cn-beijing.aliyuncs.com 59 | alibabaCloudOSSEndpoint: "" 60 | # oss file path such as oss://bucket-name/path 61 | eventsDir: "oss://cloudnativeai/spark/spark-events" 62 | ``` 63 | 64 | 安装完成后,执行命令查看是否成功 65 | 66 | ```shell 67 | kubectl get service ack-spark-history-server -n {YOUR-NAMESPACE} 68 | ``` 69 | 70 | 71 | 72 | ### 5)安装Alluxio 73 | 74 | alluxio需要在ack中通过helm安装,首先下载alluxio。 75 | 76 | ```shell 77 | wget http://kubeflow.oss-cn-beijing.aliyuncs.com/alluxio-0.6.8.tgz 78 | tar -xvf alluxio-0.6.8.tgz 79 | ``` 80 | 81 | 然后在alluxio同级目录下新建一个config.yaml配置文件,关键配置如下: 82 | 83 | ```yaml 84 | # Site properties for all the components 85 | properties: 86 | fs.oss.accessKeyId: YOUR-ACCESS-KEY-ID 87 | fs.oss.accessKeySecret: YOUR-ACCESS-KEY-SECRET 88 | fs.oss.endpoint: oss-cn-beijing-internal.aliyuncs.com 89 | alluxio.master.mount.table.root.ufs: oss://cloudnativeai/ 90 | alluxio.master.persistence.blacklist: .staging,_temporary 91 | alluxio.security.stale.channel.purge.interval: 365d 92 | alluxio.user.metrics.collection.enabled: 'true' 93 | alluxio.user.short.circuit.enabled: 'true' 94 | alluxio.user.file.write.tier.default: 1 95 | alluxio.user.block.size.bytes.default: 64MB #default 64MB 96 | alluxio.user.file.writetype.default: CACHE_THROUGH 97 | alluxio.user.file.metadata.load.type: ONCE 98 | alluxio.user.file.readtype.default: CACHE 99 | #alluxio.worker.allocator.class: alluxio.worker.block.allocator.MaxFreeAllocator 100 | alluxio.worker.allocator.class: alluxio.worker.block.allocator.RoundRobinAllocator 101 | alluxio.worker.file.buffer.size: 128MB 102 | alluxio.worker.evictor.class: alluxio.worker.block.evictor.LRUEvictor 103 | alluxio.job.master.client.threads: 5000 104 | alluxio.job.worker.threadpool.size: 300 105 | ``` 106 | 107 | 其中oss的相关的accessKey、endpoint等信息需要替换成自己的。 108 | 109 | ```yaml 110 | tieredstore: 111 | levels: 112 | - level: 0 113 | alias: HDD 114 | mediumtype: HDD-0,HDD-1,HDD-2,HDD-3,HDD-4,HDD-5,HDD-6,HDD-7,HDD-8,HDD-9,HDD-10,HDD-11 115 | path: /mnt/disk1,/mnt/disk2,/mnt/disk3,/mnt/disk4,/mnt/disk5,/mnt/disk6,/mnt/disk7,/mnt/disk8,/mnt/disk9,/mnt/disk10,/mnt/disk11,/mnt/disk12 116 | type: hostPath 117 | quota: 1024G,1024G,1024G,1024G,1024G,1024G,1024G,1024G,1024G,1024G,1024G,1024G 118 | high: 0.95 119 | low: 0.7 120 | ``` 121 | 122 | tieredstore中的中的mediumtype、path对应ack worker节点中挂载的数据盘。 123 | 124 | 完整的配置文件请参考[config.yaml](../../kubernetes/alluxio/config.yaml),下载config.yaml配置文件并修改配置,给ACK集群中的worker节点打上“alluxio=true”标签,然后通过helm命令安装alluxio。 125 | 126 | 安装alluxio 127 | 128 | ```shell 129 | helm install -f config.yaml -n alluxio alluxio alluxio 130 | ``` 131 | 132 | 安装完成后,执行命令查看是否成功 133 | 134 | ```shell 135 | kubectl get pod -n alluxio 136 | ``` 137 | 138 | 然后进入alluxio-admin,查看数据盘挂载是否成功 139 | 140 | ```shell 141 | kubectl exec -it alluxio-master-0 -n alluxio -- /bin/bash 142 | 143 | ./bin/alluxio fsadmin report capacity 144 | ``` 145 | 146 | 如果能看到每个worker节点上都有挂载的数据盘,说明Alluxio安装配置成功。 147 | 148 | ![alluxio_capacity.png](../img/alluxio_capacity.png) -------------------------------------------------------------------------------- /docs/quickstart/benchmark_result.md: -------------------------------------------------------------------------------- 1 | 1. [测试环境搭建](benchmark_env.md) 2 | 2. [测试代码开发](benchmark_code.md) 3 | 3. [Spark on ACK测试](benchmark_steps.md) 4 | 4. [测试结果分析](benchmark_result.md) 5 | 5. [问题排查定位](debugging_guide.md) 6 | 7 | ## 压测环境 8 | 9 | ### 硬件配置 10 | 11 | - **ACK集群说明** 12 | 13 | | 集群类型 | ACK标准专有集群 | 14 | | -------------- | ---------------------------------------------------- | 15 | | ECS实例 | ECS规格:ecs.d1ne.6xlarge
Aliyun Linux 2.1903
CPU: 24核,内存:96G
数据盘:5500G HDD x 12 | 16 | | Worker Node个数 | 20 | 17 | 18 | 19 | 20 | ### 软件配置 21 | 22 | - **软件版本** 23 | 24 | spark version: 2.4.5 25 | 26 | alluxio version: 2.3.0 27 | 28 | - **Spark配置说明** 29 | 30 | | spark.driver.cores | 5 | 31 | | -------------------------- | ----- | 32 | | spark.driver.memory (MB) | 20480 | 33 | | spark.executor.cores | 7 | 34 | | spark.executor.memory (MB) | 20480 | 35 | | spark.executor.instances | 20 | 36 | 37 | ## 压测结果 38 | 39 | ### Spark是否启用Alluxio对比 40 | 41 | ![tpcds_per_query.jpeg](../img/tpcds_per_query.jpeg) 42 | 43 | query任务总耗时 44 | 45 | 46 | 47 | | | total(Min) | 48 | | ---------------- | ---------- | 49 | | Spark with OSS | 180 | 50 | | Spark with Alluxio Cold | 145 | 51 | | Spark with Alluxio Warm | 137 | 52 | 53 | 54 | 55 | ![spark_vs_alluxio.jpg](../img/spark_vs_alluxio.jpg) -------------------------------------------------------------------------------- /docs/quickstart/debugging_guide.md: -------------------------------------------------------------------------------- 1 | 1. [测试环境搭建](benchmark_env.md) 2 | 2. [测试代码开发](benchmark_code.md) 3 | 3. [Spark on ACK测试](benchmark_steps.md) 4 | 4. [测试结果分析](benchmark_result.md) 5 | 5. [问题排查定位](debugging_guide.md) 6 | 7 | 测试过程中,如果参数配置不对,比如内存太小,很容器出现OOM等异常,这里可以通过两种方式来排查问题。以执行tpcds-query-runner-with-alluxio为例说明。 8 | 9 | ## spark-ui 10 | 11 | 任务运行过程中,可以通过spark-ui实时看到每个sql的执行情况,方式如下: 12 | 13 | 执行 kubectl get services 可以看到如下信息: 14 | 15 | ![sparkapplication_svc.jpg](../img/sparkapplication_svc.jpg) 16 | 17 | 其中tpcds-benchmark-sql-ui-svc即spark-ui对应的service。 18 | 19 | 执行下面命令: 20 | 21 | ```shell 22 | kubectl port-forward svc/tpcds-benchmark-sql-ui-svc 4040:4040 23 | ``` 24 | 25 | 可以看到 26 | 27 | ### ![port-forward_svc.jpg](../img/port-forward_svc.jpg) 28 | 29 | 然后在浏览器中输入:localhost:4040,就可以看到spark-ui中的任务执行情况。 30 | 31 | ![localhost_spark_ui.jpeg](../img/localhost_spark_ui.jpeg) 32 | 33 | 34 | 35 | ## ack-spark-history-server 36 | 37 | 任务结束后,如果想看到历史数据,可以采用这种方式,操作步骤如下: 38 | 39 | 执行如下命令,获取SparkApplication任务id。 40 | 41 | ```shell 42 | kubectl get sparkapplication tpcds-benchmark-sql -o yaml 43 | ``` 44 | 45 | ![get_sparkapplication_id.jpeg](../img/get_sparkapplication_id.jpeg) 46 | 47 | 在上图中可以得到sparkApplicationId。 48 | 49 | 然后再获得ack-spark-history-server的endpoint 50 | 51 | ```shell 52 | kubectl get service ack-spark-history-server 53 | ``` 54 | 55 | ![get_spark_history_svc.jpeg](../img/get_spark_history_svc.jpeg) 56 | 57 | 然后在浏览器中输入上图中的 EXTERNAL-IP和端口,就可以看到所以spark任务的历史数据,再通过上一步找到的sparkApplicationId找到对应记录即可。 -------------------------------------------------------------------------------- /kubernetes/emr/jar/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/kubernetes/emr/jar/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /kubernetes/emr/tpcds-benchmark-with-emrspark-ess-jindofs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark-emrspark-ess-jindofs-1t 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 10 | imagePullPolicy: Always 11 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 12 | mainApplicationFile: "jfs://default/jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 13 | arguments: 14 | - "--dataset_location" 15 | - "jfs://default/datasets/" 16 | - "--output_location" 17 | - "jfs://default/results-1t/" 18 | - "--iterations" 19 | - "1" 20 | - "--shuffle_partitions" 21 | - "1000" 22 | - "--scale_factor" 23 | - "1000" 24 | - "--regenerate_dataset" 25 | - "false" 26 | - "--regenerate_metadata" 27 | - "false" 28 | - "--only_generate_data_and_meta" 29 | - "false" 30 | - "--db_suffix" 31 | - "cluster_180405" 32 | - "--query_exclude_list" 33 | - "q23a,q23b,q24a,q24b,q77" 34 | - "--format" 35 | - "parquet" 36 | sparkVersion: 2.4.5 37 | restartPolicy: 38 | type: Never 39 | sparkConf: 40 | spark.driver.extraLibraryPath: /opt/spark/lib/native 41 | spark.executor.extraLibraryPath: /opt/spark/lib/native 42 | #CBO 43 | spark.sql.cbo.enabled: "true" 44 | spark.sql.cbo.joinReorder.enabled: "true" 45 | spark.sql.cbo.joinReorder.dp.star.filter: "false" 46 | spark.sql.cbo.joinReorder.dp.threshold: "12" 47 | spark.sql.cbo.outerJoinReorder.enabled: "true" 48 | #RF 49 | spark.sql.dynamic.runtime.filter.enabled: "true" 50 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 51 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 52 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 53 | spark.sql.emr.fileindex.enabled: "false" 54 | spark.sql.intersect.groupby.placement: "true" 55 | spark.sql.extract.common.conjunct.filter: "true" 56 | spark.sql.infer.filter.from.joincondition: "true" 57 | spark.dynamicAllocation.enabled: "false" 58 | spark.ess.master.host: emr-rss-master.spark-rss 59 | spark.ess.master.port: "9099" 60 | spark.ess.rpc.io.clientThreads: "8" 61 | spark.ess.data.io.clientThreads: "8" 62 | spark.ess.data.io.numConnectionsPerPeer: "8" 63 | spark.ess.data.io.mode: NIO 64 | spark.shuffle.manager: org.apache.spark.shuffle.ess.EssShuffleManager 65 | spark.sql.uncorrelated.scalar.subquery.preexecution.enabled: "true" 66 | driver: 67 | cores: 5 68 | coreLimit: 5000m 69 | memory: 20g 70 | labels: 71 | version: 2.4.5 72 | serviceAccount: spark 73 | env: 74 | - name: TZ 75 | value: "Asia/Shanghai" 76 | - name: CLIENT_NAMESPACE_RPC_ADDRESS 77 | value: jindofs-master.jindofs:8101 78 | - name: CLIENT_STORAGE_RPC_PORT 79 | value: "6101" 80 | - name: CLIENT_STORAGE_RPC_HOST 81 | valueFrom: 82 | fieldRef: 83 | fieldPath: status.hostIP 84 | - name: JFS_CACHE_DATA_CACHE_ENABLE 85 | value: "1" 86 | executor: 87 | cores: 7 88 | coreLimit: 7000m 89 | instances: 20 90 | memory: 20g 91 | memoryOverhead: 6g 92 | labels: 93 | version: 2.4.5 94 | env: 95 | - name: SPARKLOGENV 96 | value: spark-executor 97 | - name: TZ 98 | value: "Asia/Shanghai" 99 | - name: CLIENT_NAMESPACE_RPC_ADDRESS 100 | value: jindofs-master.jindofs:8101 101 | - name: CLIENT_STORAGE_RPC_PORT 102 | value: "6101" 103 | - name: CLIENT_STORAGE_RPC_HOST 104 | valueFrom: 105 | fieldRef: 106 | fieldPath: status.hostIP 107 | - name: JFS_CACHE_DATA_CACHE_ENABLE 108 | value: "1" 109 | -------------------------------------------------------------------------------- /kubernetes/emr/tpcds-benchmark-with-emrspark-ess.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark-emrspark-ess-10t 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 10 | imagePullPolicy: Always 11 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 12 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 13 | arguments: 14 | - "--dataset_location" 15 | - "oss:///datasets/" 16 | - "--output_location" 17 | - "oss:///outputs/ack-pr-10t-emr-with-ess" 18 | - "--iterations" 19 | - "1" 20 | - "--shuffle_partitions" 21 | - "1000" 22 | - "--scale_factor" 23 | - "10000" 24 | - "--regenerate_dataset" 25 | - "false" 26 | - "--regenerate_metadata" 27 | - "false" 28 | - "--only_generate_data_and_meta" 29 | - "false" 30 | - "--format" 31 | - "parquet" 32 | - "--query_exclude_list" 33 | - "q14a,q14b,q67" 34 | sparkVersion: 2.4.5 35 | restartPolicy: 36 | type: Never 37 | hadoopConf: 38 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 39 | "fs.oss.endpoint": "" 40 | "fs.oss.accessKeyId": "" 41 | "fs.oss.accessKeySecret": "" 42 | hive.metastore.uris: thrift://service-hive-metastore.default:9083 43 | hive.metastore.client.socket.timeout: 600s 44 | sparkConf: 45 | spark.eventLog.enabled: "true" 46 | spark.eventLog.dir: "oss:///spark/eventlogs" 47 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 48 | spark.driver.maxResultSize: 40g 49 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=6g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 50 | spark.locality.wait.node: "0" 51 | spark.locality.wait.process: "0" 52 | spark.locality.wait.rack: "0" 53 | spark.locality.wait: "0" 54 | spark.memory.fraction: "0.8" 55 | spark.memory.offHeap.enabled: "false" 56 | spark.memory.offHeap.size: "17179869184" 57 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 58 | spark.sql.adaptive.enabled: "false" 59 | spark.sql.analyze.column.async.delay: "200" 60 | spark.sql.auto.reused.cte.enabled: "true" 61 | spark.sql.broadcastTimeout: "3600" 62 | spark.sql.columnVector.offheap.enabled: "false" 63 | spark.sql.crossJoin.enabled: "true" 64 | spark.sql.delete.optimizeInSubquery: "true" 65 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 66 | spark.sql.dynamic.runtime.filter.enabled: "true" 67 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 68 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 69 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 70 | spark.sql.files.openCostInBytes: "34108864" 71 | spark.sql.inMemoryColumnarStorage.compressed: "true" 72 | spark.sql.join.preferNativeJoin: "false" 73 | spark.sql.native.codecache: "true" 74 | spark.sql.native.codegen.wholeStage: "false" 75 | spark.sql.native.nativewrite: "false" 76 | spark.sql.pkfk.optimize.enable: "true" 77 | spark.sql.pkfk.riJoinElimination: "true" 78 | spark.sql.shuffle.partitions: "1000" 79 | spark.sql.simplifyDecimal.enabled: "true" 80 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 81 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 82 | spark.shuffle.reduceLocality.enabled: "false" 83 | spark.shuffle.service.enabled: "true" 84 | spark.dynamicAllocation.enabled: "false" 85 | spark.shuffle.manager: org.apache.spark.shuffle.ess.EssShuffleManager 86 | spark.ess.master.host: emr-rss-master.spark-rss 87 | spark.ess.master.port: "9099" 88 | spark.ess.push.data.buffer.size: 64k 89 | spark.ess.push.data.max.inflight: "2048" 90 | spark.ess.rpc.io.clientThreads: "8" 91 | spark.ess.rpc.io.serverThreads: "8" 92 | spark.ess.data.io.clientThreads: "8" 93 | spark.ess.data.io.numConnectionsPerPeer: "8" 94 | driver: 95 | cores: 15 96 | coreLimit: 15000m 97 | memory: 50g 98 | labels: 99 | version: 2.4.5 100 | serviceAccount: spark 101 | env: 102 | - name: TZ 103 | value: "Asia/Shanghai" 104 | executor: 105 | cores: 4 106 | coreLimit: 6000m 107 | instances: 20 108 | memory: 24g 109 | memoryOverhead: 10g 110 | deleteOnTermination: false 111 | labels: 112 | version: 2.4.5 113 | env: 114 | - name: TZ 115 | value: "Asia/Shanghai" -------------------------------------------------------------------------------- /kubernetes/emr/tpcds-benchmark-with-emrspark-jindofs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark-emrspark-ess-1t 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 10 | imagePullPolicy: Always 11 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 12 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 13 | arguments: 14 | - "--dataset_location" 15 | - "oss:///datasets/" 16 | - "--output_location" 17 | - "oss:///outputs/ack-pr-10t-emr-with-ess" 18 | - "--iterations" 19 | - "1" 20 | - "--shuffle_partitions" 21 | - "1000" 22 | - "--scale_factor" 23 | - "10000" 24 | - "--regenerate_dataset" 25 | - "false" 26 | - "--regenerate_metadata" 27 | - "false" 28 | - "--only_generate_data_and_meta" 29 | - "false" 30 | - "--format" 31 | - "parquet" 32 | - "--query_exclude_list" 33 | - "q14a,q14b,q67" 34 | sparkVersion: 2.4.5 35 | restartPolicy: 36 | type: Never 37 | hadoopConf: 38 | hive.metastore.uris: thrift://service-hive-metastore.default:9083 39 | hive.metastore.client.socket.timeout: 600s 40 | sparkConf: 41 | spark.eventLog.enabled: "true" 42 | spark.eventLog.dir: "oss:///spark/eventlogs" 43 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 44 | spark.driver.maxResultSize: 40g 45 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=6g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 46 | spark.locality.wait.node: "0" 47 | spark.locality.wait.process: "0" 48 | spark.locality.wait.rack: "0" 49 | spark.locality.wait: "0" 50 | spark.memory.fraction: "0.8" 51 | spark.memory.offHeap.enabled: "false" 52 | spark.memory.offHeap.size: "17179869184" 53 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 54 | spark.sql.adaptive.enabled: "false" 55 | spark.sql.analyze.column.async.delay: "200" 56 | spark.sql.auto.reused.cte.enabled: "true" 57 | spark.sql.broadcastTimeout: "3600" 58 | spark.sql.columnVector.offheap.enabled: "false" 59 | spark.sql.crossJoin.enabled: "true" 60 | spark.sql.delete.optimizeInSubquery: "true" 61 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 62 | spark.sql.dynamic.runtime.filter.enabled: "true" 63 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 64 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 65 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 66 | spark.sql.files.openCostInBytes: "34108864" 67 | spark.sql.inMemoryColumnarStorage.compressed: "true" 68 | spark.sql.join.preferNativeJoin: "false" 69 | spark.sql.native.codecache: "true" 70 | spark.sql.native.codegen.wholeStage: "false" 71 | spark.sql.native.nativewrite: "false" 72 | spark.sql.pkfk.optimize.enable: "true" 73 | spark.sql.pkfk.riJoinElimination: "true" 74 | spark.sql.shuffle.partitions: "1000" 75 | spark.sql.simplifyDecimal.enabled: "true" 76 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 77 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 78 | spark.shuffle.reduceLocality.enabled: "false" 79 | spark.shuffle.service.enabled: "true" 80 | spark.dynamicAllocation.enabled: "false" 81 | spark.local.dir: /mnt/diskb/spark-data,/mnt/diskc/spark-data,/mnt/diskd/spark-data,/mnt/diske/spark-data,/mnt/diskf/spark-data,/mnt/diskg/spark-data,/mnt/diskh/spark-data,/mnt/diski/spark-data,/mnt/diskj/spark-data,/mnt/diskk/spark-data,/mnt/diskl/spark-data,/mnt/diskm/spark-data 82 | spark.shuffle.manager: org.apache.spark.shuffle.sort.SortShuffleManager 83 | volumes: 84 | - name: diskb 85 | hostPath: 86 | path: /mnt/diskb 87 | type: Directory 88 | - name: diskc 89 | hostPath: 90 | path: /mnt/diskc 91 | type: Directory 92 | - name: diskd 93 | hostPath: 94 | path: /mnt/diskd 95 | type: Directory 96 | - name: diske 97 | hostPath: 98 | path: /mnt/diske 99 | type: Directory 100 | - name: diskf 101 | hostPath: 102 | path: /mnt/diskf 103 | type: Directory 104 | - name: diskg 105 | hostPath: 106 | path: /mnt/diskg 107 | type: Directory 108 | - name: diskh 109 | hostPath: 110 | path: /mnt/diskh 111 | type: Directory 112 | - name: diski 113 | hostPath: 114 | path: /mnt/diski 115 | type: Directory 116 | - name: diskj 117 | hostPath: 118 | path: /mnt/diskj 119 | type: Directory 120 | - name: diskk 121 | hostPath: 122 | path: /mnt/diskk 123 | type: Directory 124 | - name: diskl 125 | hostPath: 126 | path: /mnt/diskl 127 | type: Directory 128 | - name: diskm 129 | hostPath: 130 | path: /mnt/diskm 131 | type: Directory 132 | driver: 133 | cores: 15 134 | coreLimit: 15000m 135 | memory: 50g 136 | labels: 137 | version: 2.4.5 138 | serviceAccount: spark 139 | env: 140 | - name: TZ 141 | value: "Asia/Shanghai" 142 | executor: 143 | cores: 4 144 | coreLimit: 6000m 145 | instances: 20 146 | memory: 24g 147 | memoryOverhead: 10g 148 | deleteOnTermination: false 149 | labels: 150 | version: 2.4.5 151 | env: 152 | - name: TZ 153 | value: "Asia/Shanghai" 154 | volumeMounts: 155 | - mountPath: /mnt/diskb 156 | name: diskb 157 | - mountPath: /mnt/diskc 158 | name: diskc 159 | - mountPath: /mnt/diskd 160 | name: diskd 161 | - mountPath: /mnt/diske 162 | name: diske 163 | - mountPath: /mnt/diskf 164 | name: diskf 165 | - mountPath: /mnt/diskg 166 | name: diskg 167 | - mountPath: /mnt/diskh 168 | name: diskh 169 | - mountPath: /mnt/diski 170 | name: diski 171 | - mountPath: /mnt/diskj 172 | name: diskj 173 | - mountPath: /mnt/diskk 174 | name: diskk 175 | - mountPath: /mnt/diskl 176 | name: diskl 177 | - mountPath: /mnt/diskm 178 | name: diskm -------------------------------------------------------------------------------- /kubernetes/emr/tpcds-benchmark-with-emrspark.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark-emrspark-10t 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 10 | imagePullPolicy: Always 11 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 12 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 13 | arguments: 14 | - "--dataset_location" 15 | - "oss:///datasets/" 16 | - "--output_location" 17 | - "oss:///outputs/ack-pr-10t-emr" 18 | - "--iterations" 19 | - "1" 20 | - "--shuffle_partitions" 21 | - "1000" 22 | - "--scale_factor" 23 | - "10000" 24 | - "--regenerate_dataset" 25 | - "false" 26 | - "--regenerate_metadata" 27 | - "false" 28 | - "--only_generate_data_and_meta" 29 | - "false" 30 | - "--format" 31 | - "parquet" 32 | - "--query_exclude_list" 33 | - "q14a,q14b,q67" 34 | sparkVersion: 2.4.5 35 | restartPolicy: 36 | type: Never 37 | hadoopConf: 38 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 39 | "fs.oss.endpoint": "" 40 | "fs.oss.accessKeyId": "" 41 | "fs.oss.accessKeySecret": "" 42 | hive.metastore.uris: thrift://service-hive-metastore.default:9083 43 | hive.metastore.client.socket.timeout: 600s 44 | sparkConf: 45 | spark.eventLog.enabled: "true" 46 | spark.eventLog.dir: "oss:///spark/eventlogs" 47 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 48 | spark.driver.maxResultSize: 40g 49 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=6g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 50 | spark.locality.wait.node: "0" 51 | spark.locality.wait.process: "0" 52 | spark.locality.wait.rack: "0" 53 | spark.locality.wait: "0" 54 | spark.memory.fraction: "0.8" 55 | spark.memory.offHeap.enabled: "false" 56 | spark.memory.offHeap.size: "17179869184" 57 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 58 | spark.sql.adaptive.enabled: "false" 59 | spark.sql.analyze.column.async.delay: "200" 60 | spark.sql.auto.reused.cte.enabled: "true" 61 | spark.sql.broadcastTimeout: "3600" 62 | spark.sql.columnVector.offheap.enabled: "false" 63 | spark.sql.crossJoin.enabled: "true" 64 | spark.sql.delete.optimizeInSubquery: "true" 65 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 66 | spark.sql.dynamic.runtime.filter.enabled: "true" 67 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 68 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 69 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 70 | spark.sql.files.openCostInBytes: "34108864" 71 | spark.sql.inMemoryColumnarStorage.compressed: "true" 72 | spark.sql.join.preferNativeJoin: "false" 73 | spark.sql.native.codecache: "true" 74 | spark.sql.native.codegen.wholeStage: "false" 75 | spark.sql.native.nativewrite: "false" 76 | spark.sql.pkfk.optimize.enable: "true" 77 | spark.sql.pkfk.riJoinElimination: "true" 78 | spark.sql.shuffle.partitions: "1000" 79 | spark.sql.simplifyDecimal.enabled: "true" 80 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 81 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 82 | spark.shuffle.reduceLocality.enabled: "false" 83 | spark.shuffle.service.enabled: "false" 84 | spark.dynamicAllocation.enabled: "false" 85 | spark.local.dir: /mnt/diskb/spark-data,/mnt/diskc/spark-data,/mnt/diskd/spark-data,/mnt/diske/spark-data,/mnt/diskf/spark-data,/mnt/diskg/spark-data,/mnt/diskh/spark-data,/mnt/diski/spark-data,/mnt/diskj/spark-data,/mnt/diskk/spark-data,/mnt/diskl/spark-data,/mnt/diskm/spark-data 86 | spark.shuffle.manager: org.apache.spark.shuffle.sort.SortShuffleManager 87 | volumes: 88 | - name: diskb 89 | hostPath: 90 | path: /mnt/diskb 91 | type: Directory 92 | - name: diskc 93 | hostPath: 94 | path: /mnt/diskc 95 | type: Directory 96 | - name: diskd 97 | hostPath: 98 | path: /mnt/diskd 99 | type: Directory 100 | - name: diske 101 | hostPath: 102 | path: /mnt/diske 103 | type: Directory 104 | - name: diskf 105 | hostPath: 106 | path: /mnt/diskf 107 | type: Directory 108 | - name: diskg 109 | hostPath: 110 | path: /mnt/diskg 111 | type: Directory 112 | - name: diskh 113 | hostPath: 114 | path: /mnt/diskh 115 | type: Directory 116 | - name: diski 117 | hostPath: 118 | path: /mnt/diski 119 | type: Directory 120 | - name: diskj 121 | hostPath: 122 | path: /mnt/diskj 123 | type: Directory 124 | - name: diskk 125 | hostPath: 126 | path: /mnt/diskk 127 | type: Directory 128 | - name: diskl 129 | hostPath: 130 | path: /mnt/diskl 131 | type: Directory 132 | - name: diskm 133 | hostPath: 134 | path: /mnt/diskm 135 | type: Directory 136 | driver: 137 | cores: 15 138 | coreLimit: 15000m 139 | memory: 50g 140 | labels: 141 | version: 2.4.5 142 | serviceAccount: spark 143 | env: 144 | - name: TZ 145 | value: "Asia/Shanghai" 146 | executor: 147 | cores: 4 148 | coreLimit: 6000m 149 | instances: 20 150 | memory: 24g 151 | memoryOverhead: 10g 152 | deleteOnTermination: false 153 | labels: 154 | version: 2.4.5 155 | env: 156 | - name: TZ 157 | value: "Asia/Shanghai" 158 | volumeMounts: 159 | - mountPath: /mnt/diskb 160 | name: diskb 161 | - mountPath: /mnt/diskc 162 | name: diskc 163 | - mountPath: /mnt/diskd 164 | name: diskd 165 | - mountPath: /mnt/diske 166 | name: diske 167 | - mountPath: /mnt/diskf 168 | name: diskf 169 | - mountPath: /mnt/diskg 170 | name: diskg 171 | - mountPath: /mnt/diskh 172 | name: diskh 173 | - mountPath: /mnt/diski 174 | name: diski 175 | - mountPath: /mnt/diskj 176 | name: diskj 177 | - mountPath: /mnt/diskk 178 | name: diskk 179 | - mountPath: /mnt/diskl 180 | name: diskl 181 | - mountPath: /mnt/diskm 182 | name: diskm -------------------------------------------------------------------------------- /kubernetes/emr/tpcds-data-generation.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-data-generation-10t 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/zf-spark/spark-2.4.5:for-tpc-ds-2 10 | imagePullPolicy: Always 11 | mainClass: com.databricks.spark.sql.perf.tpcds.TPCDS_Standalone 12 | mainApplicationFile: "oss:///jars/spark-sql-perf-assembly-0.5.0-SNAPSHOT.jar" 13 | arguments: 14 | - "--dataset_location" 15 | - "oss:///datasets/" 16 | - "--output_location" 17 | - "oss:///outputs/ack-pr-10t-emr" 18 | - "--iterations" 19 | - "1" 20 | - "--shuffle_partitions" 21 | - "1000" 22 | - "--scale_factor" 23 | - "10000" 24 | - "--regenerate_dataset" 25 | - "true" 26 | - "--regenerate_metadata" 27 | - "true" 28 | - "--only_generate_data_and_meta" 29 | - "true" 30 | - "--format" 31 | - "parquet" 32 | sparkVersion: 2.4.5 33 | restartPolicy: 34 | type: Never 35 | sparkConf: 36 | spark.eventLog.enabled: "true" 37 | spark.eventLog.dir: "oss:///spark/eventlogs" 38 | spark.driver.extraJavaOptions: "-XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 39 | spark.driver.maxResultSize: 40g 40 | spark.executor.extraJavaOptions: "-XX:MaxDirectMemorySize=32g -XX:-PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps" 41 | spark.locality.wait.node: "0" 42 | spark.locality.wait.process: "0" 43 | spark.locality.wait.rack: "0" 44 | spark.locality.wait: "0" 45 | spark.memory.fraction: "0.8" 46 | spark.memory.offHeap.enabled: "false" 47 | spark.memory.offHeap.size: "17179869184" 48 | spark.sql.adaptive.bloomFilterJoin.enabled: "false" 49 | spark.sql.adaptive.enabled: "false" 50 | spark.sql.analyze.column.async.delay: "200" 51 | spark.sql.auto.reused.cte.enabled: "true" 52 | spark.sql.broadcastTimeout: "3600" 53 | spark.sql.columnVector.offheap.enabled: "false" 54 | spark.sql.crossJoin.enabled: "true" 55 | spark.sql.delete.optimizeInSubquery: "true" 56 | spark.sql.dynamic.runtime.filter.bbf.enabled: "false" 57 | spark.sql.dynamic.runtime.filter.enabled: "true" 58 | spark.sql.dynamic.runtime.filter.exact.enabled: "true" 59 | spark.sql.dynamic.runtime.filter.table.size.lower.limit: "1069547520" 60 | spark.sql.dynamic.runtime.filter.table.size.upper.limit: "5368709120" 61 | spark.sql.files.openCostInBytes: "34108864" 62 | spark.sql.inMemoryColumnarStorage.compressed: "true" 63 | spark.sql.join.preferNativeJoin: "false" 64 | spark.sql.native.codecache: "true" 65 | spark.sql.native.codegen.wholeStage: "false" 66 | spark.sql.native.nativewrite: "false" 67 | spark.sql.pkfk.optimize.enable: "true" 68 | spark.sql.pkfk.riJoinElimination: "true" 69 | spark.sql.shuffle.partitions: "1000" 70 | spark.sql.simplifyDecimal.enabled: "true" 71 | spark.sql.sources.parallelPartitionDiscovery.parallelism: "432" 72 | spark.sql.sources.parallelPartitionDiscovery.threshold: "32" 73 | spark.shuffle.reduceLocality.enabled: "false" 74 | spark.shuffle.service.enabled: "true" 75 | spark.dynamicAllocation.enabled: "false" 76 | driver: 77 | cores: 15 78 | coreLimit: 15000m 79 | memory: 30g 80 | labels: 81 | version: 2.4.5 82 | serviceAccount: spark 83 | env: 84 | - name: TZ 85 | value: "Asia/Shanghai" 86 | executor: 87 | cores: 8 88 | coreLimit: 8000m 89 | instances: 20 90 | memory: 24g 91 | labels: 92 | version: 2.4.5 93 | env: 94 | - name: TZ 95 | value: "Asia/Shanghai" -------------------------------------------------------------------------------- /kubernetes/spark/tpcds-benchmark-with-alluxio.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark-with-alluxio 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/yukong/ack-spark-benchmark:1.0.0 10 | imagePullPolicy: Always 11 | sparkVersion: 2.4.5 12 | mainClass: com.aliyun.spark.benchmark.tpcds.BenchmarkSQL 13 | mainApplicationFile: "local:///opt/spark/jars/ack-spark-benchmark-assembly-0.1.jar" 14 | arguments: 15 | # TPC-DS data localtion 16 | - "alluxio://alluxio-master-0.alluxio.svc.cluster.local:19998/spark/data/tpc-ds-data/1000g" 17 | # results location 18 | - "oss:///spark/result/tpcds-benchmark-result-1000g-with-alluxio" 19 | # Path to kit in the docker image 20 | - "/tmp/tpcds-kit/tools" 21 | # Data Format 22 | - "parquet" 23 | # Scale factor (in GB) 24 | - "1000" 25 | # Number of iterations 26 | - "1" 27 | # Optimize queries 28 | - "false" 29 | # Filter queries, will run all if empty - "q70-v2.4,q82-v2.4,q64-v2.4" 30 | - "" 31 | # Logging set to WARN 32 | - "true" 33 | hostNetwork: true 34 | dnsPolicy: ClusterFirstWithHostNet 35 | restartPolicy: 36 | type: Never 37 | timeToLiveSeconds: 86400 38 | hadoopConf: 39 | # OSS 40 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 41 | "fs.oss.endpoint": "" 42 | "fs.oss.accessKeyId": "" 43 | "fs.oss.accessKeySecret": "" 44 | # OSS performance best practice 45 | "fs.oss.paging.maximum": 1000 46 | "fs.oss.multipart.download.threads": 32 47 | "fs.oss.max.total.tasks": 256 48 | "fs.oss.connection.maximum": 2048 49 | sparkConf: 50 | "spark.kubernetes.allocation.batch.size": "200" 51 | "spark.sql.adaptive.join.enabled": "true" 52 | "spark.eventLog.enabled": "true" 53 | "spark.eventLog.dir": "oss:///spark/spark-events" 54 | volumes: 55 | - name: "spark-local-dir-1" 56 | hostPath: 57 | path: "/mnt/disk1" 58 | type: Directory 59 | - name: "spark-local-dir-2" 60 | hostPath: 61 | path: "/mnt/disk2" 62 | type: Directory 63 | - name: "spark-local-dir-3" 64 | hostPath: 65 | path: "/mnt/disk3" 66 | type: Directory 67 | - name: "spark-local-dir-4" 68 | hostPath: 69 | path: "/mnt/disk4" 70 | type: Directory 71 | - name: "spark-local-dir-5" 72 | hostPath: 73 | path: "/mnt/disk5" 74 | type: Directory 75 | - name: "spark-local-dir-6" 76 | hostPath: 77 | path: "/mnt/disk6" 78 | type: Directory 79 | - name: "spark-local-dir-7" 80 | hostPath: 81 | path: "/mnt/disk7" 82 | type: Directory 83 | - name: "spark-local-dir-8" 84 | hostPath: 85 | path: "/mnt/disk8" 86 | type: Directory 87 | - name: "spark-local-dir-9" 88 | hostPath: 89 | path: "/mnt/disk9" 90 | type: Directory 91 | - name: "spark-local-dir-10" 92 | hostPath: 93 | path: "/mnt/disk10" 94 | type: Directory 95 | - name: "spark-local-dir-11" 96 | hostPath: 97 | path: "/mnt/disk11" 98 | type: Directory 99 | - name: "spark-local-dir-12" 100 | hostPath: 101 | path: "/mnt/disk12" 102 | type: Directory 103 | driver: 104 | cores: 5 105 | memory: "20480m" 106 | labels: 107 | version: 2.4.5 108 | spark-app: spark-tpcds 109 | role: driver 110 | serviceAccount: spark 111 | executor: 112 | cores: 7 113 | instances: 20 114 | memory: "20480m" 115 | memoryOverhead: "8g" 116 | labels: 117 | version: 2.4.5 118 | role: executor 119 | volumeMounts: 120 | - name: "spark-local-dir-1" 121 | mountPath: "/mnt/disk1" 122 | - name: "spark-local-dir-2" 123 | mountPath: "/mnt/disk2" 124 | - name: "spark-local-dir-3" 125 | mountPath: "/mnt/disk3" 126 | - name: "spark-local-dir-4" 127 | mountPath: "/mnt/disk4" 128 | - name: "spark-local-dir-5" 129 | mountPath: "/mnt/disk5" 130 | - name: "spark-local-dir-6" 131 | mountPath: "/mnt/disk6" 132 | - name: "spark-local-dir-7" 133 | mountPath: "/mnt/disk7" 134 | - name: "spark-local-dir-8" 135 | mountPath: "/mnt/disk8" 136 | - name: "spark-local-dir-9" 137 | mountPath: "/mnt/disk9" 138 | - name: "spark-local-dir-10" 139 | mountPath: "/mnt/disk10" 140 | - name: "spark-local-dir-11" 141 | mountPath: "/mnt/disk11" 142 | - name: "spark-local-dir-12" 143 | mountPath: "/mnt/disk12" -------------------------------------------------------------------------------- /kubernetes/spark/tpcds-benchmark.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-benchmark 5 | namespace: default 6 | spec: 7 | type: Scala 8 | mode: cluster 9 | image: registry.cn-beijing.aliyuncs.com/yukong/ack-spark-benchmark:1.0.0 10 | imagePullPolicy: Always 11 | sparkVersion: 2.4.5 12 | mainClass: com.aliyun.spark.benchmark.tpcds.BenchmarkSQL 13 | mainApplicationFile: "local:///opt/spark/jars/ack-spark-benchmark-assembly-0.1.jar" 14 | arguments: 15 | # TPC-DS data localtion 16 | - "oss:///spark/data/tpc-ds-data/1000g" 17 | # results location 18 | - "oss:///spark/result/tpcds-benchmark-result-1000g" 19 | # Path to kit in the docker image 20 | - "/tmp/tpcds-kit/tools" 21 | # Data Format 22 | - "parquet" 23 | # Scale factor (in GB) 24 | - "1000" 25 | # Number of iterations 26 | - "1" 27 | # Optimize queries 28 | - "false" 29 | # Filter queries, will run all if empty - "q70-v2.4,q82-v2.4,q64-v2.4" 30 | - "" 31 | # Logging set to WARN 32 | - "true" 33 | hostNetwork: true 34 | dnsPolicy: ClusterFirstWithHostNet 35 | restartPolicy: 36 | type: Never 37 | timeToLiveSeconds: 86400 38 | hadoopConf: 39 | # OSS 40 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 41 | "fs.oss.endpoint": "" 42 | "fs.oss.accessKeyId": "" 43 | "fs.oss.accessKeySecret": "" 44 | # OSS performance best practice 45 | "fs.oss.paging.maximum": 1000 46 | "fs.oss.multipart.download.threads": 32 47 | "fs.oss.max.total.tasks": 256 48 | "fs.oss.connection.maximum": 2048 49 | sparkConf: 50 | "spark.kubernetes.allocation.batch.size": "200" 51 | "spark.sql.adaptive.join.enabled": "true" 52 | "spark.eventLog.enabled": "true" 53 | "spark.eventLog.dir": "oss:///spark/spark-events" 54 | driver: 55 | cores: 5 56 | memory: "20480m" 57 | labels: 58 | version: 2.4.5 59 | spark-app: spark-tpcds 60 | role: driver 61 | serviceAccount: spark 62 | executor: 63 | cores: 7 64 | instances: 20 65 | memory: "20480m" 66 | memoryOverhead: "8g" 67 | labels: 68 | version: 2.4.5 69 | role: executor -------------------------------------------------------------------------------- /kubernetes/spark/tpcds-data-generation.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: tpcds-data-generation 5 | namespace: default 6 | spec: 7 | type: Scala 8 | image: registry.cn-beijing.aliyuncs.com/yukong/ack-spark-benchmark:1.0.0 9 | sparkVersion: 2.4.5 10 | mainClass: com.aliyun.spark.benchmark.tpcds.DataGeneration 11 | mainApplicationFile: "local:///opt/spark/jars/ack-spark-benchmark-assembly-0.1.jar" 12 | mode: cluster 13 | arguments: 14 | # TPC-DS data localtion 15 | - "oss:///spark/data/tpc-ds-data/1000g" 16 | # Path to kit in the docker image 17 | - "/tmp/tpcds-kit/tools" 18 | # Data Format 19 | - "parquet" 20 | # Scale factor (in GB) 21 | - "100000" 22 | # Generate data num partitions 23 | - "100" 24 | # Create the partitioned fact tables 25 | - "false" 26 | # Shuffle to get partitions coalesced into single files. 27 | - "false" 28 | # Logging set to WARN 29 | - "true" 30 | hadoopConf: 31 | # OSS 32 | "fs.oss.impl": "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem" 33 | "fs.oss.endpoint": "" 34 | "fs.oss.accessKeyId": "" 35 | "fs.oss.accessKeySecret": "" 36 | # OSS performance best practice 37 | "fs.oss.paging.maximum": 1000 38 | "fs.oss.multipart.download.threads": 32 39 | "fs.oss.max.total.tasks": 256 40 | "fs.oss.connection.maximum": 2048 41 | sparkConf: 42 | "spark.kubernetes.allocation.batch.size": "100" 43 | "spark.sql.adaptive.enabled": "true" 44 | "spark.eventLog.enabled": "true" 45 | "spark.eventLog.dir": "oss:///spark/spark-events" 46 | driver: 47 | cores: 6 48 | memory: "20480m" 49 | serviceAccount: spark 50 | executor: 51 | instances: 20 52 | cores: 8 53 | memory: "61440m" 54 | memoryOverhead: 2g 55 | restartPolicy: 56 | type: Never -------------------------------------------------------------------------------- /lib/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AliyunContainerService/benchmark-for-spark/6cb2181a1db5659465922218d15f8483e7aa7360/lib/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.3 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.1") 2 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/ack/spark/tpcds/Benchmark.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.ack.spark.tpcds 2 | 3 | import scala.util.Try 4 | 5 | import com.databricks.spark.sql.perf.tpcds.{TPCDS, TPCDSTables} 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.functions.col 9 | import org.apache.log4j.{Level, LogManager} 10 | import org.apache.spark.sql.types.DoubleType 11 | import scopt.OParser 12 | 13 | case class BenchmarkConfig( 14 | tpcdsDataPath: String = "", 15 | outputPath: String = "", 16 | dsdgenPath: String = "/opt/tpcds-kit/tools", 17 | format: String = "parquet", 18 | scaleFactor: Int = 1, 19 | iterations: Int = 1, 20 | optimizeQueries: Boolean = false, 21 | queries: String = "", 22 | onlyWarn: Boolean = false 23 | ) 24 | 25 | object Benchmark { 26 | 27 | def main(args: Array[String]): Unit = { 28 | val builder = OParser.builder[BenchmarkConfig] 29 | 30 | val parser = { 31 | import builder._ 32 | OParser.sequence( 33 | programName("Benchmark"), 34 | opt[String]("data") 35 | .required() 36 | .valueName("") 37 | .action((x, c) => c.copy(tpcdsDataPath = x)) 38 | .text("path of tpcds data"), 39 | opt[String]("result") 40 | .required() 41 | .valueName("") 42 | .action((x, c) => c.copy(outputPath = x)) 43 | .text("path of benchmark result"), 44 | opt[String]("dsdgen") 45 | .optional() 46 | .valueName("") 47 | .action((x, c) => c.copy(dsdgenPath = x)) 48 | .text("path of tpcds-kit tools"), 49 | opt[String]("format") 50 | .valueName("") 51 | .action((x, c) => c.copy(format = x)) 52 | .text("data format"), 53 | opt[Int]("scale-factor") 54 | .optional() 55 | .valueName("") 56 | .action((x, c) => c.copy(scaleFactor = x)) 57 | .text("scale factor of tpcds data (in GB)"), 58 | opt[Int]("iterations") 59 | .optional() 60 | .action((x, c) => c.copy(iterations = x)) 61 | .text("number of iterations"), 62 | opt[Unit]("optimize-queries") 63 | .optional() 64 | .action((_, c) => c.copy(optimizeQueries = true)) 65 | .text("whether to optimize queries"), 66 | opt[String]("queries") 67 | .optional() 68 | .action((x, c) => c.copy(queries = x)) 69 | .text("queries to execute(empty means all queries)"), 70 | opt[Unit]("only-warn") 71 | .optional() 72 | .action((_, c) => c.copy(onlyWarn = true)) 73 | .text("set logging level to warning") 74 | ) 75 | } 76 | 77 | val option = OParser.parse(parser, args, BenchmarkConfig()) 78 | if (option.isEmpty) { 79 | System.exit(1) 80 | } 81 | val config = option.get.asInstanceOf[BenchmarkConfig] 82 | val databaseName = "tpcds_db" 83 | val timeout = 24 * 60 * 60 84 | 85 | println(s"DATA DIR is ${config.tpcdsDataPath}") 86 | 87 | val spark = SparkSession.builder 88 | .appName(s"TPCDS SQL Benchmark ${config.scaleFactor} GB") 89 | .getOrCreate() 90 | 91 | if (config.onlyWarn) { 92 | println(s"Only WARN") 93 | LogManager.getLogger("org").setLevel(Level.WARN) 94 | } 95 | 96 | val tables = new TPCDSTables( 97 | spark.sqlContext, 98 | dsdgenDir = config.dsdgenPath, 99 | scaleFactor = config.scaleFactor.toString, 100 | useDoubleForDecimal = false, 101 | useStringForDate = false 102 | ) 103 | 104 | if (config.optimizeQueries) { 105 | Try { 106 | spark.sql(s"create database $databaseName") 107 | } 108 | tables.createExternalTables( 109 | config.tpcdsDataPath, 110 | config.format, 111 | databaseName, 112 | overwrite = true, 113 | discoverPartitions = true 114 | ) 115 | tables.analyzeTables(databaseName, analyzeColumns = true) 116 | spark.conf.set("spark.sql.cbo.enabled", "true") 117 | } else { 118 | tables.createTemporaryTables(config.tpcdsDataPath, config.format) 119 | } 120 | 121 | val tpcds = new TPCDS(spark.sqlContext) 122 | 123 | var query_filter: Seq[String] = Seq() 124 | if (!config.queries.isEmpty) { 125 | println(s"Running only queries: ${config.queries}") 126 | query_filter = config.queries.split(",").toSeq 127 | } 128 | 129 | val filtered_queries = query_filter match { 130 | case Seq() => tpcds.tpcds2_4Queries 131 | case _ => tpcds.tpcds2_4Queries.filter(q => query_filter.contains(q.name)) 132 | } 133 | 134 | // Start experiment 135 | val experiment = tpcds.runExperiment( 136 | filtered_queries, 137 | iterations = config.iterations, 138 | resultLocation = config.outputPath, 139 | forkThread = true 140 | ) 141 | 142 | experiment.waitForFinish(timeout) 143 | 144 | // Collect general results 145 | val resultPath = experiment.resultPath 146 | println(s"Reading result at ${resultPath}") 147 | val specificResultTable = spark.read.json(resultPath) 148 | specificResultTable.show() 149 | 150 | // Summarize results 151 | val result = specificResultTable 152 | .withColumn("result", explode(col("results"))) 153 | .withColumn("executionSeconds", col("result.executionTime") / 1000) 154 | .withColumn("queryName", col("result.name")) 155 | result.select("iteration", "queryName", "executionSeconds").show() 156 | 157 | val aggResults = result 158 | .groupBy("queryName") 159 | .agg( 160 | min("executionSeconds").cast(DoubleType).as("MinRuntimeInSeconds"), 161 | max("executionSeconds").cast(DoubleType).as("MaxRuntimeInSeconds"), 162 | mean("executionSeconds").cast(DoubleType).as("MeanRuntimeInSeconds"), 163 | stddev_pop("executionSeconds") 164 | .cast(DoubleType) 165 | .as("StandardDeviationInSeconds") 166 | ) 167 | .orderBy("queryName") 168 | 169 | aggResults 170 | .repartition(1) 171 | .write 172 | .csv(s"${resultPath}/summary.csv") 173 | 174 | aggResults.show(105) 175 | 176 | spark.stop() 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/main/scala/com/aliyun/ack/spark/tpcds/DataGeneration.scala: -------------------------------------------------------------------------------- 1 | package com.aliyun.ack.spark.tpcds 2 | 3 | import scala.util.Try 4 | 5 | import com.databricks.spark.sql.perf.tpcds.TPCDSTables 6 | import org.apache.log4j.{Level, LogManager} 7 | import org.apache.spark.sql.SparkSession 8 | import scopt.OParser 9 | 10 | case class DataGenerationConfig( 11 | outputPath: String = "", 12 | dsdgenPath: String = "/opt/tpcds-kit/tools", 13 | format: String = "parquet", 14 | scaleFactor: Int = 1, 15 | patitionTable: Boolean = false, 16 | numPartitions: Int = 1, 17 | coalesced: Boolean = false, 18 | onlyWarn: Boolean = false 19 | ) 20 | 21 | object DataGeneration { 22 | 23 | def main(args: Array[String]): Unit = { 24 | 25 | val builder = OParser.builder[DataGenerationConfig] 26 | 27 | val parser = { 28 | import builder._ 29 | OParser.sequence( 30 | programName("DataGeneration"), 31 | opt[String]("output") 32 | .required() 33 | .valueName("") 34 | .action((x, c) => c.copy(outputPath = x)) 35 | .text("output path of tpcds data"), 36 | opt[String]("dsdgen") 37 | .optional() 38 | .valueName("") 39 | .action((x, c) => c.copy(dsdgenPath = x)) 40 | .text("path of tpcds-kit tools"), 41 | opt[String]("format") 42 | .optional() 43 | .valueName("") 44 | .action((x, c) => c.copy(format = x)) 45 | .text("data format"), 46 | opt[Int]("scale-factor") 47 | .optional() 48 | .valueName("") 49 | .action((x, c) => c.copy(scaleFactor = x)) 50 | .text("scale factor of tpcds data (in GB)"), 51 | opt[Unit]("create-partitions") 52 | .action((_, c) => c.copy(patitionTable = true)) 53 | .optional() 54 | .text("whether to optimize queries"), 55 | opt[Int]("num-partitions") 56 | .optional() 57 | .action((x, c) => c.copy(numPartitions = x)) 58 | .text("number of partitions"), 59 | opt[Unit]("coalesced") 60 | .optional() 61 | .action((_, c) => c.copy(coalesced = true)) 62 | .text( 63 | "whether to shuffle to get partitions coalesced into single files" 64 | ), 65 | opt[Unit]("only-warn") 66 | .optional() 67 | .action((_, c) => c.copy(onlyWarn = true)) 68 | .text("set logging level to warning") 69 | ) 70 | } 71 | 72 | val option = OParser.parse(parser, args, DataGenerationConfig()) 73 | if (option.isEmpty) { 74 | System.exit(1) 75 | } 76 | val config = option.get.asInstanceOf[DataGenerationConfig] 77 | 78 | println(s"DATA DIR is ${config.outputPath}") 79 | println(s"Tools dsdgen executable located in ${config.dsdgenPath}") 80 | println(s"Scale factor is ${config.scaleFactor} GB") 81 | 82 | val spark = SparkSession.builder 83 | .appName(s"TPCDS Generate Data ${config.scaleFactor} GB") 84 | .getOrCreate() 85 | 86 | if (config.onlyWarn) { 87 | println(s"Only WARN") 88 | LogManager.getLogger("org").setLevel(Level.WARN) 89 | } 90 | 91 | val tables = new TPCDSTables( 92 | spark.sqlContext, 93 | dsdgenDir = config.dsdgenPath, 94 | scaleFactor = config.scaleFactor.toString, 95 | useDoubleForDecimal = false, 96 | useStringForDate = false 97 | ) 98 | 99 | println(s"Generating TPCDS data") 100 | 101 | tables.genData( 102 | location = config.outputPath, 103 | format = config.format, 104 | overwrite = true, // overwrite the data that is already there 105 | partitionTables = 106 | config.patitionTable, // create the partitioned fact tables 107 | clusterByPartitionColumns = 108 | config.coalesced, // shuffle to get partitions coalesced into single files. 109 | filterOutNullPartitionValues = 110 | false, // true to filter out the partition with NULL key value 111 | tableFilter = "", // "" means generate all tables 112 | numPartitions = 113 | config.numPartitions // how many dsdgen partitions to run - number of input tasks. 114 | ) 115 | 116 | println(s"Data generated at ${config.outputPath}") 117 | 118 | spark.stop() 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /terraform/alicloud/datasources.tf: -------------------------------------------------------------------------------- 1 | data "alicloud_cs_kubernetes_addons" "default" { 2 | cluster_id = module.cs.cluster_id 3 | } 4 | 5 | data "alicloud_cs_cluster_credential" "default" { 6 | cluster_id = module.cs.cluster_id 7 | output_file = "~/.kube/config" 8 | } 9 | -------------------------------------------------------------------------------- /terraform/alicloud/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | alicloud = { 4 | source = "hashicorp/alicloud" 5 | version = "1.223.2" 6 | } 7 | } 8 | 9 | required_version = ">= 1.8.0" 10 | } 11 | 12 | resource "random_string" "suffix" { 13 | length = 16 14 | lower = true 15 | upper = false 16 | special = false 17 | } 18 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/celeborn/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_cs_kubernetes_node_pool" "celeborn-master" { 2 | node_pool_name = "celeborn-master" 3 | cluster_id = var.cluster_id 4 | vswitch_ids = var.vswitch_ids 5 | desired_size = var.master_instance_count 6 | instance_types = [var.master_instance_type] 7 | image_type = "AliyunLinux3" 8 | system_disk_category = "cloud_essd" 9 | system_disk_size = 40 10 | system_disk_performance_level = "PL1" 11 | 12 | data_disks { 13 | category = "cloud_essd" 14 | size = 300 15 | performance_level = "PL1" 16 | device = "/dev/vdb" 17 | } 18 | 19 | data_disks { 20 | category = "cloud_essd" 21 | size = 40 22 | performance_level = "PL1" 23 | device = "/dev/vdc" 24 | } 25 | 26 | labels { 27 | key = "celeborn.apache.org/role" 28 | value = "master" 29 | } 30 | 31 | taints { 32 | key = "celeborn.apache.org/role" 33 | value = "master" 34 | effect = "NoSchedule" 35 | } 36 | 37 | user_data = base64encode(file("${path.module}/master_user_data.sh")) 38 | 39 | resource_group_id = var.resource_group_id 40 | security_group_ids = [var.security_group_id] 41 | } 42 | 43 | resource "alicloud_cs_kubernetes_node_pool" "celeborn-worker" { 44 | node_pool_name = "celeborn-worker" 45 | cluster_id = var.cluster_id 46 | vswitch_ids = var.vswitch_ids 47 | desired_size = var.worker_instance_count 48 | instance_types = [var.worker_instance_type] 49 | image_type = "AliyunLinux3" 50 | system_disk_category = "cloud_essd" 51 | system_disk_size = 40 52 | system_disk_performance_level = "PL1" 53 | 54 | labels { 55 | key = "celeborn.apache.org/role" 56 | value = "worker" 57 | } 58 | 59 | taints { 60 | key = "celeborn.apache.org/role" 61 | value = "worker" 62 | effect = "NoSchedule" 63 | } 64 | 65 | user_data = base64encode(file("${path.module}/worker_user_data.sh")) 66 | 67 | resource_group_id = var.resource_group_id 68 | security_group_ids = [var.security_group_id] 69 | } 70 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/celeborn/master_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eux 4 | 5 | yum install -y parted e2fsprogs 6 | 7 | output=$(fdisk -l | awk '/^Disk \//' | grep -o -E '/dev/(vd[a-z]|xvd[a-z]|nvme[0-9]n1)') 8 | disks=() 9 | while IFS= read -r line; do 10 | disks+=("$line") 11 | done <<<"$output" 12 | 13 | n=${#disks[@]} 14 | 15 | # Create one primary partition for every disk except for the first and the last one. 16 | for ((i = 1; i < n - 1; i++)); do 17 | disk="${disks[i]}" 18 | parted "${disk}" mklabel gpt 19 | parted "${disk}" mkpart primary 1 100% 20 | parted "${disk}" align-check optimal 1 21 | done 22 | partprobe 23 | 24 | # Create XFS file system for the first partition of every disk. 25 | for ((i = 1; i < n - 1; i++)); do 26 | disk="${disks[i]}" 27 | if [[ ${disk} =~ "/dev/nvme" ]]; then 28 | mkfs -t xfs "${disk}p1" 29 | elif [[ ${disk} =~ "/dev/vd" ]]; then 30 | mkfs -t xfs "${disk}1" 31 | elif [[ ${disk} =~ "/dev/xvd" ]]; then 32 | mkfs -t xfs "${disk}1" 33 | fi 34 | done 35 | 36 | # Mount file systems to /mnt/disk1, /mnt/disk2, etc. 37 | cp /etc/fstab /etc/fstab.bak 38 | 39 | for ((i = 1; i < n - 1; i++)); do 40 | dir="/mnt/disk$i" 41 | mkdir -p ${dir} 42 | if [[ ${disk} =~ "/dev/nvme" ]]; then 43 | echo "$(blkid "${disks[i]}p1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 44 | elif [[ ${disk} =~ "/dev/vd" ]]; then 45 | echo "$(blkid "${disks[i]}1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 46 | elif [[ ${disk} =~ "/dev/xvd" ]]; then 47 | echo "$(blkid "${disks[i]}1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 48 | fi 49 | done 50 | 51 | mount -a 52 | 53 | chmod a+w /mnt/disk* 54 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/celeborn/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "cluster_id" { 7 | type = string 8 | description = "The id of managed Kubernetes cluster." 9 | } 10 | 11 | variable "vswitch_ids" { 12 | type = list(string) 13 | description = "The list of vswitch id." 14 | } 15 | 16 | variable "master_instance_count" { 17 | type = number 18 | description = "Instance count of Celeborn master node pool." 19 | } 20 | 21 | variable "master_instance_type" { 22 | type = string 23 | description = "Instance type of Celeborn worker node pool" 24 | default = "ecs.g7.2xlarge" 25 | } 26 | 27 | variable "worker_instance_count" { 28 | type = number 29 | description = "Instance count of Celeborn worker node pool." 30 | } 31 | 32 | variable "worker_instance_type" { 33 | type = string 34 | description = "Instance type of Celeborn worker node pool." 35 | default = "ecs.i4.8xlarge" 36 | } 37 | 38 | variable "resource_group_id" { 39 | type = string 40 | description = "The id of resource group." 41 | } 42 | 43 | variable "security_group_id" { 44 | type = string 45 | description = "The id of security group." 46 | } 47 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/celeborn/worker_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eux 4 | 5 | yum install -y parted e2fsprogs 6 | 7 | output=$(fdisk -l | awk '/^Disk \//' | grep -o -E '/dev/nvme[0-9]n1') 8 | disks=() 9 | while IFS= read -r line; do 10 | disks+=("$line") 11 | done <<<"$output" 12 | 13 | n=${#disks[@]} 14 | 15 | # Create one primary partition for every disk. 16 | for ((i = 0; i < n; i++)); do 17 | disk="${disks[i]}" 18 | parted "${disk}" mklabel gpt 19 | parted "${disk}" mkpart primary 1 100% 20 | parted "${disk}" align-check optimal 1 21 | done 22 | partprobe 23 | 24 | # Create XFS file system for the first partition of every disk. 25 | for ((i = 0; i < n; i++)); do 26 | disk="${disks[i]}" 27 | if [[ ${disk} =~ "/dev/nvme" ]]; then 28 | mkfs -t xfs "${disk}p1" 29 | elif [[ ${disk} =~ "/dev/vd" ]]; then 30 | mkfs -t xfs "${disk}1" 31 | elif [[ ${disk} =~ "/dev/xvd" ]]; then 32 | mkfs -t xfs "${disk}1" 33 | fi 34 | done 35 | 36 | # Mount file systems to /mnt/disk1, /mnt/disk2, etc. 37 | cp /etc/fstab /etc/fstab.bak 38 | 39 | for ((i = 0; i < n; i++)); do 40 | dir="/mnt/disk$((i + 1))" 41 | mkdir -p ${dir} 42 | if [[ ${disk} =~ "/dev/nvme" ]]; then 43 | echo "$(blkid "${disks[i]}p1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 44 | elif [[ ${disk} =~ "/dev/vd" ]]; then 45 | echo "$(blkid "${disks[i]}1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 46 | elif [[ ${disk} =~ "/dev/xvd" ]]; then 47 | echo "$(blkid "${disks[i]}1" | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 48 | fi 49 | done 50 | 51 | mount -a 52 | 53 | chmod a+w /mnt/disk* 54 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/cs/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_cs_managed_kubernetes" "default" { 2 | name = "ack-${var.suffix}" 3 | timezone = "Asia/Shanghai" 4 | version = "1.32.1-aliyun.1" 5 | 6 | worker_vswitch_ids = var.worker_vswitch_ids 7 | pod_vswitch_ids = var.pod_vswitch_ids 8 | service_cidr = "172.16.0.0/16" 9 | addons { 10 | name = "terway-eniip" 11 | } 12 | 13 | proxy_mode = "ipvs" 14 | cluster_domain = "cluster.local" 15 | deletion_protection = false 16 | cluster_spec = "ack.pro.small" 17 | load_balancer_spec = "slb.s1.small" 18 | new_nat_gateway = true 19 | slb_internet_enabled = true 20 | resource_group_id = var.resource_group_id 21 | security_group_id = var.security_group_id 22 | } 23 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/cs/outputs.tf: -------------------------------------------------------------------------------- 1 | output "cluster_id" { 2 | value = alicloud_cs_managed_kubernetes.default.id 3 | } 4 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/cs/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "worker_vswitch_ids" { 7 | type = list(string) 8 | description = "The id list of worker vswitch." 9 | } 10 | 11 | variable "pod_vswitch_ids" { 12 | type = list(string) 13 | description = "The id list of pod vswitch." 14 | } 15 | 16 | variable "resource_group_id" { 17 | type = string 18 | description = "The id of resource group." 19 | } 20 | 21 | variable "security_group_id" { 22 | type = string 23 | description = "The id of security group." 24 | } 25 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/ecs/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_security_group" "default" { 2 | name = "sg-${var.suffix}" 3 | vpc_id = var.vpc_id 4 | resource_group_id = var.resource_group_id 5 | security_group_type = var.security_group_type 6 | } 7 | 8 | resource "alicloud_security_group_rule" "default" { 9 | type = "ingress" 10 | ip_protocol = "all" 11 | port_range = "-1/-1" 12 | cidr_ip = "192.168.0.0/16" 13 | security_group_id = alicloud_security_group.default.id 14 | priority = 1 15 | } 16 | 17 | resource "alicloud_security_group_rule" "icmp" { 18 | type = "ingress" 19 | ip_protocol = "icmp" 20 | port_range = "-1/-1" 21 | cidr_ip = "0.0.0.0/0" 22 | security_group_id = alicloud_security_group.default.id 23 | priority = 1 24 | } 25 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/ecs/outputs.tf: -------------------------------------------------------------------------------- 1 | output "security_group_id" { 2 | value = alicloud_security_group.default.id 3 | } 4 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/ecs/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "vpc_id" { 7 | type = string 8 | description = "The id of the vpc." 9 | } 10 | 11 | variable "resource_group_id" { 12 | type = string 13 | description = "The id of the resource group." 14 | } 15 | 16 | variable "security_group_type" { 17 | type = string 18 | description = "The type of the security group." 19 | default = "normal" 20 | } 21 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/fluid/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_cs_kubernetes_node_pool" "fluid" { 2 | node_pool_name = "np-fluid-${var.suffix}" 3 | cluster_id = var.cluster_id 4 | vswitch_ids = var.vswitch_ids 5 | desired_size = var.instance_count 6 | instance_types = [var.instance_type] 7 | image_type = "AliyunLinux3" 8 | system_disk_category = "cloud_essd" 9 | system_disk_size = 40 10 | system_disk_performance_level = "PL1" 11 | 12 | labels { 13 | key = "benchmark.node.role" 14 | value = "fluid" 15 | } 16 | 17 | resource_group_id = var.resource_group_id 18 | security_group_ids = [var.security_group_id] 19 | } 20 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/fluid/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "cluster_id" { 7 | type = string 8 | description = "The id of managed kubernetes cluster." 9 | } 10 | 11 | variable "vswitch_ids" { 12 | type = list(string) 13 | } 14 | 15 | variable "instance_count" { 16 | type = number 17 | } 18 | 19 | variable "instance_type" { 20 | type = string 21 | } 22 | 23 | variable "resource_group_id" { 24 | type = string 25 | description = "The id of resource group." 26 | } 27 | 28 | variable "security_group_id" { 29 | type = string 30 | description = "The id of security group." 31 | } 32 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/oss/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_oss_bucket" "default" { 2 | bucket = "bucket-${var.suffix}" 3 | acl = "private" 4 | storage_class = "Standard" 5 | redundancy_type = "LRS" 6 | } 7 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/oss/outputs.tf: -------------------------------------------------------------------------------- 1 | output "id" { 2 | value = alicloud_oss_bucket.default.id 3 | description = "The name of the bucket." 4 | } 5 | 6 | output "extranet_endpoint" { 7 | value = alicloud_oss_bucket.default.extranet_endpoint 8 | description = "The extranet access endpoint of the bucket" 9 | } 10 | 11 | output "intranet_endpoint" { 12 | value = alicloud_oss_bucket.default.intranet_endpoint 13 | description = "The intranet access endpoint of the bucket." 14 | } 15 | 16 | output "location" { 17 | value = alicloud_oss_bucket.default.location 18 | description = "The location of the bucket." 19 | } 20 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/oss/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/resource-manager/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_resource_manager_resource_group" "default" { 2 | resource_group_name = "rg-${var.suffix}" 3 | display_name = "rg-${var.suffix}" 4 | } 5 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/resource-manager/outputs.tf: -------------------------------------------------------------------------------- 1 | output "resource_group_id" { 2 | value = alicloud_resource_manager_resource_group.default.id 3 | } 4 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/resource-manager/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/spark/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_cs_kubernetes_node_pool" "spark-master" { 2 | node_pool_name = "spark-master" 3 | cluster_id = var.cluster_id 4 | vswitch_ids = var.vswitch_ids 5 | instance_types = [var.master_instance_type] 6 | image_type = "AliyunLinux3" 7 | system_disk_category = "cloud_essd" 8 | system_disk_size = 40 9 | system_disk_performance_level = "PL1" 10 | 11 | labels { 12 | key = "spark.tpcds.benchmark/role" 13 | value = "spark-master" 14 | } 15 | 16 | desired_size = var.master_instance_count 17 | resource_group_id = var.resource_group_id 18 | security_group_ids = [var.security_group_id] 19 | } 20 | 21 | resource "alicloud_cs_kubernetes_node_pool" "spark-worker" { 22 | node_pool_name = "spark-worker" 23 | cluster_id = var.cluster_id 24 | vswitch_ids = var.vswitch_ids 25 | desired_size = var.worker_instance_count 26 | instance_types = [var.worker_instance_type] 27 | image_type = "AliyunLinux3" 28 | system_disk_category = "cloud_essd" 29 | system_disk_size = 40 30 | system_disk_performance_level = "PL1" 31 | data_disks { 32 | category = "cloud_essd" 33 | size = 300 34 | performance_level = "PL1" 35 | device = "/dev/vdb" 36 | } 37 | data_disks { 38 | category = "cloud_essd" 39 | size = 300 40 | performance_level = "PL1" 41 | device = "/dev/vdc" 42 | } 43 | data_disks { 44 | category = "cloud_essd" 45 | size = 300 46 | performance_level = "PL1" 47 | device = "/dev/vdd" 48 | } 49 | data_disks { 50 | category = "cloud_essd" 51 | size = 300 52 | performance_level = "PL1" 53 | device = "/dev/vde" 54 | } 55 | data_disks { 56 | category = "cloud_essd" 57 | size = 300 58 | performance_level = "PL1" 59 | device = "/dev/vdf" 60 | } 61 | data_disks { 62 | category = "cloud_essd" 63 | size = 300 64 | performance_level = "PL1" 65 | device = "/dev/vdg" 66 | } 67 | data_disks { 68 | category = "cloud_essd" 69 | size = 40 70 | performance_level = "PL1" 71 | device = "/dev/vdh" 72 | } 73 | 74 | labels { 75 | key = "spark.tpcds.benchmark/role" 76 | value = "spark-worker" 77 | } 78 | 79 | taints { 80 | key = "spark.tpcds.benchmark/role" 81 | value = "spark-worker" 82 | effect = "NoSchedule" 83 | } 84 | 85 | kubelet_configuration { 86 | eviction_hard = { 87 | "imagefs.available" = "5%" 88 | "memory.available" = "100Mi" 89 | "nodefs.available" = "5%" 90 | "nodefs.inodesFree" = "5%" 91 | } 92 | system_reserved = { 93 | cpu = "300m" 94 | memory = "600Mi" 95 | pid = "1000" 96 | } 97 | kube_reserved = { 98 | cpu = "300m" 99 | memory = "600Mi" 100 | pid = "1000" 101 | } 102 | } 103 | 104 | user_data = base64encode(file("${path.module}/user_data.sh")) 105 | 106 | resource_group_id = var.resource_group_id 107 | security_group_ids = [var.security_group_id] 108 | } 109 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/spark/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 打印命令 4 | set -ex 5 | 6 | # 添加 parted 7 | yum install -y parted e2fsprogs 8 | 9 | # 为数据盘新建分区 10 | disks=(/dev/vdb /dev/vdc /dev/vdd /dev/vde /dev/vdf /dev/vdg) 11 | for disk in ${disks[@]}; do 12 | parted ${disk} mklabel gpt 13 | parted ${disk} mkpart primary 1 100% 14 | parted ${disk} align-check optimal 1 15 | done 16 | partprobe 17 | 18 | # 为分区创建文件系统 19 | for disk in ${disks[@]}; do 20 | mkfs -t xfs ${disk}1 21 | done 22 | 23 | # 挂载分区 24 | cp /etc/fstab /etc/fstab.bak 25 | n=${#disks[@]} 26 | for ((i = 0; i < n; i++)); do 27 | dir="/mnt/disk$(($i + 1))" 28 | mkdir -p ${dir} 29 | echo "$(blkid ${disks[i]}1 | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 30 | chmod g+w ${dir} 31 | done 32 | mount -a 33 | 34 | # 修改挂载点访问权限 35 | chmod a+w /mnt/disk* 36 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/spark/user_data_arm64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 打印命令 4 | set -ex 5 | 6 | # 添加 parted 7 | yum install -y parted e2fsprogs 8 | 9 | # 为数据盘新建分区 10 | disks=(/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1 /dev/nvme6n1) 11 | for disk in ${disks[@]}; do 12 | parted ${disk} mklabel gpt 13 | parted ${disk} mkpart primary 1 100% 14 | parted ${disk} align-check optimal 1 15 | done 16 | partprobe 17 | 18 | # 为分区创建文件系统 19 | for disk in ${disks[@]}; do 20 | mkfs -t xfs ${disk}p1 21 | done 22 | 23 | # 挂载分区 24 | cp /etc/fstab /etc/fstab.bak 25 | n=${#disks[@]} 26 | for ((i = 0; i < n; i++)); do 27 | dir="/mnt/disk$(($i + 1))" 28 | mkdir -p ${dir} 29 | echo "$(blkid ${disks[i]}p1 | awk '{print $2}' | sed 's/\"//g') ${dir} xfs defaults 0 0" >>/etc/fstab 30 | done 31 | mount -a 32 | 33 | # 修改挂载点访问权限 34 | chmod a+w /mnt/disk* 35 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/spark/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "cluster_id" { 7 | type = string 8 | description = "The id of managed kubernetes cluster." 9 | } 10 | 11 | variable "vswitch_ids" { 12 | type = list(string) 13 | } 14 | 15 | variable "master_instance_count" { 16 | type = number 17 | default = 1 18 | } 19 | 20 | variable "master_instance_type" { 21 | type = string 22 | } 23 | 24 | variable "worker_instance_count" { 25 | type = number 26 | default = 1 27 | } 28 | 29 | variable "worker_instance_type" { 30 | type = string 31 | } 32 | 33 | variable "resource_group_id" { 34 | type = string 35 | description = "The id of resource group." 36 | } 37 | 38 | variable "security_group_id" { 39 | type = string 40 | description = "The id of security group." 41 | } 42 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/vpc/main.tf: -------------------------------------------------------------------------------- 1 | resource "alicloud_vpc" "default" { 2 | vpc_name = "vpc-${var.suffix}" 3 | cidr_block = "192.168.0.0/16" 4 | resource_group_id = var.resource_group_id 5 | } 6 | 7 | resource "alicloud_vswitch" "default" { 8 | vswitch_name = "vsw-${var.suffix}" 9 | cidr_block = "192.168.0.0/24" 10 | vpc_id = alicloud_vpc.default.id 11 | zone_id = var.zone_id 12 | } 13 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/vpc/outputs.tf: -------------------------------------------------------------------------------- 1 | output "vpc_id" { 2 | value = alicloud_vpc.default.id 3 | } 4 | 5 | output "vswitch_id" { 6 | value = alicloud_vswitch.default.id 7 | } 8 | -------------------------------------------------------------------------------- /terraform/alicloud/modules/vpc/variables.tf: -------------------------------------------------------------------------------- 1 | variable "suffix" { 2 | type = string 3 | description = "The suffix of name." 4 | } 5 | 6 | variable "zone_id" { 7 | type = string 8 | description = "The AZ for the VSwitch." 9 | } 10 | 11 | variable "resource_group_id" { 12 | type = string 13 | description = "The id of the resource group." 14 | } 15 | -------------------------------------------------------------------------------- /terraform/alicloud/provider.tf: -------------------------------------------------------------------------------- 1 | provider "alicloud" { 2 | region = var.region 3 | profile = var.profile 4 | } 5 | -------------------------------------------------------------------------------- /terraform/alicloud/root.tf: -------------------------------------------------------------------------------- 1 | # Create resource group 2 | module "resource_manager" { 3 | source = "./modules/resource-manager" 4 | suffix = random_string.suffix.id 5 | } 6 | 7 | # Create VPC and vswitch 8 | module "vpc" { 9 | source = "./modules/vpc" 10 | suffix = random_string.suffix.id 11 | zone_id = var.zone_id 12 | resource_group_id = module.resource_manager.resource_group_id 13 | } 14 | 15 | # Create security group 16 | module "ecs" { 17 | source = "./modules/ecs" 18 | suffix = random_string.suffix.id 19 | vpc_id = module.vpc.vpc_id 20 | resource_group_id = module.resource_manager.resource_group_id 21 | } 22 | 23 | # module "oss" { 24 | # source = "./modules/oss" 25 | # suffix = random_string.suffix.id 26 | # } 27 | 28 | # Create ACK 29 | module "cs" { 30 | source = "./modules/cs" 31 | suffix = random_string.suffix.id 32 | worker_vswitch_ids = [module.vpc.vswitch_id] 33 | pod_vswitch_ids = [module.vpc.vswitch_id] 34 | resource_group_id = module.resource_manager.resource_group_id 35 | security_group_id = module.ecs.security_group_id 36 | } 37 | 38 | # Create node pool for spark 39 | module "spark" { 40 | source = "./modules/spark" 41 | suffix = random_string.suffix.id 42 | cluster_id = module.cs.cluster_id 43 | vswitch_ids = [module.vpc.vswitch_id] 44 | master_instance_count = var.spark_master_instance_count 45 | master_instance_type = var.spark_master_instance_type 46 | worker_instance_count = var.spark_worker_instance_count 47 | worker_instance_type = var.spark_worker_instance_type 48 | resource_group_id = module.resource_manager.resource_group_id 49 | security_group_id = module.ecs.security_group_id 50 | } 51 | 52 | # Create node pool for celeborn 53 | module "celeborn" { 54 | source = "./modules/celeborn" 55 | suffix = random_string.suffix.id 56 | cluster_id = module.cs.cluster_id 57 | vswitch_ids = [module.vpc.vswitch_id] 58 | master_instance_count = var.celeborn_master_instance_count 59 | master_instance_type = var.celeborn_master_instance_type 60 | worker_instance_count = var.celeborn_worker_instance_count 61 | worker_instance_type = var.celeborn_worker_instance_type 62 | resource_group_id = module.resource_manager.resource_group_id 63 | security_group_id = module.ecs.security_group_id 64 | } 65 | 66 | # Create node pool for fluid 67 | module "fluid" { 68 | source = "./modules/fluid" 69 | suffix = random_string.suffix.id 70 | cluster_id = module.cs.cluster_id 71 | vswitch_ids = [module.vpc.vswitch_id] 72 | instance_count = var.fluid_instance_count 73 | instance_type = var.fluid_instance_type 74 | resource_group_id = module.resource_manager.resource_group_id 75 | security_group_id = module.ecs.security_group_id 76 | } 77 | -------------------------------------------------------------------------------- /terraform/alicloud/terraform.tfvars: -------------------------------------------------------------------------------- 1 | # Alicloud 2 | profile = "default" 3 | zone_id = "cn-beijing-i" 4 | 5 | # Spark master node pool 6 | spark_master_instance_count = 0 7 | spark_master_instance_type = "ecs.g7.4xlarge" 8 | 9 | # Spark worker node pool 10 | spark_worker_instance_count = 0 11 | spark_worker_instance_type = "ecs.g7.8xlarge" 12 | 13 | # Celeborn master node pool 14 | celeborn_master_instance_count = 0 15 | celeborn_master_instance_type = "ecs.g8i.2xlarge" 16 | 17 | # Celeborn worker node pool 18 | celeborn_worker_instance_count = 0 19 | celeborn_worker_instance_type = "ecs.i4.8xlarge" 20 | 21 | # Fluid node pool 22 | fluid_instance_count = 0 23 | fluid_instance_type = "ecs.i3.2xlarge" 24 | -------------------------------------------------------------------------------- /terraform/alicloud/variables.tf: -------------------------------------------------------------------------------- 1 | variable "profile" { 2 | type = string 3 | default = "default" 4 | } 5 | 6 | variable "region" { 7 | type = string 8 | default = "cn-beijing" 9 | } 10 | 11 | variable "zone_id" { 12 | type = string 13 | default = "cn-beijing-i" 14 | } 15 | 16 | variable "bucket_name" { 17 | type = string 18 | description = "The name of bucket." 19 | default = "ack-spark-benchmark" 20 | } 21 | 22 | # Spark 23 | variable "spark_master_instance_count" { 24 | type = number 25 | } 26 | 27 | variable "spark_master_instance_type" { 28 | type = string 29 | } 30 | 31 | variable "spark_worker_instance_count" { 32 | type = number 33 | } 34 | 35 | variable "spark_worker_instance_type" { 36 | type = string 37 | } 38 | 39 | # Celeborn 40 | variable "celeborn_master_instance_count" { 41 | type = number 42 | description = "Instance count of Celeborn master node pool." 43 | } 44 | 45 | variable "celeborn_master_instance_type" { 46 | type = string 47 | description = "Instance type of Celeborn worker node pool" 48 | default = "" 49 | } 50 | 51 | variable "celeborn_worker_instance_count" { 52 | type = number 53 | description = "Instance count of Celeborn worker node pool." 54 | } 55 | 56 | variable "celeborn_worker_instance_type" { 57 | type = string 58 | description = "Instance type of Celeborn worker node pool." 59 | } 60 | 61 | # Fluid 62 | variable "fluid_instance_count" { 63 | type = number 64 | } 65 | 66 | variable "fluid_instance_type" { 67 | type = string 68 | } 69 | --------------------------------------------------------------------------------