├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
└── workflows
│ ├── cd.yml
│ └── ci.yml
├── .gitignore
├── .vscode
└── launch.json
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── THIRD-PARTY-LICENSES.txt
├── documentation
└── resources
│ ├── demo.gif
│ ├── howitworks.png
│ ├── logo.png
│ └── usage.png
├── spark-plugin
├── .gitignore
├── build.sbt
├── example_3_1_3
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── io
│ │ └── dataflint
│ │ └── example
│ │ └── ShakespeareSpark313.scala
├── example_3_2_4
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── io
│ │ └── dataflint
│ │ └── example
│ │ ├── Shakespeare324Exported.scala
│ │ └── ShakespeareSpark324.scala
├── example_3_3_3
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── io
│ │ └── dataflint
│ │ └── example
│ │ ├── IcebergExample333.scala
│ │ ├── Shakespeare333Exported.scala
│ │ └── ShakespeareSpark333.scala
├── example_3_4_1
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── io
│ │ └── dataflint
│ │ └── example
│ │ ├── IcebergExample.scala
│ │ ├── SalesFilterer.scala
│ │ ├── SalesFiltererFixed.scala
│ │ ├── Shakespeare341.scala
│ │ ├── Shakespeare341Exported.scala
│ │ ├── ShakespearePartitionedWriter.scala
│ │ ├── ShakespearePartitionedWriterFixed.scala
│ │ ├── ShakespeareUnpartitionedWriter.scala
│ │ ├── ShakespeareUnpartitionedWriterFixed.scala
│ │ └── SimpleStreaming.scala
├── example_3_4_1_remote
│ └── src
│ │ └── main
│ │ └── scala
│ │ └── io
│ │ └── dataflint
│ │ └── example
│ │ └── Shakespeare341Remote.scala
├── example_3_5_1
│ └── src
│ │ └── main
│ │ └── scala
│ │ ├── io
│ │ └── dataflint
│ │ │ └── example
│ │ │ ├── AccessPatternExample.scala
│ │ │ ├── CacheExample.scala
│ │ │ ├── DataFusionCometExample.scala
│ │ │ ├── DeltaLakeExample.scala
│ │ │ ├── DeltaLakeStreaming.scala
│ │ │ ├── IcebergExample.scala
│ │ │ ├── JobGroupExample.scala
│ │ │ ├── JobGroupExportedLocal.scala
│ │ │ ├── JoinExample.scala
│ │ │ ├── KafkaStreaming.scala
│ │ │ ├── LargeBroadcastExample.scala
│ │ │ ├── LargeFilterCondition.scala
│ │ │ ├── PartitionSkewExample.scala
│ │ │ ├── SchedulingSmallTasks.scala
│ │ │ ├── SchedulingSmallTasksSkipAlerts.scala
│ │ │ ├── SetJobDescriptionAndUDFName.scala
│ │ │ ├── Shakespeare351.scala
│ │ │ ├── Shakespeare351Exported.scala
│ │ │ ├── Shakespeare351ExportedLocal.scala
│ │ │ └── Shakespeare351ExportedLocal2.scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── dataflint
│ │ └── jobgroup
│ │ └── tests
│ │ └── JobGroupTests.scala
├── plugin
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.status.AppHistoryServerPlugin
│ │ └── scala
│ │ ├── io
│ │ └── dataflint
│ │ │ └── spark
│ │ │ ├── SparkDataflint.scala
│ │ │ └── SparkDataflintPlugin.scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ ├── dataflint
│ │ ├── DataflintSparkUILoader.scala
│ │ ├── api
│ │ │ ├── DataFlintTab.scala
│ │ │ ├── DataflintApplicationInfoPage.scala
│ │ │ ├── DataflintCachedStoragePage.scala
│ │ │ ├── DataflintIcebergPage.scala
│ │ │ ├── DataflintJettyUtils.scala
│ │ │ ├── DataflintSQLMetricsPage.scala
│ │ │ ├── DataflintSQLPlanPage.scala
│ │ │ ├── DataflintSQLStagesRddPage.scala
│ │ │ └── api.scala
│ │ ├── iceberg
│ │ │ ├── ClassLoaderChecker.scala
│ │ │ └── DataflintIcebergMetricsReporter.scala
│ │ ├── jobgroup
│ │ │ └── JobGroupExtractor.scala
│ │ ├── listener
│ │ │ ├── DataflintDatabricksLiveListener.scala
│ │ │ ├── DataflintListener.scala
│ │ │ ├── DataflintStore.scala
│ │ │ ├── LiveRDDsListener.scala
│ │ │ └── model.scala
│ │ ├── package.scala
│ │ └── saas
│ │ │ ├── DataflintRunExporterListener.scala
│ │ │ ├── EnumSerializer.scala
│ │ │ ├── ExecutorsMetricsSerializer.scala
│ │ │ ├── GZipUtils.scala
│ │ │ ├── JavaEnumNameSerializer.scala
│ │ │ ├── S3Uploader.scala
│ │ │ ├── SparkMetadataSerializer.scala
│ │ │ ├── SparkMetadataStore.scala
│ │ │ ├── SparkRunSerializer.scala
│ │ │ ├── SparkRunStore.scala
│ │ │ ├── StageTaskSummary.scala
│ │ │ ├── StoreDataExtractor.scala
│ │ │ └── StoreMetadataExtractor.scala
│ │ └── deploy
│ │ └── history
│ │ ├── DataFlintHistoryServerPlugin.scala
│ │ └── FsDataflintHistoryProvider.scala
├── project
│ ├── build.properties
│ └── publish.sbt
├── sonatype.sbt
└── test_data
│ └── will_play_text.csv
└── spark-ui
├── .env
├── .generatelicensefile.yaml
├── .gitignore
├── gulpfile.js
├── package-lock.json
├── package.json
├── public
├── favicon.ico
├── icon.png
├── index.html
├── logo-grey.png
├── logo.png
└── manifest.json
├── src
├── App.tsx
├── Hooks.ts
├── Router.tsx
├── Store.ts
├── components
│ ├── AlertBadge
│ │ ├── AlertBadge.tsx
│ │ └── MultiAlertsBadge.tsx
│ ├── AppDrawer
│ │ ├── AppDrawer.tsx
│ │ └── DrawerFooter.tsx
│ ├── ColumnPicker
│ │ └── ColumnPicker.tsx
│ ├── ConfigTable.tsx
│ ├── ExceptionIcon.tsx
│ ├── InfoBox
│ │ ├── InfoBox.module.css
│ │ └── InfoBox.tsx
│ ├── Modals
│ │ └── DisconnectedModal.tsx
│ ├── NoQuery
│ │ └── NoQuery.tsx
│ ├── Progress.tsx
│ ├── ResourceBar.tsx
│ ├── ResourceGraph
│ │ ├── ColorsOutput.ts
│ │ └── ResourceGraph.tsx
│ ├── SqlContainer.tsx
│ ├── SqlFlow
│ │ ├── BytesDistributionChart.tsx
│ │ ├── DurationDistributionChart.tsx
│ │ ├── NumbersDistributionChart.tsx
│ │ ├── SqlFlow.tsx
│ │ ├── SqlLayoutService.ts
│ │ ├── StageIcon.tsx
│ │ ├── StageIconDrawer.tsx
│ │ ├── StageNode.tsx
│ │ └── node-style.module.css
│ ├── SqlTable
│ │ ├── SqlTable.tsx
│ │ ├── TableTypes.tsx
│ │ └── TableUtils.tsx
│ ├── StatusBar.tsx
│ └── SummaryBar.tsx
├── index.tsx
├── interfaces
│ ├── AppStore.ts
│ ├── ApplicationInfo.ts
│ ├── CachedStorage.ts
│ ├── IcebergInfo.ts
│ ├── Mixpanel.ts
│ ├── SQLPlan.ts
│ ├── SparkApplications.ts
│ ├── SparkConfiguration.ts
│ ├── SparkExecutors.ts
│ ├── SparkJobs.ts
│ ├── SparkSQLs.ts
│ ├── SparkStages.ts
│ ├── SqlMetrics.ts
│ └── StagesRdd.ts
├── react-app-env.d.ts
├── reducers
│ ├── Alerts
│ │ ├── BroadcastTooLargeAlert.ts
│ │ ├── IcebergReplacesReducer.ts
│ │ ├── JoinToBroadcastAlert.ts
│ │ ├── LargeCrossJoinScanAlert.ts
│ │ ├── LongFilterConditions.ts
│ │ ├── MaxPartitionToBigAlert.ts
│ │ ├── MemoryAlertsReducer.ts
│ │ ├── MemorySQLInputOutputAlerts.ts
│ │ ├── PartitionSkewAlert.ts
│ │ ├── SmallTasksAlert.ts
│ │ └── WastedCoresAlertsReducer.ts
│ ├── AlertsReducer.ts
│ ├── ChatSlice.ts
│ ├── ConfigReducer.ts
│ ├── ExecutorsReducer.ts
│ ├── GeneralSlice.ts
│ ├── JobsColumnSlice.ts
│ ├── MetricsReducer.ts
│ ├── PlanGraphUtils.ts
│ ├── PlanParsers
│ │ ├── CoalesceParser.test.ts
│ │ ├── CoalesceParser.ts
│ │ ├── CollectLimitParser.ts
│ │ ├── ExchangeParser.spec.ts
│ │ ├── ExchangeParser.ts
│ │ ├── FilterParser.ts
│ │ ├── JoinParser.spec.ts
│ │ ├── JoinParser.ts
│ │ ├── PlanParserUtils.ts
│ │ ├── ProjectParser.ts
│ │ ├── ScanFileParser.spec.ts
│ │ ├── ScanFileParser.ts
│ │ ├── SortParser.ts
│ │ ├── SortParset.spec.ts
│ │ ├── TakeOrderedAndProjectParser.spec.ts
│ │ ├── TakeOrderedAndProjectParser.ts
│ │ ├── WindowParser.spec.ts
│ │ ├── WindowParser.ts
│ │ ├── WriteToHDFSParser.spec.ts
│ │ ├── WriteToHDFSParser.ts
│ │ ├── hashAggregateParser.spec.ts
│ │ └── hashAggregateParser.ts
│ ├── SQLNodeStageReducer.ts
│ ├── SparkSlice.ts
│ ├── SqlReducer.ts
│ ├── SqlReducerUtils.ts
│ └── StatusReducer.ts
├── services
│ ├── MixpanelService.tsx
│ ├── SparkApi.tsx
│ └── TabsService.tsx
├── tabs
│ ├── AlertsTab.tsx
│ ├── ChatTab.tsx
│ ├── ConfigurationTab.tsx
│ ├── ResourcesTab.tsx
│ ├── StatusTab.tsx
│ └── SummaryTab.tsx
├── theme.ts
└── utils
│ ├── ConfigParser.ts
│ ├── FormatUtils.ts
│ ├── UrlConsts.ts
│ └── UrlUtils.ts
└── tsconfig.json
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: menishmueli
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **Environemnt**
14 | spark verison: 3.2/3.3/3.4/3.5
15 | platform: EMR/DataProc/K8s/standalone/databricks
16 |
17 | **To Reproduce**
18 | Steps to reproduce the behavior:
19 | 1. Go to '...'
20 | 2. Click on '....'
21 | 3. Scroll down to '....'
22 | 4. See error
23 |
24 | **Expected behavior**
25 | A clear and concise description of what you expected to happen.
26 |
27 | **Screenshots**
28 | If applicable, add screenshots to help explain your problem.
29 |
30 | **Additional context**
31 | Add any other context about the problem here.
32 |
--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
1 | name: CD
2 | on:
3 | workflow_run:
4 | workflows: [CI]
5 | branches: [main]
6 | types:
7 | - completed
8 | jobs:
9 | publish:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | with:
14 | fetch-depth: 0
15 | - uses: actions/setup-java@v4
16 | with:
17 | distribution: temurin
18 | java-version: 8
19 | cache: sbt
20 |
21 | - name: Set up Node.js
22 | uses: actions/setup-node@v2
23 | with:
24 | node-version: 20
25 |
26 | - name: Install npm dependencies
27 | run: npm ci
28 | working-directory: ./spark-ui
29 |
30 | - name: build frontend
31 | run: npm run deploy
32 | working-directory: ./spark-ui
33 |
34 | - name: package plugin
35 | run: sbt package
36 | working-directory: ./spark-plugin
37 |
38 | - name: publish to maven staging
39 | run: sbt ci-release
40 | working-directory: ./spark-plugin
41 | env:
42 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
43 | PGP_SECRET: ${{ secrets.PGP_SECRET }}
44 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
45 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
46 |
47 | - name: Changelog
48 | uses: scottbrenner/generate-changelog-action@master
49 | if: startsWith(github.ref, 'refs/tags/v')
50 | id: Changelog
51 | env:
52 | REPO: ${{ github.repository }}
53 |
54 | - name: Create Release
55 | id: create_release
56 | uses: actions/create-release@latest
57 | if: startsWith(github.ref, 'refs/tags/v')
58 | env:
59 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
60 | with:
61 | tag_name: ${{ github.ref }}
62 | release_name: Release ${{ github.ref }}
63 | body: |
64 | See: https://dataflint.gitbook.io/dataflint-for-spark/overview/release-notes#version-${{ github.ref_name }}
65 |
66 | commits change log:
67 | ${{ steps.Changelog.outputs.changelog }}
68 | draft: false
69 | prerelease: false
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | # This file was automatically generated by sbt-github-actions using the
2 | # githubWorkflowGenerate task. You should add and commit this file to
3 | # your git repository. It goes without saying that you shouldn't edit
4 | # this file by hand! Instead, if you wish to make changes, you should
5 | # change your sbt build configuration to revise the workflow description
6 | # to meet your needs, then regenerate this file.
7 |
8 | name: CI
9 |
10 | on:
11 | pull_request:
12 | branches: ['**']
13 | push:
14 | branches: ['**']
15 |
16 | jobs:
17 | build:
18 | name: Build and Test
19 | strategy:
20 | matrix:
21 | os: [ubuntu-latest]
22 | java: [temurin@8]
23 | runs-on: ubuntu-latest
24 | steps:
25 | - name: Checkout current branch (full)
26 | uses: actions/checkout@v4
27 | with:
28 | fetch-depth: 0
29 |
30 | - name: Setup Java (temurin@8)
31 | if: matrix.java == 'temurin@8'
32 | uses: actions/setup-java@v3
33 | with:
34 | distribution: temurin
35 | java-version: 8
36 | cache: sbt
37 |
38 | - name: Set up Node.js
39 | uses: actions/setup-node@v2
40 | with:
41 | node-version: 20
42 |
43 | - name: Install dependencies
44 | run: npm ci
45 | working-directory: ./spark-ui
46 |
47 | - name: Run frontend unit tests
48 | run: npm run test
49 | working-directory: ./spark-ui
50 |
51 | - name: Build and test plugin
52 | run: sbt +test
53 | working-directory: ./spark-plugin
54 |
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.2",
6 | "configurations": [
7 | {
8 | "type": "chrome",
9 | "request": "launch",
10 | "name": "Launch Chrome against localhost",
11 | "url": "http://localhost:4000",
12 | "webRoot": "${workspaceFolder}"
13 | }
14 | ]
15 | }
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to DataFlint
2 |
3 | ## Getting started with development
4 |
5 | ### Setup
6 |
7 | Requirements:
8 | 1. Node v21.5.0
9 | 2. Java 8 or 11
10 | 3. Scala 2.12
11 | 4. SBT 1.3.13
12 | 5. IntelliJ IDEA with Scala and SBT plugins
13 | 6. Visual Studio Code
14 |
15 | ### Installation Steps
16 |
17 | 1. Clone the repository:
18 | ```
19 | git clone https://github.com/dataflint/spark.git
20 | cd spark
21 | ```
22 |
23 | 2. Set up the Spark Plugin:
24 | - Open the `spark-plugin` folder with IntelliJ IDEA
25 | - Ensure Scala and SBT plugins are installed in IntelliJ
26 |
27 | 3. Set up the UI:
28 | - Open the repository with Visual Studio Code
29 | - Install UI dependencies:
30 | ```
31 | cd spark-ui
32 | npm install
33 | ```
34 |
35 | 4. Build the UI for the plugin:
36 | ```
37 | cd spark-ui
38 | npm run deploy
39 | ```
40 |
41 | 5. (Optional) Install Local CORS Proxy (LCP) for local development:
42 | ```
43 | brew install lcp
44 | ```
45 |
46 | ### Running the Project
47 |
48 | 1. Run one of the examples in the `spark-examples-351` project using IntelliJ
49 |
50 | 2. Access the Spark UI:
51 | - Browse to `http://localhost:10000`
52 | - Open DataFlint successfully
53 |
54 | ### Live Frontend Development
55 |
56 | For live frontend development, follow these steps:
57 |
58 | 1. Start the development server and proxy:
59 | ```
60 | cd spark-ui
61 | npm run start
62 | npm run proxy
63 | ```
64 |
65 | 2. Access the development UI:
66 | - Browse to `http://localhost:4000`
67 | - This should run the DataFlint UI with live reloading
68 |
69 | ## Contributing Guidelines
70 |
71 | - Please ensure your code follows the project's coding standards
72 | - Submit pull requests for review
73 |
74 | Thank you for contributing to DataFlint!
75 |
--------------------------------------------------------------------------------
/documentation/resources/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/demo.gif
--------------------------------------------------------------------------------
/documentation/resources/howitworks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/howitworks.png
--------------------------------------------------------------------------------
/documentation/resources/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/logo.png
--------------------------------------------------------------------------------
/documentation/resources/usage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/usage.png
--------------------------------------------------------------------------------
/spark-plugin/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # From https://github.com/github/gitignore/blob/master/Scala.gitignore
3 | *.class
4 | *.log
5 |
6 | # From https://github.com/github/gitignore/blob/master/Global/SBT.gitignore
7 | dist/*
8 | target/
9 | lib_managed/
10 | src_managed/
11 | project/boot/
12 | project/plugins/project/
13 | .history
14 | .cache
15 | .lib/
16 |
17 | # From https://github.com/github/gitignore/blob/master/Global/Eclipse.gitignore
18 | .metadata
19 | bin/
20 | tmp/
21 | *.tmp
22 | *.bak
23 | *.swp
24 | *~.nib
25 | local.properties
26 | .settings/
27 | .loadpath
28 | .recommenders
29 |
30 | .externalToolBuilders/
31 | *.launch
32 | *.pydevproject
33 | .cproject
34 | .factorypath
35 | .buildpath
36 | .target
37 | .tern-project
38 | .texlipse
39 | .springBeans
40 | .recommenders/
41 |
42 | # Scala IDE specific (Scala & Java development for Eclipse)
43 | .cache-main
44 | .cache-tests
45 | .classpath
46 | .project
47 | .scala_dependencies
48 | .worksheet
49 |
50 | .idea
51 |
52 | # custom
53 | null/
54 | plugin/src/main/resources/io/
55 |
56 | # custom
57 | .bsp
58 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_1_3/src/main/scala/io/dataflint/example/ShakespeareSpark313.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | import java.nio.file.Paths
7 |
8 | object ShakespeareSpark313 extends App {
9 | def df(spark: SparkSession): DataFrame = spark.read
10 | .format("csv")
11 | .option("sep", ";")
12 | .option("inferSchema", true)
13 | .load("./test_data/will_play_text.csv")
14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
15 | .repartition(1000)
16 |
17 | val spark = SparkSession
18 | .builder()
19 | .appName("Shakespeare Statistics")
20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
21 | .config("spark.dataflint.telemetry.enabled", false)
22 | .config("spark.ui.port", "10000")
23 | .master("local[*]")
24 | .getOrCreate()
25 |
26 | import spark.implicits._
27 |
28 | val shakespeareText = df(spark)
29 |
30 | shakespeareText.printSchema()
31 |
32 | val count = shakespeareText.count()
33 | println(s"number of records : $count")
34 |
35 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
36 | println(s"number of unique speakers : $uniqueSpeakers")
37 |
38 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
39 |
40 | println(s"number of unique words : $uniqueWords")
41 |
42 | scala.io.StdIn.readLine()
43 | spark.stop()
44 | }
45 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_2_4/src/main/scala/io/dataflint/example/Shakespeare324Exported.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | object Shakespeare324Exported extends App {
6 | def df(spark: SparkSession): DataFrame = spark.read
7 | .format("csv")
8 | .option("sep", ";")
9 | .option("inferSchema", true)
10 | .load("./test_data/will_play_text.csv")
11 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
12 | .repartition(1000)
13 |
14 | val spark = SparkSession
15 | .builder
16 | .appName("Shakespeare Statistics Exported")
17 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
18 | .config("spark.dataflint.telemetry.enabled", false)
19 | .config("spark.ui.port", "10000")
20 | .config("spark.sql.maxMetadataStringLength", "10000")
21 | .config("spark.eventLog.enabled", "true")
22 | .master("local[*]")
23 | .getOrCreate()
24 |
25 | val shakespeareText = df(spark)
26 |
27 | val count = shakespeareText.count()
28 | println(s"number of records : $count")
29 |
30 | spark.stop()
31 | }
32 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_2_4/src/main/scala/io/dataflint/example/ShakespeareSpark324.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | import java.nio.file.Paths
7 |
8 | object ShakespeareSpark324 extends App {
9 | def fsPath(resource: String): String =
10 | Paths.get(this.getClass.getResource(resource).toURI).toString
11 |
12 | def df(spark: SparkSession): DataFrame = spark.read
13 | .format("csv")
14 | .option("sep", ";")
15 | .option("inferSchema", true)
16 | .load("./test_data/will_play_text.csv")
17 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
18 | .repartition(1000)
19 |
20 | val spark = SparkSession
21 | .builder()
22 | .appName("Shakespeare Statistics")
23 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
24 | .config("spark.dataflint.telemetry.enabled", false)
25 | .config("spark.ui.port", "10000")
26 | .master("local[*]")
27 | .getOrCreate()
28 |
29 | import spark.implicits._
30 |
31 | val shakespeareText = df(spark)
32 |
33 | shakespeareText.printSchema()
34 |
35 | val count = shakespeareText.count()
36 | println(s"number of records : $count")
37 |
38 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
39 | println(s"number of unique speakers : $uniqueSpeakers")
40 |
41 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
42 |
43 | println(s"number of unique words : $uniqueWords")
44 |
45 | scala.io.StdIn.readLine()
46 | spark.stop()
47 | }
48 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_3_3/src/main/scala/io/dataflint/example/Shakespeare333Exported.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object Shakespeare333Exported extends App {
8 | def df(spark: SparkSession): DataFrame = spark.read
9 | .format("csv")
10 | .option("sep", ";")
11 | .option("inferSchema", true)
12 | .load("./test_data/will_play_text.csv")
13 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
14 | .repartition(1000)
15 |
16 | val spark = SparkSession
17 | .builder
18 | .appName("Shakespeare Statistics Exported")
19 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
20 | .config("spark.dataflint.telemetry.enabled", false)
21 | .config("spark.ui.port", "10000")
22 | .config("spark.sql.maxMetadataStringLength", "10000")
23 | .config("spark.eventLog.enabled", "true")
24 | .master("local[*]")
25 | .getOrCreate()
26 |
27 | val shakespeareText = df(spark)
28 |
29 | val count = shakespeareText.count()
30 | println(s"number of records : $count")
31 |
32 | spark.stop()
33 | }
34 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_3_3/src/main/scala/io/dataflint/example/ShakespeareSpark333.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | object ShakespeareSpark333 extends App {
7 | def df(spark: SparkSession): DataFrame = spark.read
8 | .format("csv")
9 | .option("sep", ";")
10 | .option("inferSchema", true)
11 | .load("./test_data/will_play_text.csv")
12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
13 | .repartition(1000)
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName("Shakespeare Statistics")
18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
19 | .config("spark.dataflint.telemetry.enabled", false)
20 | .config("spark.ui.port", "10000")
21 | .master("local[*]")
22 | .getOrCreate()
23 |
24 | import spark.implicits._
25 |
26 | val shakespeareText = df(spark)
27 |
28 | shakespeareText.printSchema()
29 |
30 | val count = shakespeareText.count()
31 | println(s"number of records : $count")
32 |
33 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
34 | println(s"number of unique speakers : $uniqueSpeakers")
35 |
36 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
37 |
38 | println(s"number of unique words : $uniqueWords")
39 |
40 | scala.io.StdIn.readLine()
41 | spark.stop()
42 | }
43 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/IcebergExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object IcebergExample extends App{
6 | val spark = SparkSession
7 | .builder()
8 | .appName("Iceberg Example")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.dataflint.telemetry.enabled", false)
11 | .config("spark.ui.port", "10000")
12 | .config("spark.sql.maxMetadataStringLength", "10000")
13 | .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
14 | .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
15 | .config("spark.sql.catalog.spark_catalog.type", "hive")
16 | .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
17 | .config("spark.sql.catalog.local.type", "hadoop")
18 | .config("spark.sql.catalog.local.warehouse", "/tmp/iceberg-example/warehouse")
19 | .config("spark.sql.defaultCatalog", "local")
20 | .config("spark.sql.catalog.local.metrics-reporter-impl", "org.apache.spark.dataflint.iceberg.DataflintIcebergMetricsReporter")
21 | .master("local[*]")
22 | .getOrCreate()
23 |
24 | spark.sparkContext.setJobDescription("Drop table if exists")
25 | spark.sql("DROP TABLE IF EXISTS demo.nyc.taxis PURGE")
26 |
27 | spark.sparkContext.setJobDescription("Create table")
28 | spark.sql(
29 | """
30 | |CREATE TABLE demo.nyc.taxis
31 | |(
32 | | vendor_id bigint,
33 | | trip_id bigint,
34 | | trip_distance float,
35 | | fare_amount double,
36 | | store_and_fwd_flag string
37 | |)
38 | |PARTITIONED BY (vendor_id);
39 | |""".stripMargin)
40 |
41 | spark.sparkContext.setJobDescription("Insert 4 records to table")
42 | spark.sql(
43 | """
44 | |INSERT INTO demo.nyc.taxis
45 | |VALUES (1, 1000371, 1.8, 15.32, 'N'), (2, 1000372, 2.5, 22.15, 'N');
46 | |""".stripMargin)
47 |
48 | spark.sparkContext.setJobDescription("Select from table")
49 | spark.sql("SELECT * FROM demo.nyc.taxis").show()
50 |
51 | scala.io.StdIn.readLine()
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SalesFilterer.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object SalesFilterer extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Sales Filterer")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.ui.port", "10000")
13 | .config("spark.eventLog.enabled", true)
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .config("spark.dataflint.telemetry.enabled", false)
16 | .master("local[1]")
17 | .getOrCreate()
18 |
19 | import spark.implicits._
20 |
21 | spark.read
22 | .load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales")
23 | .filter($"ss_quantity" > 1)
24 | .write
25 | .mode(SaveMode.Overwrite)
26 | .partitionBy("ss_quantity")
27 | .parquet("/tmp/store_sales")
28 |
29 | // scala.io.StdIn.readLine()
30 | spark.stop()
31 | }
32 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SalesFiltererFixed.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | object SalesFiltererFixed extends App {
6 | val spark = SparkSession
7 | .builder()
8 | .appName("Sales Filterer Fixed")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.ui.port", "10000")
11 | .config("spark.eventLog.enabled", true)
12 | .config("spark.sql.maxMetadataStringLength", "10000")
13 | .config("spark.dataflint.telemetry.enabled", false)
14 | .master("local[1]")
15 | .getOrCreate()
16 |
17 | import spark.implicits._
18 |
19 | spark.read
20 | .load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales")
21 | .filter($"ss_quantity" > 1)
22 | .repartition($"ss_quantity")
23 | .write
24 | .mode(SaveMode.Overwrite)
25 | .partitionBy("ss_quantity")
26 | .parquet("/tmp/store_sales")
27 |
28 | // scala.io.StdIn.readLine()
29 | spark.stop()
30 | }
31 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/Shakespeare341.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | import java.nio.file.Paths
7 |
8 | object Shakespeare341 extends App {
9 | def df(spark: SparkSession): DataFrame = spark.read
10 | .format("csv")
11 | .option("sep", ";")
12 | .option("inferSchema", true)
13 | .load("./test_data/will_play_text.csv")
14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
15 | .repartition(1000)
16 |
17 | val spark = SparkSession
18 | .builder()
19 | .appName("Shakespeare Statistics")
20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
21 | .config("spark.dataflint.telemetry.enabled", false)
22 | .config("spark.ui.port", "10000")
23 | .config("spark.sql.maxMetadataStringLength", "10000")
24 | .master("local[*]")
25 | .getOrCreate()
26 |
27 | import spark.implicits._
28 |
29 | val shakespeareText = df(spark)
30 |
31 | shakespeareText.printSchema()
32 |
33 | val count = shakespeareText.count()
34 | println(s"number of records : $count")
35 |
36 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
37 | println(s"number of unique speakers : $uniqueSpeakers")
38 |
39 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
40 |
41 | println(s"number of unique words : $uniqueWords")
42 |
43 | scala.io.StdIn.readLine()
44 | spark.stop()
45 | }
46 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/Shakespeare341Exported.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 |
8 | object Shakespeare341Exported extends App {
9 | def df(spark: SparkSession): DataFrame = spark.read
10 | .format("csv")
11 | .option("sep", ";")
12 | .option("inferSchema", true)
13 | .load("./test_data/will_play_text.csv")
14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
15 | .repartition(1000)
16 |
17 | val spark = SparkSession
18 | .builder
19 | .appName("Shakespeare Statistics Exported")
20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
21 | .config("spark.dataflint.telemetry.enabled", false)
22 | .config("spark.sql.maxMetadataStringLength", "10000")
23 | .config("spark.eventLog.enabled", "true")
24 | .master("local[*]")
25 | .getOrCreate()
26 |
27 | val shakespeareText = df(spark)
28 |
29 | val count = shakespeareText.count()
30 | println(s"number of records : $count")
31 |
32 | spark.stop()
33 | }
34 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespearePartitionedWriter.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object ShakespearePartitionedWriter extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Shakespeare Partitioned Writer")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.eventLog.enabled", true)
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | spark.read
20 | .format("csv")
21 | .option("sep", ";")
22 | .option("inferSchema", true)
23 | .load("./test_data/will_play_text.csv")
24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
25 | .repartition(200)
26 | .write
27 | .mode(SaveMode.Overwrite)
28 | .partitionBy("play_name")
29 | .parquet("/tmp/shakespear_partitioned")
30 |
31 | spark.stop()
32 | }
33 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespearePartitionedWriterFixed.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object ShakespearePartitionedWriterFixed extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Shakespeare Partitioned Writer Fixed")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.eventLog.enabled", true)
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | import spark.implicits._
20 |
21 | spark.read
22 | .format("csv")
23 | .option("sep", ";")
24 | .option("inferSchema", true)
25 | .load("./test_data/will_play_text.csv")
26 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
27 | .repartition(200)
28 | .repartition($"play_name")
29 | .write
30 | .mode(SaveMode.Overwrite)
31 | .partitionBy("play_name")
32 | .parquet("/tmp/shakespear_partitioned")
33 |
34 | spark.stop()
35 | }
36 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespeareUnpartitionedWriter.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object ShakespeareUnpartitionedWriter extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Shakespeare Unpartitioned Writer")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.eventLog.enabled", true)
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | val shakespeareDF = spark.read
20 | .format("csv")
21 | .option("sep", ";")
22 | .option("inferSchema", true)
23 | .load("./test_data/will_play_text.csv")
24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
25 | .repartition(200)
26 |
27 | shakespeareDF
28 | .mapPartitions(itr => {
29 | // simulate slow write like in S3
30 | Thread.sleep(200)
31 | itr
32 | })(shakespeareDF.encoder)
33 | .write.mode(SaveMode.Overwrite).parquet("/tmp/shakespear")
34 |
35 | scala.io.StdIn.readLine()
36 | spark.stop()
37 | }
38 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespeareUnpartitionedWriterFixed.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{SaveMode, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object ShakespeareUnpartitionedWriterFixed extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Shakespeare Unpartitioned Writer Fixed")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.eventLog.enabled", true)
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | val shakespeareDF = spark.read
20 | .format("csv")
21 | .option("sep", ";")
22 | .option("inferSchema", true)
23 | .load("./test_data/will_play_text.csv")
24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
25 | .repartition(200)
26 |
27 | shakespeareDF
28 | .repartition(1)
29 | .mapPartitions(itr => {
30 | // simulate slow write like in S3
31 | Thread.sleep(200)
32 | itr
33 | })(shakespeareDF.encoder)
34 | .write.mode(SaveMode.Overwrite).parquet("/tmp/shakespear")
35 |
36 | spark.stop()
37 | }
38 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SimpleStreaming.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.streaming.Trigger
5 |
6 | import java.sql.Timestamp
7 |
8 | object SimpleStreaming extends App {
9 | val spark = SparkSession
10 | .builder()
11 | .appName("Simple Streaming")
12 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
13 | .config("spark.dataflint.telemetry.enabled", false)
14 | .config("spark.ui.port", "10000")
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | import spark.implicits._
20 |
21 | val numbers = (1 to 10).toList
22 | val numbersDF = numbers.toDF("number")
23 |
24 | // Create a streaming DataFrame
25 | val streamingNumbers = spark.readStream
26 | .format("rate")
27 | .option("rowsPerSecond", "1")
28 | .load()
29 | .as[(Long, Timestamp)]
30 | .flatMap(_ => numbers)
31 | .toDF("number")
32 |
33 | // Filter numbers divisible by 2
34 | val filteredStream = streamingNumbers
35 | .mapPartitions(i => {
36 | Thread.sleep(10000)
37 | i
38 | })(streamingNumbers.encoder)
39 | .filter($"number" % 2 === 0)
40 |
41 | // Output the result to the console
42 | val query = filteredStream.writeStream
43 | .outputMode("append")
44 | .format("console")
45 | .trigger(Trigger.ProcessingTime("1 second"))
46 | .start()
47 |
48 | // Wait for the streaming query to finish
49 | query.awaitTermination()
50 |
51 | scala.io.StdIn.readLine()
52 | spark.stop()
53 | }
--------------------------------------------------------------------------------
/spark-plugin/example_3_4_1_remote/src/main/scala/io/dataflint/example/Shakespeare341Remote.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.ivy.Ivy
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.sql.{DataFrame, SparkSession}
6 | import org.apache.spark.sql.functions._
7 |
8 | import java.nio.file.Paths
9 |
10 | object Shakespeare341Remote extends App {
11 | def df(spark: SparkSession): DataFrame = spark.read
12 | .format("csv")
13 | .option("sep", ";")
14 | .option("inferSchema", true)
15 | .load("./test_data/will_play_text.csv")
16 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
17 | .repartition(1000)
18 |
19 | val spark = SparkSession
20 | .builder()
21 | .appName("Shakespeare Statistics")
22 | .config("spark.dataflint.telemetry.enabled", false)
23 | .config("spark.ui.port", "10000")
24 | .master("local[*]")
25 | .getOrCreate()
26 |
27 | import spark.implicits._
28 |
29 | val shakespeareText = df(spark)
30 |
31 | shakespeareText.printSchema()
32 |
33 | val count = shakespeareText.count()
34 | println(s"number of records : $count")
35 |
36 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
37 | println(s"number of unique speakers : $uniqueSpeakers")
38 |
39 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
40 |
41 | println(s"number of unique words : $uniqueWords")
42 |
43 | scala.io.StdIn.readLine()
44 | spark.stop()
45 | }
46 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/AccessPatternExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.{DataFrame, SparkSession}
5 |
6 | object AccessPatternExample extends App {
7 | val spark = SparkSession
8 | .builder()
9 | .appName("AccessPatternExample")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.dataflint.telemetry.enabled", false)
12 | .config("spark.ui.port", "10000")
13 | .config("spark.dataflint.telemetry.enabled", value = false)
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val salesDF = spark.read.load(sys.env("SALES_FILES_LOCATION"))
21 |
22 | spark.sparkContext.setJobDescription("full scan of store_sales")
23 | salesDF.count()
24 |
25 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition")
26 | salesDF
27 | .where($"ss_sold_date_sk" > 2450858)
28 | .count()
29 |
30 | spark.sparkContext.setJobDescription("scan of store_sales, filter by field")
31 | salesDF
32 | .where($"ss_quantity" > 1)
33 | .count()
34 |
35 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition and field")
36 | salesDF
37 | .where($"ss_sold_date_sk" > 2450858)
38 | .where($"ss_quantity" > 1)
39 | .count()
40 |
41 | spark.sparkContext.setJobDescription("scan of store_sales, filter by field condition")
42 | salesDF
43 | .where($"ss_sold_date_sk" > 2450858)
44 | .where($"ss_quantity" * 2 > 2)
45 | .count()
46 |
47 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition and field and field condition")
48 | salesDF
49 | .where($"ss_sold_date_sk" > 2450858)
50 | .where($"ss_store_sk" > 0)
51 | .where($"ss_quantity" * 2 > 2)
52 | .count()
53 |
54 | spark.sparkContext.setJobDescription("scan store_sales by partition, select 3 fields: ss_cdemo_s, ss_net_paid, ss_net_profit")
55 | salesDF
56 | .where($"ss_sold_date_sk" > 2450858)
57 | .select($"ss_cdemo_sk", $"ss_net_paid", $"ss_net_profit")
58 | .show()
59 |
60 | spark.sparkContext.setJobDescription("scan store_sales by partition, select all fields")
61 | salesDF
62 | .where($"ss_sold_date_sk" > 2450858)
63 | .show()
64 |
65 | scala.io.StdIn.readLine()
66 | spark.stop()
67 | }
68 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/CacheExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import io.dataflint.example.SchedulingSmallTasks.spark
4 | import org.apache.spark.sql.SparkSession
5 |
6 | object CacheExample extends App {
7 | val spark = SparkSession
8 | .builder()
9 | .appName("JobGroupExample")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.eventLog.enabled", "true")
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val df = spark.range(0, 10).cache()
21 | val secondCache = df.select($"id" * 2).persist()
22 | secondCache.count()
23 | df.unpersist()
24 | secondCache.unpersist()
25 |
26 | val df2 = spark.range(0, 10000000L).repartition(100).cache()
27 | df2.count()
28 | df.unpersist()
29 |
30 | scala.io.StdIn.readLine()
31 |
32 | spark.stop()
33 | }
34 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | object DataFusionCometExample extends App {
7 | def df(spark: SparkSession): DataFrame = spark.read
8 | .format("csv")
9 | .option("sep", ";")
10 | .option("inferSchema", true)
11 | .load("./test_data/will_play_text.csv")
12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
13 | .repartition(1000)
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName("DataFusionCometExample")
18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin,org.apache.spark.CometPlugin")
19 | .config("spark.shuffle.manager", "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager")
20 | .config("spark.comet.explainFallback.enabled", "true")
21 | .config("spark.memory.offHeap.enabled", "true")
22 | .config("spark.memory.offHeap.size", "16g")
23 | .config("spark.ui.port", "10000")
24 | .config("spark.dataflint.telemetry.enabled", value = false)
25 | .config("spark.sql.maxMetadataStringLength", "10000")
26 | .master("local[*]")
27 | .getOrCreate()
28 |
29 | import spark.implicits._
30 |
31 | val shakespeareText = df(spark)
32 |
33 | shakespeareText.printSchema()
34 |
35 | val count = shakespeareText.count()
36 | println(s"number of records : $count")
37 |
38 | val uniqueSpeakers = shakespeareText.select($"speaker").filter($"line_id".isNotNull).distinct().count()
39 | println(s"number of unique speakers : $uniqueSpeakers")
40 |
41 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
42 |
43 | println(s"number of unique words : $uniqueWords")
44 |
45 |
46 | spark.read.load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales").filter($"ss_quantity" > 1).count()
47 |
48 | scala.io.StdIn.readLine()
49 | spark.stop()
50 | }
51 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DeltaLakeExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | import java.sql.Timestamp
6 |
7 | object DeltaLakeExample extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("DeltaLakeExample")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
16 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
17 | .master("local[*]")
18 | .getOrCreate()
19 |
20 | import spark.implicits._
21 | spark.sparkContext.setJobDescription("Create Table")
22 | spark.sql("CREATE TABLE IF NOT EXISTS delta.`/tmp/delta-table` USING DELTA AS SELECT col1 as id FROM VALUES 0,1,2,3,4;")
23 |
24 | spark.sparkContext.setJobDescription("Insert data to table")
25 | spark.sql("INSERT OVERWRITE delta.`/tmp/delta-table` SELECT col1 as id FROM VALUES 5,6,7,8,9;")
26 |
27 | spark.sparkContext.setJobDescription("Select data from table")
28 | spark.sql("SELECT * FROM delta.`/tmp/delta-table`;").show()
29 |
30 | spark.sparkContext.setJobDescription("Insert overwrite data to table")
31 | spark.sql("INSERT OVERWRITE delta.`/tmp/delta-table` SELECT col1 as id FROM VALUES 5,6,7,8,9;")
32 |
33 | spark.sparkContext.setJobDescription("Update data from table")
34 | spark.sql("UPDATE delta.`/tmp/delta-table` SET id = id + 100 WHERE id % 2 == 0;")
35 |
36 | spark.sparkContext.setJobDescription("Delete data from table")
37 | spark.sql("DELETE FROM delta.`/tmp/delta-table` WHERE id % 2 == 0;")
38 |
39 | spark.sparkContext.setJobDescription("Create view from table")
40 | spark.sql("CREATE TEMP VIEW newData AS SELECT col1 AS id FROM VALUES 1,3,5,7,9,11,13,15,17,19;")
41 |
42 | spark.sparkContext.setJobDescription("Merge data to table")
43 | spark.sql(
44 | """MERGE INTO delta.`/tmp/delta-table` AS oldData
45 | |USING newData
46 | |ON oldData.id = newData.id
47 | |WHEN MATCHED
48 | | THEN UPDATE SET id = newData.id
49 | |WHEN NOT MATCHED
50 | | THEN INSERT (id) VALUES (newData.id);
51 | |""".stripMargin)
52 |
53 | spark.sparkContext.setJobDescription("Select data from table")
54 | spark.sql("SELECT * FROM delta.`/tmp/delta-table`;").show()
55 |
56 | spark.sparkContext.setJobDescription("Select data from table by version")
57 | spark.sql("SELECT * FROM delta.`/tmp/delta-table` VERSION AS OF 0;").show()
58 |
59 | scala.io.StdIn.readLine()
60 | spark.stop()
61 | }
62 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DeltaLakeStreaming.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | import java.sql.Timestamp
6 |
7 | object DeltaLakeStreaming extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("Simple Streaming")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
16 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
17 | .master("local[*]")
18 | .getOrCreate()
19 |
20 | import spark.implicits._
21 |
22 | val numbers = (1 to 10).toList
23 |
24 | // Create a streaming DataFrame
25 | val streamingNumbers = spark.readStream
26 | .format("rate")
27 | .option("rowsPerSecond", "1")
28 | .load()
29 | .as[(Long, Timestamp)]
30 | .flatMap(_ => numbers)
31 | .toDF("number")
32 |
33 | // Filter numbers divisible by 2
34 | val filteredStream = streamingNumbers
35 | .mapPartitions(i => {
36 | Thread.sleep(10000)
37 | i
38 | })(streamingNumbers.encoder)
39 | .filter($"number" % 2 === 0)
40 |
41 | // Output the result to the console
42 | val query = filteredStream.writeStream
43 | .format("delta")
44 | .outputMode("append")
45 | .option("checkpointLocation", "/tmp/delta/events/_checkpoints/")
46 | .start("/tmp/delta/eventsByCustomer")
47 |
48 | // Wait for the streaming query to finish
49 | query.awaitTermination()
50 |
51 | scala.io.StdIn.readLine()
52 | spark.stop()
53 | }
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JobGroupExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | object JobGroupExample extends App {
7 | val spark = SparkSession
8 | .builder()
9 | .appName("JobGroupExample")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.eventLog.enabled", "true")
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val data = Seq(
21 | ("Alice", "Math", 85),
22 | ("Alice", "Physics", 95),
23 | ("Bob", "Math", 78),
24 | ("Bob", "Physics", 88),
25 | ("Charlie", "Math", 92),
26 | ("Charlie", "Physics", 80)
27 | ).toDF("name", "subject", "score")
28 |
29 | data.createOrReplaceTempView("student_scores")
30 |
31 | // Set up and run the first query with a specific group ID
32 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores")
33 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'")
34 | mathScores.count()
35 |
36 | // Set up and run the second query with a different group ID
37 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores")
38 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name")
39 | avgScores.count()
40 |
41 | scala.io.StdIn.readLine()
42 |
43 | spark.stop()
44 | }
45 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JobGroupExportedLocal.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object JobGroupExportedLocal extends App {
6 | val spark = SparkSession
7 | .builder()
8 | .appName("JobGroupExample")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.ui.port", "10000")
11 | .config("spark.dataflint.telemetry.enabled", value = false)
12 | .config("spark.eventLog.enabled", "true")
13 | .config("spark.dataflint.mode", "local")
14 | .config("spark.dataflint.token", "AKIAZEUOHHYMKVUKYYZB-1234")
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | import spark.implicits._
20 |
21 | val data = Seq(
22 | ("Alice", "Math", 85),
23 | ("Alice", "Physics", 95),
24 | ("Bob", "Math", 78),
25 | ("Bob", "Physics", 88),
26 | ("Charlie", "Math", 92),
27 | ("Charlie", "Physics", 80)
28 | ).toDF("name", "subject", "score")
29 |
30 | data.createOrReplaceTempView("student_scores")
31 |
32 | // Set up and run the first query with a specific group ID
33 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores")
34 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'")
35 | mathScores.count()
36 |
37 | // Set up and run the second query with a different group ID
38 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores")
39 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name")
40 | avgScores.count()
41 |
42 | spark.stop()
43 | }
44 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JoinExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions.broadcast
5 |
6 | object JoinExample extends App {
7 | val spark = SparkSession
8 | .builder()
9 | .appName("JoinExample")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.eventLog.enabled", "true")
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val df1 = Seq(1, 2).toDF("id1")
21 | val df2 = Seq(1, 2, 3, 4).toDF("id2")
22 |
23 | spark.sparkContext.setJobDescription("Cross Join Broadcast Nested Loop Join")
24 | val result2 = df1.join(broadcast(df2), $"id1" > $"id2")
25 | result2.show()
26 |
27 | spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
28 |
29 | spark.sparkContext.setJobDescription("Cross Join Broadcast Cartesian Product")
30 | val result = df1.repartition(2).crossJoin(df2.repartition(2))
31 | result.show()
32 |
33 | // INNER JOIN EXAMPLE (Reduces rows)
34 | val df3 = Seq((1, "a"), (2, "b"), (3, "c"), (4, "d")).toDF("id", "value")
35 | val df4 = Seq((2, "x"), (4, "y")).toDF("id", "desc")
36 |
37 | spark.sparkContext.setJobDescription("Inner Join (reduces rows)")
38 | val innerJoinResult = df3.join(df4, Seq("id"), "inner")
39 | innerJoinResult.show()
40 |
41 | // LEFT OUTER JOIN EXAMPLE (Increases rows)
42 | val df5 = Seq((1, "a"), (1, "a")).toDF("id", "value")
43 | val df6 = Seq((1, "x"), (1, "y"), (1, "a"), (1, "b")).toDF("id", "desc")
44 |
45 | spark.sparkContext.setJobDescription("Left Outer Join (increases rows)")
46 | val leftJoinResult = df5.join(df6, Seq("id"), "left_outer")
47 | leftJoinResult.show()
48 |
49 | scala.io.StdIn.readLine()
50 | spark.stop()
51 | }
52 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/KafkaStreaming.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object KafkaStreaming extends App {
6 |
7 | val spark = SparkSession
8 | .builder()
9 | .appName("Simple Streaming")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.dataflint.telemetry.enabled", false)
12 | .config("spark.ui.port", "10000")
13 | .config("spark.sql.maxMetadataStringLength", "10000")
14 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
15 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
16 | .master("local[*]")
17 | .getOrCreate()
18 |
19 | val df = spark
20 | .readStream
21 | .format("kafka")
22 | .option("kafka.bootstrap.servers", "localhost:9092")
23 | .option("subscribe", "testtopic")
24 | .option("startingOffsets", "earliest")
25 | .load()
26 |
27 | val dfSelected = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
28 |
29 | // Filter numbers divisible by 2
30 | val filteredStream = dfSelected
31 | .mapPartitions(i => {
32 | Thread.sleep(10000)
33 | i
34 | })(dfSelected.encoder)
35 |
36 | // Output the result to the console
37 | val query = filteredStream.writeStream
38 | .format("delta")
39 | .outputMode("append")
40 | .option("checkpointLocation", "/tmp/delta/kafka2delta/_checkpoints/")
41 | .start("/tmp/delta/kafka2delta")
42 |
43 | // Wait for the streaming query to finish
44 | query.awaitTermination()
45 |
46 | scala.io.StdIn.readLine()
47 | spark.stop()
48 | }
49 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/LargeBroadcastExample.scala:
--------------------------------------------------------------------------------
1 | package main.scala.io.dataflint.example
2 |
3 | import io.dataflint.example.LargeFilterCondition.spark
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.{DataFrame, SparkSession}
6 |
7 | object LargeBroadcastExample extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("LargeBroadcastExample")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.dataflint.telemetry.enabled", false)
13 | .config("spark.ui.port", "10000")
14 | .config("spark.dataflint.telemetry.enabled", value = false)
15 | .config("spark.sql.maxMetadataStringLength", "10000")
16 | .config("spark.driver.maxResultSize", "10g")
17 | .master("local[*]")
18 | .getOrCreate()
19 |
20 | spark.sparkContext.setJobDescription("Join with large broadcast")
21 |
22 | val smallDfSize = sys.env.get("SMALL_DF_SIZE").map(_.toLong).getOrElse((40 * 1000 * 1000).toLong)
23 | val largeDFSize = sys.env.get("LARGE_DF_SIZE").map(_.toLong).getOrElse((100 * 1000 * 1000).toLong)
24 |
25 | val smallDF = spark.range(1L, smallDfSize).toDF("id")
26 | val largeDF = spark.range(1L, largeDFSize).toDF("item_sk")
27 |
28 | val joinedDF = largeDF.join(broadcast(smallDF), largeDF("item_sk") === smallDF("id"))
29 |
30 | joinedDF.count()
31 |
32 | spark.sparkContext.setJobDescription("Join with shuffle")
33 | val joinedWithShuffleDF = largeDF.join(smallDF, largeDF("item_sk") === smallDF("id"))
34 |
35 | joinedWithShuffleDF.count()
36 |
37 | scala.io.StdIn.readLine()
38 | spark.stop()
39 | }
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/LargeFilterCondition.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | object LargeFilterCondition extends App {
6 | val spark = SparkSession
7 | .builder()
8 | .appName("Large Filter Condition")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.dataflint.telemetry.enabled", false)
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.sql.maxMetadataStringLength", "10000")
14 | .master("local[*]")
15 | .getOrCreate()
16 |
17 | import spark.implicits._
18 |
19 | val numOfConditions = 1000
20 | val sizeOfDF = 10000000
21 |
22 | spark.sparkContext.setJobDescription("Filter with long filter condition")
23 |
24 | val filterConditions = Range(0, numOfConditions).map($"id".equalTo(_)).reduce(_ || _)
25 |
26 | val countAfterLongFilter = spark.range(0, sizeOfDF)
27 | .filter(filterConditions)
28 | .count()
29 |
30 | println(s"count after long filter condition: ${countAfterLongFilter}")
31 |
32 | spark.sparkContext.setJobDescription("Filter with long regex condition")
33 |
34 | val regexPattern = Range(0, numOfConditions).map(_.toString).mkString("|")
35 |
36 | val countAfterLongRegexFilter = spark.range(0, sizeOfDF)
37 | .withColumn("num_str", $"id".cast("string"))
38 | .filter($"num_str".rlike(s"^($regexPattern)$$"))
39 | .count()
40 |
41 | println(s"count after long regex filter: ${countAfterLongRegexFilter}")
42 |
43 | spark.sparkContext.setJobDescription("Filter using join")
44 |
45 | val filterTable = spark.range(0, numOfConditions).toDF("id")
46 |
47 | val countAfterJoinFilter = spark.range(0, sizeOfDF)
48 | .join(filterTable, "id")
49 | .count()
50 |
51 | println(s"count after filter using join: ${countAfterJoinFilter}")
52 |
53 | scala.io.StdIn.readLine()
54 | spark.stop()
55 | }
56 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/PartitionSkewExample.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.{DataFrame, SparkSession}
5 |
6 | object PartitionSkewExample extends App {
7 | val spark = SparkSession
8 | .builder()
9 | .appName("Partition Skew Example")
10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
11 | .config("spark.dataflint.telemetry.enabled", false)
12 | .config("spark.ui.port", "10000")
13 | .config("spark.dataflint.telemetry.enabled", value = false)
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val numbers = spark.range(1, 100).repartition(100)
21 |
22 | val count = numbers.mapPartitions(i => {
23 | i.map(i => {
24 | if (i == 50L) {
25 | Thread.sleep(10000)
26 | }
27 | i
28 | });
29 | })(numbers.encoder)
30 | .count()
31 |
32 | println(s"count numbers: $count")
33 |
34 | scala.io.StdIn.readLine()
35 | spark.stop()
36 | }
37 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SchedulingSmallTasks.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object SchedulingSmallTasks extends App {
6 | val spark = SparkSession
7 | .builder()
8 | .appName("SchedulingSmallTasks")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.dataflint.telemetry.enabled", false)
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.sql.maxMetadataStringLength", "10000")
14 | .master("local[*]")
15 | .getOrCreate()
16 |
17 | val numbers = spark.range(0, 10000).repartition(10000).count()
18 |
19 | println(s"count numbers to 10000: $numbers")
20 |
21 | scala.io.StdIn.readLine()
22 | spark.stop()
23 | }
24 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SchedulingSmallTasksSkipAlerts.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.SparkSession
4 |
5 | object SchedulingSmallTasksSkipAlerts extends App {
6 | val spark = SparkSession
7 | .builder()
8 | .appName("SchedulingSmallTasks")
9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
10 | .config("spark.dataflint.telemetry.enabled", false)
11 | .config("spark.ui.port", "10000")
12 | .config("spark.dataflint.telemetry.enabled", value = false)
13 | .config("spark.sql.maxMetadataStringLength", "10000")
14 | .config("spark.dataflint.alert.disabled", "smallTasks,idleCoresTooHigh")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | val numbers = spark.range(0, 10000).repartition(10000).count()
19 |
20 | println(s"count numbers to 10000: $numbers")
21 |
22 | scala.io.StdIn.readLine()
23 | spark.stop()
24 | }
25 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SetJobDescriptionAndUDFName.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import main.scala.io.dataflint.example.LargeBroadcastExample.{smallDfSize, spark}
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.{DataFrame, SparkSession}
6 |
7 | object SetJobDescriptionAndUDFName extends App {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("SetJobDescriptionAndUDFName")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.ui.port", "10000")
13 | .config("spark.dataflint.telemetry.enabled", value = false)
14 | .config("spark.sql.maxMetadataStringLength", "10000")
15 | .master("local[*]")
16 | .getOrCreate()
17 |
18 | import spark.implicits._
19 |
20 | val df = spark.range(1L, 1000).toDF("id")
21 | val plusOne = udf((x: Int) => x + 1)
22 |
23 | df.filter(plusOne($"id") =!= 5).count()
24 |
25 | spark.sparkContext.setJobDescription("Range 1 to 1000 and then filter plus one not equal to 5")
26 | df.filter(plusOne($"id") =!= 5).count()
27 |
28 | val plusOneNamed = udf((x: Int) => x + 1).withName("plusOne")
29 | spark.sparkContext.setJobDescription("Range 1 to 1000 and then filter plus one not equal to 5, named")
30 | df.filter(plusOneNamed($"id") =!= 5).count()
31 |
32 | scala.io.StdIn.readLine()
33 | spark.stop()
34 | }
35 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 | import org.apache.spark.sql.functions._
5 |
6 | object Shakespeare351 extends App {
7 | def df(spark: SparkSession): DataFrame = spark.read
8 | .format("csv")
9 | .option("sep", ";")
10 | .option("inferSchema", true)
11 | .load("./test_data/will_play_text.csv")
12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
13 | .repartition(1000)
14 |
15 | val spark = SparkSession
16 | .builder()
17 | .appName("Shakespeare Statistics")
18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
19 | .config("spark.ui.port", "10000")
20 | .config("spark.dataflint.telemetry.enabled", value = false)
21 | .config("spark.sql.maxMetadataStringLength", "10000")
22 | .master("local[*]")
23 | .getOrCreate()
24 |
25 | import spark.implicits._
26 |
27 | val shakespeareText = df(spark)
28 |
29 | shakespeareText.printSchema()
30 |
31 | val count = shakespeareText.count()
32 | println(s"number of records : $count")
33 |
34 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count()
35 | println(s"number of unique speakers : $uniqueSpeakers")
36 |
37 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count()
38 |
39 | println(s"number of unique words : $uniqueWords")
40 |
41 | scala.io.StdIn.readLine()
42 | spark.stop()
43 | }
44 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351Exported.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.functions._
4 | import org.apache.spark.sql.{DataFrame, SparkSession}
5 |
6 | import java.nio.file.Paths
7 |
8 | object Shakespeare351Exported extends App {
9 | def df(spark: SparkSession): DataFrame = spark.read
10 | .format("csv")
11 | .option("sep", ";")
12 | .option("inferSchema", true)
13 | .load("./test_data/will_play_text.csv")
14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
15 | .repartition(1000)
16 |
17 | val spark = SparkSession
18 | .builder()
19 | .appName("Shakespeare Statistics Exported")
20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
21 | .config("spark.dataflint.telemetry.enabled", false)
22 | .config("spark.ui.port", "10000")
23 | .config("spark.sql.maxMetadataStringLength", "10000")
24 | .config("spark.eventLog.enabled", "true")
25 | .master("local[*]")
26 | .getOrCreate()
27 |
28 | val shakespeareText = df(spark)
29 |
30 | val count = shakespeareText.count()
31 | println(s"number of records : $count")
32 |
33 | spark.stop()
34 | }
35 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351ExportedLocal.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | import java.nio.file.Paths
6 |
7 | object Shakespeare351ExportedLocal extends App {
8 | def df(spark: SparkSession): DataFrame = spark.read
9 | .format("csv")
10 | .option("sep", ";")
11 | .option("inferSchema", true)
12 | .load("./test_data/will_play_text.csv")
13 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
14 | .repartition(1000)
15 |
16 | val spark = SparkSession
17 | .builder
18 | .appName("Shakespeare Statistics Exported")
19 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
20 | .config("spark.dataflint.telemetry.enabled", false)
21 | .config("spark.ui.port", "10000")
22 | .config("spark.dataflint.mode", "local")
23 | .config("spark.dataflint.token", "AKIAZEUOHHYMKVUKYYZB-1234")
24 | .config("spark.sql.maxMetadataStringLength", "10000")
25 | .config("spark.eventLog.enabled", "true")
26 | .master("local[*]")
27 | .getOrCreate()
28 |
29 | val shakespeareText = df(spark)
30 |
31 | val count = shakespeareText.count()
32 | println(s"number of records : $count")
33 |
34 | spark.stop()
35 | }
36 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351ExportedLocal2.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.example
2 |
3 | import org.apache.spark.sql.{DataFrame, SparkSession}
4 |
5 | object Shakespeare351ExportedLocal2 extends App {
6 | def df(spark: SparkSession): DataFrame = spark.read
7 | .format("csv")
8 | .option("sep", ";")
9 | .option("inferSchema", true)
10 | .load("./test_data/will_play_text.csv")
11 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry")
12 | .repartition(1000)
13 |
14 | val spark = SparkSession
15 | .builder
16 | .appName("Shakespeare Statistics Exported General")
17 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
18 | .config("spark.dataflint.telemetry.enabled", false)
19 | .config("spark.ui.port", "10000")
20 | .config("spark.dataflint.mode", "local")
21 | .config("spark.dataflint.token", "CKIAZEUOHHYMKVUKYYZC-1234")
22 | .config("spark.sql.maxMetadataStringLength", "10000")
23 | .config("spark.eventLog.enabled", "true")
24 | .master("local[*]")
25 | .getOrCreate()
26 |
27 | val shakespeareText = df(spark)
28 |
29 | val count = shakespeareText.count()
30 | println(s"number of records : $count")
31 |
32 | spark.stop()
33 | }
34 |
--------------------------------------------------------------------------------
/spark-plugin/example_3_5_1/src/main/scala/org/apache/spark/dataflint/jobgroup/tests/JobGroupTests.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.jobgroup.tests
2 |
3 | import org.apache.spark.dataflint.jobgroup.JobGroupExtractor
4 | import org.apache.spark.sql.SparkSession
5 |
6 | class JobGroupTests extends org.scalatest.funsuite.AnyFunSuiteLike {
7 | test("test job group extractor with 2 groups") {
8 | val spark = SparkSession
9 | .builder()
10 | .appName("JobGroupExample")
11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin")
12 | .config("spark.ui.port", "10000")
13 | .config("spark.sql.maxMetadataStringLength", "10000")
14 | .master("local[*]")
15 | .getOrCreate()
16 |
17 | import spark.implicits._
18 |
19 | val data = Seq(
20 | ("Alice", "Math", 85),
21 | ("Alice", "Physics", 95),
22 | ("Bob", "Math", 78),
23 | ("Bob", "Physics", 88),
24 | ("Charlie", "Math", 92),
25 | ("Charlie", "Physics", 80)
26 | ).toDF("name", "subject", "score")
27 |
28 | data.createOrReplaceTempView("student_scores")
29 |
30 | // Set up and run the first query with a specific group ID
31 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores")
32 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'")
33 | mathScores.count()
34 |
35 | spark.sparkContext.clearJobGroup()
36 |
37 | // Set up and run the second query with a different group ID
38 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores")
39 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name")
40 | avgScores.count()
41 |
42 | // Optionally, clear job group if needed
43 | spark.sparkContext.clearJobGroup()
44 |
45 | Thread.sleep(1000)
46 |
47 | val extractor = new JobGroupExtractor(spark.sparkContext.ui.get.store, spark.sharedState.statusStore)
48 | val queryGroup1Store = extractor.extract("queryGroup1")
49 | val queryGroup2Store = extractor.extract("queryGroup2")
50 |
51 | assert(queryGroup1Store._2.executionsList().length == 1)
52 | assert(queryGroup2Store._2.executionsList().length == 1)
53 | spark.stop()
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin:
--------------------------------------------------------------------------------
1 | org.apache.spark.deploy.history.DataFlintHistoryServerPlugin
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/io/dataflint/spark/SparkDataflint.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.spark
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.dataflint.DataflintSparkUILoader
5 |
6 | object SparkDataflint {
7 | def install(context: SparkContext): Unit = {
8 | DataflintSparkUILoader.install(context)
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/io/dataflint/spark/SparkDataflintPlugin.scala:
--------------------------------------------------------------------------------
1 | package io.dataflint.spark
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin}
5 | import org.apache.spark.dataflint.DataflintSparkUILoader
6 | import org.apache.spark.internal.Logging
7 |
8 | import java.util
9 | import scala.collection.JavaConverters.mapAsJavaMapConverter
10 |
11 | class SparkDataflintPlugin extends SparkPlugin {
12 | override def driverPlugin(): DriverPlugin = new SparkDataflintDriverPlugin()
13 |
14 | override def executorPlugin(): ExecutorPlugin = null
15 | }
16 |
17 | class SparkDataflintDriverPlugin extends DriverPlugin with Logging {
18 | var sc: SparkContext = null
19 |
20 | override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = {
21 | this.sc = sc
22 | Map[String, String]().asJava
23 | }
24 |
25 | override def registerMetrics(appId: String, pluginContext: PluginContext): Unit = {
26 | var webUrl = DataflintSparkUILoader.install(sc)
27 | logInfo(s"spark dataflint url is $webUrl/dataflint")
28 | super.registerMetrics(appId, pluginContext)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataFlintTab.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.ui.{SparkUI, UIUtils, WebUITab}
4 |
5 | import javax.servlet.http.HttpServletRequest
6 | import scala.xml.Node
7 |
8 | class DataFlintTab(parent: SparkUI) extends WebUITab(parent,"dataflint") {
9 | override val name: String = "DataFlint"
10 | def render(request: HttpServletRequest): Seq[Node] = {
11 | val content =
12 |
13 |
14 | UIUtils.basicSparkPage(request, content, "DataFlint", true)
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintApplicationInfoPage.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.dataflint.listener.DataflintStore
4 | import org.apache.spark.internal.Logging
5 | import org.apache.spark.ui.{SparkUI, WebUIPage}
6 | import org.json4s.{Extraction, JObject}
7 |
8 | import javax.servlet.http.HttpServletRequest
9 | import scala.xml.Node
10 |
11 | class DataflintApplicationInfoPage(ui: SparkUI, dataflintStore: DataflintStore)
12 | extends WebUIPage("applicationinfo") with Logging {
13 | override def renderJson(request: HttpServletRequest) = {
14 | try {
15 | val runIdConfigFromStore = ui.store.environmentInfo().sparkProperties.find(_._1 == "spark.dataflint.runId").map(_._2)
16 | val runIdPotentiallyFromConfig = if (runIdConfigFromStore.isEmpty) ui.conf.getOption("spark.dataflint.runId") else runIdConfigFromStore
17 | val applicationInfo = ui.store.applicationInfo()
18 | val environmentInfo = dataflintStore.environmentInfo()
19 | val dataFlintApplicationInfo = DataFlintApplicationInfo(runIdPotentiallyFromConfig, applicationInfo, environmentInfo)
20 | val jsonValue = Extraction.decompose(dataFlintApplicationInfo)(org.json4s.DefaultFormats)
21 | jsonValue
22 | }
23 | catch {
24 | case e: Throwable => {
25 | logError("failed to serve dataflint application info", e)
26 | JObject()
27 | }
28 | }
29 | }
30 |
31 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]()
32 | }
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintCachedStoragePage.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.dataflint.listener.{DataflintExecutorStorageInfo, DataflintRDDStorageInfo, DataflintStore}
4 | import org.apache.spark.internal.Logging
5 | import org.apache.spark.status.AppStatusStore
6 | import org.apache.spark.ui.{SparkUI, WebUIPage}
7 | import org.json4s.{Extraction, JObject}
8 |
9 | import javax.servlet.http.HttpServletRequest
10 | import scala.xml.Node
11 |
12 | class DataflintCachedStoragePage(ui: SparkUI, dataflintStore: DataflintStore)
13 | extends WebUIPage("cachedstorage") with Logging {
14 | override def renderJson(request: HttpServletRequest) = {
15 | try {
16 | val liveRddStorage = ui.store.rddList()
17 | val rddStorage = dataflintStore.rddStorageInfo()
18 | val graphs = ui.store.stageList(null)
19 | .filter(_.submissionTime.isDefined) // filter skipped or pending stages
20 | .map(stage => Tuple2(stage.stageId,
21 | ui.store.operationGraphForStage(stage.stageId).rootCluster.childClusters.flatMap(_.childNodes)
22 | .filter(_.cached)
23 | .map(rdd => {
24 |
25 | val liveCached = liveRddStorage.find(_.id == rdd.id).map(
26 | rdd => {
27 | val maxUsageExecutor = rdd.dataDistribution.map(executors => executors.maxBy(_.memoryUsed))
28 | val maxExecutorUsage = maxUsageExecutor.map(executor =>
29 | DataflintExecutorStorageInfo(
30 | executor.memoryUsed,
31 | executor.memoryRemaining,
32 | if(executor.memoryUsed + executor.memoryRemaining != 0) executor.memoryUsed.toDouble / (executor.memoryUsed + executor.memoryRemaining) * 100 else 0
33 | ))
34 | DataflintRDDStorageInfo(rdd.id,
35 | rdd.memoryUsed,
36 | rdd.diskUsed,
37 | rdd.numPartitions,
38 | rdd.storageLevel,
39 | maxExecutorUsage
40 | )}
41 | )
42 | val cached = rddStorage.find(_.rddId == rdd.id)
43 | liveCached.getOrElse(cached)
44 | }))).toMap
45 | val jsonValue = Extraction.decompose(graphs)(org.json4s.DefaultFormats)
46 | jsonValue
47 | }
48 | catch {
49 | case e: Throwable => {
50 | logError("failed to serve dataflint Jobs RDD", e)
51 | JObject()
52 | }
53 | }
54 | }
55 |
56 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]()
57 | }
58 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintIcebergPage.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.dataflint.listener.DataflintStore
4 | import org.apache.spark.internal.Logging
5 | import org.apache.spark.ui.{SparkUI, WebUIPage}
6 | import org.json4s.{Extraction, JObject}
7 |
8 | import javax.servlet.http.HttpServletRequest
9 | import scala.xml.Node
10 |
11 | class DataflintIcebergPage(ui: SparkUI, dataflintStore: DataflintStore)
12 | extends WebUIPage("iceberg") with Logging {
13 | override def renderJson(request: HttpServletRequest) = {
14 | try {
15 | val offset = request.getParameter("offset")
16 | val length = request.getParameter("length")
17 | if (offset == null || length == null) {
18 | JObject()
19 | } else {
20 | val commits = dataflintStore.icebergCommits(offset.toInt, length.toInt)
21 | val icebergInfo = IcebergInfo(commitsInfo = commits)
22 | val jsonValue = Extraction.decompose(icebergInfo)(org.json4s.DefaultFormats)
23 | jsonValue
24 | }
25 | }
26 | catch {
27 | case e: Throwable => {
28 | logError("failed to serve dataflint iceberg", e)
29 | JObject()
30 | }
31 | }
32 | }
33 |
34 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]()
35 | }
36 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintSQLMetricsPage.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SparkPlanGraph}
5 | import org.apache.spark.ui.{SparkUI, WebUIPage}
6 | import org.json4s.{Extraction, JObject}
7 |
8 | import javax.servlet.http.HttpServletRequest
9 | import scala.xml.Node
10 |
11 | class DataflintSQLMetricsPage(ui: SparkUI, sqlListener: () => Option[SQLAppStatusListener])
12 | extends WebUIPage("sqlmetrics") with Logging {
13 | private var sqlListenerCache: Option[SQLAppStatusListener] = None
14 |
15 | override def renderJson(request: HttpServletRequest) = {
16 | try {
17 | if (sqlListenerCache.isEmpty) {
18 | sqlListenerCache = sqlListener()
19 | }
20 |
21 | val sqlStore = new SQLAppStatusStore(ui.store.store, sqlListenerCache)
22 | val executionId = request.getParameter("executionId")
23 | if (executionId == null) {
24 | JObject()
25 | } else {
26 | val executionIdLong = executionId.toLong
27 | val metrics = sqlStore.executionMetrics(executionIdLong)
28 | val isDatabricks = ui.conf.getOption("spark.databricks.clusterUsageTags.cloudProvider").isDefined
29 | val graph = if (isDatabricks) {
30 | val exec = sqlStore.execution(executionIdLong).get
31 | val planVersion = exec.getClass.getMethod("latestVersion").invoke(exec).asInstanceOf[Long]
32 | sqlStore.getClass.getMethods.filter(_.getName == "planGraph").head.invoke(sqlStore, executionIdLong.asInstanceOf[Object], planVersion.asInstanceOf[Object]).asInstanceOf[SparkPlanGraph]
33 | } else
34 | sqlStore.planGraph(executionIdLong)
35 | val nodesMetrics = graph.allNodes.map(node => NodeMetrics(node.id, node.name, node.metrics.map(metric => {
36 | NodeMetric(metric.name, metrics.get(metric.accumulatorId))
37 | }).toSeq))
38 | // filter nodes without metrics
39 | .filter(nodeMetrics => !nodeMetrics.metrics.forall(_.value.isEmpty))
40 | val jValue = Extraction.decompose(nodesMetrics)(org.json4s.DefaultFormats)
41 | jValue
42 | }
43 | } catch {
44 | case e: Throwable => {
45 | logError("failed to serve dataflint SQL metrics", e)
46 | JObject()
47 | }
48 | }
49 | }
50 |
51 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]()
52 | }
53 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintSQLStagesRddPage.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.api
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.ui.{SparkUI, WebUIPage}
5 | import org.json4s.{Extraction, JObject}
6 |
7 | import javax.servlet.http.HttpServletRequest
8 | import scala.xml.Node
9 |
10 | class DataflintSQLStagesRddPage(ui: SparkUI)
11 | extends WebUIPage("stagesrdd") with Logging {
12 | override def renderJson(request: HttpServletRequest) = {
13 | try {
14 | val graphs = ui.store.stageList(null)
15 | .filter(_.submissionTime.isDefined) // filter skipped or pending stages
16 | .map(stage => Tuple2(stage.stageId,
17 | ui.store.operationGraphForStage(stage.stageId).rootCluster.childClusters
18 | .map(rdd => Tuple2(rdd.id, rdd.name)).toMap))
19 | .toMap
20 | val jsonValue = Extraction.decompose(graphs)(org.json4s.DefaultFormats)
21 | jsonValue
22 | }
23 | catch {
24 | case e: Throwable => {
25 | logError("failed to serve dataflint Jobs RDD", e)
26 | JObject()
27 | }
28 | }
29 | }
30 |
31 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]()
32 | }
33 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/api.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.apache.spark.dataflint.api
18 |
19 | import org.apache.spark.dataflint.listener.{IcebergCommitInfo, DataflintEnvironmentInfo}
20 | import org.apache.spark.status.api.v1.ApplicationInfo
21 |
22 | case class NodeMetric(name: String, value: Option[String])
23 |
24 | case class NodeMetrics(id: Long, name: String, metrics: Seq[NodeMetric])
25 |
26 | case class SqlEnrichedData(executionId: Long, numOfNodes:Int, rddScopesToStages: Option[Map[String, Set[Object]]], nodesPlan: Seq[NodePlan])
27 |
28 | case class NodePlan(id: Long, planDescription: String, rddScopeId: Option[String])
29 |
30 | case class DataFlintApplicationInfo(runId: Option[String], info: ApplicationInfo, environmentInfo: Option[DataflintEnvironmentInfo])
31 |
32 | case class IcebergInfo(commitsInfo: Seq[IcebergCommitInfo])
33 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/iceberg/ClassLoaderChecker.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.iceberg
2 |
3 | import org.apache.iceberg.CatalogUtil
4 | import org.apache.spark.internal.Logging
5 |
6 | object ClassLoaderChecker extends Logging {
7 | def isMetricLoaderInRightClassLoader(): Boolean = {
8 | val metricReporterClass = classOf[DataflintIcebergMetricsReporter]
9 | val classLoaderMetricReporter = metricReporterClass.getClassLoader.toString
10 | val classLoaderIcebergCatalog = classOf[CatalogUtil].getClassLoader.toString
11 | try {
12 | Class.forName(metricReporterClass.getCanonicalName, false, classOf[CatalogUtil].getClassLoader)
13 | }
14 | catch {
15 | case _: NoClassDefFoundError =>
16 | logWarning(s"Cannot load iceberg MetricsReporter from dataflint classloader, which prevents dataflint iceberg observability support. iceberg classloader: ${classOf[CatalogUtil].getClassLoader.toString}")
17 | return false
18 | case _: ClassNotFoundException =>
19 | logWarning(s"Cannot load DataflintIcebergMetricsReporter from iceberg classloader, which prevents dataflint iceberg observability support. iceberg classloader: ${classLoaderIcebergCatalog} metric reporter classloader: ${classLoaderMetricReporter}")
20 | return false
21 | case error: Throwable =>
22 | logError(s"Unexpected error while trying to load, can use DataflintIcebergMetricsReporter. iceberg classloader: ${classLoaderIcebergCatalog} metric reporter classloader: ${classLoaderMetricReporter}", error)
23 | return false
24 | }
25 | true
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintDatabricksLiveListener.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.listener
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent}
5 | import org.apache.spark.sql.execution.SparkPlanInfo
6 | import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionStart, SparkPlanGraph, SparkPlanGraphNode}
7 |
8 | import java.lang.reflect.Method
9 | import java.util.concurrent.ConcurrentHashMap
10 | import java.util.concurrent.atomic.AtomicInteger
11 |
12 | object DataflintDatabricksLiveListener {
13 | def apply(listenerBus: LiveListenerBus): DataflintDatabricksLiveListener = {
14 | val rddScopeIdReader = classOf[SparkPlanGraphNode].getMethod("rddScopeId")
15 | new DataflintDatabricksLiveListener(listenerBus, rddScopeIdReader)
16 | }
17 | }
18 |
19 | class DataflintDatabricksLiveListener(listenerBus: LiveListenerBus, rddScopeIdReader: Method) extends SparkListener with Logging {
20 | private val executionToLatestVersion = new ConcurrentHashMap[Long, AtomicInteger]()
21 |
22 | private def publishDatabricksAdditionalSQLEvent(sparkPlanInfo: SparkPlanInfo, executionId: Long, version: Long): Unit = {
23 | val planGraph = SparkPlanGraph(sparkPlanInfo)
24 | val nodesToRddScopeId = planGraph.allNodes.map(node => {
25 | val rddScopeId = rddScopeIdReader.invoke(node).asInstanceOf[String]
26 | node.id -> rddScopeId
27 | }).toMap
28 | val executionInfo = DatabricksAdditionalExecutionInfo(executionId, version, nodesToRddScopeId)
29 | val event = DatabricksAdditionalExecutionEvent(executionInfo)
30 | listenerBus.post(event)
31 | }
32 |
33 | def onExecutionStart(e: SparkListenerSQLExecutionStart): Unit = {
34 | executionToLatestVersion.put(e.executionId, new AtomicInteger(0))
35 | publishDatabricksAdditionalSQLEvent(e.sparkPlanInfo, e.executionId, 0L)
36 | }
37 |
38 | def onAdaptiveExecutionUpdate(e: SparkListenerSQLAdaptiveExecutionUpdate): Unit = {
39 | val version = executionToLatestVersion.get(e.executionId).incrementAndGet()
40 | publishDatabricksAdditionalSQLEvent(e.sparkPlanInfo, e.executionId, version)
41 | }
42 |
43 | override def onOtherEvent(event: SparkListenerEvent): Unit = {
44 | try {
45 | event match {
46 | case e: SparkListenerSQLExecutionStart => onExecutionStart(e)
47 | case e: SparkListenerSQLAdaptiveExecutionUpdate => onAdaptiveExecutionUpdate(e)
48 | case _ => {}
49 | }
50 | } catch {
51 | case e: Exception => logError("Error while processing events in DataflintDatabricksLiveListener", e)
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintListener.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.listener
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent}
5 | import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionStart}
6 | import org.apache.spark.status.ElementTrackingStore
7 |
8 | class DataflintListener(store: ElementTrackingStore) extends SparkListener with Logging {
9 |
10 | override def onOtherEvent(event: SparkListenerEvent): Unit = {
11 | try {
12 | event match {
13 | case icebergCommitEvent: IcebergCommitEvent => {
14 | val commitInfo = new IcebergCommitWrapper(icebergCommitEvent.icebergCommit)
15 | store.write(commitInfo)
16 | }
17 | case e: DatabricksAdditionalExecutionEvent => {
18 | val executionInfo = new DatabricksAdditionalExecutionWrapper(e.databricksAdditionalExecutionInfo)
19 | store.write(executionInfo)
20 | }
21 | case e: DataflintEnvironmentInfoEvent => {
22 | val wrapper = new DataflintEnvironmentInfoWrapper(e.environmentInfo)
23 | store.write(wrapper)
24 | }
25 | case _ => {}
26 | }
27 | } catch {
28 | case e: Exception => logError("Error while processing events in DataflintListener", e)
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintStore.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.listener
2 |
3 | import scala.collection.JavaConverters._
4 | import org.apache.spark.util.Utils
5 | import org.apache.spark.util.kvstore.{KVStore, KVStoreView}
6 |
7 |
8 | class DataflintStore(val store: KVStore) {
9 | // mapToSeq copied from KVUtils because it does not exists in spark 3.3
10 | def mapToSeq[T, B](view: KVStoreView[T])(mapFunc: T => B): Seq[B] = {
11 | Utils.tryWithResource(view.closeableIterator()) { iter =>
12 | iter.asScala.map(mapFunc).toList
13 | }
14 | }
15 |
16 | def icebergCommits(offset: Int, length: Int): Seq[IcebergCommitInfo] = {
17 | mapToSeq(store.view(classOf[IcebergCommitWrapper]))(_.info).filter(_.executionId >= offset).take(length).sortBy(_.executionId)
18 | }
19 |
20 | def databricksAdditionalExecutionInfo(offset: Int, length: Int): Seq[DatabricksAdditionalExecutionInfo] = {
21 | mapToSeq(store.view(classOf[DatabricksAdditionalExecutionWrapper]))(_.info).filter(_.executionId >= offset).take(length).sortBy(_.executionId)
22 | }
23 |
24 | def environmentInfo(): Option[DataflintEnvironmentInfo] = {
25 | mapToSeq(store.view(classOf[DataflintEnvironmentInfoWrapper]))(_.info).headOption
26 | }
27 |
28 | def rddStorageInfo(): Seq[DataflintRDDStorageInfo] = {
29 | mapToSeq(store.view(classOf[DataflintRDDStorageInfoWrapper]))(_.info)
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/package.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark
2 |
3 | package object dataflint {
4 | private[dataflint] type EnumValue[A <: Enumeration] = A#Value
5 | }
6 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/EnumSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.dataflint.EnumValue
4 | import org.json4s.{JInt, JString}
5 | import org.json4s.reflect.TypeInfo
6 | import org.json4s.{Formats, JValue, MappingException, Serializer}
7 |
8 | import scala.reflect.ClassTag
9 |
10 | // copied from json4s source code, because some spark version depends on json4s versions without this class
11 | class EnumSerializer[E <: Enumeration: ClassTag](enumeration: E) extends Serializer[EnumValue[E]] {
12 | import org.json4s.JsonDSL._
13 |
14 | private[this] val EnumerationClass = classOf[Enumeration#Value]
15 |
16 | private[this] def isValid(json: JValue) = json match {
17 | case JInt(value) => enumeration.values.toSeq.map(_.id).contains(value.toInt)
18 | case _ => false
19 | }
20 |
21 | private[this] def enumerationValueToEnumValueOfE(value: enumeration.Value): EnumValue[E] =
22 | value.asInstanceOf[EnumValue[E]]
23 |
24 | def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), EnumValue[E]] = {
25 | case (TypeInfo(EnumerationClass, _), json) if isValid(json) =>
26 | json match {
27 | case JInt(value) => enumerationValueToEnumValueOfE(enumeration(value.toInt))
28 | case value => throw new MappingException(s"Can't convert $value to $EnumerationClass")
29 | }
30 | }
31 |
32 | def serialize(implicit format: Formats): PartialFunction[Any, JValue] = {
33 | case i: Enumeration#Value if enumeration.values.exists(_ == i) => i.id
34 | }
35 | }
36 |
37 | class EnumNameSerializer[E <: Enumeration: ClassTag](enumeration: E) extends Serializer[EnumValue[E]] {
38 | import org.json4s.JsonDSL._
39 |
40 | private[this] val EnumerationClass = classOf[Enumeration#Value]
41 |
42 | private[this] def enumerationValueToEnumValueOfE(value: enumeration.Value): EnumValue[E] =
43 | value.asInstanceOf[EnumValue[E]]
44 |
45 | def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), EnumValue[E]] = {
46 | case (_ @TypeInfo(EnumerationClass, _), json) if isValid(json) => {
47 | json match {
48 | case JString(value) => enumerationValueToEnumValueOfE(enumeration.withName(value))
49 | case value => throw new MappingException(s"Can't convert $value to $EnumerationClass")
50 | }
51 | }
52 | }
53 |
54 | private[this] def isValid(json: JValue) = json match {
55 | case JString(value) if enumeration.values.exists(_.toString == value) => true
56 | case _ => false
57 | }
58 |
59 | def serialize(implicit format: Formats): PartialFunction[Any, JValue] = {
60 | case i: Enumeration#Value if enumeration.values.exists(_ == i) => i.toString
61 | }
62 | }
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/ExecutorsMetricsSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.executor.ExecutorMetrics
4 | import org.apache.spark.metrics.ExecutorMetricType
5 | import org.json4s.{CustomSerializer, JLong, JNull, JObject}
6 | import org.json4s.JValue
7 |
8 | class ExecutorsMetricsSerializer extends CustomSerializer[ExecutorMetrics](implicit format => (
9 | {
10 | case json: JValue =>
11 | val metricsMap = json.extract[Map[String, Long]]
12 | val metrics = new ExecutorMetrics(metricsMap)
13 | metrics
14 | },
15 | {
16 | case Some(metrics: ExecutorMetrics) =>
17 | val metricsMap = ExecutorMetricType.metricToOffset.map { case (metric, _) =>
18 | metric -> metrics.getMetricValue(metric)
19 | }
20 | val metricsObj = JObject(metricsMap.map { case (k, v) => k -> JLong(v) }.toList)
21 | metricsObj
22 | case None => JNull
23 | }
24 | ))
25 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/GZipUtils.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.commons.io.output.ByteArrayOutputStream
4 |
5 | import java.util.zip.GZIPOutputStream
6 |
7 | object GZipUtils {
8 | def compressString(inputString: String): Array[Byte] = {
9 | val input = inputString.getBytes("UTF-8")
10 | val bos = new ByteArrayOutputStream(input.length)
11 | val gzip = new GZIPOutputStream(bos)
12 | gzip.write(input)
13 | gzip.close()
14 | val compressed = bos.toByteArray
15 | bos.close()
16 | compressed
17 | }
18 |
19 | def decompressString(compressed: Array[Byte]): String = {
20 | val bis = new java.io.ByteArrayInputStream(compressed)
21 | val gzip = new java.util.zip.GZIPInputStream(bis)
22 | val bos = new java.io.ByteArrayOutputStream()
23 | val buffer = new Array[Byte](1024)
24 | var len = gzip.read(buffer)
25 | while (len > 0) {
26 | bos.write(buffer, 0, len)
27 | len = gzip.read(buffer)
28 | }
29 | gzip.close()
30 | bis.close()
31 | bos.close()
32 | new String(bos.toByteArray(), "UTF-8")
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/JavaEnumNameSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.json4s.CustomSerializer
4 | import org.json4s.JString
5 |
6 | // copied from json4s source code, because some spark version depends on json4s versions without this class
7 | class JavaEnumNameSerializer[E <: Enum[E]](implicit
8 | ct: Manifest[E]
9 | ) extends CustomSerializer[E](_ =>
10 | ( {
11 | case JString(name) =>
12 | Enum.valueOf(ct.runtimeClass.asInstanceOf[Class[E]], name)
13 | }, {
14 | case dt: E =>
15 | JString(dt.name())
16 | }
17 | )
18 | )
19 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/S3Uploader.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
4 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration
5 | import com.amazonaws.regions.Regions
6 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
7 | import com.amazonaws.services.s3.model.ObjectMetadata
8 | import org.apache.spark.internal.Logging
9 |
10 | import java.io.ByteArrayInputStream
11 |
12 | class S3Uploader(accessKeyId: String, secretAccessKey: String, mode: String) extends Logging {
13 | val credentials = new BasicAWSCredentials(accessKeyId, secretAccessKey)
14 | private val bucketName = "dataflint-upload-" + mode
15 |
16 | val s3client: AmazonS3 = {
17 | var builder = AmazonS3ClientBuilder.standard()
18 | .withCredentials(new AWSStaticCredentialsProvider(credentials))
19 |
20 | if(mode == "local") {
21 | logInfo(s"Uploading to S3 with localstack")
22 | builder = builder.withEndpointConfiguration(new EndpointConfiguration("s3.localhost.localstack.cloud:4566", Regions.US_EAST_1.getName))
23 | } else {
24 | builder = builder.enableAccelerateMode()
25 |
26 | }
27 |
28 | builder.build()
29 | }
30 |
31 | def uploadToS3(jsonContent: String, fileKey: String, shouldGzip: Boolean): Unit = {
32 | try {
33 | val startTimeMillis = System.currentTimeMillis()
34 |
35 | val metadata = new ObjectMetadata()
36 | val jsonToSend = if(shouldGzip) GZipUtils.compressString(jsonContent) else jsonContent.getBytes("UTF-8")
37 | if(shouldGzip) {
38 | metadata.setContentType("application/x-gzip")
39 | } else {
40 | metadata.setContentType("application/json")
41 | }
42 | metadata.setContentLength(jsonToSend.length)
43 |
44 | val inputStream = new ByteArrayInputStream(jsonToSend)
45 |
46 | s3client.putObject(bucketName, fileKey, inputStream, metadata)
47 | val endTimeMillis = System.currentTimeMillis()
48 | val durationMs = endTimeMillis - startTimeMillis
49 | logDebug(s"Upload file $fileKey took ${durationMs}ms")
50 | } catch {
51 | case e: Exception => e.printStackTrace()
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkMetadataSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.JobExecutionStatus
4 | import org.apache.spark.rdd.DeterministicLevel
5 | import org.apache.spark.status.api.v1.StageStatus
6 | import org.json4s.jackson.{JsonMethods, Serialization}
7 | import org.json4s.{Formats, NoTypeHints}
8 |
9 | import java.io.{File, PrintWriter}
10 |
11 | object SparkMetadataSerializer {
12 | implicit val formats: Formats = Serialization.formats(NoTypeHints) + new JavaEnumNameSerializer[JobExecutionStatus]() + new JavaEnumNameSerializer[StageStatus]() + new EnumSerializer(DeterministicLevel)
13 |
14 | def serialize(data: SparkMetadataStore): String = {
15 | Serialization.write(data)
16 | }
17 |
18 | def deserialize(json: String): SparkMetadataStore = {
19 | JsonMethods.parse(json).extract[SparkMetadataStore]
20 | }
21 |
22 | def serializeAndSave(data: SparkMetadataStore, filePath: String): Unit = {
23 | val jsonData = serialize(data)
24 | val writer = new PrintWriter(new File(filePath))
25 | try {
26 | writer.write(jsonData)
27 | } finally {
28 | writer.close()
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkMetadataStore.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.status.api.v1
4 |
5 | case class SparkMetadataMetrics(
6 | containerMemoryGb: Double,
7 | executorJvmMemoryGb: Double,
8 | totalInputBytes: Long,
9 | totalOutputBytes: Long,
10 | totalSpillBytes: Long,
11 | totalShuffleWriteBytes: Long,
12 | totalShuffleReadBytes: Long,
13 | totalCachedMemoryBytes: Long,
14 | totalCachedDiskBytes: Long,
15 | maxExecutorCachedMemoryUsagePercentage: Double, executorPeakMemoryBytes: Long,
16 | containerPeakMemoryBytes: Long,
17 | executorJvmMemoryUsage: Double,
18 | driverJvmPeakMemoryBytes: Long,
19 | driverJvmMemoryUsage: Double,
20 | containerMemoryUsage: Double,
21 | totalDCU: Double,
22 | coreHourUsage: Double,
23 | memoryGbHour: Double,
24 | isAnySqlQueryFailed: Boolean,
25 | taskErrorRate: Double,
26 | idleCoresRatio: Double,
27 | CoresWastedRatio: Double,
28 | executorsDurationMs: Long,
29 | driverDurationMs: Long,
30 |
31 | )
32 |
33 | case class SparkMetadataStore(version: String,
34 | runId: String,
35 | accessKey: String,
36 | applicationInfo: v1.ApplicationInfo,
37 | metrics: SparkMetadataMetrics,
38 | conf: Map[String, String])
39 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkRunSerializer.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.JobExecutionStatus
4 | import org.apache.spark.rdd.DeterministicLevel
5 | import org.apache.spark.status.api.v1.StageStatus
6 | import org.json4s.jackson.{JsonMethods, Serialization}
7 | import org.json4s.{Formats, NoTypeHints}
8 |
9 | import java.io.{File, PrintWriter}
10 |
11 | object SparkRunSerializer {
12 | implicit val formats: Formats = Serialization.formats(NoTypeHints) + new JavaEnumNameSerializer[JobExecutionStatus]() + new JavaEnumNameSerializer[StageStatus]() + new EnumSerializer(DeterministicLevel) + new ExecutorsMetricsSerializer()
13 |
14 | def serialize(data: SparkRunStore): String = {
15 | Serialization.write(data)
16 | }
17 |
18 | def deserialize(json: String): SparkRunStore = {
19 | JsonMethods.parse(json).extract[SparkRunStore]
20 | }
21 |
22 | def serializeAndSave(data: SparkRunStore, filePath: String): Unit = {
23 | val jsonData = serialize(data)
24 | val writer = new PrintWriter(new File(filePath))
25 | try {
26 | writer.write(jsonData)
27 | } finally {
28 | writer.close()
29 | }
30 | }
31 | }
32 |
33 |
34 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkRunStore.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.dataflint.listener.{DatabricksAdditionalExecutionWrapper, DataflintEnvironmentInfoEvent, DataflintEnvironmentInfoWrapper, DataflintRDDStorageInfoWrapper, IcebergCommitWrapper}
4 | import org.apache.spark.sql.execution.ui.{SQLExecutionUIData, SparkPlanGraphWrapper}
5 | import org.apache.spark.status._
6 |
7 | case class SparkRunStore(
8 | version: String,
9 | applicationInfos: Seq[ApplicationInfoWrapper],
10 | applicationEnvironmentInfo: Seq[ApplicationEnvironmentInfoWrapper],
11 | resourceProfiles: Seq[ResourceProfileWrapper],
12 | jobDatas: Seq[JobDataWrapper],
13 | stageDatas: Seq[StageDataWrapper],
14 | executorSummaries: Seq[ExecutorSummaryWrapper],
15 | taskDatas: Seq[TaskDataWrapper],
16 | rddStorageInfos: Seq[RDDStorageInfoWrapper],
17 | streamBlockDatas: Seq[StreamBlockData],
18 | rddOperationGraphs: Seq[RDDOperationGraphWrapper],
19 | poolDatas: Seq[PoolData],
20 | appSummaries: Seq[AppSummary],
21 | executorStageSummaries: Seq[ExecutorStageSummaryWrapper],
22 | speculationStageSummaries: Seq[SpeculationStageSummaryWrapper],
23 | sparkPlanGraphWrapper: Seq[SparkPlanGraphWrapper],
24 | sqlExecutionUIData: Seq[SQLExecutionUIData],
25 | stageTaskSummary: Seq[StageTaskSummary],
26 | databricksAdditionalExecutionInfo: Seq[DatabricksAdditionalExecutionWrapper],
27 | icebergCommit: Seq[IcebergCommitWrapper],
28 | dataflintEnvironmentInfo: Seq[DataflintEnvironmentInfoWrapper],
29 | dataflintRDDStorageInfo: Seq[DataflintRDDStorageInfoWrapper]
30 | )
31 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/StageTaskSummary.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import com.fasterxml.jackson.annotation.JsonIgnore
4 | import org.apache.spark.status.api.v1
5 | import org.apache.spark.util.kvstore.KVIndex
6 |
7 | case class StageTaskSummary(
8 | stageId: Int,
9 | stageAttemptId: Int,
10 | summary: v1.TaskMetricDistributions) {
11 | @KVIndex
12 | @JsonIgnore
13 | def id: Array[Any] = Array(stageId, stageAttemptId)
14 | }
15 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/StoreDataExtractor.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.dataflint.saas
2 |
3 | import org.apache.spark.dataflint.listener.{DatabricksAdditionalExecutionWrapper, DataflintEnvironmentInfoWrapper, DataflintRDDStorageInfoWrapper, IcebergCommitWrapper}
4 | import org.apache.spark.sql.execution.ui.{SQLExecutionUIData, SparkPlanGraphWrapper}
5 | import org.apache.spark.status._
6 |
7 | import scala.collection.convert.ImplicitConversions.`iterator asScala`
8 | import scala.reflect.{ClassTag, classTag}
9 |
10 | class StoreDataExtractor(store: AppStatusStore) {
11 | private val version: String = "1"
12 | private val kvStore = store.store.asInstanceOf[ElementTrackingStore]
13 |
14 | def extract(): SparkRunStore = {
15 | SparkRunStore(
16 | version = version,
17 | applicationInfos = readAll[ApplicationInfoWrapper],
18 | applicationEnvironmentInfo = readAll[ApplicationEnvironmentInfoWrapper],
19 | resourceProfiles = readAll[ResourceProfileWrapper],
20 | jobDatas = readAll[JobDataWrapper],
21 | stageDatas = readAll[StageDataWrapper],
22 | executorSummaries = readAll[ExecutorSummaryWrapper],
23 | taskDatas = readAll[TaskDataWrapper],
24 | rddStorageInfos = readAll[RDDStorageInfoWrapper],
25 | streamBlockDatas = readAll[StreamBlockData],
26 | rddOperationGraphs = readAll[RDDOperationGraphWrapper],
27 | poolDatas = readAll[PoolData],
28 | appSummaries = readAll[AppSummary],
29 | executorStageSummaries = readAll[ExecutorStageSummaryWrapper],
30 | speculationStageSummaries = readAll[SpeculationStageSummaryWrapper],
31 | sparkPlanGraphWrapper = readAll[SparkPlanGraphWrapper],
32 | sqlExecutionUIData = readAll[SQLExecutionUIData],
33 | stageTaskSummary = calculateTaskSummary(),
34 | databricksAdditionalExecutionInfo = readAll[DatabricksAdditionalExecutionWrapper],
35 | icebergCommit = readAll[IcebergCommitWrapper],
36 | dataflintEnvironmentInfo = readAll[DataflintEnvironmentInfoWrapper],
37 | dataflintRDDStorageInfo = readAll[DataflintRDDStorageInfoWrapper]
38 | )
39 | }
40 |
41 | private def calculateTaskSummary(): Seq[StageTaskSummary] = {
42 | val quantiles = Array(0.0, 0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99, 1.0)
43 | store.stageList(null).map(stage => {
44 | store.taskSummary(stage.stageId, stage.attemptId, quantiles).map(
45 | StageTaskSummary(stage.stageId, stage.attemptId, _)
46 | )
47 | }).filter(_.isDefined).map(_.get)
48 | }
49 |
50 | private def readAll[T: ClassTag]: Seq[T] = {
51 | val view = kvStore.view(classTag[T].runtimeClass)
52 | val it = view.closeableIterator()
53 | try {
54 | it.toSeq.asInstanceOf[Seq[T]]
55 | } finally {
56 | it.close()
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/deploy/history/DataFlintHistoryServerPlugin.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.deploy.history
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.dataflint.DataflintSparkUILoader
5 | import org.apache.spark.dataflint.listener.DataflintListener
6 | import org.apache.spark.scheduler.SparkListener
7 | import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore, LiveRDDsListener}
8 | import org.apache.spark.ui.SparkUI
9 |
10 | class DataFlintHistoryServerPlugin extends AppHistoryServerPlugin {
11 | override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = {
12 | Seq(new DataflintListener(store), new LiveRDDsListener(store))
13 | }
14 |
15 | override def setupUI(ui: SparkUI): Unit = {
16 | DataflintSparkUILoader.loadUI(ui)
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/spark-plugin/plugin/src/main/scala/org/apache/spark/deploy/history/FsDataflintHistoryProvider.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.deploy.history
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.dataflint.DataflintSparkUILoader
5 | import org.apache.spark.status.AppHistoryServerPlugin
6 | import org.apache.spark.util.{Clock, SystemClock, Utils}
7 |
8 | import java.util.ServiceLoader
9 | import scala.collection.JavaConverters._
10 |
11 | // This class is not needed any more, as history server loading is now being done via DataFlintListenerHistoryServerPlugin
12 | // Will be removed in the future, but because users already configured it as provider if we remove this method it will cause issues.
13 | class FsDataflintHistoryProvider(conf: SparkConf, clock: Clock) extends FsHistoryProvider(conf, clock) {
14 | def this(conf: SparkConf) = {
15 | this(conf, new SystemClock())
16 | }
17 |
18 | override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = {
19 | super.getAppUI(appId, attemptId)
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/spark-plugin/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.9.8
2 |
--------------------------------------------------------------------------------
/spark-plugin/project/publish.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.11.2")
2 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2")
3 | addSbtPlugin("com.github.sbt" % "sbt-git" % "2.0.1")
4 | addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.6.0")
--------------------------------------------------------------------------------
/spark-plugin/sonatype.sbt:
--------------------------------------------------------------------------------
1 |
2 | import xerial.sbt.Sonatype._
3 |
4 | ThisBuild / sonatypeCredentialHost := "s01.oss.sonatype.org"
5 |
6 | sonatypeProfileName := "io.dataflint"
7 |
8 | ThisBuild / sonatypeProfileName := "io.dataflint"
9 |
10 | ThisBuild / publishMavenStyle := true
11 |
12 | ThisBuild / licenses := Seq("APL2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt"))
13 |
14 | ThisBuild / sonatypeProjectHosting := Some(GitHubHosting("menishmueli", "dataflint/spark", "menishmueli@gmail.com"))
15 |
16 | ThisBuild / description := "Open Source Data-Application Performance Monitoring for Apache Spark"
17 |
18 | ThisBuild / homepage := Some(url("https://github.com/dataflint/spark"))
19 | ThisBuild / scmInfo := Some(
20 | ScmInfo(
21 | url("https://github.com/dataflint/spark"),
22 | "scm:git@github.com:dataflint/spark.git"
23 | )
24 | )
25 | ThisBuild / developers := List(
26 | Developer(
27 | id = "menishmueli",
28 | name = "Meni Shmueli",
29 | email = "menishmueli@gmail.com",
30 | url = url("http://dataflint.io")
31 | )
32 | )
33 |
--------------------------------------------------------------------------------
/spark-ui/.env:
--------------------------------------------------------------------------------
1 | REACT_APP_VERSION=$npm_package_version
2 | INLINE_RUNTIME_CHUNK=false
3 | GENERATE_SOURCEMAP=false
4 | SKIP_PREFLIGHT_CHECK=true
5 |
--------------------------------------------------------------------------------
/spark-ui/.generatelicensefile.yaml:
--------------------------------------------------------------------------------
1 | inputs:
2 | - ./package.json
3 | output: ./../THIRD-PARTY-LICENSES.txt
4 | overwrite: true
5 | eol: lf
6 | ci: true
7 | no-spinner: true
8 | replace:
9 | '@bcoe/v8-coverage@0.2.3': https://raw.githubusercontent.com/demurgos/v8-coverage/refs/heads/master/ts/LICENSE.md
10 | doctrine@3.0.0: https://raw.githubusercontent.com/eslint/doctrine/refs/heads/master/LICENSE
11 | doctrine@2.1.0: https://raw.githubusercontent.com/eslint/doctrine/refs/heads/master/LICENSE
12 | harmony-reflect@1.6.2: https://raw.githubusercontent.com/tvcutsem/harmony-reflect/refs/heads/master/LICENSE
13 | sockjs@0.3.24: https://raw.githubusercontent.com/sockjs/sockjs-node/refs/heads/main/LICENSE
14 |
15 |
--------------------------------------------------------------------------------
/spark-ui/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # production
12 | /build
13 |
14 | # IDEs and editors
15 | /.idea
16 | /.vscode
17 |
18 | # misc
19 | .DS_Store
20 | .env.local
21 | .env.development.local
22 | .env.test.local
23 | .env.production.local
24 |
25 | npm-debug.log*
26 | yarn-debug.log*
27 | yarn-error.log*
28 |
--------------------------------------------------------------------------------
/spark-ui/gulpfile.js:
--------------------------------------------------------------------------------
1 | const gulp = require('gulp');
2 | const inlinesource = require('gulp-inline-source');
3 | const replace = require('gulp-replace');
4 |
5 | gulp.task('default', () => {
6 | return gulp
7 | .src('./build/*.html')
8 | .pipe(replace('.js">', '.js" inline>'))
9 | .pipe(replace('', ''))
10 | .pipe(replace('', ''))
11 | .pipe(replace('rel="stylesheet">', 'rel="stylesheet" inline>'))
12 | .pipe(
13 | inlinesource({
14 | compress: false
15 | })
16 | )
17 | .pipe(gulp.dest('./build'));
18 | });
19 |
--------------------------------------------------------------------------------
/spark-ui/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/favicon.ico
--------------------------------------------------------------------------------
/spark-ui/public/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/icon.png
--------------------------------------------------------------------------------
/spark-ui/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
12 |
13 |
22 | DataFlint
23 |
24 |
28 |
29 |
30 |
31 |
32 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/spark-ui/public/logo-grey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/logo-grey.png
--------------------------------------------------------------------------------
/spark-ui/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/logo.png
--------------------------------------------------------------------------------
/spark-ui/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "Your Orders",
3 | "name": "Your Orders",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | }
10 | ],
11 | "start_url": ".",
12 | "display": "standalone",
13 | "theme_color": "#000000",
14 | "background_color": "#ffffff"
15 | }
16 |
--------------------------------------------------------------------------------
/spark-ui/src/App.tsx:
--------------------------------------------------------------------------------
1 | import Box from "@mui/material/Box";
2 | import * as React from "react";
3 | import { Outlet, useLocation, useNavigate } from "react-router-dom";
4 | import { AppDrawer } from "./components/AppDrawer/AppDrawer";
5 | import DisconnectedModal from "./components/Modals/DisconnectedModal";
6 | import Progress from "./components/Progress";
7 | import { useAppDispatch, useAppSelector } from "./Hooks";
8 | import SparkAPI from "./services/SparkApi";
9 | import { getTabByUrl, Tab, TabToUrl } from "./services/TabsService";
10 | import {
11 | BASE_CURRENT_PAGE,
12 | BASE_PATH,
13 | IS_HISTORY_SERVER_MODE,
14 | } from "./utils/UrlConsts";
15 |
16 | const DOCUMENT_TITLE_PREFIX = "DataFlint - ";
17 |
18 | export default function App() {
19 | const location = useLocation();
20 | const navigate = useNavigate();
21 |
22 | const dispatch = useAppDispatch();
23 | const store = useAppSelector((state) => state.spark);
24 | const [selectedTab, setSelectedTab] = React.useState(Tab.Status);
25 |
26 | React.useEffect(() => {
27 | const sparkAPI = new SparkAPI(
28 | BASE_PATH,
29 | BASE_CURRENT_PAGE,
30 | dispatch,
31 | IS_HISTORY_SERVER_MODE,
32 | );
33 | const cleanerFunc = sparkAPI.start();
34 | return cleanerFunc;
35 | }, []);
36 |
37 | React.useEffect(() => {
38 | if (store.runMetadata?.appName) {
39 | document.title = DOCUMENT_TITLE_PREFIX + store.runMetadata.appName;
40 | }
41 | }, [store.runMetadata?.appName]);
42 |
43 | React.useEffect(() => {
44 | if (!location || !location.pathname) return;
45 |
46 | setSelectedTab(getTabByUrl(location.pathname));
47 | }, [location]);
48 |
49 | const onTabChanged = (tab: Tab): void => {
50 | setSelectedTab(tab);
51 | navigate(TabToUrl[tab]);
52 | };
53 |
54 | return !store.isInitialized ? (
55 |
56 | ) : (
57 |
58 |
59 |
64 |
68 | theme.palette.mode === "light"
69 | ? theme.palette.grey[100]
70 | : theme.palette.grey[900],
71 | flexGrow: 1,
72 | height: "100vh",
73 | overflow: "hidden",
74 | }}
75 | >
76 |
77 |
78 |
79 | );
80 | }
81 |
--------------------------------------------------------------------------------
/spark-ui/src/Hooks.ts:
--------------------------------------------------------------------------------
1 | import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux";
2 | import type { AppDispatch, RootState } from "./Store";
3 |
4 | // Use throughout your app instead of plain `useDispatch` and `useSelector`
5 | export const useAppDispatch: () => AppDispatch = useDispatch;
6 | export const useAppSelector: TypedUseSelectorHook = useSelector;
7 |
--------------------------------------------------------------------------------
/spark-ui/src/Router.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { createHashRouter } from "react-router-dom";
3 | import App from "./App";
4 | import { AlertsTab } from "./tabs/AlertsTab";
5 | import ConfigurationTab from "./tabs/ConfigurationTab";
6 | import { ResourcesTab } from "./tabs/ResourcesTab";
7 | import StatusTab from "./tabs/StatusTab";
8 | import SummaryTab from "./tabs/SummaryTab";
9 | import { isHistoryServer } from "./utils/UrlUtils";
10 |
11 | const isHistoryServerMode = isHistoryServer();
12 |
13 | export const reactRouter = createHashRouter([
14 | {
15 | path: "/",
16 | element: ,
17 | children: [
18 | {
19 | index: true,
20 | element: isHistoryServerMode ? : ,
21 | },
22 | {
23 | path: "/status",
24 | element: ,
25 | },
26 | {
27 | path: "/config",
28 | element: ,
29 | },
30 | {
31 | path: "/alerts",
32 | element: ,
33 | },
34 | {
35 | path: "/summary",
36 | element: ,
37 | },
38 | {
39 | path: "/resources",
40 | element: ,
41 | },
42 | ],
43 | },
44 | ]);
45 |
--------------------------------------------------------------------------------
/spark-ui/src/Store.ts:
--------------------------------------------------------------------------------
1 | import { configureStore } from "@reduxjs/toolkit";
2 | import ChatSlice from "./reducers/ChatSlice";
3 | import GeneralSlice from "./reducers/GeneralSlice";
4 | import jobsColumnsReducer from "./reducers/JobsColumnSlice";
5 | import SparkSlice from "./reducers/SparkSlice";
6 |
7 | const store = configureStore({
8 | reducer: {
9 | spark: SparkSlice,
10 | chat: ChatSlice,
11 | general: GeneralSlice,
12 | jobsColumns: jobsColumnsReducer,
13 | },
14 | });
15 |
16 | // Infer the `RootState` and `AppDispatch` types from the store itself
17 | export type RootState = ReturnType;
18 | // Inferred type: {posts: PostsState, comments: CommentsState, users: UsersState}
19 | export type AppDispatch = typeof store.dispatch;
20 |
21 | export default store;
22 |
--------------------------------------------------------------------------------
/spark-ui/src/components/AlertBadge/AlertBadge.tsx:
--------------------------------------------------------------------------------
1 | import ErrorIcon from "@mui/icons-material/Error";
2 | import WarningIcon from "@mui/icons-material/Warning";
3 | import { Alert, AlertTitle, styled } from "@mui/material";
4 | import Tooltip, { tooltipClasses, TooltipProps } from "@mui/material/Tooltip";
5 | import * as React from "react";
6 | import { Alert as DataflintAlert } from "../../interfaces/AppStore";
7 |
8 | type InfoBoxProps = {
9 | alert?: DataflintAlert;
10 | margin?: string;
11 | placement?:
12 | | "top"
13 | | "right"
14 | | "bottom"
15 | | "left"
16 | | "bottom-end"
17 | | "bottom-start"
18 | | "left-end"
19 | | "left-start"
20 | | "right-end"
21 | | "right-start"
22 | | "top-end"
23 | | "top-start";
24 | };
25 |
26 | export const TransperantTooltip = styled(
27 | ({ className, ...props }: TooltipProps) => (
28 |
29 | ),
30 | )(({ theme }) => ({
31 | [`& .${tooltipClasses.tooltip}`]: {
32 | backgroundColor: "transparent",
33 | },
34 | }));
35 |
36 | export default function AlertBadge({ alert, margin, placement }: InfoBoxProps) {
37 | return alert !== undefined ? (
38 |
42 | : }
45 | >
46 | {alert.title}
47 | {alert.message}
48 | {alert.shortSuggestion !== undefined && (
49 | <>
50 |
51 | Recommended Fix:
52 |
53 | {alert.shortSuggestion}
54 | >
55 | )}
56 |
57 |
58 | }
59 | >
60 | {alert.type === "warning" ? (
61 |
71 | ) : (
72 |
82 | )}
83 |
84 | ) : null;
85 | }
86 |
--------------------------------------------------------------------------------
/spark-ui/src/components/AlertBadge/MultiAlertsBadge.tsx:
--------------------------------------------------------------------------------
1 | import ErrorIcon from "@mui/icons-material/Error";
2 | import WarningIcon from "@mui/icons-material/Warning";
3 | import { Alert, AlertTitle } from "@mui/material";
4 | import * as React from "react";
5 | import { Alert as DataflintAlert } from "../../interfaces/AppStore";
6 | import { TransperantTooltip } from "./AlertBadge";
7 |
8 | type ToggableAlertProps = {
9 | alerts: DataflintAlert[];
10 | };
11 |
12 | export default function MultiAlertBadge({ alerts }: ToggableAlertProps) {
13 | const alert = alerts.length > 0 ? alerts[0] : undefined;
14 | return alert !== undefined ? (
15 |
19 | : }
22 | >
23 | {alert.title}
24 | {alert.message}
25 | {alerts.length > 1 ? (
26 |
27 |
+ {alerts.length} additional alerts
28 |
29 | ) : (
30 | ""
31 | )}
32 |
33 |
34 | }
35 | >
36 | {alert.type === "warning" ? (
37 |
42 | ) : (
43 |
48 | )}
49 |
50 | ) : null;
51 | }
52 |
--------------------------------------------------------------------------------
/spark-ui/src/components/AppDrawer/DrawerFooter.tsx:
--------------------------------------------------------------------------------
1 | import Button from "@mui/material/Button";
2 | import * as React from "react";
3 | import {
4 | BASE_CURRENT_PAGE,
5 | IS_HISTORY_SERVER_MODE
6 | } from "../../utils/UrlConsts";
7 | import { getBaseAppUrl, getProxyBasePath, isDataFlintSaaSUI } from "../../utils/UrlUtils";
8 |
9 | export default function DrawerFooter({ version }: { version?: string }) {
10 | const onSparkUiClick = (): void => {
11 | window.location.href = `${getBaseAppUrl(BASE_CURRENT_PAGE)}/jobs/`;
12 | };
13 |
14 | const onHistoryServerClick = (): void => {
15 | const basePath = getProxyBasePath();
16 | window.location.href = basePath + "/history/";
17 | };
18 |
19 | return (
20 |
29 |
32 | {IS_HISTORY_SERVER_MODE && !isDataFlintSaaSUI() ? (
33 |
36 | ) : null}
37 | {`Version ${version}`}
38 |
39 | );
40 | }
41 |
--------------------------------------------------------------------------------
/spark-ui/src/components/ColumnPicker/ColumnPicker.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | Checkbox,
3 | FormControl,
4 | InputLabel,
5 | ListItemText,
6 | MenuItem,
7 | OutlinedInput,
8 | Select,
9 | } from "@mui/material";
10 | import React from "react";
11 |
12 | const ITEM_HEIGHT = 48;
13 | const ITEM_PADDING_TOP = 8;
14 | const MenuProps = {
15 | PaperProps: {
16 | style: {
17 | maxHeight: ITEM_HEIGHT * 4.5 + ITEM_PADDING_TOP,
18 | width: 300,
19 | },
20 | },
21 | };
22 |
23 | interface ColumnPickerProps {
24 | headCells: { id: string; label: string }[];
25 | visibleColumns: string[];
26 | onToggleColumn: (columnId: string[]) => void;
27 | }
28 |
29 | const ColumnPicker: React.FC = ({
30 | headCells,
31 | visibleColumns,
32 | onToggleColumn,
33 | }) => {
34 | const handleChange = (event: any) => {
35 | const {
36 | target: { value },
37 | } = event;
38 | // Update visible columns
39 | onToggleColumn(typeof value === "string" ? value.split(",") : value);
40 | };
41 |
42 | return (
43 |
44 | Columns
45 | }
52 | renderValue={(selected) =>
53 | headCells
54 | .filter((headCell) => selected.includes(headCell.id))
55 | .map((headCell) => headCell.label)
56 | .join(", ")
57 | }
58 | MenuProps={MenuProps}
59 | >
60 | {headCells.map((headCell) => (
61 |
65 | ))}
66 |
67 |
68 | );
69 | };
70 |
71 | export default ColumnPicker;
--------------------------------------------------------------------------------
/spark-ui/src/components/ConfigTable.tsx:
--------------------------------------------------------------------------------
1 | import Paper from "@mui/material/Paper";
2 | import Table from "@mui/material/Table";
3 | import TableBody from "@mui/material/TableBody";
4 | import TableCell from "@mui/material/TableCell";
5 | import TableContainer from "@mui/material/TableContainer";
6 | import TableHead from "@mui/material/TableHead";
7 | import TableRow from "@mui/material/TableRow";
8 | import * as React from "react";
9 | import { ConfigEntries } from "../interfaces/AppStore";
10 |
11 | type ConfigTableProps = {
12 | config: ConfigEntries;
13 | };
14 |
15 | export default function ConfigTable({ config }: ConfigTableProps) {
16 | return (
17 |
18 |
19 |
20 |
21 | Name
22 | Value
23 | Documentation
24 |
25 |
26 |
27 | {config.map((row) => (
28 |
29 | {row.name}
30 |
31 | {row.value ?? row.default}
32 | {row.value === undefined || row.value === row.default
33 | ? " (default)"
34 | : ""}
35 |
36 |
37 | {row.documentation}
38 |
39 |
40 | ))}
41 |
42 |
43 |
44 | );
45 | }
46 |
--------------------------------------------------------------------------------
/spark-ui/src/components/ExceptionIcon.tsx:
--------------------------------------------------------------------------------
1 | import ErrorOutlineIcon from "@mui/icons-material/ErrorOutline";
2 | import { Fade, Snackbar } from "@mui/material";
3 | import { styled } from "@mui/material/styles";
4 | import Tooltip, { tooltipClasses, TooltipProps } from "@mui/material/Tooltip";
5 | import * as React from "react";
6 |
7 | const CustomWidthTooltip = styled(({ className, ...props }: TooltipProps) => (
8 |
9 | ))({
10 | [`& .${tooltipClasses.tooltip}`]: {
11 | maxWidth: 500,
12 | maxHeight: 300,
13 | overflow: "auto",
14 | whiteSpace: "pre",
15 | },
16 | });
17 |
18 | const onTooltipClick = (
19 | event: React.MouseEvent,
20 | failureReason: string,
21 | setOpenSnackbar: React.Dispatch>,
22 | ) => {
23 | event.stopPropagation();
24 | setOpenSnackbar(true);
25 | navigator.clipboard.writeText(failureReason);
26 | };
27 |
28 | const formatFailureReason = (failureReason: string) => {
29 | const regex = /(Caused by:.*?)(?=\n)/s;
30 | const match = regex.exec(failureReason);
31 |
32 | if (match) {
33 | const causedByText = match[1].trim();
34 | return `${causedByText}\nFull stacktrace:\n${failureReason}`;
35 | }
36 |
37 | return failureReason;
38 | };
39 |
40 | const ExceptionIcon: React.FC<{ failureReason: string }> = ({
41 | failureReason,
42 | }): JSX.Element => {
43 | const [openSnackbar, setOpenSnackbar] = React.useState(false);
44 |
45 | const handleClose = (
46 | event: React.SyntheticEvent | Event,
47 | reason?: string,
48 | ) => {
49 | if (reason === "clickaway") {
50 | return;
51 | }
52 |
53 | setOpenSnackbar(false);
54 | };
55 |
56 | const formatedFailureReason = formatFailureReason(failureReason);
57 | return (
58 | onTooltipClick(event, failureReason, setOpenSnackbar)}
60 | >
61 |
69 |
73 |
74 |
80 |
81 | );
82 | };
83 |
84 | export default ExceptionIcon;
85 |
--------------------------------------------------------------------------------
/spark-ui/src/components/InfoBox/InfoBox.module.css:
--------------------------------------------------------------------------------
1 | @keyframes blinkAnimation {
2 | 0% {
3 | opacity: 1;
4 | }
5 | 50% {
6 | opacity: 0.2;
7 | }
8 | 100% {
9 | opacity: 1;
10 | }
11 | }
12 |
13 | .blink {
14 | animation: blinkAnimation 0.5s linear;
15 | }
16 |
--------------------------------------------------------------------------------
/spark-ui/src/components/Modals/DisconnectedModal.tsx:
--------------------------------------------------------------------------------
1 | import { Box, Fade, Modal, Typography } from "@mui/material";
2 | import Backdrop from "@mui/material/Backdrop";
3 | import React, { FC } from "react";
4 | import { useAppSelector } from "../../Hooks";
5 |
6 | const style = {
7 | position: "absolute" as "absolute",
8 | top: "50%",
9 | left: "50%",
10 | transform: "translate(-50%, -50%)",
11 | width: 400,
12 | bgcolor: "#383838",
13 | outline: "none",
14 | borderRadius: "4px",
15 | boxShadow: 24,
16 | p: 4,
17 | };
18 |
19 | const DisconnectedModal: FC = (): JSX.Element => {
20 | const { isConnected, isInitialized } = useAppSelector((state) => state.spark);
21 |
22 | const open = !isConnected && isInitialized;
23 |
24 | return (
25 |
36 |
37 |
38 |
39 | Server disconnected
40 |
41 |
42 | Trying to reconnect...
43 |
44 |
45 |
46 |
47 | );
48 | };
49 |
50 | export default DisconnectedModal;
51 |
--------------------------------------------------------------------------------
/spark-ui/src/components/NoQuery/NoQuery.tsx:
--------------------------------------------------------------------------------
1 | import { Alert } from "@mui/material";
2 | import React from "react";
3 |
4 | const NoQuery = () => {
5 | return No Spark SQL query currently running;
6 | };
7 |
8 | export default NoQuery;
9 |
--------------------------------------------------------------------------------
/spark-ui/src/components/Progress.tsx:
--------------------------------------------------------------------------------
1 | import { Box, CircularProgress } from "@mui/material";
2 | import React, { FC } from "react";
3 | import "reactflow/dist/style.css";
4 |
5 | const Progress: FC = ({}): JSX.Element => {
6 | return (
7 |
13 |
14 |
15 | );
16 | };
17 |
18 | export default Progress;
19 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlContainer.tsx:
--------------------------------------------------------------------------------
1 | import React, { FC } from "react";
2 | import "reactflow/dist/style.css";
3 | import { useAppSelector } from "../Hooks";
4 | import Progress from "./Progress";
5 | import SqlFlow from "./SqlFlow/SqlFlow";
6 |
7 | const SqlContainer: FC = (): JSX.Element => {
8 | const sql = useAppSelector((state) => state.spark.sql);
9 | return sql === undefined ? (
10 |
11 | ) : (
12 |
13 |
14 |
15 | );
16 | };
17 |
18 | export default SqlContainer;
19 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlFlow/BytesDistributionChart.tsx:
--------------------------------------------------------------------------------
1 | import { ApexOptions } from "apexcharts";
2 | import React from "react";
3 | import ReactApexChart from "react-apexcharts";
4 | import { humanFileSize } from "../../utils/FormatUtils";
5 |
6 | export default function BytesDistributionChart({
7 | bytesDist,
8 | title
9 | }: {
10 | bytesDist: number[];
11 | title: string,
12 | }): JSX.Element {
13 | const series = [
14 | {
15 | name: title,
16 | data: bytesDist,
17 | },
18 | ];
19 |
20 | const options: ApexOptions = {
21 | plotOptions: {
22 | bar: {
23 | horizontal: false,
24 | },
25 | },
26 | chart: {
27 | animations: {
28 | enabled: false,
29 | },
30 | toolbar: {
31 | show: false,
32 | },
33 | zoom: {
34 | enabled: false,
35 | },
36 | },
37 | dataLabels: {
38 | enabled: false,
39 | },
40 | stroke: {
41 | show: true,
42 | width: 2,
43 | colors: ["transparent"],
44 | },
45 | xaxis: {
46 | categories: [
47 | "min",
48 | "0.1",
49 | "0.2",
50 | "0.3",
51 | "0.4",
52 | "0.5",
53 | "0.6",
54 | "0.7",
55 | "0.8",
56 | "0.9",
57 | "max",
58 | ],
59 | },
60 | yaxis: {
61 | title: {
62 | text: title,
63 | },
64 | labels: {
65 | formatter: (value: number, timestamp?: number, opts?: any) =>
66 | humanFileSize(value),
67 | },
68 | },
69 | theme: {
70 | mode: "dark",
71 | },
72 | };
73 |
74 | return (
75 |
76 | );
77 | }
78 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlFlow/DurationDistributionChart.tsx:
--------------------------------------------------------------------------------
1 | import { ApexOptions } from "apexcharts";
2 | import { duration } from "moment";
3 | import React from "react";
4 | import ReactApexChart from "react-apexcharts";
5 | import { humanizeTimeDiff } from "../../utils/FormatUtils";
6 |
7 | export default function DurationDistributionChart({
8 | durationDist,
9 | }: {
10 | durationDist: number[];
11 | }): JSX.Element {
12 | const series = [
13 | {
14 | name: "Duration",
15 | data: durationDist,
16 | },
17 | ];
18 |
19 | const options: ApexOptions = {
20 | plotOptions: {
21 | bar: {
22 | horizontal: false,
23 | },
24 | },
25 | chart: {
26 | animations: {
27 | enabled: false,
28 | },
29 | toolbar: {
30 | show: false,
31 | },
32 | zoom: {
33 | enabled: false,
34 | },
35 | },
36 | dataLabels: {
37 | enabled: false,
38 | },
39 | stroke: {
40 | show: true,
41 | width: 2,
42 | colors: ["transparent"],
43 | },
44 | xaxis: {
45 | categories: [
46 | "min",
47 | "0.1",
48 | "0.2",
49 | "0.3",
50 | "0.4",
51 | "0.5",
52 | "0.6",
53 | "0.7",
54 | "0.8",
55 | "0.9",
56 | "max",
57 | ],
58 | },
59 | yaxis: {
60 | title: {
61 | text: "Duration",
62 | },
63 | labels: {
64 | formatter: (value: number, timestamp?: number, opts?: any) =>
65 | humanizeTimeDiff(duration(value)),
66 | },
67 | },
68 | theme: {
69 | mode: "dark",
70 | },
71 | };
72 |
73 | return (
74 |
75 | );
76 | }
77 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlFlow/NumbersDistributionChart.tsx:
--------------------------------------------------------------------------------
1 | import { ApexOptions } from "apexcharts";
2 | import React from "react";
3 | import ReactApexChart from "react-apexcharts";
4 |
5 | export default function NumbersDistributionChart({
6 | numbersDist,
7 | title
8 | }: {
9 | numbersDist: number[];
10 | title: string,
11 | }): JSX.Element {
12 | const series = [
13 | {
14 | name: title,
15 | data: numbersDist,
16 | },
17 | ];
18 |
19 | const options: ApexOptions = {
20 | plotOptions: {
21 | bar: {
22 | horizontal: false,
23 | },
24 | },
25 | chart: {
26 | animations: {
27 | enabled: false,
28 | },
29 | toolbar: {
30 | show: false,
31 | },
32 | zoom: {
33 | enabled: false,
34 | },
35 | },
36 | dataLabels: {
37 | enabled: false,
38 | },
39 | stroke: {
40 | show: true,
41 | width: 2,
42 | colors: ["transparent"],
43 | },
44 | xaxis: {
45 | categories: [
46 | "min",
47 | "0.1",
48 | "0.2",
49 | "0.3",
50 | "0.4",
51 | "0.5",
52 | "0.6",
53 | "0.7",
54 | "0.8",
55 | "0.9",
56 | "max",
57 | ],
58 | },
59 | yaxis: {
60 | title: {
61 | text: title,
62 | },
63 | },
64 | theme: {
65 | mode: "dark",
66 | },
67 | };
68 |
69 | return (
70 |
71 | );
72 | }
73 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlFlow/SqlLayoutService.ts:
--------------------------------------------------------------------------------
1 | import dagre from "dagre";
2 | import { Edge, Node, Position } from "reactflow";
3 | import { v4 as uuidv4 } from "uuid";
4 | import {
5 | EnrichedSparkSQL,
6 | EnrichedSqlEdge,
7 | EnrichedSqlNode,
8 | GraphFilter,
9 | } from "../../interfaces/AppStore";
10 | import { StageNodeName } from "./StageNode";
11 |
12 | const nodeWidth = 280;
13 | const nodeHeight = 280;
14 |
15 | const getLayoutedElements = (
16 | nodes: Node[],
17 | edges: Edge[],
18 | ): { layoutNodes: Node[]; layoutEdges: Edge[] } => {
19 | const dagreGraph = new dagre.graphlib.Graph();
20 | dagreGraph.setDefaultEdgeLabel(() => ({}));
21 | dagreGraph.setGraph({ rankdir: "LR" });
22 |
23 | nodes.forEach((node) => {
24 | dagreGraph.setNode(node.id, { width: nodeWidth, height: nodeHeight });
25 | });
26 |
27 | edges.forEach((edge) => {
28 | dagreGraph.setEdge(edge.source, edge.target);
29 | });
30 |
31 | dagre.layout(dagreGraph);
32 |
33 | nodes.forEach((node) => {
34 | const nodeWithPosition = dagreGraph.node(node.id);
35 | node.targetPosition = Position.Left;
36 | node.sourcePosition = Position.Right;
37 |
38 | // We are shifting the dagre node position (anchor=center center) to the top left
39 | // so it matches the React Flow node anchor point (top left).
40 | node.position = {
41 | x: nodeWithPosition.x - nodeWidth / 2,
42 | y: nodeWithPosition.y - nodeHeight / 2,
43 | };
44 |
45 | return node;
46 | });
47 |
48 | return { layoutNodes: nodes, layoutEdges: edges };
49 | };
50 |
51 | class SqlLayoutService {
52 | static SqlElementsToLayout(
53 | sql: EnrichedSparkSQL,
54 | graphFilter: GraphFilter,
55 | ): { layoutNodes: Node[]; layoutEdges: Edge[] } {
56 | const { nodesIds, edges } = sql.filters[graphFilter];
57 |
58 | const flowNodes: Node[] = sql.nodes
59 | .filter((node) => nodesIds.includes(node.nodeId))
60 | .map((node: EnrichedSqlNode) => {
61 | return {
62 | id: node.nodeId.toString(),
63 | data: { sqlId: sql.id, node: node },
64 | type: StageNodeName,
65 | position: { x: 0, y: 0 },
66 | };
67 | });
68 | const flowEdges: Edge[] = edges.map((edge: EnrichedSqlEdge) => {
69 | return {
70 | id: uuidv4(),
71 | source: edge.fromId.toString(),
72 | animated: true,
73 | target: edge.toId.toString(),
74 | };
75 | });
76 |
77 | const { layoutNodes, layoutEdges } = getLayoutedElements(
78 | flowNodes,
79 | flowEdges,
80 | );
81 | return { layoutNodes: layoutNodes, layoutEdges: layoutEdges };
82 | }
83 | }
84 |
85 | export default SqlLayoutService;
86 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlFlow/node-style.module.css:
--------------------------------------------------------------------------------
1 | .node {
2 | display: flex;
3 | justify-content: center;
4 | width: 280px;
5 | height: 280px;
6 | color: black;
7 | padding: 8px;
8 | background: white;
9 | border: 3px solid lightblue;
10 | border-radius: 10px;
11 | }
12 |
13 | .textWrapper {
14 | display: flex;
15 | flex-direction: column;
16 | justify-content: flex-start;
17 | height: 190px;
18 | }
19 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlTable/TableTypes.tsx:
--------------------------------------------------------------------------------
1 | export type Order = "asc" | "desc";
2 |
3 | export interface EnhancedTableProps {
4 | onRequestSort: (
5 | event: React.MouseEvent,
6 | property: keyof Data,
7 | ) => void;
8 | order: Order;
9 | orderBy: string;
10 | headCells: HeadCell[];
11 | visibleColumns: string[];
12 | }
13 |
14 | export interface Data {
15 | id: string;
16 | status: string;
17 | description: string;
18 | duration: number;
19 | durationPercentage: number;
20 | dcu: number;
21 | dcuPercentage: number;
22 | input: number;
23 | output: number;
24 | idleCores: number;
25 | spill: number;
26 | totalTasks: number;
27 | shuffleReadBytes: number;
28 | shuffleWriteBytes: number;
29 | executorRunTime: number;
30 | failureReason: string;
31 | }
32 |
33 | export interface HeadCell {
34 | disablePadding: boolean;
35 | id: keyof Data;
36 | label: string;
37 | numeric: boolean;
38 | initiallyVisible: boolean;
39 | }
40 |
--------------------------------------------------------------------------------
/spark-ui/src/components/SqlTable/TableUtils.tsx:
--------------------------------------------------------------------------------
1 | import { Order } from "./TableTypes";
2 |
3 | export function descendingComparator(a: T, b: T, orderBy: keyof T) {
4 | if (b[orderBy] < a[orderBy]) {
5 | return -1;
6 | }
7 | if (b[orderBy] > a[orderBy]) {
8 | return 1;
9 | }
10 | return 0;
11 | }
12 |
13 | export function getComparator(
14 | order: Order,
15 | orderBy: Key,
16 | ): (
17 | a: { [key in Key]: number | string },
18 | b: { [key in Key]: number | string },
19 | ) => number {
20 | return order === "desc"
21 | ? (a, b) => descendingComparator(a, b, orderBy)
22 | : (a, b) => -descendingComparator(a, b, orderBy);
23 | }
24 |
25 | // Since 2020 all major browsers ensure sort stability with Array.prototype.sort().
26 | // stableSort() brings sort stability to non-modern browsers (notably IE11). If you
27 | // only support modern browsers you can replace stableSort(exampleArray, exampleComparator)
28 | // with exampleArray.slice().sort(exampleComparator)
29 | export function stableSort(
30 | array: readonly T[],
31 | comparator: (a: T, b: T) => number,
32 | ) {
33 | const stabilizedThis = array.map((el, index) => [el, index] as [T, number]);
34 | stabilizedThis.sort((a, b) => {
35 | const order = comparator(a[0], b[0]);
36 | if (order !== 0) {
37 | return order;
38 | }
39 | return a[1] - b[1];
40 | });
41 | return stabilizedThis.map((el) => el[0]);
42 | }
43 |
--------------------------------------------------------------------------------
/spark-ui/src/index.tsx:
--------------------------------------------------------------------------------
1 | import CssBaseline from "@mui/material/CssBaseline";
2 | import { ThemeProvider } from "@mui/material/styles";
3 | import * as React from "react";
4 | import * as ReactDOM from "react-dom/client";
5 | import { Provider } from "react-redux";
6 | import { RouterProvider } from "react-router-dom";
7 | import { reactRouter } from "./Router";
8 | import store from "./Store";
9 | import theme from "./theme";
10 |
11 | document.addEventListener("DOMContentLoaded", (event) => {
12 | const rootElement = document.getElementById("root");
13 | const root = ReactDOM.createRoot(rootElement!);
14 |
15 | if (location.protocol === "https:") {
16 | var meta = document.createElement("meta");
17 | meta.httpEquiv = "Content-Security-Policy";
18 | meta.content = "upgrade-insecure-requests";
19 | document.getElementsByTagName("head")[0].appendChild(meta);
20 | }
21 |
22 | root.render(
23 |
24 |
25 | {/* CssBaseline kickstart an elegant, consistent, and simple baseline to build upon. */}
26 |
27 |
28 |
29 | ,
30 | );
31 | });
32 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/ApplicationInfo.ts:
--------------------------------------------------------------------------------
1 | import { SparkApplication } from "./SparkApplications";
2 |
3 | export interface EnvironmentInfo {
4 | driverXmxBytes?: number;
5 | }
6 |
7 | export interface ApplicationInfo {
8 | runId?: string;
9 | info: SparkApplication;
10 | environmentInfo?: EnvironmentInfo;
11 | }
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/CachedStorage.ts:
--------------------------------------------------------------------------------
1 | export interface DataflintExecutorStorageInfo {
2 | memoryUsed: number;
3 | memoryRemaining: number;
4 | memoryUsagePercentage: number;
5 | }
6 |
7 | export interface RddStorageInfo {
8 | rddId: number;
9 | memoryUsed: number;
10 | diskUsed: number;
11 | numOfPartitions: number;
12 | storageLevel: string;
13 | maxMemoryExecutorInfo: DataflintExecutorStorageInfo | undefined;
14 | }
15 |
16 | export interface CachedStorage {
17 | [stageId: string]: RddStorageInfo[];
18 | }
19 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/IcebergInfo.ts:
--------------------------------------------------------------------------------
1 | export interface IcebergInfo {
2 | commitsInfo: IcebergCommitsInfo[];
3 | }
4 |
5 | export interface IcebergCommitsInfo {
6 | executionId: number;
7 | tableName: string;
8 | commitId: number;
9 | operation: string;
10 | metrics: IcebergCommitMetrics;
11 | }
12 |
13 | export interface IcebergCommitMetrics {
14 | durationMS: number;
15 | attempts: number;
16 | addedDataFiles: number;
17 | removedDataFiles: number;
18 | totalDataFiles: number;
19 | addedDeleteFiles: number;
20 | addedEqualityDeleteFiles: number;
21 | addedPositionalDeleteFiles: number;
22 | removedDeleteFiles: number;
23 | removedEqualityDeleteFiles: number;
24 | removedPositionalDeleteFiles: number;
25 | totalDeleteFiles: number;
26 | addedRecords: number;
27 | removedRecords: number;
28 | totalRecords: number;
29 | addedFilesSizeInBytes: number;
30 | removedFilesSizeInBytes: number;
31 | totalFilesSizeInBytes: number;
32 | addedPositionalDeletes: number;
33 | removedPositionalDeletes: number;
34 | totalPositionalDeletes: number;
35 | addedEqualityDeletes: number;
36 | removedEqualityDeletes: number;
37 | totalEqualityDeletes: number;
38 | }
39 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/Mixpanel.ts:
--------------------------------------------------------------------------------
1 | export enum MixpanelEvents {
2 | SparkAppInitilized = "Spark App initilized",
3 | SqlSummarySelected = "Sql Summary Selected",
4 | KeepAlive = "Keep Alive",
5 | }
6 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SQLPlan.ts:
--------------------------------------------------------------------------------
1 | export type SQLPlans = SQLPlan[];
2 |
3 | export interface SQLPlan {
4 | executionId: number;
5 | numOfNodes: number;
6 | nodesPlan: SQLNodePlan[];
7 | rddScopesToStages?: Record;
8 | }
9 |
10 | export interface StartAndAttempt {
11 | stageId: string;
12 | attemptId: string;
13 | }
14 |
15 | export interface SQLNodePlan {
16 | id: number;
17 | planDescription: string;
18 | rddScopeId?: string;
19 | }
20 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SparkApplications.ts:
--------------------------------------------------------------------------------
1 | export type SparkApplications = SparkApplication[];
2 |
3 | export interface SparkApplication {
4 | id: string;
5 | name: string;
6 | attempts: Attempt[];
7 | }
8 |
9 | export interface Attempt {
10 | attemptId?: string;
11 | startTime: string;
12 | endTime: string;
13 | lastUpdated: string;
14 | duration: number;
15 | sparkUser: string;
16 | completed: boolean;
17 | appSparkVersion: string;
18 | startTimeEpoch: number;
19 | endTimeEpoch: number;
20 | lastUpdatedEpoch: number;
21 | }
22 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SparkConfiguration.ts:
--------------------------------------------------------------------------------
1 | export interface SparkConfiguration {
2 | runtime: Runtime;
3 | sparkProperties: string[][];
4 | hadoopProperties: string[][];
5 | systemProperties: string[][];
6 | metricsProperties: string[][];
7 | classpathEntries: string[][];
8 | resourceProfiles: ResourceProfile[];
9 | }
10 |
11 | export interface Runtime {
12 | javaVersion: string;
13 | javaHome: string;
14 | scalaVersion: string;
15 | }
16 |
17 | export interface ResourceProfile {
18 | id: number;
19 | executorResources: ExecutorResources;
20 | taskResources: TaskResources;
21 | }
22 |
23 | export interface ExecutorResources {
24 | cores: Cores;
25 | memory: Memory;
26 | offHeap: OffHeap;
27 | }
28 |
29 | export interface Cores {
30 | resourceName: string;
31 | amount: number;
32 | discoveryScript: string;
33 | vendor: string;
34 | }
35 |
36 | export interface Memory {
37 | resourceName: string;
38 | amount: number;
39 | discoveryScript: string;
40 | vendor: string;
41 | }
42 |
43 | export interface OffHeap {
44 | resourceName: string;
45 | amount: number;
46 | discoveryScript: string;
47 | vendor: string;
48 | }
49 |
50 | export interface TaskResources {
51 | cpus: Cpus;
52 | }
53 |
54 | export interface Cpus {
55 | resourceName: string;
56 | amount: number;
57 | }
58 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SparkExecutors.ts:
--------------------------------------------------------------------------------
1 | export type SparkExecutors = SparkExecutor[];
2 |
3 | export interface SparkExecutor {
4 | id: string;
5 | hostPort: string;
6 | isActive: boolean;
7 | rddBlocks: number;
8 | memoryUsed: number;
9 | diskUsed: number;
10 | totalCores: number;
11 | maxTasks: number;
12 | activeTasks: number;
13 | failedTasks: number;
14 | completedTasks: number;
15 | totalTasks: number;
16 | totalDuration: number;
17 | totalGCTime: number;
18 | totalInputBytes: number;
19 | totalShuffleRead: number;
20 | totalShuffleWrite: number;
21 | isBlacklisted: boolean;
22 | maxMemory: number;
23 | addTime: string;
24 | executorLogs: ExecutorLogs;
25 | memoryMetrics: MemoryMetrics;
26 | blacklistedInStages: any[];
27 | peakMemoryMetrics?: PeakMemoryMetrics;
28 | attributes: Attributes;
29 | resources: Resources;
30 | resourceProfileId: number;
31 | isExcluded: boolean;
32 | excludedInStages: any[];
33 | removeTime: string;
34 | removeReason: string;
35 | }
36 |
37 | export interface ExecutorLogs {
38 | stdout?: string;
39 | stderr?: string;
40 | }
41 |
42 | export interface MemoryMetrics {
43 | usedOnHeapStorageMemory: number;
44 | usedOffHeapStorageMemory: number;
45 | totalOnHeapStorageMemory: number;
46 | totalOffHeapStorageMemory: number;
47 | }
48 |
49 | export interface PeakMemoryMetrics {
50 | JVMHeapMemory: number;
51 | JVMOffHeapMemory: number;
52 | OnHeapExecutionMemory: number;
53 | OffHeapExecutionMemory: number;
54 | OnHeapStorageMemory: number;
55 | OffHeapStorageMemory: number;
56 | OnHeapUnifiedMemory: number;
57 | OffHeapUnifiedMemory: number;
58 | DirectPoolMemory: number;
59 | MappedPoolMemory: number;
60 | ProcessTreeJVMVMemory: number;
61 | ProcessTreeJVMRSSMemory: number;
62 | ProcessTreePythonVMemory: number;
63 | ProcessTreePythonRSSMemory: number;
64 | ProcessTreeOtherVMemory: number;
65 | ProcessTreeOtherRSSMemory: number;
66 | MinorGCCount: number;
67 | MinorGCTime: number;
68 | MajorGCCount: number;
69 | MajorGCTime: number;
70 | TotalGCTime: number;
71 | }
72 |
73 | export interface Attributes {}
74 |
75 | export interface Resources {}
76 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SparkJobs.ts:
--------------------------------------------------------------------------------
1 | export type SparkJobs = SparkJob[];
2 |
3 | export interface SparkJob {
4 | jobId: number;
5 | name: string;
6 | description: string;
7 | submissionTime: string;
8 | completionTime: string;
9 | stageIds: number[];
10 | status: string;
11 | numTasks: number;
12 | numActiveTasks: number;
13 | numCompletedTasks: number;
14 | numSkippedTasks: number;
15 | numFailedTasks: number;
16 | numKilledTasks: number;
17 | numCompletedIndices: number;
18 | numActiveStages: number;
19 | numCompletedStages: number;
20 | numSkippedStages: number;
21 | numFailedStages: number;
22 | killedTasksSummary: KilledTasksSummary;
23 | jobGroup?: string;
24 | }
25 |
26 | export interface KilledTasksSummary {}
27 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SparkSQLs.ts:
--------------------------------------------------------------------------------
1 | export type SparkSQLs = SparkSQL[];
2 |
3 | export interface SparkSQL {
4 | id: string;
5 | status: string;
6 | description: string;
7 | planDescription: string;
8 | submissionTime: string;
9 | duration: number;
10 | runningJobIds: number[];
11 | successJobIds: number[];
12 | failedJobIds: number[];
13 | nodes: SqlNode[];
14 | edges: SqlEdge[];
15 | }
16 |
17 | export interface SqlNode {
18 | nodeId: number;
19 | nodeName: string;
20 | metrics: SqlMetric[];
21 | wholeStageCodegenId?: number;
22 | }
23 |
24 | export interface SqlMetric {
25 | name: string;
26 | value: string;
27 | }
28 |
29 | export interface SqlEdge {
30 | fromId: number;
31 | toId: number;
32 | }
33 |
34 | export enum SqlStatus {
35 | Running = "RUNNING",
36 | Completed = "COMPLETED",
37 | Failed = "FAILED",
38 | }
39 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/SqlMetrics.ts:
--------------------------------------------------------------------------------
1 | export type NodesMetrics = NodeMetrics[];
2 |
3 | export interface NodeMetrics {
4 | id: number;
5 | name: string;
6 | metrics: Metric[];
7 | }
8 |
9 | export interface Metric {
10 | name: string;
11 | value: string;
12 | }
13 |
--------------------------------------------------------------------------------
/spark-ui/src/interfaces/StagesRdd.ts:
--------------------------------------------------------------------------------
1 | export type StagesRdd = Record>;
2 |
--------------------------------------------------------------------------------
/spark-ui/src/react-app-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
2 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/BroadcastTooLargeAlert.ts:
--------------------------------------------------------------------------------
1 | import { Alerts, SparkSQLStore } from "../../interfaces/AppStore";
2 | import { humanFileSize, parseBytesString } from "../../utils/FormatUtils";
3 |
4 | const BROADCAST_SIZE_THRESHOLD = 1 * 1024 * 1024 * 1024;
5 |
6 | export function reduceBroadcastTooLargeAlert(sql: SparkSQLStore, alerts: Alerts) {
7 | sql.sqls.forEach((sql) => {
8 | sql.nodes.forEach((node) => {
9 | if (node.nodeName === "BroadcastExchange" || (node.nodeName === "Exchange" && node.parsedPlan?.type === "Exchange" && node.parsedPlan?.plan.isBroadcast)) {
10 | const broadcastSizeMetric = parseBytesString(
11 | node.metrics.find((metric) => metric.name === "data size")?.value ?? "0",
12 | );
13 |
14 | if (broadcastSizeMetric > BROADCAST_SIZE_THRESHOLD) {
15 | const broadcastSizeString = humanFileSize(broadcastSizeMetric);
16 | alerts.push({
17 | id: `largeBroadcast_${sql.id}_${node.nodeId}_${broadcastSizeString}`,
18 | name: "largeBroadcast",
19 | title: "Large data Broadcast",
20 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`,
21 | message: `The data broadcast size is ${broadcastSizeString}, which exceeds the 1GB threshold and can cause performance issues`,
22 | suggestion: `
23 | 1. spark.sql.autoBroadcastJoinThreshold config might be set to a large number which is not optimal
24 | 2. The broadcast hint is applied on a large dataframe which is not optimal`,
25 | type: "warning",
26 | source: {
27 | type: "sql",
28 | sqlId: sql.id,
29 | sqlNodeId: node.nodeId,
30 | },
31 | });
32 | }
33 | }
34 | });
35 | });
36 | }
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/LargeCrossJoinScanAlert.ts:
--------------------------------------------------------------------------------
1 | import {
2 | Alerts,
3 | SparkSQLStore,
4 | } from "../../interfaces/AppStore";
5 |
6 | // 1 trillion threshold
7 | const CROSS_JOIN_SCANNED_ROWS_THRESHOLD = 10_000_000_000;
8 |
9 | export function reduceLargeCrossJoinScanAlert(
10 | sql: SparkSQLStore,
11 | alerts: Alerts,
12 | ) {
13 | sql.sqls.forEach((sql) => {
14 | sql.nodes.forEach((node) => {
15 | // Check if this is a cross join node (BroadcastNestedLoopJoin or CartesianProduct)
16 | if (node.nodeName === "BroadcastNestedLoopJoin" || node.nodeName === "CartesianProduct") {
17 | // Find the Cross Join Scanned Rows metric
18 | const crossJoinScannedRowsMetric = node.metrics.find(
19 | (metric) => metric.name === "Cross Join Scanned Rows"
20 | );
21 |
22 | if (crossJoinScannedRowsMetric !== undefined) {
23 | // Parse the value as a number
24 | const scannedRows = parseFloat(crossJoinScannedRowsMetric.value.replace(/,/g, ""));
25 |
26 | // Check if the scanned rows exceeds the threshold
27 | if (!isNaN(scannedRows) && scannedRows > CROSS_JOIN_SCANNED_ROWS_THRESHOLD) {
28 | // Format the number with commas for thousands separators
29 | const formattedScannedRows = scannedRows.toLocaleString();
30 |
31 | alerts.push({
32 | id: `largeCrossJoinScan_${sql.id}_${node.nodeId}`,
33 | name: "largeCrossJoinScan",
34 | title: "Large Cross Join Scan",
35 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`,
36 | message: `Cross join is scanning ${formattedScannedRows} rows, which is too large and can cause performance issues or query failure`,
37 | suggestion: `
38 | 1. Add specific join conditions to convert the cross join to a more efficient join type
39 | 2. Avoid using cross joins for large datasets
40 | `,
41 | type: "error",
42 | source: {
43 | type: "sql",
44 | sqlId: sql.id,
45 | sqlNodeId: node.nodeId,
46 | },
47 | });
48 | }
49 | }
50 | }
51 | });
52 | });
53 | }
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/LongFilterConditions.ts:
--------------------------------------------------------------------------------
1 | import { Alerts, SparkSQLStore } from "../../interfaces/AppStore";
2 |
3 | const FILTER_CONDITION_TOO_LONG_CHARACTERS_THRESHOLF = 1000;
4 |
5 | export function reduceLongFilterConditions(sql: SparkSQLStore, alerts: Alerts) {
6 | sql.sqls.forEach((sql) => {
7 | sql.nodes.forEach((node) => {
8 | const filterCondition = node.nodeName === "Filter" && node.parsedPlan?.type === "Filter" ? node.parsedPlan.plan.condition : undefined;
9 | if (filterCondition !== undefined && filterCondition.length > FILTER_CONDITION_TOO_LONG_CHARACTERS_THRESHOLF) {
10 | const filterConditionLength = filterCondition.length;
11 | alerts.push({
12 | id: `longFilterCondition${sql.id}_${node.nodeId}_${filterConditionLength}`,
13 | name: "longFilterCondition",
14 | title: "Long Filter Condition",
15 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`,
16 | message: `Condition length is ${filterConditionLength}, which is too long and can cause performance issues`,
17 | suggestion: `
18 | 1. Try to convert your filter condition to a join statement, by creating a DF of your filter condition and inner joining it with your main DF
19 | 2. Consider rewriting your filter condition to be shorter
20 | `,
21 | type: "warning",
22 | source: {
23 | type: "sql",
24 | sqlId: sql.id,
25 | sqlNodeId: node.nodeId,
26 | },
27 | });
28 | }
29 |
30 | });
31 | });
32 | }
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/PartitionSkewAlert.ts:
--------------------------------------------------------------------------------
1 | import { duration } from "moment";
2 | import {
3 | Alerts,
4 | SparkSQLStore,
5 | SparkStagesStore,
6 | } from "../../interfaces/AppStore";
7 | import { humanizeTimeDiff } from "../../utils/FormatUtils";
8 |
9 | export function reducePartitionSkewAlert(
10 | sql: SparkSQLStore,
11 | stages: SparkStagesStore,
12 | alerts: Alerts,
13 | ) {
14 | sql.sqls.forEach((sql) => {
15 | sql.nodes.forEach((node) => {
16 | const stageInfo = node.stage;
17 | if (stageInfo === undefined || stageInfo.type !== "onestage") {
18 | return;
19 | }
20 | const stageData = stages.find(
21 | (stage) => stage.stageId === stageInfo.stageId,
22 | );
23 |
24 | if (stageData?.hasPartitionSkew === true) {
25 | const maxTaskDurationTxt =
26 | stageData.maxTaskDuration === undefined
27 | ? ""
28 | : humanizeTimeDiff(duration(stageData.maxTaskDuration));
29 | const medianTaskDurationTxt =
30 | stageData.mediumTaskDuration === undefined
31 | ? ""
32 | : humanizeTimeDiff(duration(stageData.mediumTaskDuration));
33 | const skewRatio =
34 | stageData.maxTaskDuration === 0
35 | ? 0
36 | : (stageData.maxTaskDuration ?? 0) /
37 | (stageData.mediumTaskDuration ?? 1);
38 |
39 | alerts.push({
40 | id: `partitionSkew_${sql.id}_${node.nodeId}`,
41 | name: "partitionSkew",
42 | title: "Partition Skew",
43 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`,
44 | message: `Partition skew ratio of ${skewRatio.toFixed(
45 | 1,
46 | )}X, median task duration is ${medianTaskDurationTxt} and max task duration is ${maxTaskDurationTxt}`,
47 | suggestion: `
48 | 1. Fix the partition skew, by changing the repartition your data differently
49 | 2. Do not fix the partition skew, and instead decrease number of executors/cores, so you will have less resource waste
50 | `,
51 | type: "warning",
52 | source: {
53 | type: "sql",
54 | sqlId: sql.id,
55 | sqlNodeId: node.nodeId,
56 | },
57 | });
58 | }
59 | });
60 | });
61 | }
62 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/SmallTasksAlert.ts:
--------------------------------------------------------------------------------
1 | import { duration } from "moment";
2 | import {
3 | Alerts,
4 | SparkSQLStore,
5 | SparkStagesStore,
6 | } from "../../interfaces/AppStore";
7 | import { humanizeTimeDiff } from "../../utils/FormatUtils";
8 |
9 | const LARGE_TASKS_NUM_THRESHOLD = 5000;
10 | const MEDIAN_TASK_TIME_THRESHOLD_MS = 500;
11 | const TASKS_RECOMMENDED_DECREASE_RATIO = 10;
12 |
13 | export function reduceSmallTasksAlert(
14 | sql: SparkSQLStore,
15 | stages: SparkStagesStore,
16 | alerts: Alerts,
17 | ) {
18 | sql.sqls.forEach((sql) => {
19 | sql.nodes.forEach((node) => {
20 | const stageInfo = node.stage;
21 | if (stageInfo === undefined || stageInfo.type !== "onestage") {
22 | return;
23 | }
24 | const stageData = stages.find(
25 | (stage) => stage.stageId === stageInfo.stageId,
26 | );
27 |
28 | if (stageData !== undefined &&
29 | stageData.numTasks > LARGE_TASKS_NUM_THRESHOLD &&
30 | stageData.mediumTaskDuration !== undefined &&
31 | stageData.mediumTaskDuration < MEDIAN_TASK_TIME_THRESHOLD_MS) {
32 | const medianTaskDurationTxt =
33 | stageData.mediumTaskDuration === undefined
34 | ? ""
35 | : humanizeTimeDiff(duration(stageData.mediumTaskDuration));
36 | const recommendedTaskNum = Math.ceil(stageData.numTasks / TASKS_RECOMMENDED_DECREASE_RATIO);
37 |
38 | alerts.push({
39 | id: `SmallTasks_${sql.id}_${node.nodeId}`,
40 | name: "smallTasks",
41 | title: "Large Number Of Small Tasks",
42 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`,
43 | message: `${stageData.numTasks} tasks with median task duration of ${medianTaskDurationTxt}, which causes large scheduling overhead for Spark`,
44 | suggestion: `
45 | 1. Repartition to less tasks, so you will have less overhead, by running .repartition(${recommendedTaskNum})
46 | 2. Instead of repartition, you can run .coallese(${recommendedTaskNum}) to decrease the number of tasks without shuffling on the expense of less parallelism
47 | 3. If you need to hash-partition, call repartition like this: .repartition(${recommendedTaskNum}, "hash_key1", "hash_key2")
48 | `,
49 | shortSuggestion: `.repartition(${recommendedTaskNum}) before this transformation`,
50 | type: "warning",
51 | source: {
52 | type: "sql",
53 | sqlId: sql.id,
54 | sqlNodeId: node.nodeId,
55 | },
56 | });
57 | }
58 | });
59 | });
60 | }
61 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/Alerts/WastedCoresAlertsReducer.ts:
--------------------------------------------------------------------------------
1 | import { Alerts, ConfigStore, StatusStore } from "../../interfaces/AppStore";
2 |
3 | const WASTED_CORES_RATIO_THRESHOLD = 50.0;
4 |
5 | export function reduceWastedCoresAlerts(
6 | statusStore: StatusStore,
7 | config: ConfigStore,
8 | alerts: Alerts,
9 | ) {
10 | if (
11 | statusStore.executors !== undefined &&
12 | statusStore.executors.idleCoresRate > WASTED_CORES_RATIO_THRESHOLD
13 | ) {
14 | const idleCores = statusStore.executors.idleCoresRate;
15 |
16 | let suggestionMessage = "decrease amount of cores or executors";
17 | if (config.resourceControlType === "databricks") {
18 | suggestionMessage =
19 | "Reduce your cluster size or machine type via databricks cluster UI";
20 | } else if (config.resourceControlType === "static") {
21 | suggestionMessage = `1. decrease amount of cores per executor by lowering spark.executor.cores
22 | 2. decrease amount of executors by lowering spark.executor.instances OR if using dynamic allocation by tuning .`;
23 | } else if (config.resourceControlType === "dynamic") {
24 | suggestionMessage = `1. decrease amount of cores per executor by lowering spark.executor.cores
25 | 2. tune your Dynamic Allocation config, specifically lower spark.dynamicAllocation.executorAllocationRatio or increase spark.dynamicAllocation.schedulerBacklogTimeout`;
26 | }
27 |
28 | alerts.push({
29 | id: `idleCoresTooHigh${idleCores.toFixed(2)}`,
30 | name: "idleCoresTooHigh",
31 | title: "Idle Cores Too High",
32 | location: "In: Summery Page -> Idle Cores",
33 | message: `Idle Cores is ${idleCores.toFixed(
34 | 2,
35 | )}% which is too high, and suggest your cluster is over-provisioned on cores or executors`,
36 | suggestion: suggestionMessage,
37 | type: "warning",
38 | source: {
39 | type: "status",
40 | metric: "idleCores",
41 | },
42 | });
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/AlertsReducer.ts:
--------------------------------------------------------------------------------
1 | import {
2 | Alerts,
3 | AlertsStore,
4 | ConfigStore,
5 | SparkExecutorsStore,
6 | SparkSQLStore,
7 | SparkStagesStore,
8 | StatusStore,
9 | } from "../interfaces/AppStore";
10 | import { reduceBroadcastTooLargeAlert } from "./Alerts/BroadcastTooLargeAlert";
11 | import { reduceIcebergReplaces } from "./Alerts/IcebergReplacesReducer";
12 | import { reduceJoinToBroadcastAlert } from "./Alerts/JoinToBroadcastAlert";
13 | import { reduceLargeCrossJoinScanAlert } from "./Alerts/LargeCrossJoinScanAlert";
14 | import { reduceLongFilterConditions } from "./Alerts/LongFilterConditions";
15 | import { reduceMaxPartitionToBigAlert } from "./Alerts/MaxPartitionToBigAlert";
16 | import { reduceMemoryAlerts } from "./Alerts/MemoryAlertsReducer";
17 | import { reduceSQLInputOutputAlerts } from "./Alerts/MemorySQLInputOutputAlerts";
18 | import { reducePartitionSkewAlert } from "./Alerts/PartitionSkewAlert";
19 | import { reduceSmallTasksAlert } from "./Alerts/SmallTasksAlert";
20 | import { reduceWastedCoresAlerts } from "./Alerts/WastedCoresAlertsReducer";
21 | import { parseAlertDisabledConfig } from "../utils/ConfigParser";
22 |
23 | export function reduceAlerts(
24 | sqlStore: SparkSQLStore,
25 | statusStore: StatusStore,
26 | stageStore: SparkStagesStore,
27 | config: ConfigStore,
28 | executors: SparkExecutorsStore,
29 | environmentInfo: any
30 | ): AlertsStore {
31 | const alerts: Alerts = [];
32 | reduceMemoryAlerts(statusStore, config, environmentInfo, executors, alerts);
33 | reduceWastedCoresAlerts(statusStore, config, alerts);
34 | reduceSQLInputOutputAlerts(sqlStore, alerts);
35 | reducePartitionSkewAlert(sqlStore, stageStore, alerts);
36 | reduceSmallTasksAlert(sqlStore, stageStore, alerts);
37 | reduceIcebergReplaces(sqlStore, alerts);
38 | reduceLongFilterConditions(sqlStore, alerts);
39 | reduceBroadcastTooLargeAlert(sqlStore, alerts);
40 | reduceJoinToBroadcastAlert(sqlStore, alerts);
41 | reduceLargeCrossJoinScanAlert(sqlStore, alerts);
42 | reduceMaxPartitionToBigAlert(sqlStore, stageStore, alerts);
43 | const disabledAlerts = parseAlertDisabledConfig(config.alertDisabled);
44 | const filteredAlerts = alerts.filter(alert => !disabledAlerts.has(alert.name));
45 | return {
46 | alerts: filteredAlerts,
47 | };
48 | }
49 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/ChatSlice.ts:
--------------------------------------------------------------------------------
1 | import { MessageModel } from "@chatscope/chat-ui-kit-react";
2 | import { createSlice, PayloadAction } from "@reduxjs/toolkit";
3 |
4 | export const initialState: {
5 | messages: MessageModel[];
6 | isTyping: boolean;
7 | inputText: string;
8 | apiKey: string | undefined;
9 | } = {
10 | messages: [
11 | {
12 | message: "Hello, ask me anything about your spark job!",
13 | sentTime: "just now",
14 | sender: "ChatGPT",
15 | direction: "outgoing",
16 | position: "normal",
17 | },
18 | ],
19 | isTyping: false,
20 | apiKey: undefined,
21 | inputText: "",
22 | };
23 |
24 | const chatSlice = createSlice({
25 | name: "chat",
26 | initialState,
27 | reducers: {
28 | addMessage: (
29 | state,
30 | action: PayloadAction<{
31 | message: MessageModel;
32 | }>,
33 | ) => {
34 | state.messages.push(action.payload.message);
35 | },
36 | setIsTyping: (
37 | state,
38 | action: PayloadAction<{
39 | isTyping: boolean;
40 | }>,
41 | ) => {
42 | state.isTyping = action.payload.isTyping;
43 | },
44 | setApiKey: (
45 | state,
46 | action: PayloadAction<{
47 | apiKey: string;
48 | }>,
49 | ) => {
50 | state.apiKey =
51 | action.payload.apiKey === "" ? undefined : action.payload.apiKey;
52 | },
53 | setInputText: (
54 | state,
55 | action: PayloadAction<{
56 | inputText: string;
57 | }>,
58 | ) => {
59 | state.inputText = action.payload.inputText;
60 | },
61 | },
62 | });
63 |
64 | // Export the action creators and the reducer
65 | export const { addMessage, setIsTyping, setApiKey, setInputText } =
66 | chatSlice.actions;
67 |
68 | export default chatSlice.reducer;
69 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/GeneralSlice.ts:
--------------------------------------------------------------------------------
1 | import { createSlice, PayloadAction } from "@reduxjs/toolkit";
2 | import { GraphFilter, SQLNodeExchangeStageData, SQLNodeStageData } from "../interfaces/AppStore";
3 |
4 | export const initialState: {
5 | sqlMode: GraphFilter;
6 | selectedStage: SQLNodeStageData | SQLNodeExchangeStageData | undefined;
7 | } = {
8 | sqlMode: "advanced",
9 | selectedStage: undefined
10 | };
11 |
12 | const generalSlice = createSlice({
13 | name: "general",
14 | initialState,
15 | reducers: {
16 | setSQLMode: (
17 | state,
18 | action: PayloadAction<{
19 | newMode: GraphFilter;
20 | }>,
21 | ) => {
22 | state.sqlMode = action.payload.newMode;
23 | },
24 | setSelectedStage: (
25 | state,
26 | action: PayloadAction<{
27 | selectedStage: SQLNodeStageData | SQLNodeExchangeStageData | undefined;
28 | }>,
29 | ) => {
30 | state.selectedStage = action.payload.selectedStage;
31 | },
32 | },
33 | });
34 |
35 | // Export the action creators and the reducer
36 | export const { setSQLMode, setSelectedStage } = generalSlice.actions;
37 |
38 | export default generalSlice.reducer;
39 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/CoalesceParser.test.ts:
--------------------------------------------------------------------------------
1 | import { ParsedCoalescePlan } from '../../interfaces/AppStore';
2 | import { parseCoalesce } from './CoalesceParser';
3 |
4 | describe('CoalesceParser', () => {
5 | it('should parse the partition number from a Coalesce plan', () => {
6 | const input = 'Coalesce 10';
7 | const expected: ParsedCoalescePlan = { partitionNum: 10 };
8 | const result = parseCoalesce(input);
9 | expect(result).toEqual(expected);
10 | });
11 | });
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/CoalesceParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedCoalescePlan } from "../../interfaces/AppStore";
2 |
3 | export function parseCoalesce(input: string): ParsedCoalescePlan {
4 | return {
5 | partitionNum: parseInt(input.split(" ")[1]),
6 | };
7 | }
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/CollectLimitParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedCollectLimitPlan } from "../../interfaces/AppStore";
2 |
3 | export function parseCollectLimit(input: string): ParsedCollectLimitPlan {
4 | return {
5 | limit: parseInt(input.split(" ")[1]),
6 | };
7 | }
8 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/ExchangeParser.spec.ts:
--------------------------------------------------------------------------------
1 | import { parseExchange } from "./ExchangeParser";
2 |
3 | describe("parseExchange", () => {
4 | test("parses hash partitioning correctly", () => {
5 | const input =
6 | "Exchange hashpartitioning(ss_quantity#9, 200), REPARTITION_BY_COL, [plan_id=40]";
7 | expect(parseExchange(input)).toEqual({
8 | type: "hashpartitioning",
9 | fields: ["ss_quantity"],
10 | isBroadcast: false,
11 | });
12 | });
13 |
14 | test("parses single partition correctly", () => {
15 | const input =
16 | "Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=14514]";
17 | expect(parseExchange(input)).toEqual({
18 | type: "SinglePartition",
19 | fields: [],
20 | isBroadcast: false,
21 | });
22 | });
23 |
24 | test("parses range partitioning correctly", () => {
25 | const input =
26 | "Exchange rangepartitioning(ca_county#787 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=83408]";
27 | expect(parseExchange(input)).toEqual({
28 | type: "rangepartitioning",
29 | fields: ["ca_county ASC NULLS FIRST"],
30 | isBroadcast: false,
31 | });
32 | });
33 |
34 | test("parses broadcast correctly", () => {
35 | const input =
36 | "Exchange SinglePartition, EXECUTOR_BROADCAST, [plan_id=270]";
37 | expect(parseExchange(input)).toEqual({
38 | type: "SinglePartition",
39 | fields: [],
40 | isBroadcast: true,
41 | });
42 | });
43 |
44 | // Add more test cases as necessary
45 | });
46 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/ExchangeParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedExchangePlan } from "../../interfaces/AppStore";
2 | import { bracedSplit, hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function parseExchange(input: string): ParsedExchangePlan {
5 | const typeRegex = /Exchange (\w+)/;
6 |
7 | const typeMatch = input.match(typeRegex);
8 |
9 | const parenthesisContent = input.match(/\(([^)]+)\)/)?.[1] ?? "";
10 | const allFields = bracedSplit(parenthesisContent).map((field) =>
11 | hashNumbersRemover(field.trim()),
12 | );
13 | // Remove the last element if it is a number (partition number)
14 | if (allFields.length > 0 && !isNaN(Number(allFields[allFields.length - 1]))) {
15 | allFields.pop();
16 | }
17 |
18 | const type = typeMatch ? typeMatch[1] : "";
19 |
20 | const isBroadcast = input.includes("EXECUTOR_BROADCAST");
21 |
22 | return { type, fields: allFields, isBroadcast: isBroadcast };
23 | }
24 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/FilterParser.ts:
--------------------------------------------------------------------------------
1 | import { ParseFilterPlan } from "../../interfaces/AppStore";
2 | import {
3 | hashNumbersRemover,
4 | removeFromEnd,
5 | removeFromStart,
6 | } from "./PlanParserUtils";
7 |
8 | export function parseFilter(input: string): ParseFilterPlan {
9 | let filterStr = input;
10 | filterStr = removeFromStart(filterStr, "Filter ");
11 | filterStr = removeFromStart(filterStr, "PhotonFilter ");
12 | filterStr = removeFromStart(filterStr, "GpuFilter ");
13 | filterStr = removeFromStart(filterStr, "CometFilter ");
14 |
15 | if (filterStr.startsWith("(")) {
16 | filterStr = removeFromStart(filterStr, "(");
17 | filterStr = removeFromEnd(filterStr, ")");
18 | }
19 | const condition = hashNumbersRemover(filterStr);
20 | return { condition: condition };
21 | }
22 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/JoinParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedJoinPlan } from "../../interfaces/AppStore";
2 | import {
3 | bracedSplit,
4 | hashNumbersRemover,
5 | removeFromEnd,
6 | removeFromStart,
7 | } from "./PlanParserUtils";
8 |
9 | export function parseJoin(input: string): ParsedJoinPlan {
10 | if (input.startsWith("BroadcastNestedLoopJoin")) {
11 | const regex = /BroadcastNestedLoopJoin(?:\s+\w+)?,\s+(\w+)(?:,\s*\((.*)\))?/;
12 | const match = hashNumbersRemover(input).match(regex);
13 | if (!match) {
14 | throw new Error("Invalid input format");
15 | }
16 | const [, , conditionStr] = match;
17 | let joinCondition = conditionStr ? conditionStr.trim() : undefined;
18 | return { joinType: "BroadcastNestedLoopJoin", joinSideType: "Cross", joinCondition };
19 | }
20 |
21 | const regex = /^(\w+)\s+\[(.*?)\], \[(.*?)\], (\w+)(?:,\s+(.*))?$/;
22 | const match = hashNumbersRemover(input).match(regex);
23 |
24 | if (!match) {
25 | throw new Error("Invalid input format");
26 | }
27 |
28 | const [, joinType, leftKeysStr, rightKeysStr, joinSideType, conditionStr] =
29 | match;
30 | let leftKeys: string[] | undefined;
31 | let rightKeys: string[] | undefined;
32 | let joinCondition: string | undefined;
33 |
34 | if (leftKeysStr) {
35 | leftKeys = bracedSplit(leftKeysStr);
36 | }
37 |
38 | if (rightKeysStr) {
39 | rightKeys = bracedSplit(rightKeysStr);
40 | }
41 |
42 | if (conditionStr) {
43 | joinCondition = conditionStr;
44 | joinCondition = removeFromEnd(joinCondition, ", false");
45 | joinCondition = removeFromStart(joinCondition, "BuildRight, ");
46 | joinCondition = removeFromStart(joinCondition, "BuildLeft, ");
47 | }
48 |
49 | if (joinCondition === "BuildRight") {
50 | joinCondition = undefined;
51 | }
52 | if (joinCondition === "BuildLeft") {
53 | joinCondition = undefined;
54 | }
55 |
56 | return {
57 | joinType,
58 | leftKeys,
59 | rightKeys,
60 | joinCondition,
61 | joinSideType,
62 | };
63 | }
64 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/PlanParserUtils.ts:
--------------------------------------------------------------------------------
1 | export function onlyUnique(value: string, index: number, array: string[]) {
2 | return array.indexOf(value) === index;
3 | }
4 |
5 | export function hashNumbersRemover(input: string): string {
6 | return input.replace(/#\d+L/g, "").replace(/#\d+/g, "");
7 | }
8 |
9 | export function truncateString(str: string, num: number): string {
10 | if (str.length <= num) {
11 | return str;
12 | }
13 | return str.slice(0, num) + "...";
14 | }
15 |
16 | export function truncateMiddle(str: string, maxLength: number): string {
17 | if (str.length <= maxLength) {
18 | return str;
19 | }
20 |
21 | const prefixLength = Math.ceil(maxLength / 2) - 1; // Subtract 1 for the '...'
22 | const suffixLength = Math.floor(maxLength / 2);
23 |
24 | const prefix = str.substring(0, prefixLength);
25 | const suffix = str.substring(str.length - suffixLength);
26 |
27 | return `${prefix}...${suffix}`;
28 | }
29 |
30 | export function removeFromStart(str: string, strToRemove: string): string {
31 | if (str.startsWith(strToRemove)) {
32 | return str.slice(strToRemove.length);
33 | }
34 | return str;
35 | }
36 |
37 | export function removeFromEnd(str: string, strToRemove: string) {
38 | if (str.endsWith(strToRemove)) {
39 | return str.slice(0, -strToRemove.length);
40 | }
41 | return str;
42 | }
43 |
44 | export function bracedSplit(input: string): string[] {
45 | const result: string[] = [];
46 | let buffer = "";
47 | let bracketCount = 0;
48 | let inQuotes = false;
49 |
50 | for (let i = 0; i < input.length; i++) {
51 | const char = input[i];
52 |
53 | if (char === "(") bracketCount++;
54 | if (char === ")") bracketCount--;
55 | if (char === '"') inQuotes = !inQuotes;
56 |
57 | if (char === "," && bracketCount === 0 && !inQuotes) {
58 | result.push(buffer.trim());
59 | buffer = "";
60 | } else {
61 | buffer += char;
62 | }
63 | }
64 | if (buffer) result.push(buffer.trim());
65 | return result;
66 | }
67 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/ProjectParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedProjectPlan } from "../../interfaces/AppStore";
2 | import { bracedSplit, hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function parseProject(input: string): ParsedProjectPlan {
5 | const fieldsStr = hashNumbersRemover(
6 | input.replace("Project [", "").replace("PhotonProject [", "").replace("GpuProject [", "").slice(0, -1),
7 | );
8 | const fields = bracedSplit(fieldsStr).map((field) => field.trim());
9 | return { fields: fields };
10 | }
11 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/ScanFileParser.ts:
--------------------------------------------------------------------------------
1 | import { ParseFileScanPlan } from "../../interfaces/AppStore";
2 | import { hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function parseFileScan(
5 | input: string,
6 | nodeName: string,
7 | ): ParseFileScanPlan {
8 | input = hashNumbersRemover(input);
9 | const result: ParseFileScanPlan = {};
10 | const matches = {
11 | format: /Format: (\w+),/.exec(input),
12 | Location: /Location: InMemoryFileIndex\([\w\s]+\)\[(.*?)\]/.exec(input),
13 | PartitionFilters: /PartitionFilters: \[(.*?)\]/.exec(input),
14 | PushedFilters: /PushedFilters: \[(.*?)\]/.exec(input),
15 | ReadSchema: /ReadSchema: struct<([\w\W]+)>/.exec(input),
16 | };
17 |
18 | if (matches.format) result.format = matches.format[1];
19 | if (matches.Location && matches.Location[1].includes("...")) {
20 | const paths = matches.Location[1].split(",");
21 | result.Location = paths.length ? paths[0] : undefined;
22 | } else if (matches.Location) {
23 | result.Location = matches.Location[1];
24 | }
25 |
26 | if (matches.PartitionFilters) {
27 | if (matches.PartitionFilters[1].includes("...")) {
28 | result.PartitionFilters = undefined;
29 | } else {
30 | result.PartitionFilters = matches.PartitionFilters[1]
31 | .split(",")
32 | .map((filter) => filter.trim())
33 | .filter(Boolean);
34 | }
35 | }
36 |
37 | if (matches.PushedFilters) {
38 | if (matches.PushedFilters[1].includes("...")) {
39 | result.ReadSchema = undefined;
40 | } else {
41 | result.PushedFilters = matches.PushedFilters[1]
42 | .split(",")
43 | .map((filter) => filter.trim())
44 | .filter(Boolean);
45 | }
46 | }
47 |
48 | if (matches.ReadSchema) {
49 | if (matches.ReadSchema[1].includes("...")) {
50 | result.ReadSchema = undefined;
51 | } else {
52 | const fields = matches.ReadSchema[1].split(/,(?![^()]*\))/);
53 | const schema: { [key: string]: string } = {};
54 | fields.forEach((field) => {
55 | const [name, type] = field.split(":");
56 | if (name !== undefined && type !== undefined) {
57 | schema[name.trim()] = type.trim();
58 | }
59 | });
60 | result.ReadSchema = schema;
61 | }
62 | }
63 | if (nodeName.split(" ").length === 3) {
64 | result.tableName = nodeName.split(" ")[2];
65 | }
66 |
67 | return result;
68 | }
69 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/SortParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedSortPlan } from "../../interfaces/AppStore";
2 | import { hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function parseSort(input: string): ParsedSortPlan {
5 | const match = hashNumbersRemover(input).match(/\[(.*?)\]/);
6 | if (!match) {
7 | return { fields: [] };
8 | }
9 | const fields = match[1].split(",").map((field) => field.trim());
10 | return { fields: fields };
11 | }
12 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/SortParset.spec.ts:
--------------------------------------------------------------------------------
1 | import { parseSort } from "./SortParser";
2 |
3 | describe("parseSort", () => {
4 | it("should correctly parse the example input", () => {
5 | const input =
6 | "Sort [supplier_count#2941L DESC NULLS LAST, p_brand#267 ASC NULLS FIRST, p_type#268 ASC NULLS FIRST, p_size#269L ASC NULLS FIRST], true, 0";
7 | const expected = {
8 | fields: [
9 | "supplier_count DESC NULLS LAST",
10 | "p_brand ASC NULLS FIRST",
11 | "p_type ASC NULLS FIRST",
12 | "p_size ASC NULLS FIRST",
13 | ],
14 | };
15 | expect(parseSort(input)).toEqual(expected);
16 | });
17 | });
18 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/TakeOrderedAndProjectParser.spec.ts:
--------------------------------------------------------------------------------
1 | import { ParsedTakeOrderedAndProjectPlan } from "../../interfaces/AppStore";
2 | import { parseTakeOrderedAndProject } from "./TakeOrderedAndProjectParser";
3 |
4 | // Parametrized Unit Tests using Jest
5 | describe("parseTakeOrderedAndProject", () => {
6 | const testCases: {
7 | input: string;
8 | expected: ParsedTakeOrderedAndProjectPlan;
9 | }[] = [
10 | {
11 | input:
12 | "TakeOrderedAndProject(limit=100, orderBy=[s_store_name#1001 ASC NULLS FIRST], output=[s_store_name#1001,sum(ss_net_profit)#26850])",
13 | expected: {
14 | output: ["s_store_name", "sum(ss_net_profit)"],
15 | orderBy: ["s_store_name ASC NULLS FIRST"],
16 | limit: 100,
17 | },
18 | },
19 | // ... add other test cases here
20 | ];
21 |
22 | testCases.forEach(({ input, expected }) => {
23 | it(`should parse "${input}" correctly`, () => {
24 | expect(parseTakeOrderedAndProject(input)).toEqual(expected);
25 | });
26 | });
27 | });
28 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/TakeOrderedAndProjectParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedTakeOrderedAndProjectPlan } from "../../interfaces/AppStore";
2 | import { hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function parseTakeOrderedAndProject(
5 | input: string,
6 | ): ParsedTakeOrderedAndProjectPlan {
7 | const cleanInput = hashNumbersRemover(input);
8 | const outputMatch = cleanInput.match(/output=\[([^\]]+)\]/);
9 | const orderByMatch = cleanInput.match(/orderBy=\[([^\]]+)\]/);
10 | const limitMatch = cleanInput.match(/limit=(\d+)/);
11 |
12 | return {
13 | output: outputMatch ? outputMatch[1].split(",") : [],
14 | orderBy: orderByMatch ? orderByMatch[1].split(",") : [],
15 | limit: limitMatch ? parseInt(limitMatch[1], 10) : 0,
16 | };
17 | }
18 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/WindowParser.spec.ts:
--------------------------------------------------------------------------------
1 | import { ParsedWindowPlan } from "../../interfaces/AppStore";
2 | import { parseWindow } from "./WindowParser";
3 |
4 | describe("parseWindow", () => {
5 | it("should parse simple Window", () => {
6 | const input =
7 | "Window [approx_count_distinct(user_id#0, 0.05, 0, 0) windowspecdefinition(category#2, day#3, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS distinct_users#113L], [category#2, day#3]";
8 | const expected: ParsedWindowPlan = {
9 | selectFields: ["approx_count_distinct(user_id, 0.05, 0, 0) windowspecdefinition(category, day, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS distinct_users"],
10 | partitionFields: ["category", "day"],
11 | sortFields: [],
12 | };
13 | expect(parseWindow(input)).toEqual(expected);
14 | });
15 |
16 | it("should parse window with sort field", () => {
17 | const input =
18 | "Window [row_number() windowspecdefinition(category#2, day#3 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#63004], [category#2], [day#3 ASC NULLS FIRST]";
19 | const expected: ParsedWindowPlan = {
20 | selectFields: ["row_number() windowspecdefinition(category, day ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number"],
21 | partitionFields: ["category"],
22 | sortFields: ["day ASC NULLS FIRST"],
23 | };
24 | expect(parseWindow(input)).toEqual(expected);
25 | });
26 |
27 | it("should parse window with sort field", () => {
28 | const input = "Window [row_number() windowspecdefinition(category#2, day#3 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#63004], [category#2], [day#3 ASC NULLS FIRST]"
29 | const expected: ParsedWindowPlan = {
30 | selectFields: ["row_number() windowspecdefinition(category, day ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number"],
31 | partitionFields: ["category"],
32 | sortFields: ["day ASC NULLS FIRST"],
33 | };
34 | expect(parseWindow(input)).toEqual(expected);
35 | });
36 |
37 |
38 |
39 | })
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/WindowParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedWindowPlan } from "../../interfaces/AppStore";
2 | import {
3 | bracedSplit,
4 | hashNumbersRemover
5 | } from "./PlanParserUtils";
6 |
7 | export function parseWindow(input: string): ParsedWindowPlan {
8 | // Improved regex to correctly capture each part of the window specification
9 | const regex = /Window \[(.*?)\](?:,\s*\[(.*?)\])?(?:,\s*\[(.*?)\])?/;
10 |
11 | // Remove any unwanted hash numbers
12 | const sanitizedInput = hashNumbersRemover(input);
13 |
14 | // Match the input string with the regex
15 | const match = sanitizedInput.match(regex);
16 |
17 | if (!match) {
18 | return { partitionFields: [], selectFields: [], sortFields: [] };
19 | }
20 |
21 | // Extract the matched groups (select, partition, sort)
22 | const selectFields = bracedSplit(match[1]);
23 |
24 | // Handle case when there are no partition or sort fields
25 | const partitionFields = match[2] ? bracedSplit(match[2]) : [];
26 | const sortFields = match[3] ? bracedSplit(match[3]) : [];
27 |
28 | return { partitionFields, selectFields, sortFields };
29 | }
30 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/WriteToHDFSParser.spec.ts:
--------------------------------------------------------------------------------
1 | import { parseWriteToHDFS } from "./WriteToHDFSParser"; // Ensure to export functions from parser.ts
2 |
3 | const testData = [
4 | {
5 | input:
6 | "Execute InsertIntoHadoopFsRelationCommand file:/Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01, false, CSV, [header=true, path=file:///Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01], Overwrite, [l_returnflag, l_linestatus, sum(l_quantity), sum(l_extendedprice), sum(UDF(l_extendedprice, l_discount)), sum(UDF(UDF(l_extendedprice, l_discount), l_tax)), avg(l_quantity), avg(l_extendedprice), avg(l_discount), count(l_quantity)]",
7 | expected: {
8 | location:
9 | "file:/Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01",
10 | format: "CSV",
11 | mode: "Overwrite",
12 | },
13 | },
14 | {
15 | input:
16 | 'Execute InsertIntoHadoopFsRelationCommand file:/tmp/output/partitiondata, false, [speaker#76], Parquet, [__partition_columns=["speaker"], path=/tmp/output/partitiondata], Append, [line_id, play_name, speech_number, line_number, speaker, text_entry]',
17 | expected: {
18 | location: "file:/tmp/output/partitiondata",
19 | format: "Parquet",
20 | partitionKeys: ["speaker"],
21 | mode: "Append",
22 | },
23 | },
24 | {
25 | input:
26 | "Execute InsertIntoHadoopFsRelationCommand file:/tmp/data, false, [speaker#94], Parquet, [path=file:/tmp/data], Append, `spark_catalog`.`local_catalog`.`my_table`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:/tmp/data), [line_id, play_name, speech_number, line_number, text_entry, speaker]",
27 | expected: {
28 | location: "file:/tmp/data",
29 | format: "Parquet",
30 | partitionKeys: ["speaker"],
31 | mode: "Append",
32 | tableName: "`spark_catalog`.`local_catalog`.`my_table`",
33 | },
34 | },
35 | {
36 | input:
37 | "Execute InsertIntoHadoopFsRelationCommand file:/tmp/data2, false, Parquet, [path=file:/tmp/data2], Append, `spark_catalog`.`local_catalog`.`my_table`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:/tmp/data2), [line_id, play_name, speech_number, line_number, text_entry, speaker]",
38 | expected: {
39 | location: "file:/tmp/data2",
40 | format: "Parquet",
41 | mode: "Append",
42 | tableName: "`spark_catalog`.`local_catalog`.`my_table`",
43 | },
44 | },
45 | ];
46 |
47 | describe("parseWriteToHDFS", () => {
48 | testData.forEach((data, idx) => {
49 | it(`parses string ${idx + 1} correctly`, () => {
50 | const result = parseWriteToHDFS(data.input);
51 | expect(result).toEqual(data.expected);
52 | });
53 | });
54 | });
55 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/WriteToHDFSParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedWriteToHDFSPlan } from "../../interfaces/AppStore";
2 | import { hashNumbersRemover } from "./PlanParserUtils";
3 |
4 | export function specialSplit(input: string): string[] {
5 | const result: string[] = [];
6 | let buffer = "";
7 | let bracketCount = 0;
8 | let inQuotes = false;
9 |
10 | for (let i = 0; i < input.length; i++) {
11 | const char = input[i];
12 |
13 | if (char === "[") bracketCount++;
14 | if (char === "]") bracketCount--;
15 | if (char === '"') inQuotes = !inQuotes;
16 |
17 | if (char === "," && bracketCount === 0 && !inQuotes) {
18 | result.push(buffer.trim());
19 | buffer = "";
20 | } else {
21 | buffer += char;
22 | }
23 | }
24 |
25 | if (buffer) result.push(buffer.trim());
26 | return result;
27 | }
28 |
29 | export function parseWriteToHDFS(input: string): ParsedWriteToHDFSPlan {
30 | input = input.replace("Execute InsertIntoHadoopFsRelationCommand", "").trim();
31 | const parts = specialSplit(input);
32 |
33 | let parsed: ParsedWriteToHDFSPlan = {
34 | location: parts[0],
35 | format: "unknown",
36 | mode: "unknown",
37 | tableName: undefined,
38 | partitionKeys: undefined,
39 | };
40 |
41 | if (parts[2].includes("[")) {
42 | (parsed.partitionKeys = hashNumbersRemover(parts[2].slice(1, -1)).split(
43 | ",",
44 | )),
45 | (parsed.format = parts[3]);
46 | parsed.mode = parts[5];
47 | } else {
48 | parsed.format = parts[2];
49 | parsed.mode = parts[4];
50 | }
51 |
52 | if (parts[4].includes("`")) {
53 | parsed.tableName = parts[4];
54 | } else if (parts[5].includes("`")) {
55 | parsed.tableName = parts[5];
56 | } else if (parts.length > 6 && parts[6].includes("`")) {
57 | parsed.tableName = parts[6];
58 | }
59 |
60 | return parsed;
61 | }
62 |
--------------------------------------------------------------------------------
/spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts:
--------------------------------------------------------------------------------
1 | import { ParsedHashAggregatePlan } from "../../interfaces/AppStore";
2 | import { bracedSplit, hashNumbersRemover, onlyUnique } from "./PlanParserUtils";
3 |
4 | export function parseHashAggregate(input: string): ParsedHashAggregatePlan {
5 | const cleanInput = hashNumbersRemover(input);
6 | const keysMatch = cleanInput.match(/keys=\[([^\]]+)\]/);
7 | const functionsMatch = cleanInput.match(/functions=\[([^\]]+)\]/);
8 |
9 | let keys: string[] = [];
10 | let functions: string[] = [];
11 | let operations: string[] = [];
12 |
13 | if (keysMatch && keysMatch[1]) {
14 | keys = bracedSplit(keysMatch[1]).map((key) => key.trim());
15 | }
16 |
17 | if (functionsMatch && functionsMatch[1]) {
18 | functions = bracedSplit(functionsMatch[1]).map((func) => func.trim());
19 |
20 | // Extracting only the outermost operation
21 | operations = functions
22 | .map((func) => {
23 | if (func.includes("count(distinct")) {
24 | return "count_distinct";
25 | }
26 | const match = func.match(/^\w+/);
27 | return match ? match[0] : "";
28 | })
29 | .filter(Boolean)
30 | .filter(onlyUnique);
31 | }
32 |
33 | return {
34 | keys,
35 | functions,
36 | operations,
37 | };
38 | }
39 |
--------------------------------------------------------------------------------
/spark-ui/src/services/MixpanelService.tsx:
--------------------------------------------------------------------------------
1 | import mixpanel from "mixpanel-browser";
2 | import { MixpanelEvents } from "../interfaces/Mixpanel";
3 |
4 | const KEEP_ALIVE_INTERVAL_MS = 60 * 1000;
5 |
6 | const baseProperties = {
7 | dataflintVersion: process.env.REACT_APP_VERSION ?? "unknown-version",
8 | };
9 |
10 | export class MixpanelService {
11 | static mixpanelTelemetryConfigDisabled = false;
12 |
13 | static setMixpanelTelemetryConfigDisabled(): void {
14 | MixpanelService.mixpanelTelemetryConfigDisabled = true;
15 | }
16 |
17 | static InitMixpanel(): void {
18 | if (!this.ShouldTrack()) return;
19 |
20 | const MIX_PANEL_TOKEN = "114c37f7dc10c79978b850277136c232";
21 |
22 | // For debugging add debug: true to the props
23 | mixpanel.init(MIX_PANEL_TOKEN, {
24 | // using a cloudfront to skip ad blockers, see:
25 | // https://blog.pranavp.com.np/prevent-ad-blockers-from-blocking-mixpanel-without-nginx
26 | api_host: "https://drblx6b8i77l.cloudfront.net",
27 | track_pageview: true,
28 | persistence: "localStorage",
29 | });
30 | this.StartKeepAlive(KEEP_ALIVE_INTERVAL_MS);
31 | }
32 |
33 | /**
34 | * Sends keep alive every interval if the tab is focused, in order to keep the mixpanel sessions "alive"
35 | * @param interval keep alive interval in ms
36 | */
37 | static StartKeepAlive(interval: number): void {
38 | if (!this.ShouldTrack) return;
39 |
40 | setInterval(() => {
41 | if (document.hidden) {
42 | // skip keep alive when tab is not in focus
43 | return;
44 | }
45 |
46 | this.Track(MixpanelEvents.KeepAlive, baseProperties);
47 | }, interval);
48 | }
49 |
50 | static Track(
51 | event: MixpanelEvents,
52 | properties?: { [key: string]: any },
53 | ): void {
54 | if (!this.ShouldTrack()) return;
55 |
56 | mixpanel.track(event, { ...baseProperties, ...properties });
57 | }
58 |
59 | static TrackPageView(properties?: { [key: string]: any }): void {
60 | if (!this.ShouldTrack()) return;
61 |
62 | mixpanel.track_pageview({ ...baseProperties, ...properties });
63 | }
64 |
65 | static ShouldTrack(): boolean {
66 | return (
67 | process.env.NODE_ENV !== "development" &&
68 | localStorage.getItem("SKIP_MIXPANEL") !== "true" &&
69 | !MixpanelService.mixpanelTelemetryConfigDisabled
70 | );
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/spark-ui/src/services/TabsService.tsx:
--------------------------------------------------------------------------------
1 | import AdjustIcon from "@mui/icons-material/Adjust";
2 | import AssessmentIcon from "@mui/icons-material/Assessment";
3 | import PrecisionManufacturingIcon from "@mui/icons-material/PrecisionManufacturing";
4 | import ReportIcon from "@mui/icons-material/Report";
5 | import SettingsApplicationsIcon from "@mui/icons-material/SettingsApplications";
6 | import React from "react";
7 | import { isHistoryServer } from "../utils/UrlUtils";
8 |
9 | export enum Tab {
10 | Status = "Status",
11 | Summary = "Summary",
12 | Resources = "Resources",
13 | Configuration = "Configuration",
14 | Alerts = "Alerts",
15 | }
16 |
17 | export const TabToUrl = {
18 | [Tab.Status]: "/status",
19 | [Tab.Summary]: "/summary",
20 | [Tab.Configuration]: "/config",
21 | [Tab.Alerts]: "/alerts",
22 | [Tab.Resources]: "/resources",
23 | };
24 |
25 | export const getTabByUrl = (path: string) => {
26 | switch (path) {
27 | case TabToUrl[Tab.Status]:
28 | return Tab.Status;
29 | case TabToUrl[Tab.Summary]:
30 | return Tab.Summary;
31 | case TabToUrl[Tab.Configuration]:
32 | return Tab.Configuration;
33 | case TabToUrl[Tab.Alerts]:
34 | return Tab.Alerts;
35 | case TabToUrl[Tab.Resources]:
36 | return Tab.Resources;
37 | default:
38 | return isHistoryServer() ? Tab.Summary : Tab.Status;
39 | }
40 | };
41 |
42 | export function renderTabIcon(selectedTab: Tab): JSX.Element {
43 | switch (selectedTab) {
44 | case Tab.Status:
45 | return ;
46 | case Tab.Configuration:
47 | return ;
48 | case Tab.Summary:
49 | return ;
50 | case Tab.Alerts:
51 | return ;
52 | case Tab.Resources:
53 | return ;
54 | default:
55 | return ;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/spark-ui/src/tabs/AlertsTab.tsx:
--------------------------------------------------------------------------------
1 | import { Alert, AlertTitle } from "@mui/material";
2 | import { Stack } from "@mui/system";
3 | import React, { FC } from "react";
4 | import { useAppSelector } from "../Hooks";
5 |
6 | export const AlertsTab: FC<{}> = (): JSX.Element => {
7 | const alerts = useAppSelector((state) => state.spark.alerts);
8 | const errorsCount = alerts?.alerts.filter((alert) => alert.type === "error")
9 | .length;
10 | const warningsCount = alerts?.alerts.filter(
11 | (alert) => alert.type === "warning",
12 | ).length;
13 |
14 | return (
15 | <>
16 | {alerts?.alerts.length === 0 ? (
17 |
27 | ) : (
28 |
36 |
37 |
{`Errors - ${errorsCount}`}
41 |
{`Alerts - ${warningsCount}`}
45 |
46 |
54 |
58 | {alerts?.alerts.map((alert) => {
59 | return (
60 |
61 | {alert.title}
62 | {alert.message}
63 | {"\n"}
64 | {alert.location}
65 | {"\n"}
66 | {`Suggestions: ${alert.suggestion}`}
67 |
68 | );
69 | })}
70 |
71 |
72 |
73 | )}
74 | >
75 | );
76 | };
77 |
--------------------------------------------------------------------------------
/spark-ui/src/tabs/ConfigurationTab.tsx:
--------------------------------------------------------------------------------
1 | import { Box } from "@mui/material";
2 | import * as React from "react";
3 | import ConfigTable from "../components/ConfigTable";
4 | import { useAppSelector } from "../Hooks";
5 | import { MixpanelService } from "../services/MixpanelService";
6 |
7 | export default function ConfigurationTab() {
8 | const configs = useAppSelector(
9 | (state) => state.spark.config?.configs,
10 | )?.filter(
11 | (row) => row.category === "general" || row.category === "executor-memory",
12 | );
13 |
14 | React.useEffect(() => {
15 | MixpanelService.TrackPageView();
16 | }, []);
17 |
18 | return (
19 |
27 |
28 | {!!configs && }
29 |
30 |
31 | );
32 | }
33 |
--------------------------------------------------------------------------------
/spark-ui/src/tabs/StatusTab.tsx:
--------------------------------------------------------------------------------
1 | import * as React from "react";
2 | import NoQuery from "../components/NoQuery/NoQuery";
3 | import SqlContainer from "../components/SqlContainer";
4 | import StatusBar from "../components/StatusBar";
5 | import { useAppSelector } from "../Hooks";
6 | import { MixpanelService } from "../services/MixpanelService";
7 |
8 | export default function StatusTab() {
9 | const sql = useAppSelector((state) => state.spark.sql);
10 | const isIdle =
11 | useAppSelector((state) => state.spark.status?.stages?.status) == "idle";
12 |
13 | React.useEffect(() => {
14 | MixpanelService.TrackPageView();
15 | }, []);
16 |
17 | return (
18 |
19 |
20 | {sql === undefined || sql.sqls.length === 0 || isIdle ? (
21 |
29 |
30 |
31 | ) : (
32 |
35 |
44 | {sql.sqls.slice(-1)[0].description}
45 |
46 |
47 |
48 | )}
49 |
50 | );
51 | }
52 |
--------------------------------------------------------------------------------
/spark-ui/src/theme.ts:
--------------------------------------------------------------------------------
1 | import { red } from "@mui/material/colors";
2 | import { createTheme } from "@mui/material/styles";
3 |
4 | // A custom theme for this app
5 | const theme = createTheme({
6 | palette: {
7 | mode: "dark",
8 | primary: {
9 | main: "#3f51b5",
10 | },
11 | secondary: {
12 | main: "#19857b",
13 | },
14 | error: {
15 | main: red.A400,
16 | },
17 | },
18 | components: {
19 | MuiCssBaseline: {
20 | styleOverrides: {
21 | body: {
22 | scrollbarColor: "#6b6b6b #2b2b2b",
23 | "&::-webkit-scrollbar, & *::-webkit-scrollbar": {
24 | backgroundColor: "#2b2b2b",
25 | width: "0.8em",
26 | height: "0.8em",
27 | },
28 | "&::-webkit-scrollbar-thumb, & *::-webkit-scrollbar-thumb": {
29 | borderRadius: 8,
30 | backgroundColor: "#6b6b6b",
31 | minHeight: 24,
32 | border: "3px solid #2b2b2b",
33 | },
34 | "&::-webkit-scrollbar-thumb:focus, & *::-webkit-scrollbar-thumb:focus":
35 | {
36 | backgroundColor: "#959595",
37 | },
38 | "&::-webkit-scrollbar-thumb:active, & *::-webkit-scrollbar-thumb:active":
39 | {
40 | backgroundColor: "#959595",
41 | },
42 | "&::-webkit-scrollbar-thumb:hover, & *::-webkit-scrollbar-thumb:hover":
43 | {
44 | backgroundColor: "#959595",
45 | },
46 | "&::-webkit-scrollbar-corner, & *::-webkit-scrollbar-corner": {
47 | backgroundColor: "#2b2b2b",
48 | },
49 | },
50 | },
51 | },
52 | },
53 | });
54 |
55 | export default theme;
56 |
--------------------------------------------------------------------------------
/spark-ui/src/utils/ConfigParser.ts:
--------------------------------------------------------------------------------
1 | // Utility to parse the spark.dataflint.alert.disabled config
2 | export function parseAlertDisabledConfig(config: string | undefined): Set {
3 | if (!config) return new Set();
4 | return new Set(config.split(',').map(x => x.trim()).filter(Boolean));
5 | }
6 |
--------------------------------------------------------------------------------
/spark-ui/src/utils/FormatUtils.ts:
--------------------------------------------------------------------------------
1 | import { format, parse } from "bytes";
2 | import { Duration, duration } from "moment";
3 |
4 | export function humanFileSize(bytes: number): string {
5 | if (Number.isNaN(bytes)) return "NaN";
6 | const formatted = format(bytes, { unitSeparator: " " });
7 | return formatted
8 | .replace("KB", "KiB")
9 | .replace("MB", "MiB")
10 | .replace("GB", "GiB")
11 | .replace("TB", "TiB");
12 | }
13 |
14 | export function parseBytesString(str: string): number {
15 | return parse(
16 | str
17 | .replace("KiB", "KB")
18 | .replace("MiB", "MB")
19 | .replace("GiB", "GB")
20 | .replace("TiB", "TB"),
21 | );
22 | }
23 |
24 | export function humanFileSizeSparkConfigFormat(bytes: number): string {
25 | if (Number.isNaN(bytes)) return "NaN";
26 | const formatted = format(bytes);
27 | return formatted
28 | .replace("KB", "k")
29 | .replace("MB", "m")
30 | .replace("GB", "g")
31 | .replace("TB", "t");
32 | }
33 |
34 | export function humanizeTimeDiff(
35 | duration: Duration,
36 | roundSeconds: boolean = false,
37 | ): string {
38 | if (duration.asDays() >= 1) {
39 | return duration.asDays().toFixed(1) + "d";
40 | }
41 | if (duration.asHours() >= 1) {
42 | return duration.asHours().toFixed(1) + "h";
43 | }
44 | if (duration.asMinutes() >= 1) {
45 | return duration.asMinutes().toFixed(1) + "m";
46 | }
47 | if (duration.asSeconds() >= 1 || roundSeconds) {
48 | return roundSeconds
49 | ? duration.asSeconds().toFixed(0) + "s"
50 | : duration.asSeconds().toFixed(1) + "s";
51 | }
52 | return duration.asMilliseconds().toFixed(0) + "ms";
53 | }
54 |
55 | export function msToHours(ms: number): number {
56 | return ms / 1000 / 60 / 60;
57 | }
58 |
59 | export function hoursToMS(ms: number): number {
60 | return ms * 1000 * 60 * 60;
61 | }
62 |
63 | export function timeStrToEpocTime(time: string): number {
64 | const addTimeMoment = new Date(time.replace("GMT", "Z"));
65 | return addTimeMoment.getTime();
66 | }
67 |
68 | export function timeStringToMilliseconds(
69 | timeString: string | undefined,
70 | ): number | undefined {
71 | if (timeString === undefined) {
72 | return undefined;
73 | }
74 | const unit = timeString.slice(-2).trim();
75 | const value = parseFloat(timeString.slice(0, -2).trim());
76 |
77 | switch (unit) {
78 | case "ms":
79 | return value;
80 | case "s":
81 | return duration(value, "seconds").asMilliseconds();
82 | case "m":
83 | return duration(value, "minutes").asMilliseconds();
84 | case "h":
85 | return duration(value, "hours").asMilliseconds();
86 | default:
87 | throw new Error(`Unsupported time unit: ${unit}`);
88 | }
89 | }
90 |
91 | export function calculatePercentage(value: number, total: number): number {
92 | if (total === undefined || value === undefined || total === 0) {
93 | return 0;
94 | }
95 | const percentage = (value / total) * 100;
96 | return Math.min(Math.max(percentage, 0), 100);
97 | }
98 |
--------------------------------------------------------------------------------
/spark-ui/src/utils/UrlConsts.ts:
--------------------------------------------------------------------------------
1 | import {
2 | getProxyBasePath,
3 | hrefWithoutEndSlash,
4 | isDataFlintSaaSUI,
5 | isHistoryServer,
6 | isProxyMode,
7 | } from "./UrlUtils";
8 |
9 | const IS_HISTORY_SERVER_MODE = isHistoryServer();
10 |
11 | let BASE_PATH = "";
12 | let BASE_CURRENT_PAGE = hrefWithoutEndSlash();
13 | if (process.env.NODE_ENV === "development") {
14 | BASE_PATH = process.env.REACT_APP_BASE_PATH ?? "";
15 | BASE_CURRENT_PAGE = `${BASE_PATH}/dataflint`;
16 | } else if (isProxyMode()) {
17 | BASE_PATH = getProxyBasePath();
18 | } else if (isDataFlintSaaSUI()) {
19 | BASE_PATH = "/dataflint-spark-ui";
20 | }
21 |
22 | export { BASE_CURRENT_PAGE, BASE_PATH, IS_HISTORY_SERVER_MODE };
23 |
--------------------------------------------------------------------------------
/spark-ui/src/utils/UrlUtils.ts:
--------------------------------------------------------------------------------
1 | export const isHistoryServer = (): boolean =>
2 | window.location.href.includes("history");
3 |
4 | export const isProxyMode = (): boolean =>
5 | !(
6 | window.location.pathname === "/dataflint" ||
7 | window.location.pathname === "/dataflint/"
8 | );
9 |
10 | export const isDataFlintSaaSUI = (): boolean =>
11 | window.location.href.includes("dataflint-spark-ui");
12 |
13 | export function hrefWithoutEndSlash(): string {
14 | const href = window.location.href;
15 | let fixedUrl = href.split("/#/")[0];
16 |
17 | // We are using a HashRouter so we split by #
18 | if (fixedUrl.endsWith("index.html")) {
19 | fixedUrl = fixedUrl.substring(0, fixedUrl.length - "index.html".length);
20 | }
21 | if (fixedUrl.includes("?o=")) {
22 | fixedUrl = fixedUrl.split("dataflint")[0] + "dataflint";
23 | }
24 | if (fixedUrl.endsWith("/")) {
25 | fixedUrl = fixedUrl.substring(0, fixedUrl.length - 1);
26 | }
27 | return fixedUrl;
28 | }
29 |
30 | export const getProxyBasePath = (): string => {
31 | if (isHistoryServer()) {
32 | // in cases where we are in history server mode, the API should be before the last /history part
33 | // For example, for: http://localhost:18080/history//dataflint/
34 | // the api is in http://localhost:18080/api/
35 | // when the path is https://gateway/sparkhistory/history//1/dataflint/
36 | // the api is in https://gateway/sparkhistory/api/
37 | const url = new URL(window.location.href);
38 | const pathToBase = url.pathname.match(/^(.*)\/history\//);
39 |
40 | if (pathToBase && pathToBase[1]) {
41 | return `${url.origin}${pathToBase[1]}`;
42 | }
43 |
44 | // If the pattern isn't found or pathToBase[1] = '', assume it's in the root
45 | return "";
46 | } else {
47 | // in cases where we are not in history server mode, the API should be before the last /dataflint part
48 | // for example, for: http://localhost:18080/dataflint/
49 | // the api is in http://localhost:18080/api
50 | // when the path is https://gateway/mysparkapp/dataflint/
51 | // the api is in https://gateway/mysparkapp/api/
52 | return hrefWithoutEndSlash().substring(
53 | 0,
54 | hrefWithoutEndSlash().lastIndexOf("/dataflint"),
55 | );
56 | }
57 | };
58 |
59 | export function getHistoryServerCurrentAppId(): string {
60 | const urlSegments = hrefWithoutEndSlash().split("/");
61 | try {
62 | const historyIndex = urlSegments.findIndex(
63 | (segment) => segment === "history",
64 | );
65 | const appId = urlSegments[historyIndex + 1];
66 | return appId;
67 | } catch {
68 | throw new Error("Invalid history server app id");
69 | }
70 | }
71 |
72 | export const getBaseAppUrl = (appPath: string): string => {
73 | return appPath.substring(0, hrefWithoutEndSlash().lastIndexOf("/dataflint"));
74 | };
75 |
--------------------------------------------------------------------------------
/spark-ui/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es5",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": true,
6 | "skipLibCheck": true,
7 | "esModuleInterop": true,
8 | "allowSyntheticDefaultImports": true,
9 | "strict": true,
10 | "forceConsistentCasingInFileNames": true,
11 | "module": "esnext",
12 | "moduleResolution": "node",
13 | "resolveJsonModule": true,
14 | "isolatedModules": true,
15 | "noEmit": true,
16 | "jsx": "react"
17 | },
18 | "include": ["src"]
19 | }
20 |
--------------------------------------------------------------------------------