├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── cd.yml │ └── ci.yml ├── .gitignore ├── .vscode └── launch.json ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── THIRD-PARTY-LICENSES.txt ├── documentation └── resources │ ├── demo.gif │ ├── howitworks.png │ ├── logo.png │ └── usage.png ├── spark-plugin ├── .gitignore ├── build.sbt ├── example_3_1_3 │ └── src │ │ └── main │ │ └── scala │ │ └── io │ │ └── dataflint │ │ └── example │ │ └── ShakespeareSpark313.scala ├── example_3_2_4 │ └── src │ │ └── main │ │ └── scala │ │ └── io │ │ └── dataflint │ │ └── example │ │ ├── Shakespeare324Exported.scala │ │ └── ShakespeareSpark324.scala ├── example_3_3_3 │ └── src │ │ └── main │ │ └── scala │ │ └── io │ │ └── dataflint │ │ └── example │ │ ├── IcebergExample333.scala │ │ ├── Shakespeare333Exported.scala │ │ └── ShakespeareSpark333.scala ├── example_3_4_1 │ └── src │ │ └── main │ │ └── scala │ │ └── io │ │ └── dataflint │ │ └── example │ │ ├── IcebergExample.scala │ │ ├── SalesFilterer.scala │ │ ├── SalesFiltererFixed.scala │ │ ├── Shakespeare341.scala │ │ ├── Shakespeare341Exported.scala │ │ ├── ShakespearePartitionedWriter.scala │ │ ├── ShakespearePartitionedWriterFixed.scala │ │ ├── ShakespeareUnpartitionedWriter.scala │ │ ├── ShakespeareUnpartitionedWriterFixed.scala │ │ └── SimpleStreaming.scala ├── example_3_4_1_remote │ └── src │ │ └── main │ │ └── scala │ │ └── io │ │ └── dataflint │ │ └── example │ │ └── Shakespeare341Remote.scala ├── example_3_5_1 │ └── src │ │ └── main │ │ └── scala │ │ ├── io │ │ └── dataflint │ │ │ └── example │ │ │ ├── AccessPatternExample.scala │ │ │ ├── CacheExample.scala │ │ │ ├── DataFusionCometExample.scala │ │ │ ├── DeltaLakeExample.scala │ │ │ ├── DeltaLakeStreaming.scala │ │ │ ├── IcebergExample.scala │ │ │ ├── JobGroupExample.scala │ │ │ ├── JobGroupExportedLocal.scala │ │ │ ├── JoinExample.scala │ │ │ ├── KafkaStreaming.scala │ │ │ ├── LargeBroadcastExample.scala │ │ │ ├── LargeFilterCondition.scala │ │ │ ├── PartitionSkewExample.scala │ │ │ ├── SchedulingSmallTasks.scala │ │ │ ├── SchedulingSmallTasksSkipAlerts.scala │ │ │ ├── SetJobDescriptionAndUDFName.scala │ │ │ ├── Shakespeare351.scala │ │ │ ├── Shakespeare351Exported.scala │ │ │ ├── Shakespeare351ExportedLocal.scala │ │ │ └── Shakespeare351ExportedLocal2.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── dataflint │ │ └── jobgroup │ │ └── tests │ │ └── JobGroupTests.scala ├── plugin │ └── src │ │ └── main │ │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.status.AppHistoryServerPlugin │ │ └── scala │ │ ├── io │ │ └── dataflint │ │ │ └── spark │ │ │ ├── SparkDataflint.scala │ │ │ └── SparkDataflintPlugin.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ ├── dataflint │ │ ├── DataflintSparkUILoader.scala │ │ ├── api │ │ │ ├── DataFlintTab.scala │ │ │ ├── DataflintApplicationInfoPage.scala │ │ │ ├── DataflintCachedStoragePage.scala │ │ │ ├── DataflintIcebergPage.scala │ │ │ ├── DataflintJettyUtils.scala │ │ │ ├── DataflintSQLMetricsPage.scala │ │ │ ├── DataflintSQLPlanPage.scala │ │ │ ├── DataflintSQLStagesRddPage.scala │ │ │ └── api.scala │ │ ├── iceberg │ │ │ ├── ClassLoaderChecker.scala │ │ │ └── DataflintIcebergMetricsReporter.scala │ │ ├── jobgroup │ │ │ └── JobGroupExtractor.scala │ │ ├── listener │ │ │ ├── DataflintDatabricksLiveListener.scala │ │ │ ├── DataflintListener.scala │ │ │ ├── DataflintStore.scala │ │ │ ├── LiveRDDsListener.scala │ │ │ └── model.scala │ │ ├── package.scala │ │ └── saas │ │ │ ├── DataflintRunExporterListener.scala │ │ │ ├── EnumSerializer.scala │ │ │ ├── ExecutorsMetricsSerializer.scala │ │ │ ├── GZipUtils.scala │ │ │ ├── JavaEnumNameSerializer.scala │ │ │ ├── S3Uploader.scala │ │ │ ├── SparkMetadataSerializer.scala │ │ │ ├── SparkMetadataStore.scala │ │ │ ├── SparkRunSerializer.scala │ │ │ ├── SparkRunStore.scala │ │ │ ├── StageTaskSummary.scala │ │ │ ├── StoreDataExtractor.scala │ │ │ └── StoreMetadataExtractor.scala │ │ └── deploy │ │ └── history │ │ ├── DataFlintHistoryServerPlugin.scala │ │ └── FsDataflintHistoryProvider.scala ├── project │ ├── build.properties │ └── publish.sbt ├── sonatype.sbt └── test_data │ └── will_play_text.csv └── spark-ui ├── .env ├── .generatelicensefile.yaml ├── .gitignore ├── gulpfile.js ├── package-lock.json ├── package.json ├── public ├── favicon.ico ├── icon.png ├── index.html ├── logo-grey.png ├── logo.png └── manifest.json ├── src ├── App.tsx ├── Hooks.ts ├── Router.tsx ├── Store.ts ├── components │ ├── AlertBadge │ │ ├── AlertBadge.tsx │ │ └── MultiAlertsBadge.tsx │ ├── AppDrawer │ │ ├── AppDrawer.tsx │ │ └── DrawerFooter.tsx │ ├── ColumnPicker │ │ └── ColumnPicker.tsx │ ├── ConfigTable.tsx │ ├── ExceptionIcon.tsx │ ├── InfoBox │ │ ├── InfoBox.module.css │ │ └── InfoBox.tsx │ ├── Modals │ │ └── DisconnectedModal.tsx │ ├── NoQuery │ │ └── NoQuery.tsx │ ├── Progress.tsx │ ├── ResourceBar.tsx │ ├── ResourceGraph │ │ ├── ColorsOutput.ts │ │ └── ResourceGraph.tsx │ ├── SqlContainer.tsx │ ├── SqlFlow │ │ ├── BytesDistributionChart.tsx │ │ ├── DurationDistributionChart.tsx │ │ ├── NumbersDistributionChart.tsx │ │ ├── SqlFlow.tsx │ │ ├── SqlLayoutService.ts │ │ ├── StageIcon.tsx │ │ ├── StageIconDrawer.tsx │ │ ├── StageNode.tsx │ │ └── node-style.module.css │ ├── SqlTable │ │ ├── SqlTable.tsx │ │ ├── TableTypes.tsx │ │ └── TableUtils.tsx │ ├── StatusBar.tsx │ └── SummaryBar.tsx ├── index.tsx ├── interfaces │ ├── AppStore.ts │ ├── ApplicationInfo.ts │ ├── CachedStorage.ts │ ├── IcebergInfo.ts │ ├── Mixpanel.ts │ ├── SQLPlan.ts │ ├── SparkApplications.ts │ ├── SparkConfiguration.ts │ ├── SparkExecutors.ts │ ├── SparkJobs.ts │ ├── SparkSQLs.ts │ ├── SparkStages.ts │ ├── SqlMetrics.ts │ └── StagesRdd.ts ├── react-app-env.d.ts ├── reducers │ ├── Alerts │ │ ├── BroadcastTooLargeAlert.ts │ │ ├── IcebergReplacesReducer.ts │ │ ├── JoinToBroadcastAlert.ts │ │ ├── LargeCrossJoinScanAlert.ts │ │ ├── LongFilterConditions.ts │ │ ├── MaxPartitionToBigAlert.ts │ │ ├── MemoryAlertsReducer.ts │ │ ├── MemorySQLInputOutputAlerts.ts │ │ ├── PartitionSkewAlert.ts │ │ ├── SmallTasksAlert.ts │ │ └── WastedCoresAlertsReducer.ts │ ├── AlertsReducer.ts │ ├── ChatSlice.ts │ ├── ConfigReducer.ts │ ├── ExecutorsReducer.ts │ ├── GeneralSlice.ts │ ├── JobsColumnSlice.ts │ ├── MetricsReducer.ts │ ├── PlanGraphUtils.ts │ ├── PlanParsers │ │ ├── CoalesceParser.test.ts │ │ ├── CoalesceParser.ts │ │ ├── CollectLimitParser.ts │ │ ├── ExchangeParser.spec.ts │ │ ├── ExchangeParser.ts │ │ ├── FilterParser.ts │ │ ├── JoinParser.spec.ts │ │ ├── JoinParser.ts │ │ ├── PlanParserUtils.ts │ │ ├── ProjectParser.ts │ │ ├── ScanFileParser.spec.ts │ │ ├── ScanFileParser.ts │ │ ├── SortParser.ts │ │ ├── SortParset.spec.ts │ │ ├── TakeOrderedAndProjectParser.spec.ts │ │ ├── TakeOrderedAndProjectParser.ts │ │ ├── WindowParser.spec.ts │ │ ├── WindowParser.ts │ │ ├── WriteToHDFSParser.spec.ts │ │ ├── WriteToHDFSParser.ts │ │ ├── hashAggregateParser.spec.ts │ │ └── hashAggregateParser.ts │ ├── SQLNodeStageReducer.ts │ ├── SparkSlice.ts │ ├── SqlReducer.ts │ ├── SqlReducerUtils.ts │ └── StatusReducer.ts ├── services │ ├── MixpanelService.tsx │ ├── SparkApi.tsx │ └── TabsService.tsx ├── tabs │ ├── AlertsTab.tsx │ ├── ChatTab.tsx │ ├── ConfigurationTab.tsx │ ├── ResourcesTab.tsx │ ├── StatusTab.tsx │ └── SummaryTab.tsx ├── theme.ts └── utils │ ├── ConfigParser.ts │ ├── FormatUtils.ts │ ├── UrlConsts.ts │ └── UrlUtils.ts └── tsconfig.json /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: menishmueli 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Environemnt** 14 | spark verison: 3.2/3.3/3.4/3.5 15 | platform: EMR/DataProc/K8s/standalone/databricks 16 | 17 | **To Reproduce** 18 | Steps to reproduce the behavior: 19 | 1. Go to '...' 20 | 2. Click on '....' 21 | 3. Scroll down to '....' 22 | 4. See error 23 | 24 | **Expected behavior** 25 | A clear and concise description of what you expected to happen. 26 | 27 | **Screenshots** 28 | If applicable, add screenshots to help explain your problem. 29 | 30 | **Additional context** 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | on: 3 | workflow_run: 4 | workflows: [CI] 5 | branches: [main] 6 | types: 7 | - completed 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 15 | - uses: actions/setup-java@v4 16 | with: 17 | distribution: temurin 18 | java-version: 8 19 | cache: sbt 20 | 21 | - name: Set up Node.js 22 | uses: actions/setup-node@v2 23 | with: 24 | node-version: 20 25 | 26 | - name: Install npm dependencies 27 | run: npm ci 28 | working-directory: ./spark-ui 29 | 30 | - name: build frontend 31 | run: npm run deploy 32 | working-directory: ./spark-ui 33 | 34 | - name: package plugin 35 | run: sbt package 36 | working-directory: ./spark-plugin 37 | 38 | - name: publish to maven staging 39 | run: sbt ci-release 40 | working-directory: ./spark-plugin 41 | env: 42 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} 43 | PGP_SECRET: ${{ secrets.PGP_SECRET }} 44 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} 45 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} 46 | 47 | - name: Changelog 48 | uses: scottbrenner/generate-changelog-action@master 49 | if: startsWith(github.ref, 'refs/tags/v') 50 | id: Changelog 51 | env: 52 | REPO: ${{ github.repository }} 53 | 54 | - name: Create Release 55 | id: create_release 56 | uses: actions/create-release@latest 57 | if: startsWith(github.ref, 'refs/tags/v') 58 | env: 59 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 60 | with: 61 | tag_name: ${{ github.ref }} 62 | release_name: Release ${{ github.ref }} 63 | body: | 64 | See: https://dataflint.gitbook.io/dataflint-for-spark/overview/release-notes#version-${{ github.ref_name }} 65 | 66 | commits change log: 67 | ${{ steps.Changelog.outputs.changelog }} 68 | draft: false 69 | prerelease: false -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by sbt-github-actions using the 2 | # githubWorkflowGenerate task. You should add and commit this file to 3 | # your git repository. It goes without saying that you shouldn't edit 4 | # this file by hand! Instead, if you wish to make changes, you should 5 | # change your sbt build configuration to revise the workflow description 6 | # to meet your needs, then regenerate this file. 7 | 8 | name: CI 9 | 10 | on: 11 | pull_request: 12 | branches: ['**'] 13 | push: 14 | branches: ['**'] 15 | 16 | jobs: 17 | build: 18 | name: Build and Test 19 | strategy: 20 | matrix: 21 | os: [ubuntu-latest] 22 | java: [temurin@8] 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: Checkout current branch (full) 26 | uses: actions/checkout@v4 27 | with: 28 | fetch-depth: 0 29 | 30 | - name: Setup Java (temurin@8) 31 | if: matrix.java == 'temurin@8' 32 | uses: actions/setup-java@v3 33 | with: 34 | distribution: temurin 35 | java-version: 8 36 | cache: sbt 37 | 38 | - name: Set up Node.js 39 | uses: actions/setup-node@v2 40 | with: 41 | node-version: 20 42 | 43 | - name: Install dependencies 44 | run: npm ci 45 | working-directory: ./spark-ui 46 | 47 | - name: Run frontend unit tests 48 | run: npm run test 49 | working-directory: ./spark-ui 50 | 51 | - name: Build and test plugin 52 | run: sbt +test 53 | working-directory: ./spark-plugin 54 | 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.2", 6 | "configurations": [ 7 | { 8 | "type": "chrome", 9 | "request": "launch", 10 | "name": "Launch Chrome against localhost", 11 | "url": "http://localhost:4000", 12 | "webRoot": "${workspaceFolder}" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to DataFlint 2 | 3 | ## Getting started with development 4 | 5 | ### Setup 6 | 7 | Requirements: 8 | 1. Node v21.5.0 9 | 2. Java 8 or 11 10 | 3. Scala 2.12 11 | 4. SBT 1.3.13 12 | 5. IntelliJ IDEA with Scala and SBT plugins 13 | 6. Visual Studio Code 14 | 15 | ### Installation Steps 16 | 17 | 1. Clone the repository: 18 | ``` 19 | git clone https://github.com/dataflint/spark.git 20 | cd spark 21 | ``` 22 | 23 | 2. Set up the Spark Plugin: 24 | - Open the `spark-plugin` folder with IntelliJ IDEA 25 | - Ensure Scala and SBT plugins are installed in IntelliJ 26 | 27 | 3. Set up the UI: 28 | - Open the repository with Visual Studio Code 29 | - Install UI dependencies: 30 | ``` 31 | cd spark-ui 32 | npm install 33 | ``` 34 | 35 | 4. Build the UI for the plugin: 36 | ``` 37 | cd spark-ui 38 | npm run deploy 39 | ``` 40 | 41 | 5. (Optional) Install Local CORS Proxy (LCP) for local development: 42 | ``` 43 | brew install lcp 44 | ``` 45 | 46 | ### Running the Project 47 | 48 | 1. Run one of the examples in the `spark-examples-351` project using IntelliJ 49 | 50 | 2. Access the Spark UI: 51 | - Browse to `http://localhost:10000` 52 | - Open DataFlint successfully 53 | 54 | ### Live Frontend Development 55 | 56 | For live frontend development, follow these steps: 57 | 58 | 1. Start the development server and proxy: 59 | ``` 60 | cd spark-ui 61 | npm run start 62 | npm run proxy 63 | ``` 64 | 65 | 2. Access the development UI: 66 | - Browse to `http://localhost:4000` 67 | - This should run the DataFlint UI with live reloading 68 | 69 | ## Contributing Guidelines 70 | 71 | - Please ensure your code follows the project's coding standards 72 | - Submit pull requests for review 73 | 74 | Thank you for contributing to DataFlint! 75 | -------------------------------------------------------------------------------- /documentation/resources/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/demo.gif -------------------------------------------------------------------------------- /documentation/resources/howitworks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/howitworks.png -------------------------------------------------------------------------------- /documentation/resources/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/logo.png -------------------------------------------------------------------------------- /documentation/resources/usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/documentation/resources/usage.png -------------------------------------------------------------------------------- /spark-plugin/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # From https://github.com/github/gitignore/blob/master/Scala.gitignore 3 | *.class 4 | *.log 5 | 6 | # From https://github.com/github/gitignore/blob/master/Global/SBT.gitignore 7 | dist/* 8 | target/ 9 | lib_managed/ 10 | src_managed/ 11 | project/boot/ 12 | project/plugins/project/ 13 | .history 14 | .cache 15 | .lib/ 16 | 17 | # From https://github.com/github/gitignore/blob/master/Global/Eclipse.gitignore 18 | .metadata 19 | bin/ 20 | tmp/ 21 | *.tmp 22 | *.bak 23 | *.swp 24 | *~.nib 25 | local.properties 26 | .settings/ 27 | .loadpath 28 | .recommenders 29 | 30 | .externalToolBuilders/ 31 | *.launch 32 | *.pydevproject 33 | .cproject 34 | .factorypath 35 | .buildpath 36 | .target 37 | .tern-project 38 | .texlipse 39 | .springBeans 40 | .recommenders/ 41 | 42 | # Scala IDE specific (Scala & Java development for Eclipse) 43 | .cache-main 44 | .cache-tests 45 | .classpath 46 | .project 47 | .scala_dependencies 48 | .worksheet 49 | 50 | .idea 51 | 52 | # custom 53 | null/ 54 | plugin/src/main/resources/io/ 55 | 56 | # custom 57 | .bsp 58 | -------------------------------------------------------------------------------- /spark-plugin/example_3_1_3/src/main/scala/io/dataflint/example/ShakespeareSpark313.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | import java.nio.file.Paths 7 | 8 | object ShakespeareSpark313 extends App { 9 | def df(spark: SparkSession): DataFrame = spark.read 10 | .format("csv") 11 | .option("sep", ";") 12 | .option("inferSchema", true) 13 | .load("./test_data/will_play_text.csv") 14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 15 | .repartition(1000) 16 | 17 | val spark = SparkSession 18 | .builder() 19 | .appName("Shakespeare Statistics") 20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 21 | .config("spark.dataflint.telemetry.enabled", false) 22 | .config("spark.ui.port", "10000") 23 | .master("local[*]") 24 | .getOrCreate() 25 | 26 | import spark.implicits._ 27 | 28 | val shakespeareText = df(spark) 29 | 30 | shakespeareText.printSchema() 31 | 32 | val count = shakespeareText.count() 33 | println(s"number of records : $count") 34 | 35 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 36 | println(s"number of unique speakers : $uniqueSpeakers") 37 | 38 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 39 | 40 | println(s"number of unique words : $uniqueWords") 41 | 42 | scala.io.StdIn.readLine() 43 | spark.stop() 44 | } 45 | -------------------------------------------------------------------------------- /spark-plugin/example_3_2_4/src/main/scala/io/dataflint/example/Shakespeare324Exported.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object Shakespeare324Exported extends App { 6 | def df(spark: SparkSession): DataFrame = spark.read 7 | .format("csv") 8 | .option("sep", ";") 9 | .option("inferSchema", true) 10 | .load("./test_data/will_play_text.csv") 11 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 12 | .repartition(1000) 13 | 14 | val spark = SparkSession 15 | .builder 16 | .appName("Shakespeare Statistics Exported") 17 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 18 | .config("spark.dataflint.telemetry.enabled", false) 19 | .config("spark.ui.port", "10000") 20 | .config("spark.sql.maxMetadataStringLength", "10000") 21 | .config("spark.eventLog.enabled", "true") 22 | .master("local[*]") 23 | .getOrCreate() 24 | 25 | val shakespeareText = df(spark) 26 | 27 | val count = shakespeareText.count() 28 | println(s"number of records : $count") 29 | 30 | spark.stop() 31 | } 32 | -------------------------------------------------------------------------------- /spark-plugin/example_3_2_4/src/main/scala/io/dataflint/example/ShakespeareSpark324.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | import java.nio.file.Paths 7 | 8 | object ShakespeareSpark324 extends App { 9 | def fsPath(resource: String): String = 10 | Paths.get(this.getClass.getResource(resource).toURI).toString 11 | 12 | def df(spark: SparkSession): DataFrame = spark.read 13 | .format("csv") 14 | .option("sep", ";") 15 | .option("inferSchema", true) 16 | .load("./test_data/will_play_text.csv") 17 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 18 | .repartition(1000) 19 | 20 | val spark = SparkSession 21 | .builder() 22 | .appName("Shakespeare Statistics") 23 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 24 | .config("spark.dataflint.telemetry.enabled", false) 25 | .config("spark.ui.port", "10000") 26 | .master("local[*]") 27 | .getOrCreate() 28 | 29 | import spark.implicits._ 30 | 31 | val shakespeareText = df(spark) 32 | 33 | shakespeareText.printSchema() 34 | 35 | val count = shakespeareText.count() 36 | println(s"number of records : $count") 37 | 38 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 39 | println(s"number of unique speakers : $uniqueSpeakers") 40 | 41 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 42 | 43 | println(s"number of unique words : $uniqueWords") 44 | 45 | scala.io.StdIn.readLine() 46 | spark.stop() 47 | } 48 | -------------------------------------------------------------------------------- /spark-plugin/example_3_3_3/src/main/scala/io/dataflint/example/Shakespeare333Exported.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object Shakespeare333Exported extends App { 8 | def df(spark: SparkSession): DataFrame = spark.read 9 | .format("csv") 10 | .option("sep", ";") 11 | .option("inferSchema", true) 12 | .load("./test_data/will_play_text.csv") 13 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 14 | .repartition(1000) 15 | 16 | val spark = SparkSession 17 | .builder 18 | .appName("Shakespeare Statistics Exported") 19 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 20 | .config("spark.dataflint.telemetry.enabled", false) 21 | .config("spark.ui.port", "10000") 22 | .config("spark.sql.maxMetadataStringLength", "10000") 23 | .config("spark.eventLog.enabled", "true") 24 | .master("local[*]") 25 | .getOrCreate() 26 | 27 | val shakespeareText = df(spark) 28 | 29 | val count = shakespeareText.count() 30 | println(s"number of records : $count") 31 | 32 | spark.stop() 33 | } 34 | -------------------------------------------------------------------------------- /spark-plugin/example_3_3_3/src/main/scala/io/dataflint/example/ShakespeareSpark333.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | object ShakespeareSpark333 extends App { 7 | def df(spark: SparkSession): DataFrame = spark.read 8 | .format("csv") 9 | .option("sep", ";") 10 | .option("inferSchema", true) 11 | .load("./test_data/will_play_text.csv") 12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 13 | .repartition(1000) 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName("Shakespeare Statistics") 18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 19 | .config("spark.dataflint.telemetry.enabled", false) 20 | .config("spark.ui.port", "10000") 21 | .master("local[*]") 22 | .getOrCreate() 23 | 24 | import spark.implicits._ 25 | 26 | val shakespeareText = df(spark) 27 | 28 | shakespeareText.printSchema() 29 | 30 | val count = shakespeareText.count() 31 | println(s"number of records : $count") 32 | 33 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 34 | println(s"number of unique speakers : $uniqueSpeakers") 35 | 36 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 37 | 38 | println(s"number of unique words : $uniqueWords") 39 | 40 | scala.io.StdIn.readLine() 41 | spark.stop() 42 | } 43 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/IcebergExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object IcebergExample extends App{ 6 | val spark = SparkSession 7 | .builder() 8 | .appName("Iceberg Example") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.dataflint.telemetry.enabled", false) 11 | .config("spark.ui.port", "10000") 12 | .config("spark.sql.maxMetadataStringLength", "10000") 13 | .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") 14 | .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") 15 | .config("spark.sql.catalog.spark_catalog.type", "hive") 16 | .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") 17 | .config("spark.sql.catalog.local.type", "hadoop") 18 | .config("spark.sql.catalog.local.warehouse", "/tmp/iceberg-example/warehouse") 19 | .config("spark.sql.defaultCatalog", "local") 20 | .config("spark.sql.catalog.local.metrics-reporter-impl", "org.apache.spark.dataflint.iceberg.DataflintIcebergMetricsReporter") 21 | .master("local[*]") 22 | .getOrCreate() 23 | 24 | spark.sparkContext.setJobDescription("Drop table if exists") 25 | spark.sql("DROP TABLE IF EXISTS demo.nyc.taxis PURGE") 26 | 27 | spark.sparkContext.setJobDescription("Create table") 28 | spark.sql( 29 | """ 30 | |CREATE TABLE demo.nyc.taxis 31 | |( 32 | | vendor_id bigint, 33 | | trip_id bigint, 34 | | trip_distance float, 35 | | fare_amount double, 36 | | store_and_fwd_flag string 37 | |) 38 | |PARTITIONED BY (vendor_id); 39 | |""".stripMargin) 40 | 41 | spark.sparkContext.setJobDescription("Insert 4 records to table") 42 | spark.sql( 43 | """ 44 | |INSERT INTO demo.nyc.taxis 45 | |VALUES (1, 1000371, 1.8, 15.32, 'N'), (2, 1000372, 2.5, 22.15, 'N'); 46 | |""".stripMargin) 47 | 48 | spark.sparkContext.setJobDescription("Select from table") 49 | spark.sql("SELECT * FROM demo.nyc.taxis").show() 50 | 51 | scala.io.StdIn.readLine() 52 | } 53 | 54 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SalesFilterer.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object SalesFilterer extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Sales Filterer") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.ui.port", "10000") 13 | .config("spark.eventLog.enabled", true) 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .config("spark.dataflint.telemetry.enabled", false) 16 | .master("local[1]") 17 | .getOrCreate() 18 | 19 | import spark.implicits._ 20 | 21 | spark.read 22 | .load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales") 23 | .filter($"ss_quantity" > 1) 24 | .write 25 | .mode(SaveMode.Overwrite) 26 | .partitionBy("ss_quantity") 27 | .parquet("/tmp/store_sales") 28 | 29 | // scala.io.StdIn.readLine() 30 | spark.stop() 31 | } 32 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SalesFiltererFixed.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object SalesFiltererFixed extends App { 6 | val spark = SparkSession 7 | .builder() 8 | .appName("Sales Filterer Fixed") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.ui.port", "10000") 11 | .config("spark.eventLog.enabled", true) 12 | .config("spark.sql.maxMetadataStringLength", "10000") 13 | .config("spark.dataflint.telemetry.enabled", false) 14 | .master("local[1]") 15 | .getOrCreate() 16 | 17 | import spark.implicits._ 18 | 19 | spark.read 20 | .load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales") 21 | .filter($"ss_quantity" > 1) 22 | .repartition($"ss_quantity") 23 | .write 24 | .mode(SaveMode.Overwrite) 25 | .partitionBy("ss_quantity") 26 | .parquet("/tmp/store_sales") 27 | 28 | // scala.io.StdIn.readLine() 29 | spark.stop() 30 | } 31 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/Shakespeare341.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | import java.nio.file.Paths 7 | 8 | object Shakespeare341 extends App { 9 | def df(spark: SparkSession): DataFrame = spark.read 10 | .format("csv") 11 | .option("sep", ";") 12 | .option("inferSchema", true) 13 | .load("./test_data/will_play_text.csv") 14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 15 | .repartition(1000) 16 | 17 | val spark = SparkSession 18 | .builder() 19 | .appName("Shakespeare Statistics") 20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 21 | .config("spark.dataflint.telemetry.enabled", false) 22 | .config("spark.ui.port", "10000") 23 | .config("spark.sql.maxMetadataStringLength", "10000") 24 | .master("local[*]") 25 | .getOrCreate() 26 | 27 | import spark.implicits._ 28 | 29 | val shakespeareText = df(spark) 30 | 31 | shakespeareText.printSchema() 32 | 33 | val count = shakespeareText.count() 34 | println(s"number of records : $count") 35 | 36 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 37 | println(s"number of unique speakers : $uniqueSpeakers") 38 | 39 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 40 | 41 | println(s"number of unique words : $uniqueWords") 42 | 43 | scala.io.StdIn.readLine() 44 | spark.stop() 45 | } 46 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/Shakespeare341Exported.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | 8 | object Shakespeare341Exported extends App { 9 | def df(spark: SparkSession): DataFrame = spark.read 10 | .format("csv") 11 | .option("sep", ";") 12 | .option("inferSchema", true) 13 | .load("./test_data/will_play_text.csv") 14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 15 | .repartition(1000) 16 | 17 | val spark = SparkSession 18 | .builder 19 | .appName("Shakespeare Statistics Exported") 20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 21 | .config("spark.dataflint.telemetry.enabled", false) 22 | .config("spark.sql.maxMetadataStringLength", "10000") 23 | .config("spark.eventLog.enabled", "true") 24 | .master("local[*]") 25 | .getOrCreate() 26 | 27 | val shakespeareText = df(spark) 28 | 29 | val count = shakespeareText.count() 30 | println(s"number of records : $count") 31 | 32 | spark.stop() 33 | } 34 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespearePartitionedWriter.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object ShakespearePartitionedWriter extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Shakespeare Partitioned Writer") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.eventLog.enabled", true) 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | spark.read 20 | .format("csv") 21 | .option("sep", ";") 22 | .option("inferSchema", true) 23 | .load("./test_data/will_play_text.csv") 24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 25 | .repartition(200) 26 | .write 27 | .mode(SaveMode.Overwrite) 28 | .partitionBy("play_name") 29 | .parquet("/tmp/shakespear_partitioned") 30 | 31 | spark.stop() 32 | } 33 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespearePartitionedWriterFixed.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object ShakespearePartitionedWriterFixed extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Shakespeare Partitioned Writer Fixed") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.eventLog.enabled", true) 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | import spark.implicits._ 20 | 21 | spark.read 22 | .format("csv") 23 | .option("sep", ";") 24 | .option("inferSchema", true) 25 | .load("./test_data/will_play_text.csv") 26 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 27 | .repartition(200) 28 | .repartition($"play_name") 29 | .write 30 | .mode(SaveMode.Overwrite) 31 | .partitionBy("play_name") 32 | .parquet("/tmp/shakespear_partitioned") 33 | 34 | spark.stop() 35 | } 36 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespeareUnpartitionedWriter.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object ShakespeareUnpartitionedWriter extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Shakespeare Unpartitioned Writer") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.eventLog.enabled", true) 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | val shakespeareDF = spark.read 20 | .format("csv") 21 | .option("sep", ";") 22 | .option("inferSchema", true) 23 | .load("./test_data/will_play_text.csv") 24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 25 | .repartition(200) 26 | 27 | shakespeareDF 28 | .mapPartitions(itr => { 29 | // simulate slow write like in S3 30 | Thread.sleep(200) 31 | itr 32 | })(shakespeareDF.encoder) 33 | .write.mode(SaveMode.Overwrite).parquet("/tmp/shakespear") 34 | 35 | scala.io.StdIn.readLine() 36 | spark.stop() 37 | } 38 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/ShakespeareUnpartitionedWriterFixed.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object ShakespeareUnpartitionedWriterFixed extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Shakespeare Unpartitioned Writer Fixed") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.eventLog.enabled", true) 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | val shakespeareDF = spark.read 20 | .format("csv") 21 | .option("sep", ";") 22 | .option("inferSchema", true) 23 | .load("./test_data/will_play_text.csv") 24 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 25 | .repartition(200) 26 | 27 | shakespeareDF 28 | .repartition(1) 29 | .mapPartitions(itr => { 30 | // simulate slow write like in S3 31 | Thread.sleep(200) 32 | itr 33 | })(shakespeareDF.encoder) 34 | .write.mode(SaveMode.Overwrite).parquet("/tmp/shakespear") 35 | 36 | spark.stop() 37 | } 38 | -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1/src/main/scala/io/dataflint/example/SimpleStreaming.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | import java.sql.Timestamp 7 | 8 | object SimpleStreaming extends App { 9 | val spark = SparkSession 10 | .builder() 11 | .appName("Simple Streaming") 12 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 13 | .config("spark.dataflint.telemetry.enabled", false) 14 | .config("spark.ui.port", "10000") 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | import spark.implicits._ 20 | 21 | val numbers = (1 to 10).toList 22 | val numbersDF = numbers.toDF("number") 23 | 24 | // Create a streaming DataFrame 25 | val streamingNumbers = spark.readStream 26 | .format("rate") 27 | .option("rowsPerSecond", "1") 28 | .load() 29 | .as[(Long, Timestamp)] 30 | .flatMap(_ => numbers) 31 | .toDF("number") 32 | 33 | // Filter numbers divisible by 2 34 | val filteredStream = streamingNumbers 35 | .mapPartitions(i => { 36 | Thread.sleep(10000) 37 | i 38 | })(streamingNumbers.encoder) 39 | .filter($"number" % 2 === 0) 40 | 41 | // Output the result to the console 42 | val query = filteredStream.writeStream 43 | .outputMode("append") 44 | .format("console") 45 | .trigger(Trigger.ProcessingTime("1 second")) 46 | .start() 47 | 48 | // Wait for the streaming query to finish 49 | query.awaitTermination() 50 | 51 | scala.io.StdIn.readLine() 52 | spark.stop() 53 | } -------------------------------------------------------------------------------- /spark-plugin/example_3_4_1_remote/src/main/scala/io/dataflint/example/Shakespeare341Remote.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.ivy.Ivy 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | import org.apache.spark.sql.functions._ 7 | 8 | import java.nio.file.Paths 9 | 10 | object Shakespeare341Remote extends App { 11 | def df(spark: SparkSession): DataFrame = spark.read 12 | .format("csv") 13 | .option("sep", ";") 14 | .option("inferSchema", true) 15 | .load("./test_data/will_play_text.csv") 16 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 17 | .repartition(1000) 18 | 19 | val spark = SparkSession 20 | .builder() 21 | .appName("Shakespeare Statistics") 22 | .config("spark.dataflint.telemetry.enabled", false) 23 | .config("spark.ui.port", "10000") 24 | .master("local[*]") 25 | .getOrCreate() 26 | 27 | import spark.implicits._ 28 | 29 | val shakespeareText = df(spark) 30 | 31 | shakespeareText.printSchema() 32 | 33 | val count = shakespeareText.count() 34 | println(s"number of records : $count") 35 | 36 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 37 | println(s"number of unique speakers : $uniqueSpeakers") 38 | 39 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 40 | 41 | println(s"number of unique words : $uniqueWords") 42 | 43 | scala.io.StdIn.readLine() 44 | spark.stop() 45 | } 46 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/AccessPatternExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | object AccessPatternExample extends App { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("AccessPatternExample") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.dataflint.telemetry.enabled", false) 12 | .config("spark.ui.port", "10000") 13 | .config("spark.dataflint.telemetry.enabled", value = false) 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val salesDF = spark.read.load(sys.env("SALES_FILES_LOCATION")) 21 | 22 | spark.sparkContext.setJobDescription("full scan of store_sales") 23 | salesDF.count() 24 | 25 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition") 26 | salesDF 27 | .where($"ss_sold_date_sk" > 2450858) 28 | .count() 29 | 30 | spark.sparkContext.setJobDescription("scan of store_sales, filter by field") 31 | salesDF 32 | .where($"ss_quantity" > 1) 33 | .count() 34 | 35 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition and field") 36 | salesDF 37 | .where($"ss_sold_date_sk" > 2450858) 38 | .where($"ss_quantity" > 1) 39 | .count() 40 | 41 | spark.sparkContext.setJobDescription("scan of store_sales, filter by field condition") 42 | salesDF 43 | .where($"ss_sold_date_sk" > 2450858) 44 | .where($"ss_quantity" * 2 > 2) 45 | .count() 46 | 47 | spark.sparkContext.setJobDescription("scan of store_sales, filter by partition and field and field condition") 48 | salesDF 49 | .where($"ss_sold_date_sk" > 2450858) 50 | .where($"ss_store_sk" > 0) 51 | .where($"ss_quantity" * 2 > 2) 52 | .count() 53 | 54 | spark.sparkContext.setJobDescription("scan store_sales by partition, select 3 fields: ss_cdemo_s, ss_net_paid, ss_net_profit") 55 | salesDF 56 | .where($"ss_sold_date_sk" > 2450858) 57 | .select($"ss_cdemo_sk", $"ss_net_paid", $"ss_net_profit") 58 | .show() 59 | 60 | spark.sparkContext.setJobDescription("scan store_sales by partition, select all fields") 61 | salesDF 62 | .where($"ss_sold_date_sk" > 2450858) 63 | .show() 64 | 65 | scala.io.StdIn.readLine() 66 | spark.stop() 67 | } 68 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/CacheExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import io.dataflint.example.SchedulingSmallTasks.spark 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object CacheExample extends App { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("JobGroupExample") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.eventLog.enabled", "true") 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val df = spark.range(0, 10).cache() 21 | val secondCache = df.select($"id" * 2).persist() 22 | secondCache.count() 23 | df.unpersist() 24 | secondCache.unpersist() 25 | 26 | val df2 = spark.range(0, 10000000L).repartition(100).cache() 27 | df2.count() 28 | df.unpersist() 29 | 30 | scala.io.StdIn.readLine() 31 | 32 | spark.stop() 33 | } 34 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DataFusionCometExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | object DataFusionCometExample extends App { 7 | def df(spark: SparkSession): DataFrame = spark.read 8 | .format("csv") 9 | .option("sep", ";") 10 | .option("inferSchema", true) 11 | .load("./test_data/will_play_text.csv") 12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 13 | .repartition(1000) 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName("DataFusionCometExample") 18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin,org.apache.spark.CometPlugin") 19 | .config("spark.shuffle.manager", "org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager") 20 | .config("spark.comet.explainFallback.enabled", "true") 21 | .config("spark.memory.offHeap.enabled", "true") 22 | .config("spark.memory.offHeap.size", "16g") 23 | .config("spark.ui.port", "10000") 24 | .config("spark.dataflint.telemetry.enabled", value = false) 25 | .config("spark.sql.maxMetadataStringLength", "10000") 26 | .master("local[*]") 27 | .getOrCreate() 28 | 29 | import spark.implicits._ 30 | 31 | val shakespeareText = df(spark) 32 | 33 | shakespeareText.printSchema() 34 | 35 | val count = shakespeareText.count() 36 | println(s"number of records : $count") 37 | 38 | val uniqueSpeakers = shakespeareText.select($"speaker").filter($"line_id".isNotNull).distinct().count() 39 | println(s"number of unique speakers : $uniqueSpeakers") 40 | 41 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 42 | 43 | println(s"number of unique words : $uniqueWords") 44 | 45 | 46 | spark.read.load("/Users/menishmueli/Documents/GitHub/spark-sql-perf/data/store_sales").filter($"ss_quantity" > 1).count() 47 | 48 | scala.io.StdIn.readLine() 49 | spark.stop() 50 | } 51 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DeltaLakeExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | import java.sql.Timestamp 6 | 7 | object DeltaLakeExample extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("DeltaLakeExample") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 16 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 17 | .master("local[*]") 18 | .getOrCreate() 19 | 20 | import spark.implicits._ 21 | spark.sparkContext.setJobDescription("Create Table") 22 | spark.sql("CREATE TABLE IF NOT EXISTS delta.`/tmp/delta-table` USING DELTA AS SELECT col1 as id FROM VALUES 0,1,2,3,4;") 23 | 24 | spark.sparkContext.setJobDescription("Insert data to table") 25 | spark.sql("INSERT OVERWRITE delta.`/tmp/delta-table` SELECT col1 as id FROM VALUES 5,6,7,8,9;") 26 | 27 | spark.sparkContext.setJobDescription("Select data from table") 28 | spark.sql("SELECT * FROM delta.`/tmp/delta-table`;").show() 29 | 30 | spark.sparkContext.setJobDescription("Insert overwrite data to table") 31 | spark.sql("INSERT OVERWRITE delta.`/tmp/delta-table` SELECT col1 as id FROM VALUES 5,6,7,8,9;") 32 | 33 | spark.sparkContext.setJobDescription("Update data from table") 34 | spark.sql("UPDATE delta.`/tmp/delta-table` SET id = id + 100 WHERE id % 2 == 0;") 35 | 36 | spark.sparkContext.setJobDescription("Delete data from table") 37 | spark.sql("DELETE FROM delta.`/tmp/delta-table` WHERE id % 2 == 0;") 38 | 39 | spark.sparkContext.setJobDescription("Create view from table") 40 | spark.sql("CREATE TEMP VIEW newData AS SELECT col1 AS id FROM VALUES 1,3,5,7,9,11,13,15,17,19;") 41 | 42 | spark.sparkContext.setJobDescription("Merge data to table") 43 | spark.sql( 44 | """MERGE INTO delta.`/tmp/delta-table` AS oldData 45 | |USING newData 46 | |ON oldData.id = newData.id 47 | |WHEN MATCHED 48 | | THEN UPDATE SET id = newData.id 49 | |WHEN NOT MATCHED 50 | | THEN INSERT (id) VALUES (newData.id); 51 | |""".stripMargin) 52 | 53 | spark.sparkContext.setJobDescription("Select data from table") 54 | spark.sql("SELECT * FROM delta.`/tmp/delta-table`;").show() 55 | 56 | spark.sparkContext.setJobDescription("Select data from table by version") 57 | spark.sql("SELECT * FROM delta.`/tmp/delta-table` VERSION AS OF 0;").show() 58 | 59 | scala.io.StdIn.readLine() 60 | spark.stop() 61 | } 62 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/DeltaLakeStreaming.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | import java.sql.Timestamp 6 | 7 | object DeltaLakeStreaming extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("Simple Streaming") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 16 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 17 | .master("local[*]") 18 | .getOrCreate() 19 | 20 | import spark.implicits._ 21 | 22 | val numbers = (1 to 10).toList 23 | 24 | // Create a streaming DataFrame 25 | val streamingNumbers = spark.readStream 26 | .format("rate") 27 | .option("rowsPerSecond", "1") 28 | .load() 29 | .as[(Long, Timestamp)] 30 | .flatMap(_ => numbers) 31 | .toDF("number") 32 | 33 | // Filter numbers divisible by 2 34 | val filteredStream = streamingNumbers 35 | .mapPartitions(i => { 36 | Thread.sleep(10000) 37 | i 38 | })(streamingNumbers.encoder) 39 | .filter($"number" % 2 === 0) 40 | 41 | // Output the result to the console 42 | val query = filteredStream.writeStream 43 | .format("delta") 44 | .outputMode("append") 45 | .option("checkpointLocation", "/tmp/delta/events/_checkpoints/") 46 | .start("/tmp/delta/eventsByCustomer") 47 | 48 | // Wait for the streaming query to finish 49 | query.awaitTermination() 50 | 51 | scala.io.StdIn.readLine() 52 | spark.stop() 53 | } -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JobGroupExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | object JobGroupExample extends App { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("JobGroupExample") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.eventLog.enabled", "true") 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val data = Seq( 21 | ("Alice", "Math", 85), 22 | ("Alice", "Physics", 95), 23 | ("Bob", "Math", 78), 24 | ("Bob", "Physics", 88), 25 | ("Charlie", "Math", 92), 26 | ("Charlie", "Physics", 80) 27 | ).toDF("name", "subject", "score") 28 | 29 | data.createOrReplaceTempView("student_scores") 30 | 31 | // Set up and run the first query with a specific group ID 32 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores") 33 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'") 34 | mathScores.count() 35 | 36 | // Set up and run the second query with a different group ID 37 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores") 38 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name") 39 | avgScores.count() 40 | 41 | scala.io.StdIn.readLine() 42 | 43 | spark.stop() 44 | } 45 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JobGroupExportedLocal.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JobGroupExportedLocal extends App { 6 | val spark = SparkSession 7 | .builder() 8 | .appName("JobGroupExample") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.ui.port", "10000") 11 | .config("spark.dataflint.telemetry.enabled", value = false) 12 | .config("spark.eventLog.enabled", "true") 13 | .config("spark.dataflint.mode", "local") 14 | .config("spark.dataflint.token", "AKIAZEUOHHYMKVUKYYZB-1234") 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | import spark.implicits._ 20 | 21 | val data = Seq( 22 | ("Alice", "Math", 85), 23 | ("Alice", "Physics", 95), 24 | ("Bob", "Math", 78), 25 | ("Bob", "Physics", 88), 26 | ("Charlie", "Math", 92), 27 | ("Charlie", "Physics", 80) 28 | ).toDF("name", "subject", "score") 29 | 30 | data.createOrReplaceTempView("student_scores") 31 | 32 | // Set up and run the first query with a specific group ID 33 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores") 34 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'") 35 | mathScores.count() 36 | 37 | // Set up and run the second query with a different group ID 38 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores") 39 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name") 40 | avgScores.count() 41 | 42 | spark.stop() 43 | } 44 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/JoinExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.broadcast 5 | 6 | object JoinExample extends App { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("JoinExample") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.eventLog.enabled", "true") 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val df1 = Seq(1, 2).toDF("id1") 21 | val df2 = Seq(1, 2, 3, 4).toDF("id2") 22 | 23 | spark.sparkContext.setJobDescription("Cross Join Broadcast Nested Loop Join") 24 | val result2 = df1.join(broadcast(df2), $"id1" > $"id2") 25 | result2.show() 26 | 27 | spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 28 | 29 | spark.sparkContext.setJobDescription("Cross Join Broadcast Cartesian Product") 30 | val result = df1.repartition(2).crossJoin(df2.repartition(2)) 31 | result.show() 32 | 33 | // INNER JOIN EXAMPLE (Reduces rows) 34 | val df3 = Seq((1, "a"), (2, "b"), (3, "c"), (4, "d")).toDF("id", "value") 35 | val df4 = Seq((2, "x"), (4, "y")).toDF("id", "desc") 36 | 37 | spark.sparkContext.setJobDescription("Inner Join (reduces rows)") 38 | val innerJoinResult = df3.join(df4, Seq("id"), "inner") 39 | innerJoinResult.show() 40 | 41 | // LEFT OUTER JOIN EXAMPLE (Increases rows) 42 | val df5 = Seq((1, "a"), (1, "a")).toDF("id", "value") 43 | val df6 = Seq((1, "x"), (1, "y"), (1, "a"), (1, "b")).toDF("id", "desc") 44 | 45 | spark.sparkContext.setJobDescription("Left Outer Join (increases rows)") 46 | val leftJoinResult = df5.join(df6, Seq("id"), "left_outer") 47 | leftJoinResult.show() 48 | 49 | scala.io.StdIn.readLine() 50 | spark.stop() 51 | } 52 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/KafkaStreaming.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object KafkaStreaming extends App { 6 | 7 | val spark = SparkSession 8 | .builder() 9 | .appName("Simple Streaming") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.dataflint.telemetry.enabled", false) 12 | .config("spark.ui.port", "10000") 13 | .config("spark.sql.maxMetadataStringLength", "10000") 14 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 15 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 16 | .master("local[*]") 17 | .getOrCreate() 18 | 19 | val df = spark 20 | .readStream 21 | .format("kafka") 22 | .option("kafka.bootstrap.servers", "localhost:9092") 23 | .option("subscribe", "testtopic") 24 | .option("startingOffsets", "earliest") 25 | .load() 26 | 27 | val dfSelected = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 28 | 29 | // Filter numbers divisible by 2 30 | val filteredStream = dfSelected 31 | .mapPartitions(i => { 32 | Thread.sleep(10000) 33 | i 34 | })(dfSelected.encoder) 35 | 36 | // Output the result to the console 37 | val query = filteredStream.writeStream 38 | .format("delta") 39 | .outputMode("append") 40 | .option("checkpointLocation", "/tmp/delta/kafka2delta/_checkpoints/") 41 | .start("/tmp/delta/kafka2delta") 42 | 43 | // Wait for the streaming query to finish 44 | query.awaitTermination() 45 | 46 | scala.io.StdIn.readLine() 47 | spark.stop() 48 | } 49 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/LargeBroadcastExample.scala: -------------------------------------------------------------------------------- 1 | package main.scala.io.dataflint.example 2 | 3 | import io.dataflint.example.LargeFilterCondition.spark 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | 7 | object LargeBroadcastExample extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("LargeBroadcastExample") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.dataflint.telemetry.enabled", false) 13 | .config("spark.ui.port", "10000") 14 | .config("spark.dataflint.telemetry.enabled", value = false) 15 | .config("spark.sql.maxMetadataStringLength", "10000") 16 | .config("spark.driver.maxResultSize", "10g") 17 | .master("local[*]") 18 | .getOrCreate() 19 | 20 | spark.sparkContext.setJobDescription("Join with large broadcast") 21 | 22 | val smallDfSize = sys.env.get("SMALL_DF_SIZE").map(_.toLong).getOrElse((40 * 1000 * 1000).toLong) 23 | val largeDFSize = sys.env.get("LARGE_DF_SIZE").map(_.toLong).getOrElse((100 * 1000 * 1000).toLong) 24 | 25 | val smallDF = spark.range(1L, smallDfSize).toDF("id") 26 | val largeDF = spark.range(1L, largeDFSize).toDF("item_sk") 27 | 28 | val joinedDF = largeDF.join(broadcast(smallDF), largeDF("item_sk") === smallDF("id")) 29 | 30 | joinedDF.count() 31 | 32 | spark.sparkContext.setJobDescription("Join with shuffle") 33 | val joinedWithShuffleDF = largeDF.join(smallDF, largeDF("item_sk") === smallDF("id")) 34 | 35 | joinedWithShuffleDF.count() 36 | 37 | scala.io.StdIn.readLine() 38 | spark.stop() 39 | } -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/LargeFilterCondition.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object LargeFilterCondition extends App { 6 | val spark = SparkSession 7 | .builder() 8 | .appName("Large Filter Condition") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.dataflint.telemetry.enabled", false) 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.sql.maxMetadataStringLength", "10000") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | import spark.implicits._ 18 | 19 | val numOfConditions = 1000 20 | val sizeOfDF = 10000000 21 | 22 | spark.sparkContext.setJobDescription("Filter with long filter condition") 23 | 24 | val filterConditions = Range(0, numOfConditions).map($"id".equalTo(_)).reduce(_ || _) 25 | 26 | val countAfterLongFilter = spark.range(0, sizeOfDF) 27 | .filter(filterConditions) 28 | .count() 29 | 30 | println(s"count after long filter condition: ${countAfterLongFilter}") 31 | 32 | spark.sparkContext.setJobDescription("Filter with long regex condition") 33 | 34 | val regexPattern = Range(0, numOfConditions).map(_.toString).mkString("|") 35 | 36 | val countAfterLongRegexFilter = spark.range(0, sizeOfDF) 37 | .withColumn("num_str", $"id".cast("string")) 38 | .filter($"num_str".rlike(s"^($regexPattern)$$")) 39 | .count() 40 | 41 | println(s"count after long regex filter: ${countAfterLongRegexFilter}") 42 | 43 | spark.sparkContext.setJobDescription("Filter using join") 44 | 45 | val filterTable = spark.range(0, numOfConditions).toDF("id") 46 | 47 | val countAfterJoinFilter = spark.range(0, sizeOfDF) 48 | .join(filterTable, "id") 49 | .count() 50 | 51 | println(s"count after filter using join: ${countAfterJoinFilter}") 52 | 53 | scala.io.StdIn.readLine() 54 | spark.stop() 55 | } 56 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/PartitionSkewExample.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | object PartitionSkewExample extends App { 7 | val spark = SparkSession 8 | .builder() 9 | .appName("Partition Skew Example") 10 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 11 | .config("spark.dataflint.telemetry.enabled", false) 12 | .config("spark.ui.port", "10000") 13 | .config("spark.dataflint.telemetry.enabled", value = false) 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val numbers = spark.range(1, 100).repartition(100) 21 | 22 | val count = numbers.mapPartitions(i => { 23 | i.map(i => { 24 | if (i == 50L) { 25 | Thread.sleep(10000) 26 | } 27 | i 28 | }); 29 | })(numbers.encoder) 30 | .count() 31 | 32 | println(s"count numbers: $count") 33 | 34 | scala.io.StdIn.readLine() 35 | spark.stop() 36 | } 37 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SchedulingSmallTasks.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SchedulingSmallTasks extends App { 6 | val spark = SparkSession 7 | .builder() 8 | .appName("SchedulingSmallTasks") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.dataflint.telemetry.enabled", false) 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.sql.maxMetadataStringLength", "10000") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | val numbers = spark.range(0, 10000).repartition(10000).count() 18 | 19 | println(s"count numbers to 10000: $numbers") 20 | 21 | scala.io.StdIn.readLine() 22 | spark.stop() 23 | } 24 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SchedulingSmallTasksSkipAlerts.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object SchedulingSmallTasksSkipAlerts extends App { 6 | val spark = SparkSession 7 | .builder() 8 | .appName("SchedulingSmallTasks") 9 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 10 | .config("spark.dataflint.telemetry.enabled", false) 11 | .config("spark.ui.port", "10000") 12 | .config("spark.dataflint.telemetry.enabled", value = false) 13 | .config("spark.sql.maxMetadataStringLength", "10000") 14 | .config("spark.dataflint.alert.disabled", "smallTasks,idleCoresTooHigh") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | val numbers = spark.range(0, 10000).repartition(10000).count() 19 | 20 | println(s"count numbers to 10000: $numbers") 21 | 22 | scala.io.StdIn.readLine() 23 | spark.stop() 24 | } 25 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/SetJobDescriptionAndUDFName.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import main.scala.io.dataflint.example.LargeBroadcastExample.{smallDfSize, spark} 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | 7 | object SetJobDescriptionAndUDFName extends App { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("SetJobDescriptionAndUDFName") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.ui.port", "10000") 13 | .config("spark.dataflint.telemetry.enabled", value = false) 14 | .config("spark.sql.maxMetadataStringLength", "10000") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | import spark.implicits._ 19 | 20 | val df = spark.range(1L, 1000).toDF("id") 21 | val plusOne = udf((x: Int) => x + 1) 22 | 23 | df.filter(plusOne($"id") =!= 5).count() 24 | 25 | spark.sparkContext.setJobDescription("Range 1 to 1000 and then filter plus one not equal to 5") 26 | df.filter(plusOne($"id") =!= 5).count() 27 | 28 | val plusOneNamed = udf((x: Int) => x + 1).withName("plusOne") 29 | spark.sparkContext.setJobDescription("Range 1 to 1000 and then filter plus one not equal to 5, named") 30 | df.filter(plusOneNamed($"id") =!= 5).count() 31 | 32 | scala.io.StdIn.readLine() 33 | spark.stop() 34 | } 35 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | object Shakespeare351 extends App { 7 | def df(spark: SparkSession): DataFrame = spark.read 8 | .format("csv") 9 | .option("sep", ";") 10 | .option("inferSchema", true) 11 | .load("./test_data/will_play_text.csv") 12 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 13 | .repartition(1000) 14 | 15 | val spark = SparkSession 16 | .builder() 17 | .appName("Shakespeare Statistics") 18 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 19 | .config("spark.ui.port", "10000") 20 | .config("spark.dataflint.telemetry.enabled", value = false) 21 | .config("spark.sql.maxMetadataStringLength", "10000") 22 | .master("local[*]") 23 | .getOrCreate() 24 | 25 | import spark.implicits._ 26 | 27 | val shakespeareText = df(spark) 28 | 29 | shakespeareText.printSchema() 30 | 31 | val count = shakespeareText.count() 32 | println(s"number of records : $count") 33 | 34 | val uniqueSpeakers = shakespeareText.select($"speaker").distinct().count() 35 | println(s"number of unique speakers : $uniqueSpeakers") 36 | 37 | val uniqueWords = shakespeareText.select(explode(split($"text_entry", " "))).distinct().count() 38 | 39 | println(s"number of unique words : $uniqueWords") 40 | 41 | scala.io.StdIn.readLine() 42 | spark.stop() 43 | } 44 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351Exported.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | import java.nio.file.Paths 7 | 8 | object Shakespeare351Exported extends App { 9 | def df(spark: SparkSession): DataFrame = spark.read 10 | .format("csv") 11 | .option("sep", ";") 12 | .option("inferSchema", true) 13 | .load("./test_data/will_play_text.csv") 14 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 15 | .repartition(1000) 16 | 17 | val spark = SparkSession 18 | .builder() 19 | .appName("Shakespeare Statistics Exported") 20 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 21 | .config("spark.dataflint.telemetry.enabled", false) 22 | .config("spark.ui.port", "10000") 23 | .config("spark.sql.maxMetadataStringLength", "10000") 24 | .config("spark.eventLog.enabled", "true") 25 | .master("local[*]") 26 | .getOrCreate() 27 | 28 | val shakespeareText = df(spark) 29 | 30 | val count = shakespeareText.count() 31 | println(s"number of records : $count") 32 | 33 | spark.stop() 34 | } 35 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351ExportedLocal.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | import java.nio.file.Paths 6 | 7 | object Shakespeare351ExportedLocal extends App { 8 | def df(spark: SparkSession): DataFrame = spark.read 9 | .format("csv") 10 | .option("sep", ";") 11 | .option("inferSchema", true) 12 | .load("./test_data/will_play_text.csv") 13 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 14 | .repartition(1000) 15 | 16 | val spark = SparkSession 17 | .builder 18 | .appName("Shakespeare Statistics Exported") 19 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 20 | .config("spark.dataflint.telemetry.enabled", false) 21 | .config("spark.ui.port", "10000") 22 | .config("spark.dataflint.mode", "local") 23 | .config("spark.dataflint.token", "AKIAZEUOHHYMKVUKYYZB-1234") 24 | .config("spark.sql.maxMetadataStringLength", "10000") 25 | .config("spark.eventLog.enabled", "true") 26 | .master("local[*]") 27 | .getOrCreate() 28 | 29 | val shakespeareText = df(spark) 30 | 31 | val count = shakespeareText.count() 32 | println(s"number of records : $count") 33 | 34 | spark.stop() 35 | } 36 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/io/dataflint/example/Shakespeare351ExportedLocal2.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.example 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | 5 | object Shakespeare351ExportedLocal2 extends App { 6 | def df(spark: SparkSession): DataFrame = spark.read 7 | .format("csv") 8 | .option("sep", ";") 9 | .option("inferSchema", true) 10 | .load("./test_data/will_play_text.csv") 11 | .toDF("line_id", "play_name", "speech_number", "line_number", "speaker", "text_entry") 12 | .repartition(1000) 13 | 14 | val spark = SparkSession 15 | .builder 16 | .appName("Shakespeare Statistics Exported General") 17 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 18 | .config("spark.dataflint.telemetry.enabled", false) 19 | .config("spark.ui.port", "10000") 20 | .config("spark.dataflint.mode", "local") 21 | .config("spark.dataflint.token", "CKIAZEUOHHYMKVUKYYZC-1234") 22 | .config("spark.sql.maxMetadataStringLength", "10000") 23 | .config("spark.eventLog.enabled", "true") 24 | .master("local[*]") 25 | .getOrCreate() 26 | 27 | val shakespeareText = df(spark) 28 | 29 | val count = shakespeareText.count() 30 | println(s"number of records : $count") 31 | 32 | spark.stop() 33 | } 34 | -------------------------------------------------------------------------------- /spark-plugin/example_3_5_1/src/main/scala/org/apache/spark/dataflint/jobgroup/tests/JobGroupTests.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.jobgroup.tests 2 | 3 | import org.apache.spark.dataflint.jobgroup.JobGroupExtractor 4 | import org.apache.spark.sql.SparkSession 5 | 6 | class JobGroupTests extends org.scalatest.funsuite.AnyFunSuiteLike { 7 | test("test job group extractor with 2 groups") { 8 | val spark = SparkSession 9 | .builder() 10 | .appName("JobGroupExample") 11 | .config("spark.plugins", "io.dataflint.spark.SparkDataflintPlugin") 12 | .config("spark.ui.port", "10000") 13 | .config("spark.sql.maxMetadataStringLength", "10000") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | import spark.implicits._ 18 | 19 | val data = Seq( 20 | ("Alice", "Math", 85), 21 | ("Alice", "Physics", 95), 22 | ("Bob", "Math", 78), 23 | ("Bob", "Physics", 88), 24 | ("Charlie", "Math", 92), 25 | ("Charlie", "Physics", 80) 26 | ).toDF("name", "subject", "score") 27 | 28 | data.createOrReplaceTempView("student_scores") 29 | 30 | // Set up and run the first query with a specific group ID 31 | spark.sparkContext.setJobGroup("queryGroup1", "Group 1: Math Scores") 32 | val mathScores = spark.sql("SELECT name, score FROM student_scores WHERE subject = 'Math'") 33 | mathScores.count() 34 | 35 | spark.sparkContext.clearJobGroup() 36 | 37 | // Set up and run the second query with a different group ID 38 | spark.sparkContext.setJobGroup("queryGroup2", "Group 2: Average Scores") 39 | val avgScores = spark.sql("SELECT name, AVG(score) as avg_score FROM student_scores GROUP BY name") 40 | avgScores.count() 41 | 42 | // Optionally, clear job group if needed 43 | spark.sparkContext.clearJobGroup() 44 | 45 | Thread.sleep(1000) 46 | 47 | val extractor = new JobGroupExtractor(spark.sparkContext.ui.get.store, spark.sharedState.statusStore) 48 | val queryGroup1Store = extractor.extract("queryGroup1") 49 | val queryGroup2Store = extractor.extract("queryGroup2") 50 | 51 | assert(queryGroup1Store._2.executionsList().length == 1) 52 | assert(queryGroup2Store._2.executionsList().length == 1) 53 | spark.stop() 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin: -------------------------------------------------------------------------------- 1 | org.apache.spark.deploy.history.DataFlintHistoryServerPlugin -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/io/dataflint/spark/SparkDataflint.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.dataflint.DataflintSparkUILoader 5 | 6 | object SparkDataflint { 7 | def install(context: SparkContext): Unit = { 8 | DataflintSparkUILoader.install(context) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/io/dataflint/spark/SparkDataflintPlugin.scala: -------------------------------------------------------------------------------- 1 | package io.dataflint.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} 5 | import org.apache.spark.dataflint.DataflintSparkUILoader 6 | import org.apache.spark.internal.Logging 7 | 8 | import java.util 9 | import scala.collection.JavaConverters.mapAsJavaMapConverter 10 | 11 | class SparkDataflintPlugin extends SparkPlugin { 12 | override def driverPlugin(): DriverPlugin = new SparkDataflintDriverPlugin() 13 | 14 | override def executorPlugin(): ExecutorPlugin = null 15 | } 16 | 17 | class SparkDataflintDriverPlugin extends DriverPlugin with Logging { 18 | var sc: SparkContext = null 19 | 20 | override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = { 21 | this.sc = sc 22 | Map[String, String]().asJava 23 | } 24 | 25 | override def registerMetrics(appId: String, pluginContext: PluginContext): Unit = { 26 | var webUrl = DataflintSparkUILoader.install(sc) 27 | logInfo(s"spark dataflint url is $webUrl/dataflint") 28 | super.registerMetrics(appId, pluginContext) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataFlintTab.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.ui.{SparkUI, UIUtils, WebUITab} 4 | 5 | import javax.servlet.http.HttpServletRequest 6 | import scala.xml.Node 7 | 8 | class DataFlintTab(parent: SparkUI) extends WebUITab(parent,"dataflint") { 9 | override val name: String = "DataFlint" 10 | def render(request: HttpServletRequest): Seq[Node] = { 11 | val content = 12 |
13 |
14 | UIUtils.basicSparkPage(request, content, "DataFlint", true) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintApplicationInfoPage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.dataflint.listener.DataflintStore 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.ui.{SparkUI, WebUIPage} 6 | import org.json4s.{Extraction, JObject} 7 | 8 | import javax.servlet.http.HttpServletRequest 9 | import scala.xml.Node 10 | 11 | class DataflintApplicationInfoPage(ui: SparkUI, dataflintStore: DataflintStore) 12 | extends WebUIPage("applicationinfo") with Logging { 13 | override def renderJson(request: HttpServletRequest) = { 14 | try { 15 | val runIdConfigFromStore = ui.store.environmentInfo().sparkProperties.find(_._1 == "spark.dataflint.runId").map(_._2) 16 | val runIdPotentiallyFromConfig = if (runIdConfigFromStore.isEmpty) ui.conf.getOption("spark.dataflint.runId") else runIdConfigFromStore 17 | val applicationInfo = ui.store.applicationInfo() 18 | val environmentInfo = dataflintStore.environmentInfo() 19 | val dataFlintApplicationInfo = DataFlintApplicationInfo(runIdPotentiallyFromConfig, applicationInfo, environmentInfo) 20 | val jsonValue = Extraction.decompose(dataFlintApplicationInfo)(org.json4s.DefaultFormats) 21 | jsonValue 22 | } 23 | catch { 24 | case e: Throwable => { 25 | logError("failed to serve dataflint application info", e) 26 | JObject() 27 | } 28 | } 29 | } 30 | 31 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]() 32 | } -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintCachedStoragePage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.dataflint.listener.{DataflintExecutorStorageInfo, DataflintRDDStorageInfo, DataflintStore} 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.status.AppStatusStore 6 | import org.apache.spark.ui.{SparkUI, WebUIPage} 7 | import org.json4s.{Extraction, JObject} 8 | 9 | import javax.servlet.http.HttpServletRequest 10 | import scala.xml.Node 11 | 12 | class DataflintCachedStoragePage(ui: SparkUI, dataflintStore: DataflintStore) 13 | extends WebUIPage("cachedstorage") with Logging { 14 | override def renderJson(request: HttpServletRequest) = { 15 | try { 16 | val liveRddStorage = ui.store.rddList() 17 | val rddStorage = dataflintStore.rddStorageInfo() 18 | val graphs = ui.store.stageList(null) 19 | .filter(_.submissionTime.isDefined) // filter skipped or pending stages 20 | .map(stage => Tuple2(stage.stageId, 21 | ui.store.operationGraphForStage(stage.stageId).rootCluster.childClusters.flatMap(_.childNodes) 22 | .filter(_.cached) 23 | .map(rdd => { 24 | 25 | val liveCached = liveRddStorage.find(_.id == rdd.id).map( 26 | rdd => { 27 | val maxUsageExecutor = rdd.dataDistribution.map(executors => executors.maxBy(_.memoryUsed)) 28 | val maxExecutorUsage = maxUsageExecutor.map(executor => 29 | DataflintExecutorStorageInfo( 30 | executor.memoryUsed, 31 | executor.memoryRemaining, 32 | if(executor.memoryUsed + executor.memoryRemaining != 0) executor.memoryUsed.toDouble / (executor.memoryUsed + executor.memoryRemaining) * 100 else 0 33 | )) 34 | DataflintRDDStorageInfo(rdd.id, 35 | rdd.memoryUsed, 36 | rdd.diskUsed, 37 | rdd.numPartitions, 38 | rdd.storageLevel, 39 | maxExecutorUsage 40 | )} 41 | ) 42 | val cached = rddStorage.find(_.rddId == rdd.id) 43 | liveCached.getOrElse(cached) 44 | }))).toMap 45 | val jsonValue = Extraction.decompose(graphs)(org.json4s.DefaultFormats) 46 | jsonValue 47 | } 48 | catch { 49 | case e: Throwable => { 50 | logError("failed to serve dataflint Jobs RDD", e) 51 | JObject() 52 | } 53 | } 54 | } 55 | 56 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]() 57 | } 58 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintIcebergPage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.dataflint.listener.DataflintStore 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.ui.{SparkUI, WebUIPage} 6 | import org.json4s.{Extraction, JObject} 7 | 8 | import javax.servlet.http.HttpServletRequest 9 | import scala.xml.Node 10 | 11 | class DataflintIcebergPage(ui: SparkUI, dataflintStore: DataflintStore) 12 | extends WebUIPage("iceberg") with Logging { 13 | override def renderJson(request: HttpServletRequest) = { 14 | try { 15 | val offset = request.getParameter("offset") 16 | val length = request.getParameter("length") 17 | if (offset == null || length == null) { 18 | JObject() 19 | } else { 20 | val commits = dataflintStore.icebergCommits(offset.toInt, length.toInt) 21 | val icebergInfo = IcebergInfo(commitsInfo = commits) 22 | val jsonValue = Extraction.decompose(icebergInfo)(org.json4s.DefaultFormats) 23 | jsonValue 24 | } 25 | } 26 | catch { 27 | case e: Throwable => { 28 | logError("failed to serve dataflint iceberg", e) 29 | JObject() 30 | } 31 | } 32 | } 33 | 34 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]() 35 | } 36 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintSQLMetricsPage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SparkPlanGraph} 5 | import org.apache.spark.ui.{SparkUI, WebUIPage} 6 | import org.json4s.{Extraction, JObject} 7 | 8 | import javax.servlet.http.HttpServletRequest 9 | import scala.xml.Node 10 | 11 | class DataflintSQLMetricsPage(ui: SparkUI, sqlListener: () => Option[SQLAppStatusListener]) 12 | extends WebUIPage("sqlmetrics") with Logging { 13 | private var sqlListenerCache: Option[SQLAppStatusListener] = None 14 | 15 | override def renderJson(request: HttpServletRequest) = { 16 | try { 17 | if (sqlListenerCache.isEmpty) { 18 | sqlListenerCache = sqlListener() 19 | } 20 | 21 | val sqlStore = new SQLAppStatusStore(ui.store.store, sqlListenerCache) 22 | val executionId = request.getParameter("executionId") 23 | if (executionId == null) { 24 | JObject() 25 | } else { 26 | val executionIdLong = executionId.toLong 27 | val metrics = sqlStore.executionMetrics(executionIdLong) 28 | val isDatabricks = ui.conf.getOption("spark.databricks.clusterUsageTags.cloudProvider").isDefined 29 | val graph = if (isDatabricks) { 30 | val exec = sqlStore.execution(executionIdLong).get 31 | val planVersion = exec.getClass.getMethod("latestVersion").invoke(exec).asInstanceOf[Long] 32 | sqlStore.getClass.getMethods.filter(_.getName == "planGraph").head.invoke(sqlStore, executionIdLong.asInstanceOf[Object], planVersion.asInstanceOf[Object]).asInstanceOf[SparkPlanGraph] 33 | } else 34 | sqlStore.planGraph(executionIdLong) 35 | val nodesMetrics = graph.allNodes.map(node => NodeMetrics(node.id, node.name, node.metrics.map(metric => { 36 | NodeMetric(metric.name, metrics.get(metric.accumulatorId)) 37 | }).toSeq)) 38 | // filter nodes without metrics 39 | .filter(nodeMetrics => !nodeMetrics.metrics.forall(_.value.isEmpty)) 40 | val jValue = Extraction.decompose(nodesMetrics)(org.json4s.DefaultFormats) 41 | jValue 42 | } 43 | } catch { 44 | case e: Throwable => { 45 | logError("failed to serve dataflint SQL metrics", e) 46 | JObject() 47 | } 48 | } 49 | } 50 | 51 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]() 52 | } 53 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/DataflintSQLStagesRddPage.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.api 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.ui.{SparkUI, WebUIPage} 5 | import org.json4s.{Extraction, JObject} 6 | 7 | import javax.servlet.http.HttpServletRequest 8 | import scala.xml.Node 9 | 10 | class DataflintSQLStagesRddPage(ui: SparkUI) 11 | extends WebUIPage("stagesrdd") with Logging { 12 | override def renderJson(request: HttpServletRequest) = { 13 | try { 14 | val graphs = ui.store.stageList(null) 15 | .filter(_.submissionTime.isDefined) // filter skipped or pending stages 16 | .map(stage => Tuple2(stage.stageId, 17 | ui.store.operationGraphForStage(stage.stageId).rootCluster.childClusters 18 | .map(rdd => Tuple2(rdd.id, rdd.name)).toMap)) 19 | .toMap 20 | val jsonValue = Extraction.decompose(graphs)(org.json4s.DefaultFormats) 21 | jsonValue 22 | } 23 | catch { 24 | case e: Throwable => { 25 | logError("failed to serve dataflint Jobs RDD", e) 26 | JObject() 27 | } 28 | } 29 | } 30 | 31 | override def render(request: HttpServletRequest): Seq[Node] = Seq[Node]() 32 | } 33 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/api/api.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.spark.dataflint.api 18 | 19 | import org.apache.spark.dataflint.listener.{IcebergCommitInfo, DataflintEnvironmentInfo} 20 | import org.apache.spark.status.api.v1.ApplicationInfo 21 | 22 | case class NodeMetric(name: String, value: Option[String]) 23 | 24 | case class NodeMetrics(id: Long, name: String, metrics: Seq[NodeMetric]) 25 | 26 | case class SqlEnrichedData(executionId: Long, numOfNodes:Int, rddScopesToStages: Option[Map[String, Set[Object]]], nodesPlan: Seq[NodePlan]) 27 | 28 | case class NodePlan(id: Long, planDescription: String, rddScopeId: Option[String]) 29 | 30 | case class DataFlintApplicationInfo(runId: Option[String], info: ApplicationInfo, environmentInfo: Option[DataflintEnvironmentInfo]) 31 | 32 | case class IcebergInfo(commitsInfo: Seq[IcebergCommitInfo]) 33 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/iceberg/ClassLoaderChecker.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.iceberg 2 | 3 | import org.apache.iceberg.CatalogUtil 4 | import org.apache.spark.internal.Logging 5 | 6 | object ClassLoaderChecker extends Logging { 7 | def isMetricLoaderInRightClassLoader(): Boolean = { 8 | val metricReporterClass = classOf[DataflintIcebergMetricsReporter] 9 | val classLoaderMetricReporter = metricReporterClass.getClassLoader.toString 10 | val classLoaderIcebergCatalog = classOf[CatalogUtil].getClassLoader.toString 11 | try { 12 | Class.forName(metricReporterClass.getCanonicalName, false, classOf[CatalogUtil].getClassLoader) 13 | } 14 | catch { 15 | case _: NoClassDefFoundError => 16 | logWarning(s"Cannot load iceberg MetricsReporter from dataflint classloader, which prevents dataflint iceberg observability support. iceberg classloader: ${classOf[CatalogUtil].getClassLoader.toString}") 17 | return false 18 | case _: ClassNotFoundException => 19 | logWarning(s"Cannot load DataflintIcebergMetricsReporter from iceberg classloader, which prevents dataflint iceberg observability support. iceberg classloader: ${classLoaderIcebergCatalog} metric reporter classloader: ${classLoaderMetricReporter}") 20 | return false 21 | case error: Throwable => 22 | logError(s"Unexpected error while trying to load, can use DataflintIcebergMetricsReporter. iceberg classloader: ${classLoaderIcebergCatalog} metric reporter classloader: ${classLoaderMetricReporter}", error) 23 | return false 24 | } 25 | true 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintDatabricksLiveListener.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.listener 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent} 5 | import org.apache.spark.sql.execution.SparkPlanInfo 6 | import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionStart, SparkPlanGraph, SparkPlanGraphNode} 7 | 8 | import java.lang.reflect.Method 9 | import java.util.concurrent.ConcurrentHashMap 10 | import java.util.concurrent.atomic.AtomicInteger 11 | 12 | object DataflintDatabricksLiveListener { 13 | def apply(listenerBus: LiveListenerBus): DataflintDatabricksLiveListener = { 14 | val rddScopeIdReader = classOf[SparkPlanGraphNode].getMethod("rddScopeId") 15 | new DataflintDatabricksLiveListener(listenerBus, rddScopeIdReader) 16 | } 17 | } 18 | 19 | class DataflintDatabricksLiveListener(listenerBus: LiveListenerBus, rddScopeIdReader: Method) extends SparkListener with Logging { 20 | private val executionToLatestVersion = new ConcurrentHashMap[Long, AtomicInteger]() 21 | 22 | private def publishDatabricksAdditionalSQLEvent(sparkPlanInfo: SparkPlanInfo, executionId: Long, version: Long): Unit = { 23 | val planGraph = SparkPlanGraph(sparkPlanInfo) 24 | val nodesToRddScopeId = planGraph.allNodes.map(node => { 25 | val rddScopeId = rddScopeIdReader.invoke(node).asInstanceOf[String] 26 | node.id -> rddScopeId 27 | }).toMap 28 | val executionInfo = DatabricksAdditionalExecutionInfo(executionId, version, nodesToRddScopeId) 29 | val event = DatabricksAdditionalExecutionEvent(executionInfo) 30 | listenerBus.post(event) 31 | } 32 | 33 | def onExecutionStart(e: SparkListenerSQLExecutionStart): Unit = { 34 | executionToLatestVersion.put(e.executionId, new AtomicInteger(0)) 35 | publishDatabricksAdditionalSQLEvent(e.sparkPlanInfo, e.executionId, 0L) 36 | } 37 | 38 | def onAdaptiveExecutionUpdate(e: SparkListenerSQLAdaptiveExecutionUpdate): Unit = { 39 | val version = executionToLatestVersion.get(e.executionId).incrementAndGet() 40 | publishDatabricksAdditionalSQLEvent(e.sparkPlanInfo, e.executionId, version) 41 | } 42 | 43 | override def onOtherEvent(event: SparkListenerEvent): Unit = { 44 | try { 45 | event match { 46 | case e: SparkListenerSQLExecutionStart => onExecutionStart(e) 47 | case e: SparkListenerSQLAdaptiveExecutionUpdate => onAdaptiveExecutionUpdate(e) 48 | case _ => {} 49 | } 50 | } catch { 51 | case e: Exception => logError("Error while processing events in DataflintDatabricksLiveListener", e) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintListener.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.listener 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} 5 | import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionStart} 6 | import org.apache.spark.status.ElementTrackingStore 7 | 8 | class DataflintListener(store: ElementTrackingStore) extends SparkListener with Logging { 9 | 10 | override def onOtherEvent(event: SparkListenerEvent): Unit = { 11 | try { 12 | event match { 13 | case icebergCommitEvent: IcebergCommitEvent => { 14 | val commitInfo = new IcebergCommitWrapper(icebergCommitEvent.icebergCommit) 15 | store.write(commitInfo) 16 | } 17 | case e: DatabricksAdditionalExecutionEvent => { 18 | val executionInfo = new DatabricksAdditionalExecutionWrapper(e.databricksAdditionalExecutionInfo) 19 | store.write(executionInfo) 20 | } 21 | case e: DataflintEnvironmentInfoEvent => { 22 | val wrapper = new DataflintEnvironmentInfoWrapper(e.environmentInfo) 23 | store.write(wrapper) 24 | } 25 | case _ => {} 26 | } 27 | } catch { 28 | case e: Exception => logError("Error while processing events in DataflintListener", e) 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/listener/DataflintStore.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.listener 2 | 3 | import scala.collection.JavaConverters._ 4 | import org.apache.spark.util.Utils 5 | import org.apache.spark.util.kvstore.{KVStore, KVStoreView} 6 | 7 | 8 | class DataflintStore(val store: KVStore) { 9 | // mapToSeq copied from KVUtils because it does not exists in spark 3.3 10 | def mapToSeq[T, B](view: KVStoreView[T])(mapFunc: T => B): Seq[B] = { 11 | Utils.tryWithResource(view.closeableIterator()) { iter => 12 | iter.asScala.map(mapFunc).toList 13 | } 14 | } 15 | 16 | def icebergCommits(offset: Int, length: Int): Seq[IcebergCommitInfo] = { 17 | mapToSeq(store.view(classOf[IcebergCommitWrapper]))(_.info).filter(_.executionId >= offset).take(length).sortBy(_.executionId) 18 | } 19 | 20 | def databricksAdditionalExecutionInfo(offset: Int, length: Int): Seq[DatabricksAdditionalExecutionInfo] = { 21 | mapToSeq(store.view(classOf[DatabricksAdditionalExecutionWrapper]))(_.info).filter(_.executionId >= offset).take(length).sortBy(_.executionId) 22 | } 23 | 24 | def environmentInfo(): Option[DataflintEnvironmentInfo] = { 25 | mapToSeq(store.view(classOf[DataflintEnvironmentInfoWrapper]))(_.info).headOption 26 | } 27 | 28 | def rddStorageInfo(): Seq[DataflintRDDStorageInfo] = { 29 | mapToSeq(store.view(classOf[DataflintRDDStorageInfoWrapper]))(_.info) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/package.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark 2 | 3 | package object dataflint { 4 | private[dataflint] type EnumValue[A <: Enumeration] = A#Value 5 | } 6 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/EnumSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.dataflint.EnumValue 4 | import org.json4s.{JInt, JString} 5 | import org.json4s.reflect.TypeInfo 6 | import org.json4s.{Formats, JValue, MappingException, Serializer} 7 | 8 | import scala.reflect.ClassTag 9 | 10 | // copied from json4s source code, because some spark version depends on json4s versions without this class 11 | class EnumSerializer[E <: Enumeration: ClassTag](enumeration: E) extends Serializer[EnumValue[E]] { 12 | import org.json4s.JsonDSL._ 13 | 14 | private[this] val EnumerationClass = classOf[Enumeration#Value] 15 | 16 | private[this] def isValid(json: JValue) = json match { 17 | case JInt(value) => enumeration.values.toSeq.map(_.id).contains(value.toInt) 18 | case _ => false 19 | } 20 | 21 | private[this] def enumerationValueToEnumValueOfE(value: enumeration.Value): EnumValue[E] = 22 | value.asInstanceOf[EnumValue[E]] 23 | 24 | def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), EnumValue[E]] = { 25 | case (TypeInfo(EnumerationClass, _), json) if isValid(json) => 26 | json match { 27 | case JInt(value) => enumerationValueToEnumValueOfE(enumeration(value.toInt)) 28 | case value => throw new MappingException(s"Can't convert $value to $EnumerationClass") 29 | } 30 | } 31 | 32 | def serialize(implicit format: Formats): PartialFunction[Any, JValue] = { 33 | case i: Enumeration#Value if enumeration.values.exists(_ == i) => i.id 34 | } 35 | } 36 | 37 | class EnumNameSerializer[E <: Enumeration: ClassTag](enumeration: E) extends Serializer[EnumValue[E]] { 38 | import org.json4s.JsonDSL._ 39 | 40 | private[this] val EnumerationClass = classOf[Enumeration#Value] 41 | 42 | private[this] def enumerationValueToEnumValueOfE(value: enumeration.Value): EnumValue[E] = 43 | value.asInstanceOf[EnumValue[E]] 44 | 45 | def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), EnumValue[E]] = { 46 | case (_ @TypeInfo(EnumerationClass, _), json) if isValid(json) => { 47 | json match { 48 | case JString(value) => enumerationValueToEnumValueOfE(enumeration.withName(value)) 49 | case value => throw new MappingException(s"Can't convert $value to $EnumerationClass") 50 | } 51 | } 52 | } 53 | 54 | private[this] def isValid(json: JValue) = json match { 55 | case JString(value) if enumeration.values.exists(_.toString == value) => true 56 | case _ => false 57 | } 58 | 59 | def serialize(implicit format: Formats): PartialFunction[Any, JValue] = { 60 | case i: Enumeration#Value if enumeration.values.exists(_ == i) => i.toString 61 | } 62 | } -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/ExecutorsMetricsSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.executor.ExecutorMetrics 4 | import org.apache.spark.metrics.ExecutorMetricType 5 | import org.json4s.{CustomSerializer, JLong, JNull, JObject} 6 | import org.json4s.JValue 7 | 8 | class ExecutorsMetricsSerializer extends CustomSerializer[ExecutorMetrics](implicit format => ( 9 | { 10 | case json: JValue => 11 | val metricsMap = json.extract[Map[String, Long]] 12 | val metrics = new ExecutorMetrics(metricsMap) 13 | metrics 14 | }, 15 | { 16 | case Some(metrics: ExecutorMetrics) => 17 | val metricsMap = ExecutorMetricType.metricToOffset.map { case (metric, _) => 18 | metric -> metrics.getMetricValue(metric) 19 | } 20 | val metricsObj = JObject(metricsMap.map { case (k, v) => k -> JLong(v) }.toList) 21 | metricsObj 22 | case None => JNull 23 | } 24 | )) 25 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/GZipUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.commons.io.output.ByteArrayOutputStream 4 | 5 | import java.util.zip.GZIPOutputStream 6 | 7 | object GZipUtils { 8 | def compressString(inputString: String): Array[Byte] = { 9 | val input = inputString.getBytes("UTF-8") 10 | val bos = new ByteArrayOutputStream(input.length) 11 | val gzip = new GZIPOutputStream(bos) 12 | gzip.write(input) 13 | gzip.close() 14 | val compressed = bos.toByteArray 15 | bos.close() 16 | compressed 17 | } 18 | 19 | def decompressString(compressed: Array[Byte]): String = { 20 | val bis = new java.io.ByteArrayInputStream(compressed) 21 | val gzip = new java.util.zip.GZIPInputStream(bis) 22 | val bos = new java.io.ByteArrayOutputStream() 23 | val buffer = new Array[Byte](1024) 24 | var len = gzip.read(buffer) 25 | while (len > 0) { 26 | bos.write(buffer, 0, len) 27 | len = gzip.read(buffer) 28 | } 29 | gzip.close() 30 | bis.close() 31 | bos.close() 32 | new String(bos.toByteArray(), "UTF-8") 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/JavaEnumNameSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.json4s.CustomSerializer 4 | import org.json4s.JString 5 | 6 | // copied from json4s source code, because some spark version depends on json4s versions without this class 7 | class JavaEnumNameSerializer[E <: Enum[E]](implicit 8 | ct: Manifest[E] 9 | ) extends CustomSerializer[E](_ => 10 | ( { 11 | case JString(name) => 12 | Enum.valueOf(ct.runtimeClass.asInstanceOf[Class[E]], name) 13 | }, { 14 | case dt: E => 15 | JString(dt.name()) 16 | } 17 | ) 18 | ) 19 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/S3Uploader.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials} 4 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration 5 | import com.amazonaws.regions.Regions 6 | import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} 7 | import com.amazonaws.services.s3.model.ObjectMetadata 8 | import org.apache.spark.internal.Logging 9 | 10 | import java.io.ByteArrayInputStream 11 | 12 | class S3Uploader(accessKeyId: String, secretAccessKey: String, mode: String) extends Logging { 13 | val credentials = new BasicAWSCredentials(accessKeyId, secretAccessKey) 14 | private val bucketName = "dataflint-upload-" + mode 15 | 16 | val s3client: AmazonS3 = { 17 | var builder = AmazonS3ClientBuilder.standard() 18 | .withCredentials(new AWSStaticCredentialsProvider(credentials)) 19 | 20 | if(mode == "local") { 21 | logInfo(s"Uploading to S3 with localstack") 22 | builder = builder.withEndpointConfiguration(new EndpointConfiguration("s3.localhost.localstack.cloud:4566", Regions.US_EAST_1.getName)) 23 | } else { 24 | builder = builder.enableAccelerateMode() 25 | 26 | } 27 | 28 | builder.build() 29 | } 30 | 31 | def uploadToS3(jsonContent: String, fileKey: String, shouldGzip: Boolean): Unit = { 32 | try { 33 | val startTimeMillis = System.currentTimeMillis() 34 | 35 | val metadata = new ObjectMetadata() 36 | val jsonToSend = if(shouldGzip) GZipUtils.compressString(jsonContent) else jsonContent.getBytes("UTF-8") 37 | if(shouldGzip) { 38 | metadata.setContentType("application/x-gzip") 39 | } else { 40 | metadata.setContentType("application/json") 41 | } 42 | metadata.setContentLength(jsonToSend.length) 43 | 44 | val inputStream = new ByteArrayInputStream(jsonToSend) 45 | 46 | s3client.putObject(bucketName, fileKey, inputStream, metadata) 47 | val endTimeMillis = System.currentTimeMillis() 48 | val durationMs = endTimeMillis - startTimeMillis 49 | logDebug(s"Upload file $fileKey took ${durationMs}ms") 50 | } catch { 51 | case e: Exception => e.printStackTrace() 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkMetadataSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.JobExecutionStatus 4 | import org.apache.spark.rdd.DeterministicLevel 5 | import org.apache.spark.status.api.v1.StageStatus 6 | import org.json4s.jackson.{JsonMethods, Serialization} 7 | import org.json4s.{Formats, NoTypeHints} 8 | 9 | import java.io.{File, PrintWriter} 10 | 11 | object SparkMetadataSerializer { 12 | implicit val formats: Formats = Serialization.formats(NoTypeHints) + new JavaEnumNameSerializer[JobExecutionStatus]() + new JavaEnumNameSerializer[StageStatus]() + new EnumSerializer(DeterministicLevel) 13 | 14 | def serialize(data: SparkMetadataStore): String = { 15 | Serialization.write(data) 16 | } 17 | 18 | def deserialize(json: String): SparkMetadataStore = { 19 | JsonMethods.parse(json).extract[SparkMetadataStore] 20 | } 21 | 22 | def serializeAndSave(data: SparkMetadataStore, filePath: String): Unit = { 23 | val jsonData = serialize(data) 24 | val writer = new PrintWriter(new File(filePath)) 25 | try { 26 | writer.write(jsonData) 27 | } finally { 28 | writer.close() 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkMetadataStore.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.status.api.v1 4 | 5 | case class SparkMetadataMetrics( 6 | containerMemoryGb: Double, 7 | executorJvmMemoryGb: Double, 8 | totalInputBytes: Long, 9 | totalOutputBytes: Long, 10 | totalSpillBytes: Long, 11 | totalShuffleWriteBytes: Long, 12 | totalShuffleReadBytes: Long, 13 | totalCachedMemoryBytes: Long, 14 | totalCachedDiskBytes: Long, 15 | maxExecutorCachedMemoryUsagePercentage: Double, executorPeakMemoryBytes: Long, 16 | containerPeakMemoryBytes: Long, 17 | executorJvmMemoryUsage: Double, 18 | driverJvmPeakMemoryBytes: Long, 19 | driverJvmMemoryUsage: Double, 20 | containerMemoryUsage: Double, 21 | totalDCU: Double, 22 | coreHourUsage: Double, 23 | memoryGbHour: Double, 24 | isAnySqlQueryFailed: Boolean, 25 | taskErrorRate: Double, 26 | idleCoresRatio: Double, 27 | CoresWastedRatio: Double, 28 | executorsDurationMs: Long, 29 | driverDurationMs: Long, 30 | 31 | ) 32 | 33 | case class SparkMetadataStore(version: String, 34 | runId: String, 35 | accessKey: String, 36 | applicationInfo: v1.ApplicationInfo, 37 | metrics: SparkMetadataMetrics, 38 | conf: Map[String, String]) 39 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkRunSerializer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.JobExecutionStatus 4 | import org.apache.spark.rdd.DeterministicLevel 5 | import org.apache.spark.status.api.v1.StageStatus 6 | import org.json4s.jackson.{JsonMethods, Serialization} 7 | import org.json4s.{Formats, NoTypeHints} 8 | 9 | import java.io.{File, PrintWriter} 10 | 11 | object SparkRunSerializer { 12 | implicit val formats: Formats = Serialization.formats(NoTypeHints) + new JavaEnumNameSerializer[JobExecutionStatus]() + new JavaEnumNameSerializer[StageStatus]() + new EnumSerializer(DeterministicLevel) + new ExecutorsMetricsSerializer() 13 | 14 | def serialize(data: SparkRunStore): String = { 15 | Serialization.write(data) 16 | } 17 | 18 | def deserialize(json: String): SparkRunStore = { 19 | JsonMethods.parse(json).extract[SparkRunStore] 20 | } 21 | 22 | def serializeAndSave(data: SparkRunStore, filePath: String): Unit = { 23 | val jsonData = serialize(data) 24 | val writer = new PrintWriter(new File(filePath)) 25 | try { 26 | writer.write(jsonData) 27 | } finally { 28 | writer.close() 29 | } 30 | } 31 | } 32 | 33 | 34 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/SparkRunStore.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.dataflint.listener.{DatabricksAdditionalExecutionWrapper, DataflintEnvironmentInfoEvent, DataflintEnvironmentInfoWrapper, DataflintRDDStorageInfoWrapper, IcebergCommitWrapper} 4 | import org.apache.spark.sql.execution.ui.{SQLExecutionUIData, SparkPlanGraphWrapper} 5 | import org.apache.spark.status._ 6 | 7 | case class SparkRunStore( 8 | version: String, 9 | applicationInfos: Seq[ApplicationInfoWrapper], 10 | applicationEnvironmentInfo: Seq[ApplicationEnvironmentInfoWrapper], 11 | resourceProfiles: Seq[ResourceProfileWrapper], 12 | jobDatas: Seq[JobDataWrapper], 13 | stageDatas: Seq[StageDataWrapper], 14 | executorSummaries: Seq[ExecutorSummaryWrapper], 15 | taskDatas: Seq[TaskDataWrapper], 16 | rddStorageInfos: Seq[RDDStorageInfoWrapper], 17 | streamBlockDatas: Seq[StreamBlockData], 18 | rddOperationGraphs: Seq[RDDOperationGraphWrapper], 19 | poolDatas: Seq[PoolData], 20 | appSummaries: Seq[AppSummary], 21 | executorStageSummaries: Seq[ExecutorStageSummaryWrapper], 22 | speculationStageSummaries: Seq[SpeculationStageSummaryWrapper], 23 | sparkPlanGraphWrapper: Seq[SparkPlanGraphWrapper], 24 | sqlExecutionUIData: Seq[SQLExecutionUIData], 25 | stageTaskSummary: Seq[StageTaskSummary], 26 | databricksAdditionalExecutionInfo: Seq[DatabricksAdditionalExecutionWrapper], 27 | icebergCommit: Seq[IcebergCommitWrapper], 28 | dataflintEnvironmentInfo: Seq[DataflintEnvironmentInfoWrapper], 29 | dataflintRDDStorageInfo: Seq[DataflintRDDStorageInfoWrapper] 30 | ) 31 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/StageTaskSummary.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore 4 | import org.apache.spark.status.api.v1 5 | import org.apache.spark.util.kvstore.KVIndex 6 | 7 | case class StageTaskSummary( 8 | stageId: Int, 9 | stageAttemptId: Int, 10 | summary: v1.TaskMetricDistributions) { 11 | @KVIndex 12 | @JsonIgnore 13 | def id: Array[Any] = Array(stageId, stageAttemptId) 14 | } 15 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/dataflint/saas/StoreDataExtractor.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.dataflint.saas 2 | 3 | import org.apache.spark.dataflint.listener.{DatabricksAdditionalExecutionWrapper, DataflintEnvironmentInfoWrapper, DataflintRDDStorageInfoWrapper, IcebergCommitWrapper} 4 | import org.apache.spark.sql.execution.ui.{SQLExecutionUIData, SparkPlanGraphWrapper} 5 | import org.apache.spark.status._ 6 | 7 | import scala.collection.convert.ImplicitConversions.`iterator asScala` 8 | import scala.reflect.{ClassTag, classTag} 9 | 10 | class StoreDataExtractor(store: AppStatusStore) { 11 | private val version: String = "1" 12 | private val kvStore = store.store.asInstanceOf[ElementTrackingStore] 13 | 14 | def extract(): SparkRunStore = { 15 | SparkRunStore( 16 | version = version, 17 | applicationInfos = readAll[ApplicationInfoWrapper], 18 | applicationEnvironmentInfo = readAll[ApplicationEnvironmentInfoWrapper], 19 | resourceProfiles = readAll[ResourceProfileWrapper], 20 | jobDatas = readAll[JobDataWrapper], 21 | stageDatas = readAll[StageDataWrapper], 22 | executorSummaries = readAll[ExecutorSummaryWrapper], 23 | taskDatas = readAll[TaskDataWrapper], 24 | rddStorageInfos = readAll[RDDStorageInfoWrapper], 25 | streamBlockDatas = readAll[StreamBlockData], 26 | rddOperationGraphs = readAll[RDDOperationGraphWrapper], 27 | poolDatas = readAll[PoolData], 28 | appSummaries = readAll[AppSummary], 29 | executorStageSummaries = readAll[ExecutorStageSummaryWrapper], 30 | speculationStageSummaries = readAll[SpeculationStageSummaryWrapper], 31 | sparkPlanGraphWrapper = readAll[SparkPlanGraphWrapper], 32 | sqlExecutionUIData = readAll[SQLExecutionUIData], 33 | stageTaskSummary = calculateTaskSummary(), 34 | databricksAdditionalExecutionInfo = readAll[DatabricksAdditionalExecutionWrapper], 35 | icebergCommit = readAll[IcebergCommitWrapper], 36 | dataflintEnvironmentInfo = readAll[DataflintEnvironmentInfoWrapper], 37 | dataflintRDDStorageInfo = readAll[DataflintRDDStorageInfoWrapper] 38 | ) 39 | } 40 | 41 | private def calculateTaskSummary(): Seq[StageTaskSummary] = { 42 | val quantiles = Array(0.0, 0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99, 1.0) 43 | store.stageList(null).map(stage => { 44 | store.taskSummary(stage.stageId, stage.attemptId, quantiles).map( 45 | StageTaskSummary(stage.stageId, stage.attemptId, _) 46 | ) 47 | }).filter(_.isDefined).map(_.get) 48 | } 49 | 50 | private def readAll[T: ClassTag]: Seq[T] = { 51 | val view = kvStore.view(classTag[T].runtimeClass) 52 | val it = view.closeableIterator() 53 | try { 54 | it.toSeq.asInstanceOf[Seq[T]] 55 | } finally { 56 | it.close() 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/deploy/history/DataFlintHistoryServerPlugin.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.deploy.history 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.dataflint.DataflintSparkUILoader 5 | import org.apache.spark.dataflint.listener.DataflintListener 6 | import org.apache.spark.scheduler.SparkListener 7 | import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore, LiveRDDsListener} 8 | import org.apache.spark.ui.SparkUI 9 | 10 | class DataFlintHistoryServerPlugin extends AppHistoryServerPlugin { 11 | override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { 12 | Seq(new DataflintListener(store), new LiveRDDsListener(store)) 13 | } 14 | 15 | override def setupUI(ui: SparkUI): Unit = { 16 | DataflintSparkUILoader.loadUI(ui) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /spark-plugin/plugin/src/main/scala/org/apache/spark/deploy/history/FsDataflintHistoryProvider.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.deploy.history 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.dataflint.DataflintSparkUILoader 5 | import org.apache.spark.status.AppHistoryServerPlugin 6 | import org.apache.spark.util.{Clock, SystemClock, Utils} 7 | 8 | import java.util.ServiceLoader 9 | import scala.collection.JavaConverters._ 10 | 11 | // This class is not needed any more, as history server loading is now being done via DataFlintListenerHistoryServerPlugin 12 | // Will be removed in the future, but because users already configured it as provider if we remove this method it will cause issues. 13 | class FsDataflintHistoryProvider(conf: SparkConf, clock: Clock) extends FsHistoryProvider(conf, clock) { 14 | def this(conf: SparkConf) = { 15 | this(conf, new SystemClock()) 16 | } 17 | 18 | override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = { 19 | super.getAppUI(appId, attemptId) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /spark-plugin/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.8 2 | -------------------------------------------------------------------------------- /spark-plugin/project/publish.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.11.2") 2 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") 3 | addSbtPlugin("com.github.sbt" % "sbt-git" % "2.0.1") 4 | addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.6.0") -------------------------------------------------------------------------------- /spark-plugin/sonatype.sbt: -------------------------------------------------------------------------------- 1 | 2 | import xerial.sbt.Sonatype._ 3 | 4 | ThisBuild / sonatypeCredentialHost := "s01.oss.sonatype.org" 5 | 6 | sonatypeProfileName := "io.dataflint" 7 | 8 | ThisBuild / sonatypeProfileName := "io.dataflint" 9 | 10 | ThisBuild / publishMavenStyle := true 11 | 12 | ThisBuild / licenses := Seq("APL2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt")) 13 | 14 | ThisBuild / sonatypeProjectHosting := Some(GitHubHosting("menishmueli", "dataflint/spark", "menishmueli@gmail.com")) 15 | 16 | ThisBuild / description := "Open Source Data-Application Performance Monitoring for Apache Spark" 17 | 18 | ThisBuild / homepage := Some(url("https://github.com/dataflint/spark")) 19 | ThisBuild / scmInfo := Some( 20 | ScmInfo( 21 | url("https://github.com/dataflint/spark"), 22 | "scm:git@github.com:dataflint/spark.git" 23 | ) 24 | ) 25 | ThisBuild / developers := List( 26 | Developer( 27 | id = "menishmueli", 28 | name = "Meni Shmueli", 29 | email = "menishmueli@gmail.com", 30 | url = url("http://dataflint.io") 31 | ) 32 | ) 33 | -------------------------------------------------------------------------------- /spark-ui/.env: -------------------------------------------------------------------------------- 1 | REACT_APP_VERSION=$npm_package_version 2 | INLINE_RUNTIME_CHUNK=false 3 | GENERATE_SOURCEMAP=false 4 | SKIP_PREFLIGHT_CHECK=true 5 | -------------------------------------------------------------------------------- /spark-ui/.generatelicensefile.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | - ./package.json 3 | output: ./../THIRD-PARTY-LICENSES.txt 4 | overwrite: true 5 | eol: lf 6 | ci: true 7 | no-spinner: true 8 | replace: 9 | '@bcoe/v8-coverage@0.2.3': https://raw.githubusercontent.com/demurgos/v8-coverage/refs/heads/master/ts/LICENSE.md 10 | doctrine@3.0.0: https://raw.githubusercontent.com/eslint/doctrine/refs/heads/master/LICENSE 11 | doctrine@2.1.0: https://raw.githubusercontent.com/eslint/doctrine/refs/heads/master/LICENSE 12 | harmony-reflect@1.6.2: https://raw.githubusercontent.com/tvcutsem/harmony-reflect/refs/heads/master/LICENSE 13 | sockjs@0.3.24: https://raw.githubusercontent.com/sockjs/sockjs-node/refs/heads/main/LICENSE 14 | 15 | -------------------------------------------------------------------------------- /spark-ui/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | /build 13 | 14 | # IDEs and editors 15 | /.idea 16 | /.vscode 17 | 18 | # misc 19 | .DS_Store 20 | .env.local 21 | .env.development.local 22 | .env.test.local 23 | .env.production.local 24 | 25 | npm-debug.log* 26 | yarn-debug.log* 27 | yarn-error.log* 28 | -------------------------------------------------------------------------------- /spark-ui/gulpfile.js: -------------------------------------------------------------------------------- 1 | const gulp = require('gulp'); 2 | const inlinesource = require('gulp-inline-source'); 3 | const replace = require('gulp-replace'); 4 | 5 | gulp.task('default', () => { 6 | return gulp 7 | .src('./build/*.html') 8 | .pipe(replace('.js">', '.js" inline>')) 9 | .pipe(replace('', '')) 10 | .pipe(replace('', '')) 11 | .pipe(replace('rel="stylesheet">', 'rel="stylesheet" inline>')) 12 | .pipe( 13 | inlinesource({ 14 | compress: false 15 | }) 16 | ) 17 | .pipe(gulp.dest('./build')); 18 | }); 19 | -------------------------------------------------------------------------------- /spark-ui/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/favicon.ico -------------------------------------------------------------------------------- /spark-ui/public/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/icon.png -------------------------------------------------------------------------------- /spark-ui/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 22 | DataFlint 23 | 24 | 28 | 29 | 30 | 31 |
32 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /spark-ui/public/logo-grey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/logo-grey.png -------------------------------------------------------------------------------- /spark-ui/public/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataflint/spark/ae8495c730f9b27c88cd26d5072b1d59f28fefe2/spark-ui/public/logo.png -------------------------------------------------------------------------------- /spark-ui/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "Your Orders", 3 | "name": "Your Orders", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | } 10 | ], 11 | "start_url": ".", 12 | "display": "standalone", 13 | "theme_color": "#000000", 14 | "background_color": "#ffffff" 15 | } 16 | -------------------------------------------------------------------------------- /spark-ui/src/App.tsx: -------------------------------------------------------------------------------- 1 | import Box from "@mui/material/Box"; 2 | import * as React from "react"; 3 | import { Outlet, useLocation, useNavigate } from "react-router-dom"; 4 | import { AppDrawer } from "./components/AppDrawer/AppDrawer"; 5 | import DisconnectedModal from "./components/Modals/DisconnectedModal"; 6 | import Progress from "./components/Progress"; 7 | import { useAppDispatch, useAppSelector } from "./Hooks"; 8 | import SparkAPI from "./services/SparkApi"; 9 | import { getTabByUrl, Tab, TabToUrl } from "./services/TabsService"; 10 | import { 11 | BASE_CURRENT_PAGE, 12 | BASE_PATH, 13 | IS_HISTORY_SERVER_MODE, 14 | } from "./utils/UrlConsts"; 15 | 16 | const DOCUMENT_TITLE_PREFIX = "DataFlint - "; 17 | 18 | export default function App() { 19 | const location = useLocation(); 20 | const navigate = useNavigate(); 21 | 22 | const dispatch = useAppDispatch(); 23 | const store = useAppSelector((state) => state.spark); 24 | const [selectedTab, setSelectedTab] = React.useState(Tab.Status); 25 | 26 | React.useEffect(() => { 27 | const sparkAPI = new SparkAPI( 28 | BASE_PATH, 29 | BASE_CURRENT_PAGE, 30 | dispatch, 31 | IS_HISTORY_SERVER_MODE, 32 | ); 33 | const cleanerFunc = sparkAPI.start(); 34 | return cleanerFunc; 35 | }, []); 36 | 37 | React.useEffect(() => { 38 | if (store.runMetadata?.appName) { 39 | document.title = DOCUMENT_TITLE_PREFIX + store.runMetadata.appName; 40 | } 41 | }, [store.runMetadata?.appName]); 42 | 43 | React.useEffect(() => { 44 | if (!location || !location.pathname) return; 45 | 46 | setSelectedTab(getTabByUrl(location.pathname)); 47 | }, [location]); 48 | 49 | const onTabChanged = (tab: Tab): void => { 50 | setSelectedTab(tab); 51 | navigate(TabToUrl[tab]); 52 | }; 53 | 54 | return !store.isInitialized ? ( 55 | 56 | ) : ( 57 | 58 | 59 | 64 | 68 | theme.palette.mode === "light" 69 | ? theme.palette.grey[100] 70 | : theme.palette.grey[900], 71 | flexGrow: 1, 72 | height: "100vh", 73 | overflow: "hidden", 74 | }} 75 | > 76 | 77 | 78 | 79 | ); 80 | } 81 | -------------------------------------------------------------------------------- /spark-ui/src/Hooks.ts: -------------------------------------------------------------------------------- 1 | import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux"; 2 | import type { AppDispatch, RootState } from "./Store"; 3 | 4 | // Use throughout your app instead of plain `useDispatch` and `useSelector` 5 | export const useAppDispatch: () => AppDispatch = useDispatch; 6 | export const useAppSelector: TypedUseSelectorHook = useSelector; 7 | -------------------------------------------------------------------------------- /spark-ui/src/Router.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { createHashRouter } from "react-router-dom"; 3 | import App from "./App"; 4 | import { AlertsTab } from "./tabs/AlertsTab"; 5 | import ConfigurationTab from "./tabs/ConfigurationTab"; 6 | import { ResourcesTab } from "./tabs/ResourcesTab"; 7 | import StatusTab from "./tabs/StatusTab"; 8 | import SummaryTab from "./tabs/SummaryTab"; 9 | import { isHistoryServer } from "./utils/UrlUtils"; 10 | 11 | const isHistoryServerMode = isHistoryServer(); 12 | 13 | export const reactRouter = createHashRouter([ 14 | { 15 | path: "/", 16 | element: , 17 | children: [ 18 | { 19 | index: true, 20 | element: isHistoryServerMode ? : , 21 | }, 22 | { 23 | path: "/status", 24 | element: , 25 | }, 26 | { 27 | path: "/config", 28 | element: , 29 | }, 30 | { 31 | path: "/alerts", 32 | element: , 33 | }, 34 | { 35 | path: "/summary", 36 | element: , 37 | }, 38 | { 39 | path: "/resources", 40 | element: , 41 | }, 42 | ], 43 | }, 44 | ]); 45 | -------------------------------------------------------------------------------- /spark-ui/src/Store.ts: -------------------------------------------------------------------------------- 1 | import { configureStore } from "@reduxjs/toolkit"; 2 | import ChatSlice from "./reducers/ChatSlice"; 3 | import GeneralSlice from "./reducers/GeneralSlice"; 4 | import jobsColumnsReducer from "./reducers/JobsColumnSlice"; 5 | import SparkSlice from "./reducers/SparkSlice"; 6 | 7 | const store = configureStore({ 8 | reducer: { 9 | spark: SparkSlice, 10 | chat: ChatSlice, 11 | general: GeneralSlice, 12 | jobsColumns: jobsColumnsReducer, 13 | }, 14 | }); 15 | 16 | // Infer the `RootState` and `AppDispatch` types from the store itself 17 | export type RootState = ReturnType; 18 | // Inferred type: {posts: PostsState, comments: CommentsState, users: UsersState} 19 | export type AppDispatch = typeof store.dispatch; 20 | 21 | export default store; 22 | -------------------------------------------------------------------------------- /spark-ui/src/components/AlertBadge/AlertBadge.tsx: -------------------------------------------------------------------------------- 1 | import ErrorIcon from "@mui/icons-material/Error"; 2 | import WarningIcon from "@mui/icons-material/Warning"; 3 | import { Alert, AlertTitle, styled } from "@mui/material"; 4 | import Tooltip, { tooltipClasses, TooltipProps } from "@mui/material/Tooltip"; 5 | import * as React from "react"; 6 | import { Alert as DataflintAlert } from "../../interfaces/AppStore"; 7 | 8 | type InfoBoxProps = { 9 | alert?: DataflintAlert; 10 | margin?: string; 11 | placement?: 12 | | "top" 13 | | "right" 14 | | "bottom" 15 | | "left" 16 | | "bottom-end" 17 | | "bottom-start" 18 | | "left-end" 19 | | "left-start" 20 | | "right-end" 21 | | "right-start" 22 | | "top-end" 23 | | "top-start"; 24 | }; 25 | 26 | export const TransperantTooltip = styled( 27 | ({ className, ...props }: TooltipProps) => ( 28 | 29 | ), 30 | )(({ theme }) => ({ 31 | [`& .${tooltipClasses.tooltip}`]: { 32 | backgroundColor: "transparent", 33 | }, 34 | })); 35 | 36 | export default function AlertBadge({ alert, margin, placement }: InfoBoxProps) { 37 | return alert !== undefined ? ( 38 | 42 | : } 45 | > 46 | {alert.title} 47 | {alert.message} 48 | {alert.shortSuggestion !== undefined && ( 49 | <> 50 |
51 | Recommended Fix: 52 |
53 | {alert.shortSuggestion} 54 | 55 | )} 56 |
57 | 58 | } 59 | > 60 | {alert.type === "warning" ? ( 61 | 71 | ) : ( 72 | 82 | )} 83 |
84 | ) : null; 85 | } 86 | -------------------------------------------------------------------------------- /spark-ui/src/components/AlertBadge/MultiAlertsBadge.tsx: -------------------------------------------------------------------------------- 1 | import ErrorIcon from "@mui/icons-material/Error"; 2 | import WarningIcon from "@mui/icons-material/Warning"; 3 | import { Alert, AlertTitle } from "@mui/material"; 4 | import * as React from "react"; 5 | import { Alert as DataflintAlert } from "../../interfaces/AppStore"; 6 | import { TransperantTooltip } from "./AlertBadge"; 7 | 8 | type ToggableAlertProps = { 9 | alerts: DataflintAlert[]; 10 | }; 11 | 12 | export default function MultiAlertBadge({ alerts }: ToggableAlertProps) { 13 | const alert = alerts.length > 0 ? alerts[0] : undefined; 14 | return alert !== undefined ? ( 15 | 19 | : } 22 | > 23 | {alert.title} 24 | {alert.message} 25 | {alerts.length > 1 ? ( 26 | 27 |
+ {alerts.length} additional alerts 28 |
29 | ) : ( 30 | "" 31 | )} 32 |
33 | 34 | } 35 | > 36 | {alert.type === "warning" ? ( 37 | 42 | ) : ( 43 | 48 | )} 49 |
50 | ) : null; 51 | } 52 | -------------------------------------------------------------------------------- /spark-ui/src/components/AppDrawer/DrawerFooter.tsx: -------------------------------------------------------------------------------- 1 | import Button from "@mui/material/Button"; 2 | import * as React from "react"; 3 | import { 4 | BASE_CURRENT_PAGE, 5 | IS_HISTORY_SERVER_MODE 6 | } from "../../utils/UrlConsts"; 7 | import { getBaseAppUrl, getProxyBasePath, isDataFlintSaaSUI } from "../../utils/UrlUtils"; 8 | 9 | export default function DrawerFooter({ version }: { version?: string }) { 10 | const onSparkUiClick = (): void => { 11 | window.location.href = `${getBaseAppUrl(BASE_CURRENT_PAGE)}/jobs/`; 12 | }; 13 | 14 | const onHistoryServerClick = (): void => { 15 | const basePath = getProxyBasePath(); 16 | window.location.href = basePath + "/history/"; 17 | }; 18 | 19 | return ( 20 |
29 | 32 | {IS_HISTORY_SERVER_MODE && !isDataFlintSaaSUI() ? ( 33 | 36 | ) : null} 37 | {`Version ${version}`} 38 |
39 | ); 40 | } 41 | -------------------------------------------------------------------------------- /spark-ui/src/components/ColumnPicker/ColumnPicker.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | Checkbox, 3 | FormControl, 4 | InputLabel, 5 | ListItemText, 6 | MenuItem, 7 | OutlinedInput, 8 | Select, 9 | } from "@mui/material"; 10 | import React from "react"; 11 | 12 | const ITEM_HEIGHT = 48; 13 | const ITEM_PADDING_TOP = 8; 14 | const MenuProps = { 15 | PaperProps: { 16 | style: { 17 | maxHeight: ITEM_HEIGHT * 4.5 + ITEM_PADDING_TOP, 18 | width: 300, 19 | }, 20 | }, 21 | }; 22 | 23 | interface ColumnPickerProps { 24 | headCells: { id: string; label: string }[]; 25 | visibleColumns: string[]; 26 | onToggleColumn: (columnId: string[]) => void; 27 | } 28 | 29 | const ColumnPicker: React.FC = ({ 30 | headCells, 31 | visibleColumns, 32 | onToggleColumn, 33 | }) => { 34 | const handleChange = (event: any) => { 35 | const { 36 | target: { value }, 37 | } = event; 38 | // Update visible columns 39 | onToggleColumn(typeof value === "string" ? value.split(",") : value); 40 | }; 41 | 42 | return ( 43 | 44 | Columns 45 | 67 | 68 | ); 69 | }; 70 | 71 | export default ColumnPicker; -------------------------------------------------------------------------------- /spark-ui/src/components/ConfigTable.tsx: -------------------------------------------------------------------------------- 1 | import Paper from "@mui/material/Paper"; 2 | import Table from "@mui/material/Table"; 3 | import TableBody from "@mui/material/TableBody"; 4 | import TableCell from "@mui/material/TableCell"; 5 | import TableContainer from "@mui/material/TableContainer"; 6 | import TableHead from "@mui/material/TableHead"; 7 | import TableRow from "@mui/material/TableRow"; 8 | import * as React from "react"; 9 | import { ConfigEntries } from "../interfaces/AppStore"; 10 | 11 | type ConfigTableProps = { 12 | config: ConfigEntries; 13 | }; 14 | 15 | export default function ConfigTable({ config }: ConfigTableProps) { 16 | return ( 17 | 18 | 19 | 20 | 21 | Name 22 | Value 23 | Documentation 24 | 25 | 26 | 27 | {config.map((row) => ( 28 | 29 | {row.name} 30 | 31 | {row.value ?? row.default} 32 | {row.value === undefined || row.value === row.default 33 | ? " (default)" 34 | : ""} 35 | 36 | 37 | {row.documentation} 38 | 39 | 40 | ))} 41 | 42 |
43 |
44 | ); 45 | } 46 | -------------------------------------------------------------------------------- /spark-ui/src/components/ExceptionIcon.tsx: -------------------------------------------------------------------------------- 1 | import ErrorOutlineIcon from "@mui/icons-material/ErrorOutline"; 2 | import { Fade, Snackbar } from "@mui/material"; 3 | import { styled } from "@mui/material/styles"; 4 | import Tooltip, { tooltipClasses, TooltipProps } from "@mui/material/Tooltip"; 5 | import * as React from "react"; 6 | 7 | const CustomWidthTooltip = styled(({ className, ...props }: TooltipProps) => ( 8 | 9 | ))({ 10 | [`& .${tooltipClasses.tooltip}`]: { 11 | maxWidth: 500, 12 | maxHeight: 300, 13 | overflow: "auto", 14 | whiteSpace: "pre", 15 | }, 16 | }); 17 | 18 | const onTooltipClick = ( 19 | event: React.MouseEvent, 20 | failureReason: string, 21 | setOpenSnackbar: React.Dispatch>, 22 | ) => { 23 | event.stopPropagation(); 24 | setOpenSnackbar(true); 25 | navigator.clipboard.writeText(failureReason); 26 | }; 27 | 28 | const formatFailureReason = (failureReason: string) => { 29 | const regex = /(Caused by:.*?)(?=\n)/s; 30 | const match = regex.exec(failureReason); 31 | 32 | if (match) { 33 | const causedByText = match[1].trim(); 34 | return `${causedByText}\nFull stacktrace:\n${failureReason}`; 35 | } 36 | 37 | return failureReason; 38 | }; 39 | 40 | const ExceptionIcon: React.FC<{ failureReason: string }> = ({ 41 | failureReason, 42 | }): JSX.Element => { 43 | const [openSnackbar, setOpenSnackbar] = React.useState(false); 44 | 45 | const handleClose = ( 46 | event: React.SyntheticEvent | Event, 47 | reason?: string, 48 | ) => { 49 | if (reason === "clickaway") { 50 | return; 51 | } 52 | 53 | setOpenSnackbar(false); 54 | }; 55 | 56 | const formatedFailureReason = formatFailureReason(failureReason); 57 | return ( 58 |
onTooltipClick(event, failureReason, setOpenSnackbar)} 60 | > 61 | 69 | 73 | 74 | 80 |
81 | ); 82 | }; 83 | 84 | export default ExceptionIcon; 85 | -------------------------------------------------------------------------------- /spark-ui/src/components/InfoBox/InfoBox.module.css: -------------------------------------------------------------------------------- 1 | @keyframes blinkAnimation { 2 | 0% { 3 | opacity: 1; 4 | } 5 | 50% { 6 | opacity: 0.2; 7 | } 8 | 100% { 9 | opacity: 1; 10 | } 11 | } 12 | 13 | .blink { 14 | animation: blinkAnimation 0.5s linear; 15 | } 16 | -------------------------------------------------------------------------------- /spark-ui/src/components/Modals/DisconnectedModal.tsx: -------------------------------------------------------------------------------- 1 | import { Box, Fade, Modal, Typography } from "@mui/material"; 2 | import Backdrop from "@mui/material/Backdrop"; 3 | import React, { FC } from "react"; 4 | import { useAppSelector } from "../../Hooks"; 5 | 6 | const style = { 7 | position: "absolute" as "absolute", 8 | top: "50%", 9 | left: "50%", 10 | transform: "translate(-50%, -50%)", 11 | width: 400, 12 | bgcolor: "#383838", 13 | outline: "none", 14 | borderRadius: "4px", 15 | boxShadow: 24, 16 | p: 4, 17 | }; 18 | 19 | const DisconnectedModal: FC = (): JSX.Element => { 20 | const { isConnected, isInitialized } = useAppSelector((state) => state.spark); 21 | 22 | const open = !isConnected && isInitialized; 23 | 24 | return ( 25 | 36 | 37 | 38 | 39 | Server disconnected 40 | 41 | 42 | Trying to reconnect... 43 | 44 | 45 | 46 | 47 | ); 48 | }; 49 | 50 | export default DisconnectedModal; 51 | -------------------------------------------------------------------------------- /spark-ui/src/components/NoQuery/NoQuery.tsx: -------------------------------------------------------------------------------- 1 | import { Alert } from "@mui/material"; 2 | import React from "react"; 3 | 4 | const NoQuery = () => { 5 | return No Spark SQL query currently running; 6 | }; 7 | 8 | export default NoQuery; 9 | -------------------------------------------------------------------------------- /spark-ui/src/components/Progress.tsx: -------------------------------------------------------------------------------- 1 | import { Box, CircularProgress } from "@mui/material"; 2 | import React, { FC } from "react"; 3 | import "reactflow/dist/style.css"; 4 | 5 | const Progress: FC = ({}): JSX.Element => { 6 | return ( 7 | 13 | 14 | 15 | ); 16 | }; 17 | 18 | export default Progress; 19 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlContainer.tsx: -------------------------------------------------------------------------------- 1 | import React, { FC } from "react"; 2 | import "reactflow/dist/style.css"; 3 | import { useAppSelector } from "../Hooks"; 4 | import Progress from "./Progress"; 5 | import SqlFlow from "./SqlFlow/SqlFlow"; 6 | 7 | const SqlContainer: FC = (): JSX.Element => { 8 | const sql = useAppSelector((state) => state.spark.sql); 9 | return sql === undefined ? ( 10 | 11 | ) : ( 12 |
13 | 14 |
15 | ); 16 | }; 17 | 18 | export default SqlContainer; 19 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlFlow/BytesDistributionChart.tsx: -------------------------------------------------------------------------------- 1 | import { ApexOptions } from "apexcharts"; 2 | import React from "react"; 3 | import ReactApexChart from "react-apexcharts"; 4 | import { humanFileSize } from "../../utils/FormatUtils"; 5 | 6 | export default function BytesDistributionChart({ 7 | bytesDist, 8 | title 9 | }: { 10 | bytesDist: number[]; 11 | title: string, 12 | }): JSX.Element { 13 | const series = [ 14 | { 15 | name: title, 16 | data: bytesDist, 17 | }, 18 | ]; 19 | 20 | const options: ApexOptions = { 21 | plotOptions: { 22 | bar: { 23 | horizontal: false, 24 | }, 25 | }, 26 | chart: { 27 | animations: { 28 | enabled: false, 29 | }, 30 | toolbar: { 31 | show: false, 32 | }, 33 | zoom: { 34 | enabled: false, 35 | }, 36 | }, 37 | dataLabels: { 38 | enabled: false, 39 | }, 40 | stroke: { 41 | show: true, 42 | width: 2, 43 | colors: ["transparent"], 44 | }, 45 | xaxis: { 46 | categories: [ 47 | "min", 48 | "0.1", 49 | "0.2", 50 | "0.3", 51 | "0.4", 52 | "0.5", 53 | "0.6", 54 | "0.7", 55 | "0.8", 56 | "0.9", 57 | "max", 58 | ], 59 | }, 60 | yaxis: { 61 | title: { 62 | text: title, 63 | }, 64 | labels: { 65 | formatter: (value: number, timestamp?: number, opts?: any) => 66 | humanFileSize(value), 67 | }, 68 | }, 69 | theme: { 70 | mode: "dark", 71 | }, 72 | }; 73 | 74 | return ( 75 | 76 | ); 77 | } 78 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlFlow/DurationDistributionChart.tsx: -------------------------------------------------------------------------------- 1 | import { ApexOptions } from "apexcharts"; 2 | import { duration } from "moment"; 3 | import React from "react"; 4 | import ReactApexChart from "react-apexcharts"; 5 | import { humanizeTimeDiff } from "../../utils/FormatUtils"; 6 | 7 | export default function DurationDistributionChart({ 8 | durationDist, 9 | }: { 10 | durationDist: number[]; 11 | }): JSX.Element { 12 | const series = [ 13 | { 14 | name: "Duration", 15 | data: durationDist, 16 | }, 17 | ]; 18 | 19 | const options: ApexOptions = { 20 | plotOptions: { 21 | bar: { 22 | horizontal: false, 23 | }, 24 | }, 25 | chart: { 26 | animations: { 27 | enabled: false, 28 | }, 29 | toolbar: { 30 | show: false, 31 | }, 32 | zoom: { 33 | enabled: false, 34 | }, 35 | }, 36 | dataLabels: { 37 | enabled: false, 38 | }, 39 | stroke: { 40 | show: true, 41 | width: 2, 42 | colors: ["transparent"], 43 | }, 44 | xaxis: { 45 | categories: [ 46 | "min", 47 | "0.1", 48 | "0.2", 49 | "0.3", 50 | "0.4", 51 | "0.5", 52 | "0.6", 53 | "0.7", 54 | "0.8", 55 | "0.9", 56 | "max", 57 | ], 58 | }, 59 | yaxis: { 60 | title: { 61 | text: "Duration", 62 | }, 63 | labels: { 64 | formatter: (value: number, timestamp?: number, opts?: any) => 65 | humanizeTimeDiff(duration(value)), 66 | }, 67 | }, 68 | theme: { 69 | mode: "dark", 70 | }, 71 | }; 72 | 73 | return ( 74 | 75 | ); 76 | } 77 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlFlow/NumbersDistributionChart.tsx: -------------------------------------------------------------------------------- 1 | import { ApexOptions } from "apexcharts"; 2 | import React from "react"; 3 | import ReactApexChart from "react-apexcharts"; 4 | 5 | export default function NumbersDistributionChart({ 6 | numbersDist, 7 | title 8 | }: { 9 | numbersDist: number[]; 10 | title: string, 11 | }): JSX.Element { 12 | const series = [ 13 | { 14 | name: title, 15 | data: numbersDist, 16 | }, 17 | ]; 18 | 19 | const options: ApexOptions = { 20 | plotOptions: { 21 | bar: { 22 | horizontal: false, 23 | }, 24 | }, 25 | chart: { 26 | animations: { 27 | enabled: false, 28 | }, 29 | toolbar: { 30 | show: false, 31 | }, 32 | zoom: { 33 | enabled: false, 34 | }, 35 | }, 36 | dataLabels: { 37 | enabled: false, 38 | }, 39 | stroke: { 40 | show: true, 41 | width: 2, 42 | colors: ["transparent"], 43 | }, 44 | xaxis: { 45 | categories: [ 46 | "min", 47 | "0.1", 48 | "0.2", 49 | "0.3", 50 | "0.4", 51 | "0.5", 52 | "0.6", 53 | "0.7", 54 | "0.8", 55 | "0.9", 56 | "max", 57 | ], 58 | }, 59 | yaxis: { 60 | title: { 61 | text: title, 62 | }, 63 | }, 64 | theme: { 65 | mode: "dark", 66 | }, 67 | }; 68 | 69 | return ( 70 | 71 | ); 72 | } 73 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlFlow/SqlLayoutService.ts: -------------------------------------------------------------------------------- 1 | import dagre from "dagre"; 2 | import { Edge, Node, Position } from "reactflow"; 3 | import { v4 as uuidv4 } from "uuid"; 4 | import { 5 | EnrichedSparkSQL, 6 | EnrichedSqlEdge, 7 | EnrichedSqlNode, 8 | GraphFilter, 9 | } from "../../interfaces/AppStore"; 10 | import { StageNodeName } from "./StageNode"; 11 | 12 | const nodeWidth = 280; 13 | const nodeHeight = 280; 14 | 15 | const getLayoutedElements = ( 16 | nodes: Node[], 17 | edges: Edge[], 18 | ): { layoutNodes: Node[]; layoutEdges: Edge[] } => { 19 | const dagreGraph = new dagre.graphlib.Graph(); 20 | dagreGraph.setDefaultEdgeLabel(() => ({})); 21 | dagreGraph.setGraph({ rankdir: "LR" }); 22 | 23 | nodes.forEach((node) => { 24 | dagreGraph.setNode(node.id, { width: nodeWidth, height: nodeHeight }); 25 | }); 26 | 27 | edges.forEach((edge) => { 28 | dagreGraph.setEdge(edge.source, edge.target); 29 | }); 30 | 31 | dagre.layout(dagreGraph); 32 | 33 | nodes.forEach((node) => { 34 | const nodeWithPosition = dagreGraph.node(node.id); 35 | node.targetPosition = Position.Left; 36 | node.sourcePosition = Position.Right; 37 | 38 | // We are shifting the dagre node position (anchor=center center) to the top left 39 | // so it matches the React Flow node anchor point (top left). 40 | node.position = { 41 | x: nodeWithPosition.x - nodeWidth / 2, 42 | y: nodeWithPosition.y - nodeHeight / 2, 43 | }; 44 | 45 | return node; 46 | }); 47 | 48 | return { layoutNodes: nodes, layoutEdges: edges }; 49 | }; 50 | 51 | class SqlLayoutService { 52 | static SqlElementsToLayout( 53 | sql: EnrichedSparkSQL, 54 | graphFilter: GraphFilter, 55 | ): { layoutNodes: Node[]; layoutEdges: Edge[] } { 56 | const { nodesIds, edges } = sql.filters[graphFilter]; 57 | 58 | const flowNodes: Node[] = sql.nodes 59 | .filter((node) => nodesIds.includes(node.nodeId)) 60 | .map((node: EnrichedSqlNode) => { 61 | return { 62 | id: node.nodeId.toString(), 63 | data: { sqlId: sql.id, node: node }, 64 | type: StageNodeName, 65 | position: { x: 0, y: 0 }, 66 | }; 67 | }); 68 | const flowEdges: Edge[] = edges.map((edge: EnrichedSqlEdge) => { 69 | return { 70 | id: uuidv4(), 71 | source: edge.fromId.toString(), 72 | animated: true, 73 | target: edge.toId.toString(), 74 | }; 75 | }); 76 | 77 | const { layoutNodes, layoutEdges } = getLayoutedElements( 78 | flowNodes, 79 | flowEdges, 80 | ); 81 | return { layoutNodes: layoutNodes, layoutEdges: layoutEdges }; 82 | } 83 | } 84 | 85 | export default SqlLayoutService; 86 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlFlow/node-style.module.css: -------------------------------------------------------------------------------- 1 | .node { 2 | display: flex; 3 | justify-content: center; 4 | width: 280px; 5 | height: 280px; 6 | color: black; 7 | padding: 8px; 8 | background: white; 9 | border: 3px solid lightblue; 10 | border-radius: 10px; 11 | } 12 | 13 | .textWrapper { 14 | display: flex; 15 | flex-direction: column; 16 | justify-content: flex-start; 17 | height: 190px; 18 | } 19 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlTable/TableTypes.tsx: -------------------------------------------------------------------------------- 1 | export type Order = "asc" | "desc"; 2 | 3 | export interface EnhancedTableProps { 4 | onRequestSort: ( 5 | event: React.MouseEvent, 6 | property: keyof Data, 7 | ) => void; 8 | order: Order; 9 | orderBy: string; 10 | headCells: HeadCell[]; 11 | visibleColumns: string[]; 12 | } 13 | 14 | export interface Data { 15 | id: string; 16 | status: string; 17 | description: string; 18 | duration: number; 19 | durationPercentage: number; 20 | dcu: number; 21 | dcuPercentage: number; 22 | input: number; 23 | output: number; 24 | idleCores: number; 25 | spill: number; 26 | totalTasks: number; 27 | shuffleReadBytes: number; 28 | shuffleWriteBytes: number; 29 | executorRunTime: number; 30 | failureReason: string; 31 | } 32 | 33 | export interface HeadCell { 34 | disablePadding: boolean; 35 | id: keyof Data; 36 | label: string; 37 | numeric: boolean; 38 | initiallyVisible: boolean; 39 | } 40 | -------------------------------------------------------------------------------- /spark-ui/src/components/SqlTable/TableUtils.tsx: -------------------------------------------------------------------------------- 1 | import { Order } from "./TableTypes"; 2 | 3 | export function descendingComparator(a: T, b: T, orderBy: keyof T) { 4 | if (b[orderBy] < a[orderBy]) { 5 | return -1; 6 | } 7 | if (b[orderBy] > a[orderBy]) { 8 | return 1; 9 | } 10 | return 0; 11 | } 12 | 13 | export function getComparator( 14 | order: Order, 15 | orderBy: Key, 16 | ): ( 17 | a: { [key in Key]: number | string }, 18 | b: { [key in Key]: number | string }, 19 | ) => number { 20 | return order === "desc" 21 | ? (a, b) => descendingComparator(a, b, orderBy) 22 | : (a, b) => -descendingComparator(a, b, orderBy); 23 | } 24 | 25 | // Since 2020 all major browsers ensure sort stability with Array.prototype.sort(). 26 | // stableSort() brings sort stability to non-modern browsers (notably IE11). If you 27 | // only support modern browsers you can replace stableSort(exampleArray, exampleComparator) 28 | // with exampleArray.slice().sort(exampleComparator) 29 | export function stableSort( 30 | array: readonly T[], 31 | comparator: (a: T, b: T) => number, 32 | ) { 33 | const stabilizedThis = array.map((el, index) => [el, index] as [T, number]); 34 | stabilizedThis.sort((a, b) => { 35 | const order = comparator(a[0], b[0]); 36 | if (order !== 0) { 37 | return order; 38 | } 39 | return a[1] - b[1]; 40 | }); 41 | return stabilizedThis.map((el) => el[0]); 42 | } 43 | -------------------------------------------------------------------------------- /spark-ui/src/index.tsx: -------------------------------------------------------------------------------- 1 | import CssBaseline from "@mui/material/CssBaseline"; 2 | import { ThemeProvider } from "@mui/material/styles"; 3 | import * as React from "react"; 4 | import * as ReactDOM from "react-dom/client"; 5 | import { Provider } from "react-redux"; 6 | import { RouterProvider } from "react-router-dom"; 7 | import { reactRouter } from "./Router"; 8 | import store from "./Store"; 9 | import theme from "./theme"; 10 | 11 | document.addEventListener("DOMContentLoaded", (event) => { 12 | const rootElement = document.getElementById("root"); 13 | const root = ReactDOM.createRoot(rootElement!); 14 | 15 | if (location.protocol === "https:") { 16 | var meta = document.createElement("meta"); 17 | meta.httpEquiv = "Content-Security-Policy"; 18 | meta.content = "upgrade-insecure-requests"; 19 | document.getElementsByTagName("head")[0].appendChild(meta); 20 | } 21 | 22 | root.render( 23 | 24 | 25 | {/* CssBaseline kickstart an elegant, consistent, and simple baseline to build upon. */} 26 | 27 | 28 | 29 | , 30 | ); 31 | }); 32 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/ApplicationInfo.ts: -------------------------------------------------------------------------------- 1 | import { SparkApplication } from "./SparkApplications"; 2 | 3 | export interface EnvironmentInfo { 4 | driverXmxBytes?: number; 5 | } 6 | 7 | export interface ApplicationInfo { 8 | runId?: string; 9 | info: SparkApplication; 10 | environmentInfo?: EnvironmentInfo; 11 | } -------------------------------------------------------------------------------- /spark-ui/src/interfaces/CachedStorage.ts: -------------------------------------------------------------------------------- 1 | export interface DataflintExecutorStorageInfo { 2 | memoryUsed: number; 3 | memoryRemaining: number; 4 | memoryUsagePercentage: number; 5 | } 6 | 7 | export interface RddStorageInfo { 8 | rddId: number; 9 | memoryUsed: number; 10 | diskUsed: number; 11 | numOfPartitions: number; 12 | storageLevel: string; 13 | maxMemoryExecutorInfo: DataflintExecutorStorageInfo | undefined; 14 | } 15 | 16 | export interface CachedStorage { 17 | [stageId: string]: RddStorageInfo[]; 18 | } 19 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/IcebergInfo.ts: -------------------------------------------------------------------------------- 1 | export interface IcebergInfo { 2 | commitsInfo: IcebergCommitsInfo[]; 3 | } 4 | 5 | export interface IcebergCommitsInfo { 6 | executionId: number; 7 | tableName: string; 8 | commitId: number; 9 | operation: string; 10 | metrics: IcebergCommitMetrics; 11 | } 12 | 13 | export interface IcebergCommitMetrics { 14 | durationMS: number; 15 | attempts: number; 16 | addedDataFiles: number; 17 | removedDataFiles: number; 18 | totalDataFiles: number; 19 | addedDeleteFiles: number; 20 | addedEqualityDeleteFiles: number; 21 | addedPositionalDeleteFiles: number; 22 | removedDeleteFiles: number; 23 | removedEqualityDeleteFiles: number; 24 | removedPositionalDeleteFiles: number; 25 | totalDeleteFiles: number; 26 | addedRecords: number; 27 | removedRecords: number; 28 | totalRecords: number; 29 | addedFilesSizeInBytes: number; 30 | removedFilesSizeInBytes: number; 31 | totalFilesSizeInBytes: number; 32 | addedPositionalDeletes: number; 33 | removedPositionalDeletes: number; 34 | totalPositionalDeletes: number; 35 | addedEqualityDeletes: number; 36 | removedEqualityDeletes: number; 37 | totalEqualityDeletes: number; 38 | } 39 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/Mixpanel.ts: -------------------------------------------------------------------------------- 1 | export enum MixpanelEvents { 2 | SparkAppInitilized = "Spark App initilized", 3 | SqlSummarySelected = "Sql Summary Selected", 4 | KeepAlive = "Keep Alive", 5 | } 6 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SQLPlan.ts: -------------------------------------------------------------------------------- 1 | export type SQLPlans = SQLPlan[]; 2 | 3 | export interface SQLPlan { 4 | executionId: number; 5 | numOfNodes: number; 6 | nodesPlan: SQLNodePlan[]; 7 | rddScopesToStages?: Record; 8 | } 9 | 10 | export interface StartAndAttempt { 11 | stageId: string; 12 | attemptId: string; 13 | } 14 | 15 | export interface SQLNodePlan { 16 | id: number; 17 | planDescription: string; 18 | rddScopeId?: string; 19 | } 20 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SparkApplications.ts: -------------------------------------------------------------------------------- 1 | export type SparkApplications = SparkApplication[]; 2 | 3 | export interface SparkApplication { 4 | id: string; 5 | name: string; 6 | attempts: Attempt[]; 7 | } 8 | 9 | export interface Attempt { 10 | attemptId?: string; 11 | startTime: string; 12 | endTime: string; 13 | lastUpdated: string; 14 | duration: number; 15 | sparkUser: string; 16 | completed: boolean; 17 | appSparkVersion: string; 18 | startTimeEpoch: number; 19 | endTimeEpoch: number; 20 | lastUpdatedEpoch: number; 21 | } 22 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SparkConfiguration.ts: -------------------------------------------------------------------------------- 1 | export interface SparkConfiguration { 2 | runtime: Runtime; 3 | sparkProperties: string[][]; 4 | hadoopProperties: string[][]; 5 | systemProperties: string[][]; 6 | metricsProperties: string[][]; 7 | classpathEntries: string[][]; 8 | resourceProfiles: ResourceProfile[]; 9 | } 10 | 11 | export interface Runtime { 12 | javaVersion: string; 13 | javaHome: string; 14 | scalaVersion: string; 15 | } 16 | 17 | export interface ResourceProfile { 18 | id: number; 19 | executorResources: ExecutorResources; 20 | taskResources: TaskResources; 21 | } 22 | 23 | export interface ExecutorResources { 24 | cores: Cores; 25 | memory: Memory; 26 | offHeap: OffHeap; 27 | } 28 | 29 | export interface Cores { 30 | resourceName: string; 31 | amount: number; 32 | discoveryScript: string; 33 | vendor: string; 34 | } 35 | 36 | export interface Memory { 37 | resourceName: string; 38 | amount: number; 39 | discoveryScript: string; 40 | vendor: string; 41 | } 42 | 43 | export interface OffHeap { 44 | resourceName: string; 45 | amount: number; 46 | discoveryScript: string; 47 | vendor: string; 48 | } 49 | 50 | export interface TaskResources { 51 | cpus: Cpus; 52 | } 53 | 54 | export interface Cpus { 55 | resourceName: string; 56 | amount: number; 57 | } 58 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SparkExecutors.ts: -------------------------------------------------------------------------------- 1 | export type SparkExecutors = SparkExecutor[]; 2 | 3 | export interface SparkExecutor { 4 | id: string; 5 | hostPort: string; 6 | isActive: boolean; 7 | rddBlocks: number; 8 | memoryUsed: number; 9 | diskUsed: number; 10 | totalCores: number; 11 | maxTasks: number; 12 | activeTasks: number; 13 | failedTasks: number; 14 | completedTasks: number; 15 | totalTasks: number; 16 | totalDuration: number; 17 | totalGCTime: number; 18 | totalInputBytes: number; 19 | totalShuffleRead: number; 20 | totalShuffleWrite: number; 21 | isBlacklisted: boolean; 22 | maxMemory: number; 23 | addTime: string; 24 | executorLogs: ExecutorLogs; 25 | memoryMetrics: MemoryMetrics; 26 | blacklistedInStages: any[]; 27 | peakMemoryMetrics?: PeakMemoryMetrics; 28 | attributes: Attributes; 29 | resources: Resources; 30 | resourceProfileId: number; 31 | isExcluded: boolean; 32 | excludedInStages: any[]; 33 | removeTime: string; 34 | removeReason: string; 35 | } 36 | 37 | export interface ExecutorLogs { 38 | stdout?: string; 39 | stderr?: string; 40 | } 41 | 42 | export interface MemoryMetrics { 43 | usedOnHeapStorageMemory: number; 44 | usedOffHeapStorageMemory: number; 45 | totalOnHeapStorageMemory: number; 46 | totalOffHeapStorageMemory: number; 47 | } 48 | 49 | export interface PeakMemoryMetrics { 50 | JVMHeapMemory: number; 51 | JVMOffHeapMemory: number; 52 | OnHeapExecutionMemory: number; 53 | OffHeapExecutionMemory: number; 54 | OnHeapStorageMemory: number; 55 | OffHeapStorageMemory: number; 56 | OnHeapUnifiedMemory: number; 57 | OffHeapUnifiedMemory: number; 58 | DirectPoolMemory: number; 59 | MappedPoolMemory: number; 60 | ProcessTreeJVMVMemory: number; 61 | ProcessTreeJVMRSSMemory: number; 62 | ProcessTreePythonVMemory: number; 63 | ProcessTreePythonRSSMemory: number; 64 | ProcessTreeOtherVMemory: number; 65 | ProcessTreeOtherRSSMemory: number; 66 | MinorGCCount: number; 67 | MinorGCTime: number; 68 | MajorGCCount: number; 69 | MajorGCTime: number; 70 | TotalGCTime: number; 71 | } 72 | 73 | export interface Attributes {} 74 | 75 | export interface Resources {} 76 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SparkJobs.ts: -------------------------------------------------------------------------------- 1 | export type SparkJobs = SparkJob[]; 2 | 3 | export interface SparkJob { 4 | jobId: number; 5 | name: string; 6 | description: string; 7 | submissionTime: string; 8 | completionTime: string; 9 | stageIds: number[]; 10 | status: string; 11 | numTasks: number; 12 | numActiveTasks: number; 13 | numCompletedTasks: number; 14 | numSkippedTasks: number; 15 | numFailedTasks: number; 16 | numKilledTasks: number; 17 | numCompletedIndices: number; 18 | numActiveStages: number; 19 | numCompletedStages: number; 20 | numSkippedStages: number; 21 | numFailedStages: number; 22 | killedTasksSummary: KilledTasksSummary; 23 | jobGroup?: string; 24 | } 25 | 26 | export interface KilledTasksSummary {} 27 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SparkSQLs.ts: -------------------------------------------------------------------------------- 1 | export type SparkSQLs = SparkSQL[]; 2 | 3 | export interface SparkSQL { 4 | id: string; 5 | status: string; 6 | description: string; 7 | planDescription: string; 8 | submissionTime: string; 9 | duration: number; 10 | runningJobIds: number[]; 11 | successJobIds: number[]; 12 | failedJobIds: number[]; 13 | nodes: SqlNode[]; 14 | edges: SqlEdge[]; 15 | } 16 | 17 | export interface SqlNode { 18 | nodeId: number; 19 | nodeName: string; 20 | metrics: SqlMetric[]; 21 | wholeStageCodegenId?: number; 22 | } 23 | 24 | export interface SqlMetric { 25 | name: string; 26 | value: string; 27 | } 28 | 29 | export interface SqlEdge { 30 | fromId: number; 31 | toId: number; 32 | } 33 | 34 | export enum SqlStatus { 35 | Running = "RUNNING", 36 | Completed = "COMPLETED", 37 | Failed = "FAILED", 38 | } 39 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/SqlMetrics.ts: -------------------------------------------------------------------------------- 1 | export type NodesMetrics = NodeMetrics[]; 2 | 3 | export interface NodeMetrics { 4 | id: number; 5 | name: string; 6 | metrics: Metric[]; 7 | } 8 | 9 | export interface Metric { 10 | name: string; 11 | value: string; 12 | } 13 | -------------------------------------------------------------------------------- /spark-ui/src/interfaces/StagesRdd.ts: -------------------------------------------------------------------------------- 1 | export type StagesRdd = Record>; 2 | -------------------------------------------------------------------------------- /spark-ui/src/react-app-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/BroadcastTooLargeAlert.ts: -------------------------------------------------------------------------------- 1 | import { Alerts, SparkSQLStore } from "../../interfaces/AppStore"; 2 | import { humanFileSize, parseBytesString } from "../../utils/FormatUtils"; 3 | 4 | const BROADCAST_SIZE_THRESHOLD = 1 * 1024 * 1024 * 1024; 5 | 6 | export function reduceBroadcastTooLargeAlert(sql: SparkSQLStore, alerts: Alerts) { 7 | sql.sqls.forEach((sql) => { 8 | sql.nodes.forEach((node) => { 9 | if (node.nodeName === "BroadcastExchange" || (node.nodeName === "Exchange" && node.parsedPlan?.type === "Exchange" && node.parsedPlan?.plan.isBroadcast)) { 10 | const broadcastSizeMetric = parseBytesString( 11 | node.metrics.find((metric) => metric.name === "data size")?.value ?? "0", 12 | ); 13 | 14 | if (broadcastSizeMetric > BROADCAST_SIZE_THRESHOLD) { 15 | const broadcastSizeString = humanFileSize(broadcastSizeMetric); 16 | alerts.push({ 17 | id: `largeBroadcast_${sql.id}_${node.nodeId}_${broadcastSizeString}`, 18 | name: "largeBroadcast", 19 | title: "Large data Broadcast", 20 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`, 21 | message: `The data broadcast size is ${broadcastSizeString}, which exceeds the 1GB threshold and can cause performance issues`, 22 | suggestion: ` 23 | 1. spark.sql.autoBroadcastJoinThreshold config might be set to a large number which is not optimal 24 | 2. The broadcast hint is applied on a large dataframe which is not optimal`, 25 | type: "warning", 26 | source: { 27 | type: "sql", 28 | sqlId: sql.id, 29 | sqlNodeId: node.nodeId, 30 | }, 31 | }); 32 | } 33 | } 34 | }); 35 | }); 36 | } -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/LargeCrossJoinScanAlert.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Alerts, 3 | SparkSQLStore, 4 | } from "../../interfaces/AppStore"; 5 | 6 | // 1 trillion threshold 7 | const CROSS_JOIN_SCANNED_ROWS_THRESHOLD = 10_000_000_000; 8 | 9 | export function reduceLargeCrossJoinScanAlert( 10 | sql: SparkSQLStore, 11 | alerts: Alerts, 12 | ) { 13 | sql.sqls.forEach((sql) => { 14 | sql.nodes.forEach((node) => { 15 | // Check if this is a cross join node (BroadcastNestedLoopJoin or CartesianProduct) 16 | if (node.nodeName === "BroadcastNestedLoopJoin" || node.nodeName === "CartesianProduct") { 17 | // Find the Cross Join Scanned Rows metric 18 | const crossJoinScannedRowsMetric = node.metrics.find( 19 | (metric) => metric.name === "Cross Join Scanned Rows" 20 | ); 21 | 22 | if (crossJoinScannedRowsMetric !== undefined) { 23 | // Parse the value as a number 24 | const scannedRows = parseFloat(crossJoinScannedRowsMetric.value.replace(/,/g, "")); 25 | 26 | // Check if the scanned rows exceeds the threshold 27 | if (!isNaN(scannedRows) && scannedRows > CROSS_JOIN_SCANNED_ROWS_THRESHOLD) { 28 | // Format the number with commas for thousands separators 29 | const formattedScannedRows = scannedRows.toLocaleString(); 30 | 31 | alerts.push({ 32 | id: `largeCrossJoinScan_${sql.id}_${node.nodeId}`, 33 | name: "largeCrossJoinScan", 34 | title: "Large Cross Join Scan", 35 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`, 36 | message: `Cross join is scanning ${formattedScannedRows} rows, which is too large and can cause performance issues or query failure`, 37 | suggestion: ` 38 | 1. Add specific join conditions to convert the cross join to a more efficient join type 39 | 2. Avoid using cross joins for large datasets 40 | `, 41 | type: "error", 42 | source: { 43 | type: "sql", 44 | sqlId: sql.id, 45 | sqlNodeId: node.nodeId, 46 | }, 47 | }); 48 | } 49 | } 50 | } 51 | }); 52 | }); 53 | } -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/LongFilterConditions.ts: -------------------------------------------------------------------------------- 1 | import { Alerts, SparkSQLStore } from "../../interfaces/AppStore"; 2 | 3 | const FILTER_CONDITION_TOO_LONG_CHARACTERS_THRESHOLF = 1000; 4 | 5 | export function reduceLongFilterConditions(sql: SparkSQLStore, alerts: Alerts) { 6 | sql.sqls.forEach((sql) => { 7 | sql.nodes.forEach((node) => { 8 | const filterCondition = node.nodeName === "Filter" && node.parsedPlan?.type === "Filter" ? node.parsedPlan.plan.condition : undefined; 9 | if (filterCondition !== undefined && filterCondition.length > FILTER_CONDITION_TOO_LONG_CHARACTERS_THRESHOLF) { 10 | const filterConditionLength = filterCondition.length; 11 | alerts.push({ 12 | id: `longFilterCondition${sql.id}_${node.nodeId}_${filterConditionLength}`, 13 | name: "longFilterCondition", 14 | title: "Long Filter Condition", 15 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`, 16 | message: `Condition length is ${filterConditionLength}, which is too long and can cause performance issues`, 17 | suggestion: ` 18 | 1. Try to convert your filter condition to a join statement, by creating a DF of your filter condition and inner joining it with your main DF 19 | 2. Consider rewriting your filter condition to be shorter 20 | `, 21 | type: "warning", 22 | source: { 23 | type: "sql", 24 | sqlId: sql.id, 25 | sqlNodeId: node.nodeId, 26 | }, 27 | }); 28 | } 29 | 30 | }); 31 | }); 32 | } -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/PartitionSkewAlert.ts: -------------------------------------------------------------------------------- 1 | import { duration } from "moment"; 2 | import { 3 | Alerts, 4 | SparkSQLStore, 5 | SparkStagesStore, 6 | } from "../../interfaces/AppStore"; 7 | import { humanizeTimeDiff } from "../../utils/FormatUtils"; 8 | 9 | export function reducePartitionSkewAlert( 10 | sql: SparkSQLStore, 11 | stages: SparkStagesStore, 12 | alerts: Alerts, 13 | ) { 14 | sql.sqls.forEach((sql) => { 15 | sql.nodes.forEach((node) => { 16 | const stageInfo = node.stage; 17 | if (stageInfo === undefined || stageInfo.type !== "onestage") { 18 | return; 19 | } 20 | const stageData = stages.find( 21 | (stage) => stage.stageId === stageInfo.stageId, 22 | ); 23 | 24 | if (stageData?.hasPartitionSkew === true) { 25 | const maxTaskDurationTxt = 26 | stageData.maxTaskDuration === undefined 27 | ? "" 28 | : humanizeTimeDiff(duration(stageData.maxTaskDuration)); 29 | const medianTaskDurationTxt = 30 | stageData.mediumTaskDuration === undefined 31 | ? "" 32 | : humanizeTimeDiff(duration(stageData.mediumTaskDuration)); 33 | const skewRatio = 34 | stageData.maxTaskDuration === 0 35 | ? 0 36 | : (stageData.maxTaskDuration ?? 0) / 37 | (stageData.mediumTaskDuration ?? 1); 38 | 39 | alerts.push({ 40 | id: `partitionSkew_${sql.id}_${node.nodeId}`, 41 | name: "partitionSkew", 42 | title: "Partition Skew", 43 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`, 44 | message: `Partition skew ratio of ${skewRatio.toFixed( 45 | 1, 46 | )}X, median task duration is ${medianTaskDurationTxt} and max task duration is ${maxTaskDurationTxt}`, 47 | suggestion: ` 48 | 1. Fix the partition skew, by changing the repartition your data differently 49 | 2. Do not fix the partition skew, and instead decrease number of executors/cores, so you will have less resource waste 50 | `, 51 | type: "warning", 52 | source: { 53 | type: "sql", 54 | sqlId: sql.id, 55 | sqlNodeId: node.nodeId, 56 | }, 57 | }); 58 | } 59 | }); 60 | }); 61 | } 62 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/SmallTasksAlert.ts: -------------------------------------------------------------------------------- 1 | import { duration } from "moment"; 2 | import { 3 | Alerts, 4 | SparkSQLStore, 5 | SparkStagesStore, 6 | } from "../../interfaces/AppStore"; 7 | import { humanizeTimeDiff } from "../../utils/FormatUtils"; 8 | 9 | const LARGE_TASKS_NUM_THRESHOLD = 5000; 10 | const MEDIAN_TASK_TIME_THRESHOLD_MS = 500; 11 | const TASKS_RECOMMENDED_DECREASE_RATIO = 10; 12 | 13 | export function reduceSmallTasksAlert( 14 | sql: SparkSQLStore, 15 | stages: SparkStagesStore, 16 | alerts: Alerts, 17 | ) { 18 | sql.sqls.forEach((sql) => { 19 | sql.nodes.forEach((node) => { 20 | const stageInfo = node.stage; 21 | if (stageInfo === undefined || stageInfo.type !== "onestage") { 22 | return; 23 | } 24 | const stageData = stages.find( 25 | (stage) => stage.stageId === stageInfo.stageId, 26 | ); 27 | 28 | if (stageData !== undefined && 29 | stageData.numTasks > LARGE_TASKS_NUM_THRESHOLD && 30 | stageData.mediumTaskDuration !== undefined && 31 | stageData.mediumTaskDuration < MEDIAN_TASK_TIME_THRESHOLD_MS) { 32 | const medianTaskDurationTxt = 33 | stageData.mediumTaskDuration === undefined 34 | ? "" 35 | : humanizeTimeDiff(duration(stageData.mediumTaskDuration)); 36 | const recommendedTaskNum = Math.ceil(stageData.numTasks / TASKS_RECOMMENDED_DECREASE_RATIO); 37 | 38 | alerts.push({ 39 | id: `SmallTasks_${sql.id}_${node.nodeId}`, 40 | name: "smallTasks", 41 | title: "Large Number Of Small Tasks", 42 | location: `In: SQL query "${sql.description}" (id: ${sql.id}) and node "${node.nodeName}"`, 43 | message: `${stageData.numTasks} tasks with median task duration of ${medianTaskDurationTxt}, which causes large scheduling overhead for Spark`, 44 | suggestion: ` 45 | 1. Repartition to less tasks, so you will have less overhead, by running .repartition(${recommendedTaskNum}) 46 | 2. Instead of repartition, you can run .coallese(${recommendedTaskNum}) to decrease the number of tasks without shuffling on the expense of less parallelism 47 | 3. If you need to hash-partition, call repartition like this: .repartition(${recommendedTaskNum}, "hash_key1", "hash_key2") 48 | `, 49 | shortSuggestion: `.repartition(${recommendedTaskNum}) before this transformation`, 50 | type: "warning", 51 | source: { 52 | type: "sql", 53 | sqlId: sql.id, 54 | sqlNodeId: node.nodeId, 55 | }, 56 | }); 57 | } 58 | }); 59 | }); 60 | } 61 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/Alerts/WastedCoresAlertsReducer.ts: -------------------------------------------------------------------------------- 1 | import { Alerts, ConfigStore, StatusStore } from "../../interfaces/AppStore"; 2 | 3 | const WASTED_CORES_RATIO_THRESHOLD = 50.0; 4 | 5 | export function reduceWastedCoresAlerts( 6 | statusStore: StatusStore, 7 | config: ConfigStore, 8 | alerts: Alerts, 9 | ) { 10 | if ( 11 | statusStore.executors !== undefined && 12 | statusStore.executors.idleCoresRate > WASTED_CORES_RATIO_THRESHOLD 13 | ) { 14 | const idleCores = statusStore.executors.idleCoresRate; 15 | 16 | let suggestionMessage = "decrease amount of cores or executors"; 17 | if (config.resourceControlType === "databricks") { 18 | suggestionMessage = 19 | "Reduce your cluster size or machine type via databricks cluster UI"; 20 | } else if (config.resourceControlType === "static") { 21 | suggestionMessage = `1. decrease amount of cores per executor by lowering spark.executor.cores 22 | 2. decrease amount of executors by lowering spark.executor.instances OR if using dynamic allocation by tuning .`; 23 | } else if (config.resourceControlType === "dynamic") { 24 | suggestionMessage = `1. decrease amount of cores per executor by lowering spark.executor.cores 25 | 2. tune your Dynamic Allocation config, specifically lower spark.dynamicAllocation.executorAllocationRatio or increase spark.dynamicAllocation.schedulerBacklogTimeout`; 26 | } 27 | 28 | alerts.push({ 29 | id: `idleCoresTooHigh${idleCores.toFixed(2)}`, 30 | name: "idleCoresTooHigh", 31 | title: "Idle Cores Too High", 32 | location: "In: Summery Page -> Idle Cores", 33 | message: `Idle Cores is ${idleCores.toFixed( 34 | 2, 35 | )}% which is too high, and suggest your cluster is over-provisioned on cores or executors`, 36 | suggestion: suggestionMessage, 37 | type: "warning", 38 | source: { 39 | type: "status", 40 | metric: "idleCores", 41 | }, 42 | }); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/AlertsReducer.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Alerts, 3 | AlertsStore, 4 | ConfigStore, 5 | SparkExecutorsStore, 6 | SparkSQLStore, 7 | SparkStagesStore, 8 | StatusStore, 9 | } from "../interfaces/AppStore"; 10 | import { reduceBroadcastTooLargeAlert } from "./Alerts/BroadcastTooLargeAlert"; 11 | import { reduceIcebergReplaces } from "./Alerts/IcebergReplacesReducer"; 12 | import { reduceJoinToBroadcastAlert } from "./Alerts/JoinToBroadcastAlert"; 13 | import { reduceLargeCrossJoinScanAlert } from "./Alerts/LargeCrossJoinScanAlert"; 14 | import { reduceLongFilterConditions } from "./Alerts/LongFilterConditions"; 15 | import { reduceMaxPartitionToBigAlert } from "./Alerts/MaxPartitionToBigAlert"; 16 | import { reduceMemoryAlerts } from "./Alerts/MemoryAlertsReducer"; 17 | import { reduceSQLInputOutputAlerts } from "./Alerts/MemorySQLInputOutputAlerts"; 18 | import { reducePartitionSkewAlert } from "./Alerts/PartitionSkewAlert"; 19 | import { reduceSmallTasksAlert } from "./Alerts/SmallTasksAlert"; 20 | import { reduceWastedCoresAlerts } from "./Alerts/WastedCoresAlertsReducer"; 21 | import { parseAlertDisabledConfig } from "../utils/ConfigParser"; 22 | 23 | export function reduceAlerts( 24 | sqlStore: SparkSQLStore, 25 | statusStore: StatusStore, 26 | stageStore: SparkStagesStore, 27 | config: ConfigStore, 28 | executors: SparkExecutorsStore, 29 | environmentInfo: any 30 | ): AlertsStore { 31 | const alerts: Alerts = []; 32 | reduceMemoryAlerts(statusStore, config, environmentInfo, executors, alerts); 33 | reduceWastedCoresAlerts(statusStore, config, alerts); 34 | reduceSQLInputOutputAlerts(sqlStore, alerts); 35 | reducePartitionSkewAlert(sqlStore, stageStore, alerts); 36 | reduceSmallTasksAlert(sqlStore, stageStore, alerts); 37 | reduceIcebergReplaces(sqlStore, alerts); 38 | reduceLongFilterConditions(sqlStore, alerts); 39 | reduceBroadcastTooLargeAlert(sqlStore, alerts); 40 | reduceJoinToBroadcastAlert(sqlStore, alerts); 41 | reduceLargeCrossJoinScanAlert(sqlStore, alerts); 42 | reduceMaxPartitionToBigAlert(sqlStore, stageStore, alerts); 43 | const disabledAlerts = parseAlertDisabledConfig(config.alertDisabled); 44 | const filteredAlerts = alerts.filter(alert => !disabledAlerts.has(alert.name)); 45 | return { 46 | alerts: filteredAlerts, 47 | }; 48 | } 49 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/ChatSlice.ts: -------------------------------------------------------------------------------- 1 | import { MessageModel } from "@chatscope/chat-ui-kit-react"; 2 | import { createSlice, PayloadAction } from "@reduxjs/toolkit"; 3 | 4 | export const initialState: { 5 | messages: MessageModel[]; 6 | isTyping: boolean; 7 | inputText: string; 8 | apiKey: string | undefined; 9 | } = { 10 | messages: [ 11 | { 12 | message: "Hello, ask me anything about your spark job!", 13 | sentTime: "just now", 14 | sender: "ChatGPT", 15 | direction: "outgoing", 16 | position: "normal", 17 | }, 18 | ], 19 | isTyping: false, 20 | apiKey: undefined, 21 | inputText: "", 22 | }; 23 | 24 | const chatSlice = createSlice({ 25 | name: "chat", 26 | initialState, 27 | reducers: { 28 | addMessage: ( 29 | state, 30 | action: PayloadAction<{ 31 | message: MessageModel; 32 | }>, 33 | ) => { 34 | state.messages.push(action.payload.message); 35 | }, 36 | setIsTyping: ( 37 | state, 38 | action: PayloadAction<{ 39 | isTyping: boolean; 40 | }>, 41 | ) => { 42 | state.isTyping = action.payload.isTyping; 43 | }, 44 | setApiKey: ( 45 | state, 46 | action: PayloadAction<{ 47 | apiKey: string; 48 | }>, 49 | ) => { 50 | state.apiKey = 51 | action.payload.apiKey === "" ? undefined : action.payload.apiKey; 52 | }, 53 | setInputText: ( 54 | state, 55 | action: PayloadAction<{ 56 | inputText: string; 57 | }>, 58 | ) => { 59 | state.inputText = action.payload.inputText; 60 | }, 61 | }, 62 | }); 63 | 64 | // Export the action creators and the reducer 65 | export const { addMessage, setIsTyping, setApiKey, setInputText } = 66 | chatSlice.actions; 67 | 68 | export default chatSlice.reducer; 69 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/GeneralSlice.ts: -------------------------------------------------------------------------------- 1 | import { createSlice, PayloadAction } from "@reduxjs/toolkit"; 2 | import { GraphFilter, SQLNodeExchangeStageData, SQLNodeStageData } from "../interfaces/AppStore"; 3 | 4 | export const initialState: { 5 | sqlMode: GraphFilter; 6 | selectedStage: SQLNodeStageData | SQLNodeExchangeStageData | undefined; 7 | } = { 8 | sqlMode: "advanced", 9 | selectedStage: undefined 10 | }; 11 | 12 | const generalSlice = createSlice({ 13 | name: "general", 14 | initialState, 15 | reducers: { 16 | setSQLMode: ( 17 | state, 18 | action: PayloadAction<{ 19 | newMode: GraphFilter; 20 | }>, 21 | ) => { 22 | state.sqlMode = action.payload.newMode; 23 | }, 24 | setSelectedStage: ( 25 | state, 26 | action: PayloadAction<{ 27 | selectedStage: SQLNodeStageData | SQLNodeExchangeStageData | undefined; 28 | }>, 29 | ) => { 30 | state.selectedStage = action.payload.selectedStage; 31 | }, 32 | }, 33 | }); 34 | 35 | // Export the action creators and the reducer 36 | export const { setSQLMode, setSelectedStage } = generalSlice.actions; 37 | 38 | export default generalSlice.reducer; 39 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/CoalesceParser.test.ts: -------------------------------------------------------------------------------- 1 | import { ParsedCoalescePlan } from '../../interfaces/AppStore'; 2 | import { parseCoalesce } from './CoalesceParser'; 3 | 4 | describe('CoalesceParser', () => { 5 | it('should parse the partition number from a Coalesce plan', () => { 6 | const input = 'Coalesce 10'; 7 | const expected: ParsedCoalescePlan = { partitionNum: 10 }; 8 | const result = parseCoalesce(input); 9 | expect(result).toEqual(expected); 10 | }); 11 | }); -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/CoalesceParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedCoalescePlan } from "../../interfaces/AppStore"; 2 | 3 | export function parseCoalesce(input: string): ParsedCoalescePlan { 4 | return { 5 | partitionNum: parseInt(input.split(" ")[1]), 6 | }; 7 | } -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/CollectLimitParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedCollectLimitPlan } from "../../interfaces/AppStore"; 2 | 3 | export function parseCollectLimit(input: string): ParsedCollectLimitPlan { 4 | return { 5 | limit: parseInt(input.split(" ")[1]), 6 | }; 7 | } 8 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/ExchangeParser.spec.ts: -------------------------------------------------------------------------------- 1 | import { parseExchange } from "./ExchangeParser"; 2 | 3 | describe("parseExchange", () => { 4 | test("parses hash partitioning correctly", () => { 5 | const input = 6 | "Exchange hashpartitioning(ss_quantity#9, 200), REPARTITION_BY_COL, [plan_id=40]"; 7 | expect(parseExchange(input)).toEqual({ 8 | type: "hashpartitioning", 9 | fields: ["ss_quantity"], 10 | isBroadcast: false, 11 | }); 12 | }); 13 | 14 | test("parses single partition correctly", () => { 15 | const input = 16 | "Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=14514]"; 17 | expect(parseExchange(input)).toEqual({ 18 | type: "SinglePartition", 19 | fields: [], 20 | isBroadcast: false, 21 | }); 22 | }); 23 | 24 | test("parses range partitioning correctly", () => { 25 | const input = 26 | "Exchange rangepartitioning(ca_county#787 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=83408]"; 27 | expect(parseExchange(input)).toEqual({ 28 | type: "rangepartitioning", 29 | fields: ["ca_county ASC NULLS FIRST"], 30 | isBroadcast: false, 31 | }); 32 | }); 33 | 34 | test("parses broadcast correctly", () => { 35 | const input = 36 | "Exchange SinglePartition, EXECUTOR_BROADCAST, [plan_id=270]"; 37 | expect(parseExchange(input)).toEqual({ 38 | type: "SinglePartition", 39 | fields: [], 40 | isBroadcast: true, 41 | }); 42 | }); 43 | 44 | // Add more test cases as necessary 45 | }); 46 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/ExchangeParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedExchangePlan } from "../../interfaces/AppStore"; 2 | import { bracedSplit, hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function parseExchange(input: string): ParsedExchangePlan { 5 | const typeRegex = /Exchange (\w+)/; 6 | 7 | const typeMatch = input.match(typeRegex); 8 | 9 | const parenthesisContent = input.match(/\(([^)]+)\)/)?.[1] ?? ""; 10 | const allFields = bracedSplit(parenthesisContent).map((field) => 11 | hashNumbersRemover(field.trim()), 12 | ); 13 | // Remove the last element if it is a number (partition number) 14 | if (allFields.length > 0 && !isNaN(Number(allFields[allFields.length - 1]))) { 15 | allFields.pop(); 16 | } 17 | 18 | const type = typeMatch ? typeMatch[1] : ""; 19 | 20 | const isBroadcast = input.includes("EXECUTOR_BROADCAST"); 21 | 22 | return { type, fields: allFields, isBroadcast: isBroadcast }; 23 | } 24 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/FilterParser.ts: -------------------------------------------------------------------------------- 1 | import { ParseFilterPlan } from "../../interfaces/AppStore"; 2 | import { 3 | hashNumbersRemover, 4 | removeFromEnd, 5 | removeFromStart, 6 | } from "./PlanParserUtils"; 7 | 8 | export function parseFilter(input: string): ParseFilterPlan { 9 | let filterStr = input; 10 | filterStr = removeFromStart(filterStr, "Filter "); 11 | filterStr = removeFromStart(filterStr, "PhotonFilter "); 12 | filterStr = removeFromStart(filterStr, "GpuFilter "); 13 | filterStr = removeFromStart(filterStr, "CometFilter "); 14 | 15 | if (filterStr.startsWith("(")) { 16 | filterStr = removeFromStart(filterStr, "("); 17 | filterStr = removeFromEnd(filterStr, ")"); 18 | } 19 | const condition = hashNumbersRemover(filterStr); 20 | return { condition: condition }; 21 | } 22 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/JoinParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedJoinPlan } from "../../interfaces/AppStore"; 2 | import { 3 | bracedSplit, 4 | hashNumbersRemover, 5 | removeFromEnd, 6 | removeFromStart, 7 | } from "./PlanParserUtils"; 8 | 9 | export function parseJoin(input: string): ParsedJoinPlan { 10 | if (input.startsWith("BroadcastNestedLoopJoin")) { 11 | const regex = /BroadcastNestedLoopJoin(?:\s+\w+)?,\s+(\w+)(?:,\s*\((.*)\))?/; 12 | const match = hashNumbersRemover(input).match(regex); 13 | if (!match) { 14 | throw new Error("Invalid input format"); 15 | } 16 | const [, , conditionStr] = match; 17 | let joinCondition = conditionStr ? conditionStr.trim() : undefined; 18 | return { joinType: "BroadcastNestedLoopJoin", joinSideType: "Cross", joinCondition }; 19 | } 20 | 21 | const regex = /^(\w+)\s+\[(.*?)\], \[(.*?)\], (\w+)(?:,\s+(.*))?$/; 22 | const match = hashNumbersRemover(input).match(regex); 23 | 24 | if (!match) { 25 | throw new Error("Invalid input format"); 26 | } 27 | 28 | const [, joinType, leftKeysStr, rightKeysStr, joinSideType, conditionStr] = 29 | match; 30 | let leftKeys: string[] | undefined; 31 | let rightKeys: string[] | undefined; 32 | let joinCondition: string | undefined; 33 | 34 | if (leftKeysStr) { 35 | leftKeys = bracedSplit(leftKeysStr); 36 | } 37 | 38 | if (rightKeysStr) { 39 | rightKeys = bracedSplit(rightKeysStr); 40 | } 41 | 42 | if (conditionStr) { 43 | joinCondition = conditionStr; 44 | joinCondition = removeFromEnd(joinCondition, ", false"); 45 | joinCondition = removeFromStart(joinCondition, "BuildRight, "); 46 | joinCondition = removeFromStart(joinCondition, "BuildLeft, "); 47 | } 48 | 49 | if (joinCondition === "BuildRight") { 50 | joinCondition = undefined; 51 | } 52 | if (joinCondition === "BuildLeft") { 53 | joinCondition = undefined; 54 | } 55 | 56 | return { 57 | joinType, 58 | leftKeys, 59 | rightKeys, 60 | joinCondition, 61 | joinSideType, 62 | }; 63 | } 64 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/PlanParserUtils.ts: -------------------------------------------------------------------------------- 1 | export function onlyUnique(value: string, index: number, array: string[]) { 2 | return array.indexOf(value) === index; 3 | } 4 | 5 | export function hashNumbersRemover(input: string): string { 6 | return input.replace(/#\d+L/g, "").replace(/#\d+/g, ""); 7 | } 8 | 9 | export function truncateString(str: string, num: number): string { 10 | if (str.length <= num) { 11 | return str; 12 | } 13 | return str.slice(0, num) + "..."; 14 | } 15 | 16 | export function truncateMiddle(str: string, maxLength: number): string { 17 | if (str.length <= maxLength) { 18 | return str; 19 | } 20 | 21 | const prefixLength = Math.ceil(maxLength / 2) - 1; // Subtract 1 for the '...' 22 | const suffixLength = Math.floor(maxLength / 2); 23 | 24 | const prefix = str.substring(0, prefixLength); 25 | const suffix = str.substring(str.length - suffixLength); 26 | 27 | return `${prefix}...${suffix}`; 28 | } 29 | 30 | export function removeFromStart(str: string, strToRemove: string): string { 31 | if (str.startsWith(strToRemove)) { 32 | return str.slice(strToRemove.length); 33 | } 34 | return str; 35 | } 36 | 37 | export function removeFromEnd(str: string, strToRemove: string) { 38 | if (str.endsWith(strToRemove)) { 39 | return str.slice(0, -strToRemove.length); 40 | } 41 | return str; 42 | } 43 | 44 | export function bracedSplit(input: string): string[] { 45 | const result: string[] = []; 46 | let buffer = ""; 47 | let bracketCount = 0; 48 | let inQuotes = false; 49 | 50 | for (let i = 0; i < input.length; i++) { 51 | const char = input[i]; 52 | 53 | if (char === "(") bracketCount++; 54 | if (char === ")") bracketCount--; 55 | if (char === '"') inQuotes = !inQuotes; 56 | 57 | if (char === "," && bracketCount === 0 && !inQuotes) { 58 | result.push(buffer.trim()); 59 | buffer = ""; 60 | } else { 61 | buffer += char; 62 | } 63 | } 64 | if (buffer) result.push(buffer.trim()); 65 | return result; 66 | } 67 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/ProjectParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedProjectPlan } from "../../interfaces/AppStore"; 2 | import { bracedSplit, hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function parseProject(input: string): ParsedProjectPlan { 5 | const fieldsStr = hashNumbersRemover( 6 | input.replace("Project [", "").replace("PhotonProject [", "").replace("GpuProject [", "").slice(0, -1), 7 | ); 8 | const fields = bracedSplit(fieldsStr).map((field) => field.trim()); 9 | return { fields: fields }; 10 | } 11 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/ScanFileParser.ts: -------------------------------------------------------------------------------- 1 | import { ParseFileScanPlan } from "../../interfaces/AppStore"; 2 | import { hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function parseFileScan( 5 | input: string, 6 | nodeName: string, 7 | ): ParseFileScanPlan { 8 | input = hashNumbersRemover(input); 9 | const result: ParseFileScanPlan = {}; 10 | const matches = { 11 | format: /Format: (\w+),/.exec(input), 12 | Location: /Location: InMemoryFileIndex\([\w\s]+\)\[(.*?)\]/.exec(input), 13 | PartitionFilters: /PartitionFilters: \[(.*?)\]/.exec(input), 14 | PushedFilters: /PushedFilters: \[(.*?)\]/.exec(input), 15 | ReadSchema: /ReadSchema: struct<([\w\W]+)>/.exec(input), 16 | }; 17 | 18 | if (matches.format) result.format = matches.format[1]; 19 | if (matches.Location && matches.Location[1].includes("...")) { 20 | const paths = matches.Location[1].split(","); 21 | result.Location = paths.length ? paths[0] : undefined; 22 | } else if (matches.Location) { 23 | result.Location = matches.Location[1]; 24 | } 25 | 26 | if (matches.PartitionFilters) { 27 | if (matches.PartitionFilters[1].includes("...")) { 28 | result.PartitionFilters = undefined; 29 | } else { 30 | result.PartitionFilters = matches.PartitionFilters[1] 31 | .split(",") 32 | .map((filter) => filter.trim()) 33 | .filter(Boolean); 34 | } 35 | } 36 | 37 | if (matches.PushedFilters) { 38 | if (matches.PushedFilters[1].includes("...")) { 39 | result.ReadSchema = undefined; 40 | } else { 41 | result.PushedFilters = matches.PushedFilters[1] 42 | .split(",") 43 | .map((filter) => filter.trim()) 44 | .filter(Boolean); 45 | } 46 | } 47 | 48 | if (matches.ReadSchema) { 49 | if (matches.ReadSchema[1].includes("...")) { 50 | result.ReadSchema = undefined; 51 | } else { 52 | const fields = matches.ReadSchema[1].split(/,(?![^()]*\))/); 53 | const schema: { [key: string]: string } = {}; 54 | fields.forEach((field) => { 55 | const [name, type] = field.split(":"); 56 | if (name !== undefined && type !== undefined) { 57 | schema[name.trim()] = type.trim(); 58 | } 59 | }); 60 | result.ReadSchema = schema; 61 | } 62 | } 63 | if (nodeName.split(" ").length === 3) { 64 | result.tableName = nodeName.split(" ")[2]; 65 | } 66 | 67 | return result; 68 | } 69 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/SortParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedSortPlan } from "../../interfaces/AppStore"; 2 | import { hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function parseSort(input: string): ParsedSortPlan { 5 | const match = hashNumbersRemover(input).match(/\[(.*?)\]/); 6 | if (!match) { 7 | return { fields: [] }; 8 | } 9 | const fields = match[1].split(",").map((field) => field.trim()); 10 | return { fields: fields }; 11 | } 12 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/SortParset.spec.ts: -------------------------------------------------------------------------------- 1 | import { parseSort } from "./SortParser"; 2 | 3 | describe("parseSort", () => { 4 | it("should correctly parse the example input", () => { 5 | const input = 6 | "Sort [supplier_count#2941L DESC NULLS LAST, p_brand#267 ASC NULLS FIRST, p_type#268 ASC NULLS FIRST, p_size#269L ASC NULLS FIRST], true, 0"; 7 | const expected = { 8 | fields: [ 9 | "supplier_count DESC NULLS LAST", 10 | "p_brand ASC NULLS FIRST", 11 | "p_type ASC NULLS FIRST", 12 | "p_size ASC NULLS FIRST", 13 | ], 14 | }; 15 | expect(parseSort(input)).toEqual(expected); 16 | }); 17 | }); 18 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/TakeOrderedAndProjectParser.spec.ts: -------------------------------------------------------------------------------- 1 | import { ParsedTakeOrderedAndProjectPlan } from "../../interfaces/AppStore"; 2 | import { parseTakeOrderedAndProject } from "./TakeOrderedAndProjectParser"; 3 | 4 | // Parametrized Unit Tests using Jest 5 | describe("parseTakeOrderedAndProject", () => { 6 | const testCases: { 7 | input: string; 8 | expected: ParsedTakeOrderedAndProjectPlan; 9 | }[] = [ 10 | { 11 | input: 12 | "TakeOrderedAndProject(limit=100, orderBy=[s_store_name#1001 ASC NULLS FIRST], output=[s_store_name#1001,sum(ss_net_profit)#26850])", 13 | expected: { 14 | output: ["s_store_name", "sum(ss_net_profit)"], 15 | orderBy: ["s_store_name ASC NULLS FIRST"], 16 | limit: 100, 17 | }, 18 | }, 19 | // ... add other test cases here 20 | ]; 21 | 22 | testCases.forEach(({ input, expected }) => { 23 | it(`should parse "${input}" correctly`, () => { 24 | expect(parseTakeOrderedAndProject(input)).toEqual(expected); 25 | }); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/TakeOrderedAndProjectParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedTakeOrderedAndProjectPlan } from "../../interfaces/AppStore"; 2 | import { hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function parseTakeOrderedAndProject( 5 | input: string, 6 | ): ParsedTakeOrderedAndProjectPlan { 7 | const cleanInput = hashNumbersRemover(input); 8 | const outputMatch = cleanInput.match(/output=\[([^\]]+)\]/); 9 | const orderByMatch = cleanInput.match(/orderBy=\[([^\]]+)\]/); 10 | const limitMatch = cleanInput.match(/limit=(\d+)/); 11 | 12 | return { 13 | output: outputMatch ? outputMatch[1].split(",") : [], 14 | orderBy: orderByMatch ? orderByMatch[1].split(",") : [], 15 | limit: limitMatch ? parseInt(limitMatch[1], 10) : 0, 16 | }; 17 | } 18 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/WindowParser.spec.ts: -------------------------------------------------------------------------------- 1 | import { ParsedWindowPlan } from "../../interfaces/AppStore"; 2 | import { parseWindow } from "./WindowParser"; 3 | 4 | describe("parseWindow", () => { 5 | it("should parse simple Window", () => { 6 | const input = 7 | "Window [approx_count_distinct(user_id#0, 0.05, 0, 0) windowspecdefinition(category#2, day#3, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS distinct_users#113L], [category#2, day#3]"; 8 | const expected: ParsedWindowPlan = { 9 | selectFields: ["approx_count_distinct(user_id, 0.05, 0, 0) windowspecdefinition(category, day, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS distinct_users"], 10 | partitionFields: ["category", "day"], 11 | sortFields: [], 12 | }; 13 | expect(parseWindow(input)).toEqual(expected); 14 | }); 15 | 16 | it("should parse window with sort field", () => { 17 | const input = 18 | "Window [row_number() windowspecdefinition(category#2, day#3 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#63004], [category#2], [day#3 ASC NULLS FIRST]"; 19 | const expected: ParsedWindowPlan = { 20 | selectFields: ["row_number() windowspecdefinition(category, day ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number"], 21 | partitionFields: ["category"], 22 | sortFields: ["day ASC NULLS FIRST"], 23 | }; 24 | expect(parseWindow(input)).toEqual(expected); 25 | }); 26 | 27 | it("should parse window with sort field", () => { 28 | const input = "Window [row_number() windowspecdefinition(category#2, day#3 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number#63004], [category#2], [day#3 ASC NULLS FIRST]" 29 | const expected: ParsedWindowPlan = { 30 | selectFields: ["row_number() windowspecdefinition(category, day ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number"], 31 | partitionFields: ["category"], 32 | sortFields: ["day ASC NULLS FIRST"], 33 | }; 34 | expect(parseWindow(input)).toEqual(expected); 35 | }); 36 | 37 | 38 | 39 | }) -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/WindowParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedWindowPlan } from "../../interfaces/AppStore"; 2 | import { 3 | bracedSplit, 4 | hashNumbersRemover 5 | } from "./PlanParserUtils"; 6 | 7 | export function parseWindow(input: string): ParsedWindowPlan { 8 | // Improved regex to correctly capture each part of the window specification 9 | const regex = /Window \[(.*?)\](?:,\s*\[(.*?)\])?(?:,\s*\[(.*?)\])?/; 10 | 11 | // Remove any unwanted hash numbers 12 | const sanitizedInput = hashNumbersRemover(input); 13 | 14 | // Match the input string with the regex 15 | const match = sanitizedInput.match(regex); 16 | 17 | if (!match) { 18 | return { partitionFields: [], selectFields: [], sortFields: [] }; 19 | } 20 | 21 | // Extract the matched groups (select, partition, sort) 22 | const selectFields = bracedSplit(match[1]); 23 | 24 | // Handle case when there are no partition or sort fields 25 | const partitionFields = match[2] ? bracedSplit(match[2]) : []; 26 | const sortFields = match[3] ? bracedSplit(match[3]) : []; 27 | 28 | return { partitionFields, selectFields, sortFields }; 29 | } 30 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/WriteToHDFSParser.spec.ts: -------------------------------------------------------------------------------- 1 | import { parseWriteToHDFS } from "./WriteToHDFSParser"; // Ensure to export functions from parser.ts 2 | 3 | const testData = [ 4 | { 5 | input: 6 | "Execute InsertIntoHadoopFsRelationCommand file:/Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01, false, CSV, [header=true, path=file:///Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01], Overwrite, [l_returnflag, l_linestatus, sum(l_quantity), sum(l_extendedprice), sum(UDF(l_extendedprice, l_discount)), sum(UDF(UDF(l_extendedprice, l_discount), l_tax)), avg(l_quantity), avg(l_extendedprice), avg(l_discount), count(l_quantity)]", 7 | expected: { 8 | location: 9 | "file:/Users/menishmueli/Documents/GitHub/tpch-spark/dbgen/output/Q01", 10 | format: "CSV", 11 | mode: "Overwrite", 12 | }, 13 | }, 14 | { 15 | input: 16 | 'Execute InsertIntoHadoopFsRelationCommand file:/tmp/output/partitiondata, false, [speaker#76], Parquet, [__partition_columns=["speaker"], path=/tmp/output/partitiondata], Append, [line_id, play_name, speech_number, line_number, speaker, text_entry]', 17 | expected: { 18 | location: "file:/tmp/output/partitiondata", 19 | format: "Parquet", 20 | partitionKeys: ["speaker"], 21 | mode: "Append", 22 | }, 23 | }, 24 | { 25 | input: 26 | "Execute InsertIntoHadoopFsRelationCommand file:/tmp/data, false, [speaker#94], Parquet, [path=file:/tmp/data], Append, `spark_catalog`.`local_catalog`.`my_table`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:/tmp/data), [line_id, play_name, speech_number, line_number, text_entry, speaker]", 27 | expected: { 28 | location: "file:/tmp/data", 29 | format: "Parquet", 30 | partitionKeys: ["speaker"], 31 | mode: "Append", 32 | tableName: "`spark_catalog`.`local_catalog`.`my_table`", 33 | }, 34 | }, 35 | { 36 | input: 37 | "Execute InsertIntoHadoopFsRelationCommand file:/tmp/data2, false, Parquet, [path=file:/tmp/data2], Append, `spark_catalog`.`local_catalog`.`my_table`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:/tmp/data2), [line_id, play_name, speech_number, line_number, text_entry, speaker]", 38 | expected: { 39 | location: "file:/tmp/data2", 40 | format: "Parquet", 41 | mode: "Append", 42 | tableName: "`spark_catalog`.`local_catalog`.`my_table`", 43 | }, 44 | }, 45 | ]; 46 | 47 | describe("parseWriteToHDFS", () => { 48 | testData.forEach((data, idx) => { 49 | it(`parses string ${idx + 1} correctly`, () => { 50 | const result = parseWriteToHDFS(data.input); 51 | expect(result).toEqual(data.expected); 52 | }); 53 | }); 54 | }); 55 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/WriteToHDFSParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedWriteToHDFSPlan } from "../../interfaces/AppStore"; 2 | import { hashNumbersRemover } from "./PlanParserUtils"; 3 | 4 | export function specialSplit(input: string): string[] { 5 | const result: string[] = []; 6 | let buffer = ""; 7 | let bracketCount = 0; 8 | let inQuotes = false; 9 | 10 | for (let i = 0; i < input.length; i++) { 11 | const char = input[i]; 12 | 13 | if (char === "[") bracketCount++; 14 | if (char === "]") bracketCount--; 15 | if (char === '"') inQuotes = !inQuotes; 16 | 17 | if (char === "," && bracketCount === 0 && !inQuotes) { 18 | result.push(buffer.trim()); 19 | buffer = ""; 20 | } else { 21 | buffer += char; 22 | } 23 | } 24 | 25 | if (buffer) result.push(buffer.trim()); 26 | return result; 27 | } 28 | 29 | export function parseWriteToHDFS(input: string): ParsedWriteToHDFSPlan { 30 | input = input.replace("Execute InsertIntoHadoopFsRelationCommand", "").trim(); 31 | const parts = specialSplit(input); 32 | 33 | let parsed: ParsedWriteToHDFSPlan = { 34 | location: parts[0], 35 | format: "unknown", 36 | mode: "unknown", 37 | tableName: undefined, 38 | partitionKeys: undefined, 39 | }; 40 | 41 | if (parts[2].includes("[")) { 42 | (parsed.partitionKeys = hashNumbersRemover(parts[2].slice(1, -1)).split( 43 | ",", 44 | )), 45 | (parsed.format = parts[3]); 46 | parsed.mode = parts[5]; 47 | } else { 48 | parsed.format = parts[2]; 49 | parsed.mode = parts[4]; 50 | } 51 | 52 | if (parts[4].includes("`")) { 53 | parsed.tableName = parts[4]; 54 | } else if (parts[5].includes("`")) { 55 | parsed.tableName = parts[5]; 56 | } else if (parts.length > 6 && parts[6].includes("`")) { 57 | parsed.tableName = parts[6]; 58 | } 59 | 60 | return parsed; 61 | } 62 | -------------------------------------------------------------------------------- /spark-ui/src/reducers/PlanParsers/hashAggregateParser.ts: -------------------------------------------------------------------------------- 1 | import { ParsedHashAggregatePlan } from "../../interfaces/AppStore"; 2 | import { bracedSplit, hashNumbersRemover, onlyUnique } from "./PlanParserUtils"; 3 | 4 | export function parseHashAggregate(input: string): ParsedHashAggregatePlan { 5 | const cleanInput = hashNumbersRemover(input); 6 | const keysMatch = cleanInput.match(/keys=\[([^\]]+)\]/); 7 | const functionsMatch = cleanInput.match(/functions=\[([^\]]+)\]/); 8 | 9 | let keys: string[] = []; 10 | let functions: string[] = []; 11 | let operations: string[] = []; 12 | 13 | if (keysMatch && keysMatch[1]) { 14 | keys = bracedSplit(keysMatch[1]).map((key) => key.trim()); 15 | } 16 | 17 | if (functionsMatch && functionsMatch[1]) { 18 | functions = bracedSplit(functionsMatch[1]).map((func) => func.trim()); 19 | 20 | // Extracting only the outermost operation 21 | operations = functions 22 | .map((func) => { 23 | if (func.includes("count(distinct")) { 24 | return "count_distinct"; 25 | } 26 | const match = func.match(/^\w+/); 27 | return match ? match[0] : ""; 28 | }) 29 | .filter(Boolean) 30 | .filter(onlyUnique); 31 | } 32 | 33 | return { 34 | keys, 35 | functions, 36 | operations, 37 | }; 38 | } 39 | -------------------------------------------------------------------------------- /spark-ui/src/services/MixpanelService.tsx: -------------------------------------------------------------------------------- 1 | import mixpanel from "mixpanel-browser"; 2 | import { MixpanelEvents } from "../interfaces/Mixpanel"; 3 | 4 | const KEEP_ALIVE_INTERVAL_MS = 60 * 1000; 5 | 6 | const baseProperties = { 7 | dataflintVersion: process.env.REACT_APP_VERSION ?? "unknown-version", 8 | }; 9 | 10 | export class MixpanelService { 11 | static mixpanelTelemetryConfigDisabled = false; 12 | 13 | static setMixpanelTelemetryConfigDisabled(): void { 14 | MixpanelService.mixpanelTelemetryConfigDisabled = true; 15 | } 16 | 17 | static InitMixpanel(): void { 18 | if (!this.ShouldTrack()) return; 19 | 20 | const MIX_PANEL_TOKEN = "114c37f7dc10c79978b850277136c232"; 21 | 22 | // For debugging add debug: true to the props 23 | mixpanel.init(MIX_PANEL_TOKEN, { 24 | // using a cloudfront to skip ad blockers, see: 25 | // https://blog.pranavp.com.np/prevent-ad-blockers-from-blocking-mixpanel-without-nginx 26 | api_host: "https://drblx6b8i77l.cloudfront.net", 27 | track_pageview: true, 28 | persistence: "localStorage", 29 | }); 30 | this.StartKeepAlive(KEEP_ALIVE_INTERVAL_MS); 31 | } 32 | 33 | /** 34 | * Sends keep alive every interval if the tab is focused, in order to keep the mixpanel sessions "alive" 35 | * @param interval keep alive interval in ms 36 | */ 37 | static StartKeepAlive(interval: number): void { 38 | if (!this.ShouldTrack) return; 39 | 40 | setInterval(() => { 41 | if (document.hidden) { 42 | // skip keep alive when tab is not in focus 43 | return; 44 | } 45 | 46 | this.Track(MixpanelEvents.KeepAlive, baseProperties); 47 | }, interval); 48 | } 49 | 50 | static Track( 51 | event: MixpanelEvents, 52 | properties?: { [key: string]: any }, 53 | ): void { 54 | if (!this.ShouldTrack()) return; 55 | 56 | mixpanel.track(event, { ...baseProperties, ...properties }); 57 | } 58 | 59 | static TrackPageView(properties?: { [key: string]: any }): void { 60 | if (!this.ShouldTrack()) return; 61 | 62 | mixpanel.track_pageview({ ...baseProperties, ...properties }); 63 | } 64 | 65 | static ShouldTrack(): boolean { 66 | return ( 67 | process.env.NODE_ENV !== "development" && 68 | localStorage.getItem("SKIP_MIXPANEL") !== "true" && 69 | !MixpanelService.mixpanelTelemetryConfigDisabled 70 | ); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /spark-ui/src/services/TabsService.tsx: -------------------------------------------------------------------------------- 1 | import AdjustIcon from "@mui/icons-material/Adjust"; 2 | import AssessmentIcon from "@mui/icons-material/Assessment"; 3 | import PrecisionManufacturingIcon from "@mui/icons-material/PrecisionManufacturing"; 4 | import ReportIcon from "@mui/icons-material/Report"; 5 | import SettingsApplicationsIcon from "@mui/icons-material/SettingsApplications"; 6 | import React from "react"; 7 | import { isHistoryServer } from "../utils/UrlUtils"; 8 | 9 | export enum Tab { 10 | Status = "Status", 11 | Summary = "Summary", 12 | Resources = "Resources", 13 | Configuration = "Configuration", 14 | Alerts = "Alerts", 15 | } 16 | 17 | export const TabToUrl = { 18 | [Tab.Status]: "/status", 19 | [Tab.Summary]: "/summary", 20 | [Tab.Configuration]: "/config", 21 | [Tab.Alerts]: "/alerts", 22 | [Tab.Resources]: "/resources", 23 | }; 24 | 25 | export const getTabByUrl = (path: string) => { 26 | switch (path) { 27 | case TabToUrl[Tab.Status]: 28 | return Tab.Status; 29 | case TabToUrl[Tab.Summary]: 30 | return Tab.Summary; 31 | case TabToUrl[Tab.Configuration]: 32 | return Tab.Configuration; 33 | case TabToUrl[Tab.Alerts]: 34 | return Tab.Alerts; 35 | case TabToUrl[Tab.Resources]: 36 | return Tab.Resources; 37 | default: 38 | return isHistoryServer() ? Tab.Summary : Tab.Status; 39 | } 40 | }; 41 | 42 | export function renderTabIcon(selectedTab: Tab): JSX.Element { 43 | switch (selectedTab) { 44 | case Tab.Status: 45 | return ; 46 | case Tab.Configuration: 47 | return ; 48 | case Tab.Summary: 49 | return ; 50 | case Tab.Alerts: 51 | return ; 52 | case Tab.Resources: 53 | return ; 54 | default: 55 | return
; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /spark-ui/src/tabs/AlertsTab.tsx: -------------------------------------------------------------------------------- 1 | import { Alert, AlertTitle } from "@mui/material"; 2 | import { Stack } from "@mui/system"; 3 | import React, { FC } from "react"; 4 | import { useAppSelector } from "../Hooks"; 5 | 6 | export const AlertsTab: FC<{}> = (): JSX.Element => { 7 | const alerts = useAppSelector((state) => state.spark.alerts); 8 | const errorsCount = alerts?.alerts.filter((alert) => alert.type === "error") 9 | .length; 10 | const warningsCount = alerts?.alerts.filter( 11 | (alert) => alert.type === "warning", 12 | ).length; 13 | 14 | return ( 15 | <> 16 | {alerts?.alerts.length === 0 ? ( 17 |
25 | No alerts 😎 26 |
27 | ) : ( 28 |
36 |
37 | {`Errors - ${errorsCount}`} 41 | {`Alerts - ${warningsCount}`} 45 |
46 |
54 | 58 | {alerts?.alerts.map((alert) => { 59 | return ( 60 | 61 | {alert.title} 62 | {alert.message} 63 | {"\n"} 64 | {alert.location} 65 | {"\n"} 66 | {`Suggestions: ${alert.suggestion}`} 67 | 68 | ); 69 | })} 70 | 71 |
72 |
73 | )} 74 | 75 | ); 76 | }; 77 | -------------------------------------------------------------------------------- /spark-ui/src/tabs/ConfigurationTab.tsx: -------------------------------------------------------------------------------- 1 | import { Box } from "@mui/material"; 2 | import * as React from "react"; 3 | import ConfigTable from "../components/ConfigTable"; 4 | import { useAppSelector } from "../Hooks"; 5 | import { MixpanelService } from "../services/MixpanelService"; 6 | 7 | export default function ConfigurationTab() { 8 | const configs = useAppSelector( 9 | (state) => state.spark.config?.configs, 10 | )?.filter( 11 | (row) => row.category === "general" || row.category === "executor-memory", 12 | ); 13 | 14 | React.useEffect(() => { 15 | MixpanelService.TrackPageView(); 16 | }, []); 17 | 18 | return ( 19 |
27 | 28 | {!!configs && } 29 | 30 |
31 | ); 32 | } 33 | -------------------------------------------------------------------------------- /spark-ui/src/tabs/StatusTab.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | import NoQuery from "../components/NoQuery/NoQuery"; 3 | import SqlContainer from "../components/SqlContainer"; 4 | import StatusBar from "../components/StatusBar"; 5 | import { useAppSelector } from "../Hooks"; 6 | import { MixpanelService } from "../services/MixpanelService"; 7 | 8 | export default function StatusTab() { 9 | const sql = useAppSelector((state) => state.spark.sql); 10 | const isIdle = 11 | useAppSelector((state) => state.spark.status?.stages?.status) == "idle"; 12 | 13 | React.useEffect(() => { 14 | MixpanelService.TrackPageView(); 15 | }, []); 16 | 17 | return ( 18 |
19 | 20 | {sql === undefined || sql.sqls.length === 0 || isIdle ? ( 21 |
29 | 30 |
31 | ) : ( 32 |
35 |
44 | {sql.sqls.slice(-1)[0].description} 45 |
46 | 47 |
48 | )} 49 |
50 | ); 51 | } 52 | -------------------------------------------------------------------------------- /spark-ui/src/theme.ts: -------------------------------------------------------------------------------- 1 | import { red } from "@mui/material/colors"; 2 | import { createTheme } from "@mui/material/styles"; 3 | 4 | // A custom theme for this app 5 | const theme = createTheme({ 6 | palette: { 7 | mode: "dark", 8 | primary: { 9 | main: "#3f51b5", 10 | }, 11 | secondary: { 12 | main: "#19857b", 13 | }, 14 | error: { 15 | main: red.A400, 16 | }, 17 | }, 18 | components: { 19 | MuiCssBaseline: { 20 | styleOverrides: { 21 | body: { 22 | scrollbarColor: "#6b6b6b #2b2b2b", 23 | "&::-webkit-scrollbar, & *::-webkit-scrollbar": { 24 | backgroundColor: "#2b2b2b", 25 | width: "0.8em", 26 | height: "0.8em", 27 | }, 28 | "&::-webkit-scrollbar-thumb, & *::-webkit-scrollbar-thumb": { 29 | borderRadius: 8, 30 | backgroundColor: "#6b6b6b", 31 | minHeight: 24, 32 | border: "3px solid #2b2b2b", 33 | }, 34 | "&::-webkit-scrollbar-thumb:focus, & *::-webkit-scrollbar-thumb:focus": 35 | { 36 | backgroundColor: "#959595", 37 | }, 38 | "&::-webkit-scrollbar-thumb:active, & *::-webkit-scrollbar-thumb:active": 39 | { 40 | backgroundColor: "#959595", 41 | }, 42 | "&::-webkit-scrollbar-thumb:hover, & *::-webkit-scrollbar-thumb:hover": 43 | { 44 | backgroundColor: "#959595", 45 | }, 46 | "&::-webkit-scrollbar-corner, & *::-webkit-scrollbar-corner": { 47 | backgroundColor: "#2b2b2b", 48 | }, 49 | }, 50 | }, 51 | }, 52 | }, 53 | }); 54 | 55 | export default theme; 56 | -------------------------------------------------------------------------------- /spark-ui/src/utils/ConfigParser.ts: -------------------------------------------------------------------------------- 1 | // Utility to parse the spark.dataflint.alert.disabled config 2 | export function parseAlertDisabledConfig(config: string | undefined): Set { 3 | if (!config) return new Set(); 4 | return new Set(config.split(',').map(x => x.trim()).filter(Boolean)); 5 | } 6 | -------------------------------------------------------------------------------- /spark-ui/src/utils/FormatUtils.ts: -------------------------------------------------------------------------------- 1 | import { format, parse } from "bytes"; 2 | import { Duration, duration } from "moment"; 3 | 4 | export function humanFileSize(bytes: number): string { 5 | if (Number.isNaN(bytes)) return "NaN"; 6 | const formatted = format(bytes, { unitSeparator: " " }); 7 | return formatted 8 | .replace("KB", "KiB") 9 | .replace("MB", "MiB") 10 | .replace("GB", "GiB") 11 | .replace("TB", "TiB"); 12 | } 13 | 14 | export function parseBytesString(str: string): number { 15 | return parse( 16 | str 17 | .replace("KiB", "KB") 18 | .replace("MiB", "MB") 19 | .replace("GiB", "GB") 20 | .replace("TiB", "TB"), 21 | ); 22 | } 23 | 24 | export function humanFileSizeSparkConfigFormat(bytes: number): string { 25 | if (Number.isNaN(bytes)) return "NaN"; 26 | const formatted = format(bytes); 27 | return formatted 28 | .replace("KB", "k") 29 | .replace("MB", "m") 30 | .replace("GB", "g") 31 | .replace("TB", "t"); 32 | } 33 | 34 | export function humanizeTimeDiff( 35 | duration: Duration, 36 | roundSeconds: boolean = false, 37 | ): string { 38 | if (duration.asDays() >= 1) { 39 | return duration.asDays().toFixed(1) + "d"; 40 | } 41 | if (duration.asHours() >= 1) { 42 | return duration.asHours().toFixed(1) + "h"; 43 | } 44 | if (duration.asMinutes() >= 1) { 45 | return duration.asMinutes().toFixed(1) + "m"; 46 | } 47 | if (duration.asSeconds() >= 1 || roundSeconds) { 48 | return roundSeconds 49 | ? duration.asSeconds().toFixed(0) + "s" 50 | : duration.asSeconds().toFixed(1) + "s"; 51 | } 52 | return duration.asMilliseconds().toFixed(0) + "ms"; 53 | } 54 | 55 | export function msToHours(ms: number): number { 56 | return ms / 1000 / 60 / 60; 57 | } 58 | 59 | export function hoursToMS(ms: number): number { 60 | return ms * 1000 * 60 * 60; 61 | } 62 | 63 | export function timeStrToEpocTime(time: string): number { 64 | const addTimeMoment = new Date(time.replace("GMT", "Z")); 65 | return addTimeMoment.getTime(); 66 | } 67 | 68 | export function timeStringToMilliseconds( 69 | timeString: string | undefined, 70 | ): number | undefined { 71 | if (timeString === undefined) { 72 | return undefined; 73 | } 74 | const unit = timeString.slice(-2).trim(); 75 | const value = parseFloat(timeString.slice(0, -2).trim()); 76 | 77 | switch (unit) { 78 | case "ms": 79 | return value; 80 | case "s": 81 | return duration(value, "seconds").asMilliseconds(); 82 | case "m": 83 | return duration(value, "minutes").asMilliseconds(); 84 | case "h": 85 | return duration(value, "hours").asMilliseconds(); 86 | default: 87 | throw new Error(`Unsupported time unit: ${unit}`); 88 | } 89 | } 90 | 91 | export function calculatePercentage(value: number, total: number): number { 92 | if (total === undefined || value === undefined || total === 0) { 93 | return 0; 94 | } 95 | const percentage = (value / total) * 100; 96 | return Math.min(Math.max(percentage, 0), 100); 97 | } 98 | -------------------------------------------------------------------------------- /spark-ui/src/utils/UrlConsts.ts: -------------------------------------------------------------------------------- 1 | import { 2 | getProxyBasePath, 3 | hrefWithoutEndSlash, 4 | isDataFlintSaaSUI, 5 | isHistoryServer, 6 | isProxyMode, 7 | } from "./UrlUtils"; 8 | 9 | const IS_HISTORY_SERVER_MODE = isHistoryServer(); 10 | 11 | let BASE_PATH = ""; 12 | let BASE_CURRENT_PAGE = hrefWithoutEndSlash(); 13 | if (process.env.NODE_ENV === "development") { 14 | BASE_PATH = process.env.REACT_APP_BASE_PATH ?? ""; 15 | BASE_CURRENT_PAGE = `${BASE_PATH}/dataflint`; 16 | } else if (isProxyMode()) { 17 | BASE_PATH = getProxyBasePath(); 18 | } else if (isDataFlintSaaSUI()) { 19 | BASE_PATH = "/dataflint-spark-ui"; 20 | } 21 | 22 | export { BASE_CURRENT_PAGE, BASE_PATH, IS_HISTORY_SERVER_MODE }; 23 | -------------------------------------------------------------------------------- /spark-ui/src/utils/UrlUtils.ts: -------------------------------------------------------------------------------- 1 | export const isHistoryServer = (): boolean => 2 | window.location.href.includes("history"); 3 | 4 | export const isProxyMode = (): boolean => 5 | !( 6 | window.location.pathname === "/dataflint" || 7 | window.location.pathname === "/dataflint/" 8 | ); 9 | 10 | export const isDataFlintSaaSUI = (): boolean => 11 | window.location.href.includes("dataflint-spark-ui"); 12 | 13 | export function hrefWithoutEndSlash(): string { 14 | const href = window.location.href; 15 | let fixedUrl = href.split("/#/")[0]; 16 | 17 | // We are using a HashRouter so we split by # 18 | if (fixedUrl.endsWith("index.html")) { 19 | fixedUrl = fixedUrl.substring(0, fixedUrl.length - "index.html".length); 20 | } 21 | if (fixedUrl.includes("?o=")) { 22 | fixedUrl = fixedUrl.split("dataflint")[0] + "dataflint"; 23 | } 24 | if (fixedUrl.endsWith("/")) { 25 | fixedUrl = fixedUrl.substring(0, fixedUrl.length - 1); 26 | } 27 | return fixedUrl; 28 | } 29 | 30 | export const getProxyBasePath = (): string => { 31 | if (isHistoryServer()) { 32 | // in cases where we are in history server mode, the API should be before the last /history part 33 | // For example, for: http://localhost:18080/history//dataflint/ 34 | // the api is in http://localhost:18080/api/ 35 | // when the path is https://gateway/sparkhistory/history//1/dataflint/ 36 | // the api is in https://gateway/sparkhistory/api/ 37 | const url = new URL(window.location.href); 38 | const pathToBase = url.pathname.match(/^(.*)\/history\//); 39 | 40 | if (pathToBase && pathToBase[1]) { 41 | return `${url.origin}${pathToBase[1]}`; 42 | } 43 | 44 | // If the pattern isn't found or pathToBase[1] = '', assume it's in the root 45 | return ""; 46 | } else { 47 | // in cases where we are not in history server mode, the API should be before the last /dataflint part 48 | // for example, for: http://localhost:18080/dataflint/ 49 | // the api is in http://localhost:18080/api 50 | // when the path is https://gateway/mysparkapp/dataflint/ 51 | // the api is in https://gateway/mysparkapp/api/ 52 | return hrefWithoutEndSlash().substring( 53 | 0, 54 | hrefWithoutEndSlash().lastIndexOf("/dataflint"), 55 | ); 56 | } 57 | }; 58 | 59 | export function getHistoryServerCurrentAppId(): string { 60 | const urlSegments = hrefWithoutEndSlash().split("/"); 61 | try { 62 | const historyIndex = urlSegments.findIndex( 63 | (segment) => segment === "history", 64 | ); 65 | const appId = urlSegments[historyIndex + 1]; 66 | return appId; 67 | } catch { 68 | throw new Error("Invalid history server app id"); 69 | } 70 | } 71 | 72 | export const getBaseAppUrl = (appPath: string): string => { 73 | return appPath.substring(0, hrefWithoutEndSlash().lastIndexOf("/dataflint")); 74 | }; 75 | -------------------------------------------------------------------------------- /spark-ui/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "esModuleInterop": true, 8 | "allowSyntheticDefaultImports": true, 9 | "strict": true, 10 | "forceConsistentCasingInFileNames": true, 11 | "module": "esnext", 12 | "moduleResolution": "node", 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "noEmit": true, 16 | "jsx": "react" 17 | }, 18 | "include": ["src"] 19 | } 20 | --------------------------------------------------------------------------------