├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── build.yml │ ├── generate_docs.yml │ ├── publish_dev_version.yml │ └── publish_release_version.yml ├── .gitignore ├── .grenrc.yml ├── .travis.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── build.gradle.kts ├── buildSrc ├── build.gradle.kts └── src │ └── main │ └── kotlin │ ├── Dependencies.kt │ ├── Helpers.kt │ ├── Plugins.kt │ ├── Projects.kt │ └── Versions.kt ├── core ├── build.gradle.kts └── src │ └── main │ └── scala │ └── org │ ├── apache │ └── spark │ │ └── sql │ │ ├── KotlinReflection.scala │ │ ├── KotlinWrappers.scala │ │ └── catalyst │ │ └── CatalystTypeConverters.scala │ └── jetbrains │ └── kotlinx │ └── spark │ └── extensions │ ├── DemoCaseClass.scala │ ├── KSparkExtensions.scala │ └── VarargUnwrapper.scala ├── docs ├── _config.yml └── quick-start-guide.md ├── examples ├── build.gradle.kts └── src │ └── main │ ├── kotlin │ └── org │ │ └── jetbrains │ │ └── kotlinx │ │ └── spark │ │ └── examples │ │ ├── Broadcasting.kt │ │ ├── CachedOperations.kt │ │ ├── Collect.kt │ │ ├── Group.kt │ │ ├── Join.kt │ │ ├── JupyterExample.ipynb │ │ ├── MLlib.kt │ │ ├── Main.kt │ │ ├── MapAndListOperations.kt │ │ ├── RddGroupCalculation.kt │ │ ├── UDFs.kt │ │ ├── UdtRegistration.kt │ │ ├── WordCount.kt │ │ └── streaming │ │ ├── JupyterStreamingExample.ipynb │ │ ├── KotlinDirectKafkaWordCount.kt │ │ ├── KotlinRecoverableNetworkWordCount.kt │ │ ├── KotlinSqlNetworkWordCount.kt │ │ ├── KotlinStatefulNetworkCount.kt │ │ └── Streaming.kt │ └── resources │ └── the-catcher-in-the-rye.txt ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── gradlew_all_versions ├── jupyter ├── build.gradle.kts └── src │ ├── main │ ├── kotlin │ │ └── org │ │ │ └── jetbrains │ │ │ └── kotlinx │ │ │ └── spark │ │ │ └── api │ │ │ └── jupyter │ │ │ ├── HtmlRendering.kt │ │ │ ├── Integration.kt │ │ │ ├── JupyterConfiguration.kt │ │ │ ├── Properties.kt │ │ │ ├── SparkIntegration.kt │ │ │ └── SparkStreamingIntegration.kt │ └── resources │ │ └── table.css │ └── test │ └── kotlin │ └── org │ └── jetbrains │ └── kotlinx │ └── spark │ └── api │ └── jupyter │ └── JupyterTests.kt ├── kotlin-spark-api ├── build.gradle.kts └── src │ ├── main │ └── kotlin │ │ └── org │ │ └── jetbrains │ │ └── kotlinx │ │ └── spark │ │ └── api │ │ ├── Arities.kt │ │ ├── Column.kt │ │ ├── Conversions.kt │ │ ├── DataStreamWriter.kt │ │ ├── Dataset.kt │ │ ├── Encoding.kt │ │ ├── GroupState.kt │ │ ├── Iterators.kt │ │ ├── KeyValueGroupedDataset.kt │ │ ├── Rdd.kt │ │ ├── RddDouble.kt │ │ ├── RddKeyValue.kt │ │ ├── Seq.kt │ │ ├── SparkSession.kt │ │ ├── StreamingKeyValues.kt │ │ ├── UDFRegister.kt │ │ ├── UserDefinedAggregateFunction.kt │ │ ├── UserDefinedFunction.kt │ │ ├── UserDefinedFunctionVararg.kt │ │ └── UserDefinedFunctions.kt │ └── test │ └── kotlin │ └── org │ └── jetbrains │ └── kotlinx │ └── spark │ └── api │ ├── ApiTest.kt │ ├── DatasetFunctionTest.kt │ ├── EncodingTest.kt │ ├── KafkaStreamingTest.kt │ ├── ProjectConfig.kt │ ├── RddTest.kt │ ├── StreamingTest.kt │ ├── TypeInferenceTest.kt │ ├── UDFTest.kt │ ├── UdtTest.kt │ └── struct │ └── model │ └── models.kt ├── qodana.yaml ├── scala-tuples-in-kotlin ├── build.gradle.kts └── src │ ├── main │ └── kotlin │ │ └── org │ │ └── jetbrains │ │ └── kotlinx │ │ └── spark │ │ └── api │ │ ├── Conversions.kt │ │ └── tuples │ │ ├── DestructuredTupleBuilders.kt │ │ ├── DropFunctions.kt │ │ ├── EmptyTuple.kt │ │ ├── MapTuples.kt │ │ ├── ProductDestructuring.kt │ │ ├── ProductExtensions.kt │ │ ├── ProductTextualAccessors.kt │ │ ├── TupleBuilders.kt │ │ ├── TupleConcatenation.kt │ │ ├── TupleCopy.kt │ │ ├── TupleDrop.kt │ │ ├── TupleExtending.kt │ │ ├── TupleSplit.kt │ │ ├── TupleTake.kt │ │ ├── TupleZip.kt │ │ └── TypedProductExtensions.kt │ └── test │ └── kotlin │ └── org │ └── jetbrains │ └── kotlinx │ └── spark │ └── api │ └── tuples │ └── TuplesTest.kt └── settings.gradle.kts /.gitattributes: -------------------------------------------------------------------------------- 1 | kotlin-spark-api/src/main/kotlin/org/jetbrains/spark/api/VarArities.kt linguist-generated 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "maven" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and test 2 | 3 | on: 4 | push: 5 | branches: '*' 6 | # pull_request: 7 | # branches: '*' 8 | 9 | jobs: 10 | build-all-versions: 11 | timeout-minutes: 30 12 | strategy: 13 | matrix: 14 | scala: [ "2.12.17", "2.13.10" ] 15 | spark: [ "3.3.2", "3.3.1", "3.3.0", "3.2.3", "3.2.2", "3.2.1", "3.2.0", "3.1.3", "3.1.2", "3.1.1", "3.1.0", "3.0.3", "3.0.2", "3.0.1", "3.0.0" ] 16 | exclude: 17 | - scala: "2.13.10" 18 | spark: "3.1.3" 19 | - scala: "2.13.10" 20 | spark: "3.1.2" 21 | - scala: "2.13.10" 22 | spark: "3.1.1" 23 | - scala: "2.13.10" 24 | spark: "3.1.0" 25 | - scala: "2.13.10" 26 | spark: "3.0.3" 27 | - scala: "2.13.10" 28 | spark: "3.0.2" 29 | - scala: "2.13.10" 30 | spark: "3.0.1" 31 | - scala: "2.13.10" 32 | spark: "3.0.0" 33 | runs-on: ubuntu-latest 34 | 35 | steps: 36 | - uses: actions/checkout@v3 37 | 38 | - name: Set up JDK 11 39 | uses: actions/setup-java@v3 40 | with: 41 | distribution: adopt 42 | java-version: 11 43 | check-latest: true 44 | 45 | - name: Cache Gradle packages 46 | uses: actions/cache@v3 47 | with: 48 | path: | 49 | ~/.gradle/caches 50 | ~/.gradle/wrapper 51 | ~/.gradle/jdks 52 | key: ${{ runner.os }}-gradle-spark-${{ matrix.spark }}-${{ matrix.scala }} 53 | restore-keys: | 54 | ${{ runner.os }}-gradle- 55 | 56 | - name: Build with Gradle 57 | uses: gradle/gradle-build-action@v2 58 | with: 59 | arguments: | 60 | -Pspark=${{ matrix.spark }} 61 | -Pscala=${{ matrix.scala }} 62 | clean 63 | test 64 | --scan 65 | 66 | # qodana: 67 | # runs-on: ubuntu-latest 68 | # steps: 69 | # - uses: actions/checkout@v3 70 | # - name: 'Qodana Scan' 71 | # uses: JetBrains/qodana-action@v5.0.2 72 | 73 | 74 | 75 | # vim: ts=2:sts=2:sw=2:expandtab 76 | -------------------------------------------------------------------------------- /.github/workflows/generate_docs.yml: -------------------------------------------------------------------------------- 1 | name: Generate and publish docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - "release" 7 | 8 | jobs: 9 | generate-and-publish-docs: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | 15 | - name: Set up JDK 11 16 | uses: actions/setup-java@v3 17 | with: 18 | distribution: adopt 19 | java-version: 11 20 | check-latest: true 21 | 22 | - name: Cache Gradle packages 23 | uses: actions/cache@v3 24 | with: 25 | path: | 26 | ~/.gradle/caches 27 | ~/.gradle/wrapper 28 | ~/.gradle/jdks 29 | key: ${{ runner.os }}-gradle-spark-${{ matrix.spark }}-${{ matrix.scala }} 30 | restore-keys: | 31 | ${{ runner.os }}-gradle- 32 | 33 | - name: Set Swap Space 34 | uses: pierotofy/set-swap-space@master 35 | with: 36 | swap-size-gb: 12 37 | 38 | - name: Generate docs with Gradle 39 | uses: gradle/gradle-build-action@v2 40 | with: 41 | arguments: | 42 | clean 43 | build 44 | dokkaHtmlMultiModule 45 | --scan 46 | 47 | - name: Copy docs to "docs" branch 48 | uses: peaceiris/actions-gh-pages@v3 49 | with: 50 | github_token: ${{ secrets.GITHUB_TOKEN }} 51 | publish_branch: docs 52 | publish_dir: ./build/dokka/htmlMultiModule 53 | force_orphan: true 54 | 55 | 56 | -------------------------------------------------------------------------------- /.github/workflows/publish_dev_version.yml: -------------------------------------------------------------------------------- 1 | name: Deploy dev version to GH packages 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | 8 | jobs: 9 | build-and-deploy: 10 | strategy: 11 | matrix: 12 | scala: [ "2.12.17", "2.13.10" ] 13 | spark: [ "3.3.2", "3.3.1", "3.3.0", "3.2.3", "3.2.2", "3.2.1", "3.2.0", "3.1.3", "3.1.2", "3.1.1", "3.1.0", "3.0.3", "3.0.2", "3.0.1", "3.0.0" ] 14 | exclude: 15 | - scala: "2.13.10" 16 | spark: "3.1.3" 17 | - scala: "2.13.10" 18 | spark: "3.1.2" 19 | - scala: "2.13.10" 20 | spark: "3.1.1" 21 | - scala: "2.13.10" 22 | spark: "3.1.0" 23 | - scala: "2.13.10" 24 | spark: "3.0.3" 25 | - scala: "2.13.10" 26 | spark: "3.0.2" 27 | - scala: "2.13.10" 28 | spark: "3.0.1" 29 | - scala: "2.13.10" 30 | spark: "3.0.0" 31 | runs-on: ubuntu-latest 32 | permissions: 33 | contents: read 34 | packages: write 35 | 36 | steps: 37 | - uses: actions/checkout@v3 38 | 39 | - name: Set up JDK 11 40 | uses: actions/setup-java@v3 41 | with: 42 | distribution: adopt 43 | java-version: 11 44 | check-latest: true 45 | 46 | - name: Cache Gradle packages 47 | uses: actions/cache@v3 48 | with: 49 | path: | 50 | ~/.gradle/caches 51 | ~/.gradle/wrapper 52 | ~/.gradle/jdks 53 | key: ${{ runner.os }}-gradle-spark-${{ matrix.spark }}-${{ matrix.scala }} 54 | restore-keys: | 55 | ${{ runner.os }}-gradle- 56 | 57 | - name: Validate Gradle wrapper 58 | uses: gradle/wrapper-validation-action@v1 59 | 60 | - name: Setup Gradle 61 | uses: gradle/gradle-build-action@v2 62 | 63 | - name: Set Swap Space 64 | uses: pierotofy/set-swap-space@master 65 | with: 66 | swap-size-gb: 12 67 | 68 | - name: Deploy to GH Packages with Gradle 69 | env: 70 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 71 | run: > 72 | ./gradlew 73 | -Pspark=${{ matrix.spark }} 74 | -Pscala=${{ matrix.scala }} 75 | -PskipScalaTuplesInKotlin=${{ !(matrix.spark == '3.0.0' || matrix.scala == '2.13.10' && matrix.spark == '3.2.0') }} 76 | clean 77 | publishMavenPublicationToGitHubPackagesRepository 78 | --scan 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /.github/workflows/publish_release_version.yml: -------------------------------------------------------------------------------- 1 | name: Deploy release version to Maven Central 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build-and-deploy-mvn-central: 9 | strategy: 10 | matrix: 11 | scala: [ "2.12.17", "2.13.10" ] 12 | spark: [ "3.3.2", "3.3.1", "3.3.0", "3.2.3", "3.2.2", "3.2.1", "3.2.0", "3.1.3", "3.1.2", "3.1.1", "3.1.0", "3.0.3", "3.0.2", "3.0.1", "3.0.0" ] 13 | exclude: 14 | - scala: "2.13.10" 15 | spark: "3.1.3" 16 | - scala: "2.13.10" 17 | spark: "3.1.2" 18 | - scala: "2.13.10" 19 | spark: "3.1.1" 20 | - scala: "2.13.10" 21 | spark: "3.1.0" 22 | - scala: "2.13.10" 23 | spark: "3.0.3" 24 | - scala: "2.13.10" 25 | spark: "3.0.2" 26 | - scala: "2.13.10" 27 | spark: "3.0.1" 28 | - scala: "2.13.10" 29 | spark: "3.0.0" 30 | runs-on: ubuntu-latest 31 | permissions: 32 | contents: read 33 | packages: write 34 | 35 | steps: 36 | - uses: actions/checkout@v3 37 | 38 | - name: Set up JDK 11 39 | uses: actions/setup-java@v3 40 | with: 41 | distribution: adopt 42 | java-version: 11 43 | check-latest: true 44 | 45 | - name: Cache Gradle packages 46 | uses: actions/cache@v3 47 | with: 48 | path: | 49 | ~/.gradle/caches 50 | ~/.gradle/wrapper 51 | ~/.gradle/jdks 52 | key: ${{ runner.os }}-gradle-spark-${{ matrix.spark }}-${{ matrix.scala }} 53 | restore-keys: | 54 | ${{ runner.os }}-gradle- 55 | 56 | - name: Validate Gradle wrapper 57 | uses: gradle/wrapper-validation-action@v1 58 | 59 | - name: Setup Gradle 60 | uses: gradle/gradle-build-action@v2 61 | 62 | - name: Set Swap Space 63 | uses: pierotofy/set-swap-space@master 64 | with: 65 | swap-size-gb: 12 66 | 67 | - name: Upload to Maven Central with Gradle 68 | env: 69 | ORG_GRADLE_PROJECT_mavenCentralUsername: ${{ secrets.OSSRH_USERNAME }} 70 | ORG_GRADLE_PROJECT_mavenCentralPassword: ${{ secrets.OSSRH_TOKEN }} 71 | ORG_GRADLE_PROJECT_signingInMemoryKey: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }} 72 | ORG_GRADLE_PROJECT_signingInMemoryKeyPassword: ${{ secrets.MAVEN_GPG_PASSPHRASE }} 73 | run: > 74 | ./gradlew 75 | -Pspark=${{ matrix.spark }} 76 | -Pscala=${{ matrix.scala }} 77 | -PskipScalaTuplesInKotlin=${{ !(matrix.spark == '3.0.0' || matrix.scala == '2.13.10' && matrix.spark == '3.2.0') }} 78 | clean 79 | publishMavenPublicationToMavenCentralRepository 80 | --scan 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/git,vim,maven,emacs,spark,kotlin,jetbrains+all,visualstudiocode 3 | # Edit at https://www.gitignore.io/?templates=git,vim,maven,emacs,spark,kotlin,jetbrains+all,visualstudiocode 4 | 5 | ### Emacs ### 6 | # -*- mode: gitignore; -*- 7 | *~ 8 | \#*\# 9 | /.emacs.desktop 10 | /.emacs.desktop.lock 11 | *.elc 12 | auto-save-list 13 | tramp 14 | .\#* 15 | 16 | # Org-mode 17 | .org-id-locations 18 | *_archive 19 | 20 | # flymake-mode 21 | *_flymake.* 22 | 23 | # eshell files 24 | /eshell/history 25 | /eshell/lastdir 26 | 27 | # elpa packages 28 | /elpa/ 29 | 30 | # reftex files 31 | *.rel 32 | 33 | # AUCTeX auto folder 34 | /auto/ 35 | 36 | # cask packages 37 | .cask/ 38 | dist/ 39 | 40 | # Flycheck 41 | flycheck_*.el 42 | 43 | # server auth directory 44 | /server/ 45 | 46 | # projectiles files 47 | .projectile 48 | 49 | # directory configuration 50 | .dir-locals.el 51 | 52 | # network security 53 | /network-security.data 54 | 55 | 56 | ### Git ### 57 | # Created by git for backups. To disable backups in Git: 58 | # $ git config --global mergetool.keepBackup false 59 | *.orig 60 | 61 | # Created by git when using merge tools for conflicts 62 | *.BACKUP.* 63 | *.BASE.* 64 | *.LOCAL.* 65 | *.REMOTE.* 66 | *_BACKUP_*.txt 67 | *_BASE_*.txt 68 | *_LOCAL_*.txt 69 | *_REMOTE_*.txt 70 | 71 | ### JetBrains+all ### 72 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 73 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 74 | 75 | # User-specific stuff 76 | .idea/**/workspace.xml 77 | .idea/**/tasks.xml 78 | .idea/**/usage.statistics.xml 79 | .idea/**/dictionaries 80 | .idea/**/shelf 81 | 82 | # Generated files 83 | .idea/**/contentModel.xml 84 | 85 | # Sensitive or high-churn files 86 | .idea/**/dataSources/ 87 | .idea/**/dataSources.ids 88 | .idea/**/dataSources.local.xml 89 | .idea/**/sqlDataSources.xml 90 | .idea/**/dynamic.xml 91 | .idea/**/uiDesigner.xml 92 | .idea/**/dbnavigator.xml 93 | 94 | # Gradle 95 | .idea/**/gradle.xml 96 | .idea/**/libraries 97 | 98 | # Gradle and Maven with auto-import 99 | # When using Gradle or Maven with auto-import, you should exclude module files, 100 | # since they will be recreated, and may cause churn. Uncomment if using 101 | # auto-import. 102 | # .idea/modules.xml 103 | # .idea/*.iml 104 | # .idea/modules 105 | # *.iml 106 | # *.ipr 107 | 108 | # CMake 109 | cmake-build-*/ 110 | 111 | # Mongo Explorer plugin 112 | .idea/**/mongoSettings.xml 113 | 114 | # File-based project format 115 | *.iws 116 | 117 | # IntelliJ 118 | out/ 119 | 120 | # mpeltonen/sbt-idea plugin 121 | .idea_modules/ 122 | 123 | # JIRA plugin 124 | atlassian-ide-plugin.xml 125 | 126 | # Cursive Clojure plugin 127 | .idea/replstate.xml 128 | 129 | # Crashlytics plugin (for Android Studio and IntelliJ) 130 | com_crashlytics_export_strings.xml 131 | crashlytics.properties 132 | crashlytics-build.properties 133 | fabric.properties 134 | 135 | # Editor-based Rest Client 136 | .idea/httpRequests 137 | 138 | # Android studio 3.1+ serialized cache file 139 | .idea/caches/build_file_checksums.ser 140 | 141 | ### JetBrains+all Patch ### 142 | # Ignores the whole .idea folder and all .iml files 143 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 144 | 145 | .idea/ 146 | 147 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 148 | 149 | *.iml 150 | modules.xml 151 | .idea/misc.xml 152 | *.ipr 153 | 154 | # Sonarlint plugin 155 | .idea/sonarlint 156 | 157 | ### Kotlin ### 158 | # Compiled class file 159 | *.class 160 | 161 | # Log file 162 | *.log 163 | 164 | # BlueJ files 165 | *.ctxt 166 | 167 | # Mobile Tools for Java (J2ME) 168 | .mtj.tmp/ 169 | 170 | # Package Files # 171 | *.jar 172 | *.war 173 | *.nar 174 | *.ear 175 | *.zip 176 | *.tar.gz 177 | *.rar 178 | 179 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 180 | hs_err_pid* 181 | 182 | ### Maven ### 183 | target/ 184 | pom*.xml.tag 185 | pom*.xml.bak 186 | pom*.xml.releaseBackup 187 | pom*.xml.versionsBackup 188 | pom*.xml.next 189 | release.properties 190 | dependency-reduced-pom.xml 191 | buildNumber.properties 192 | .mvn/timing.properties 193 | .mvn/wrapper/maven-wrapper.jar 194 | .flattened-pom.xml 195 | 196 | ### Spark ### 197 | *#*# 198 | *.#* 199 | *.pyc 200 | *.pyo 201 | *.swp 202 | .DS_Store 203 | .cache 204 | .classpath 205 | .ensime 206 | .ensime_cache/ 207 | .ensime_lucene 208 | .generated-mima* 209 | .project 210 | .pydevproject 211 | .scala_dependencies 212 | .settings 213 | /lib/ 214 | R-unit-tests.log 215 | R/unit-tests.out 216 | R/cran-check.out 217 | R/pkg/vignettes/sparkr-vignettes.html 218 | R/pkg/tests/fulltests/Rplots.pdf 219 | build/*.jar 220 | build/apache-maven* 221 | build/scala* 222 | build/zinc* 223 | cache 224 | checkpoint 225 | conf/*.cmd 226 | conf/*.conf 227 | conf/*.properties 228 | conf/*.sh 229 | conf/*.xml 230 | conf/java-opts 231 | conf/slaves 232 | derby.log 233 | dev/create-release/*final 234 | dev/create-release/*txt 235 | dev/pr-deps/ 236 | docs/_site 237 | docs/api 238 | sql/docs 239 | sql/site 240 | lib_managed/ 241 | lint-r-report.log 242 | log/ 243 | logs/ 244 | project/boot/ 245 | project/build/target/ 246 | project/plugins/lib_managed/ 247 | project/plugins/project/build.properties 248 | project/plugins/src_managed/ 249 | project/plugins/target/ 250 | python/lib/pyspark.zip 251 | python/deps 252 | python/test_coverage/coverage_data 253 | python/test_coverage/htmlcov 254 | python/pyspark/python 255 | reports/ 256 | scalastyle-on-compile.generated.xml 257 | scalastyle-output.xml 258 | scalastyle.txt 259 | spark-*-bin-*.tgz 260 | spark-tests.log 261 | src_managed/ 262 | streaming-tests.log 263 | unit-tests.log 264 | work/ 265 | docs/.jekyll-metadata 266 | 267 | # For Hive 268 | TempStatsStore/ 269 | metastore/ 270 | metastore_db/ 271 | sql/hive-thriftserver/test_warehouses 272 | warehouse/ 273 | spark-warehouse/ 274 | 275 | # For R session data 276 | .RData 277 | .RHistory 278 | .Rhistory 279 | *.Rproj 280 | *.Rproj.* 281 | 282 | .Rproj.user 283 | 284 | # For SBT 285 | .jvmopts 286 | 287 | 288 | ### Vim ### 289 | # Swap 290 | [._]*.s[a-v][a-z] 291 | [._]*.sw[a-p] 292 | [._]s[a-rt-v][a-z] 293 | [._]ss[a-gi-z] 294 | [._]sw[a-p] 295 | 296 | # Session 297 | Session.vim 298 | Sessionx.vim 299 | 300 | # Temporary 301 | .netrwhist 302 | # Auto-generated tag files 303 | tags 304 | # Persistent undo 305 | [._]*.un~ 306 | 307 | ### VisualStudioCode ### 308 | .vscode/* 309 | !.vscode/settings.json 310 | !.vscode/tasks.json 311 | !.vscode/launch.json 312 | !.vscode/extensions.json 313 | 314 | ### VisualStudioCode Patch ### 315 | # Ignore all local history of files 316 | .history 317 | 318 | # End of https://www.gitignore.io/api/git,vim,maven,emacs,spark,kotlin,jetbrains+all,visualstudiocode 319 | 320 | 321 | # Created by https://www.gitignore.io/api/scala,gradle,kotlin 322 | # Edit at https://www.gitignore.io/?templates=scala,gradle,kotlin 323 | 324 | ### Kotlin ### 325 | # Compiled class file 326 | *.class 327 | 328 | # Log file 329 | *.log 330 | 331 | # BlueJ files 332 | *.ctxt 333 | 334 | # Mobile Tools for Java (J2ME) 335 | .mtj.tmp/ 336 | 337 | # Package Files # 338 | *.jar 339 | *.war 340 | *.nar 341 | *.ear 342 | *.zip 343 | *.tar.gz 344 | *.rar 345 | 346 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 347 | hs_err_pid* 348 | 349 | ### Scala ### 350 | *.metals 351 | 352 | ### Gradle ### 353 | .gradle 354 | build/ 355 | 356 | # Ignore Gradle GUI config 357 | gradle-app.setting 358 | 359 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 360 | !gradle-wrapper.jar 361 | 362 | # Cache of project 363 | .gradletasknamecache 364 | 365 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898 366 | # gradle/wrapper/gradle-wrapper.properties 367 | 368 | ### Gradle Patch ### 369 | **/build/ 370 | 371 | # End of https://www.gitignore.io/api/scala,gradle,kotlin 372 | 373 | csvpath/ 374 | orcpath/ 375 | 376 | .env 377 | **/.allure/ 378 | **/allure-results/ 379 | /generated_* 380 | -------------------------------------------------------------------------------- /.grenrc.yml: -------------------------------------------------------------------------------- 1 | --- 2 | dataSource: "commits" 3 | prefix: "" 4 | includeMessages: "commits" 5 | changelogFilename: "CHANGELOG.md" 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | openjdk8 4 | cache: 5 | directories: 6 | - $HOME/.m2 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.3.1 (05/08/2020) 4 | - [chore: 🤖 Version bump to 0.3.1](https://github.com/JetBrains/kotlin-spark-api/commit/88d4e31cd6d76fe1fc0b9c10a078658aa6178e36) - @asm0dey 5 | - [test: 💍 Adds integration and more unit tests](https://github.com/JetBrains/kotlin-spark-api/commit/bca98f77b743c9456882f49702f9cea55d153320) - @asm0dey 6 | - [fix: 🐛 Fixes incorrect order of fields in data class](https://github.com/JetBrains/kotlin-spark-api/commit/56bc82f9ff0de6948386e8c4a330ba715285ad55) - @asm0dey 7 | - [fix: 🐛 Fixes incorrect handling of primitive types](https://github.com/JetBrains/kotlin-spark-api/commit/868e47919d41309e1ebebc327acdd53e3df094ea) - @asm0dey 8 | - [Bump klaxon from 5.2 to 5.3](https://github.com/JetBrains/kotlin-spark-api/commit/edfd06d4895ded7772a42d7e13e2eb2aed8031bf) - @dependabot[bot] 9 | - [Updated README.md (#32)](https://github.com/JetBrains/kotlin-spark-api/commit/c2cc6fe7e7a4d5e0d501ab0a75e371c6cc041d77) - @MKhalusova 10 | - [Bump kotest.version from 4.1.1 to 4.1.3](https://github.com/JetBrains/kotlin-spark-api/commit/a1ca5fe8c11c7aeeb70a8335c2252549e1cc7f81) - @dependabot[bot] 11 | 12 | --- 13 | 14 | ## 0.3.0 (08/07/2020) 15 | - [Version bump to 0.3.0](https://github.com/JetBrains/kotlin-spark-api/commit/0d5bcf57575d8906a219b4143a67df8939c46b0c) - @asm0dey 16 | - [Inproves README — adds information on collection of data from Dataset](https://github.com/JetBrains/kotlin-spark-api/commit/d81b98622ac816c9224f980d92407249333cf6d0) - @asm0dey 17 | - [Fixes #27](https://github.com/JetBrains/kotlin-spark-api/commit/5c05b6f02e30289535fe5f2d45dc99ede3c1eff3) - @asm0dey 18 | - [Fixes #26](https://github.com/JetBrains/kotlin-spark-api/commit/ffdb41d418b53e9336b73d51f3e5237d09c06ef2) - @asm0dey 19 | - [Bump maven-site-plugin from 3.9.0 to 3.9.1](https://github.com/JetBrains/kotlin-spark-api/commit/ee5a4ea1512d756e19549bc50f5c73ecf61108d2) - @dependabot[bot] 20 | - [Create CODE_OF_CONDUCT.md](https://github.com/JetBrains/kotlin-spark-api/commit/96dcfbbf8882de0e8446db3c4485e9febccaf8c5) - @asm0dey 21 | - [Fixes changelog](https://github.com/JetBrains/kotlin-spark-api/commit/1121550089cbec91b8ac915260a9d3593bb0138b) - @asm0dey 22 | 23 | --- 24 | 25 | ## 0.2.3 (23/06/2020) 26 | - [Updates version to 0.2.3](https://github.com/JetBrains/kotlin-spark-api/commit/91ee4faf392792642be5a8c58800b343df02da5b) - @asm0dey 27 | - [Fixes #21](https://github.com/JetBrains/kotlin-spark-api/commit/e8c1c5973087b3dd3f755d9d408893d3d2f19c94) - @asm0dey 28 | - [Updates kotest to 4.1.0 and moves it to property](https://github.com/JetBrains/kotlin-spark-api/commit/c26ad2e514421c4a1e8eaa10a76c035d8c0a0f11) - @asm0dey 29 | - [Fixes #20](https://github.com/JetBrains/kotlin-spark-api/commit/0b1bd9875cbb9ea85f1ccb66250d434ba5384c06) - @asm0dey 30 | - [Fixes #16](https://github.com/JetBrains/kotlin-spark-api/commit/875709459df946542bd133c2a3164deda5909fbc) - @asm0dey 31 | - [Bump kotest-assertions-core-jvm from 4.0.6 to 4.1.0](https://github.com/JetBrains/kotlin-spark-api/commit/b072f8fc2b4b30d40c8fee08f598941c896175bd) - @dependabot[bot] 32 | 33 | --- 34 | 35 | ## Fixes for #16 and #17 (22/06/2020) 36 | 37 | --- 38 | 39 | ## Fixes #15 (22/06/2020) 40 | 41 | --- 42 | 43 | ## Update to Spark 3.0.0 release (18/06/2020) 44 | 45 | --- 46 | 47 | ## 0.1.0 (01/06/2020) 48 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # JetBrains Open Source and Community Code of Conduct 2 | 3 | This code of conduct outlines our expectations for all those who participate in our open source projects and communities (community programs), as well as the consequences for unacceptable behaviour. We invite all those who participate to help us create safe and positive experiences for everyone. Communities mirror the societies in which they exist and positive action is essential to counteract the many forms of inequality and abuses of power that exist in society. 4 | 5 | ## How to behave 6 | 7 | The following behaviours are expected and requested of all community members: 8 | 9 | * Participate in an authentic and active way. In doing so, you contribute to the health and longevity of this community. 10 | * Exercise consideration, respect and empathy in your speech and actions. Remember, we have all been through different stages of learning when adopting technologies. 11 | * Refrain from demeaning, discriminatory, or harassing behaviour and speech. 12 | * Disagreements on things are fine, argumentative behaviour or trolling are not. 13 | 14 | ## How not to behave 15 | 16 | * Do not perform threats of violence or use violent language directed against another person. 17 | * Do not make jokes of sexist, racist, homophobic, transphobic, ableist or otherwise discriminatory nature, or use language of this nature. 18 | * Do not post or display sexually explicit or violent material. 19 | * Do not post or threaten to post other people’s personally identifying information ("doxing"). 20 | * Do not make personal insults, particularly those related to gender, sexual orientation, race, religion, or disability. 21 | * Do not engage in sexual attention. This includes, sexualised comments or jokes and sexual advances. 22 | * Do not advocate for, or encourage, any of the above behaviour. 23 | 24 | 25 | Please take into account that online communities bring together people from many different cultures and backgrounds. It's important to understand that sometimes the combination of cultural differences and online interaction can lead to misunderstandings. That is why having empathy is very important. 26 | 27 | ## How to report issues 28 | 29 | If someone is acting inappropriately or violating this Code of Conduct in any shape or form, and they are not receptive to your feedback or you prefer not to confront them, please reach out to JetBrains via codeofconduct@jetbrains.com 30 | 31 | ## Consequences of Unacceptable Behaviour 32 | 33 | Unacceptable behaviour from any community member will not be tolerated. Anyone asked to stop unacceptable behaviour is expected to comply immediately. If a community member engages in unacceptable behaviour, JetBrains and/or community organisers may take any action they deem appropriate, up to and including a temporary ban or permanent expulsion from the community without warning. 34 | 35 | ## License and attribution 36 | 37 | The license is based off of The Citizen Code of Conduct is distributed by [Stumptown Syndicate](http://stumptownsyndicate.org/) under a [Creative Commons Attribution-ShareAlike license](http://creativecommons.org/licenses/by-sa/3.0/). 38 | -------------------------------------------------------------------------------- /build.gradle.kts: -------------------------------------------------------------------------------- 1 | @file:Suppress("UnstableApiUsage") 2 | 3 | buildscript { 4 | repositories { 5 | mavenCentral() 6 | } 7 | dependencies { 8 | classpath(jcp) 9 | classpath(mavenPublish) 10 | } 11 | } 12 | 13 | 14 | plugins { 15 | mavenPublish version Versions.mavenPublish 16 | dokka version Versions.dokka 17 | idea 18 | kotlin version Versions.kotlin apply false 19 | } 20 | 21 | group = Versions.groupID 22 | version = Versions.project 23 | 24 | tasks.withType().configureEach { 25 | useJUnitPlatform() 26 | } 27 | 28 | repositories { 29 | mavenCentral() 30 | } 31 | 32 | allprojects { 33 | plugins.withId(mavenPublishBase) { 34 | group = Versions.groupID 35 | version = Versions.project 36 | 37 | publishing { 38 | repositories { 39 | maven { 40 | name = "GitHubPackages" 41 | url = uri("https://maven.pkg.github.com/Kotlin/kotlin-spark-api") 42 | credentials { 43 | username = project.findProperty("gpr.user") as String? 44 | ?: System.getenv("GITHUB_ACTOR") 45 | password = project.findProperty("gpr.key") as String? 46 | ?: System.getenv("GITHUB_TOKEN") 47 | } 48 | } 49 | } 50 | } 51 | 52 | mavenPublishing { 53 | pomFromGradleProperties() 54 | publishToMavenCentral() 55 | // The username and password for Sonatype OSS can be provided as Gradle properties 56 | // called mavenCentralUsername and mavenCentralPassword to avoid having to commit them. 57 | // You can also supply them as environment variables called 58 | // ORG_GRADLE_PROJECT_mavenCentralUsername and 59 | // ORG_GRADLE_PROJECT_mavenCentralPassword. 60 | 61 | // also ORG_GRADLE_PROJECT_signingInMemoryKey=exported_ascii_armored_key 62 | // # optional 63 | // ORG_GRADLE_PROJECT_signingInMemoryKeyId=24875D73 64 | // # if key was created with a password 65 | // ORG_GRADLE_PROJECT_signingInMemoryKeyPassword=secret 66 | 67 | signAllPublications() 68 | pom { 69 | name.set("Kotlin Spark API") 70 | description.set("Kotlin for Apache Spark") 71 | packaging = "pom" 72 | 73 | url.set("https://maven.apache.org") 74 | inceptionYear.set("2019") 75 | 76 | organization { 77 | name.set("JetBrains") 78 | url.set("https://www.jetbrains.com/") 79 | } 80 | 81 | licenses { 82 | license { 83 | name.set("Apache License, Version 2.0") 84 | url.set("https://www.apache.org/licenses/LICENSE-2.0.txt") 85 | } 86 | } 87 | 88 | developers { 89 | developer { 90 | id.set("asm0dey") 91 | name.set("Pasha Finkelshteyn") 92 | email.set("asm0dey@jetbrains.com") 93 | timezone.set("GMT+3") 94 | } 95 | developer { 96 | id.set("vitaly.khudobakhshov") 97 | name.set("Vitaly Khudobakhshov") 98 | email.set("vitaly.khudobakhshov@jetbrains.com") 99 | timezone.set("GMT+3") 100 | } 101 | developer { 102 | id.set("Jolanrensen") 103 | name.set("Jolan Rensen") 104 | email.set("jolan.rensen@jetbrains.com") 105 | timezone.set("GMT+1") 106 | } 107 | } 108 | 109 | scm { 110 | connection.set("scm:git:https://github.com/Kotlin/kotlin-spark-api.git") 111 | url.set("https://github.com/Kotlin/kotlin-spark-api") 112 | tag.set("HEAD") 113 | } 114 | } 115 | } 116 | } 117 | } -------------------------------------------------------------------------------- /buildSrc/build.gradle.kts: -------------------------------------------------------------------------------- 1 | import org.gradle.kotlin.dsl.`kotlin-dsl` 2 | import org.jetbrains.kotlin.gradle.tasks.KotlinCompile 3 | 4 | plugins { 5 | `kotlin-dsl` 6 | } 7 | 8 | repositories { 9 | mavenCentral() 10 | } 11 | -------------------------------------------------------------------------------- /buildSrc/src/main/kotlin/Dependencies.kt: -------------------------------------------------------------------------------- 1 | object Dependencies { 2 | inline val kotlinStdLib get() = "org.jetbrains.kotlin:kotlin-stdlib-jdk8:${Versions.kotlin}" 3 | inline val reflect get() = "org.jetbrains.kotlin:kotlin-reflect:${Versions.kotlin}" 4 | inline val scalaLibrary get() = "org.scala-lang:scala-library:${Versions.scala}" 5 | inline val kotlinxHtml get() = "org.jetbrains.kotlinx:kotlinx-html-jvm:${Versions.kotlinxHtml}" 6 | inline val sparkSql get() = "org.apache.spark:spark-sql_${Versions.scalaCompat}:${Versions.spark}" 7 | inline val sparkMl get() = "org.apache.spark:spark-mllib_${Versions.scalaCompat}:${Versions.spark}" 8 | inline val sparkStreaming get() = "org.apache.spark:spark-streaming_${Versions.scalaCompat}:${Versions.spark}" 9 | inline val hadoopClient get() = "org.apache.hadoop:hadoop-client:${Versions.hadoop}" 10 | inline val sparkRepl get() = "org.apache.spark:spark-repl_${Versions.scalaCompat}:${Versions.spark}" 11 | inline val jupyter get() = "org.jetbrains.kotlinx:kotlin-jupyter-api:${Versions.jupyter}" 12 | inline val junit get() = "org.junit.jupiter:junit-jupiter-engine:5.8.1" 13 | inline val sparkStreamingKafka get() = "org.apache.spark:spark-streaming-kafka-0-10_${Versions.scalaCompat}:${Versions.spark}" 14 | inline val kotest get() = "io.kotest:kotest-runner-junit5:${Versions.kotest}" 15 | inline val kotestTestcontainers get() = "io.kotest.extensions:kotest-extensions-testcontainers:${Versions.kotestTestContainers}" 16 | inline val klaxon get() = "com.beust:klaxon:${Versions.klaxon}" 17 | inline val atrium get() = "ch.tutteli.atrium:atrium-fluent-en_GB:${Versions.atrium}" 18 | inline val kafkaStreamsTestUtils get() = "org.apache.kafka:kafka-streams-test-utils:${Versions.kafkaStreamsTestUtils}" 19 | inline val jupyterTest get() = "org.jetbrains.kotlinx:kotlin-jupyter-test-kit:${Versions.jupyter}" 20 | inline val kotlinTest get() = "org.jetbrains.kotlin:kotlin-test:${Versions.kotlin}" 21 | inline val kotlinScriptingCommon get() = "org.jetbrains.kotlin:kotlin-scripting-common" 22 | inline val kotlinScriptingJvm get() = "org.jetbrains.kotlin:kotlin-scripting-jvm" 23 | inline val jacksonDatabind get() = "com.fasterxml.jackson.core:jackson-databind:${Versions.jacksonDatabind}" 24 | } 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /buildSrc/src/main/kotlin/Helpers.kt: -------------------------------------------------------------------------------- 1 | import org.gradle.api.artifacts.Dependency 2 | import org.gradle.api.artifacts.ProjectDependency 3 | import org.gradle.api.artifacts.dsl.DependencyHandler 4 | 5 | fun DependencyHandler.testApi(vararg dependencyNotations: Any): List = 6 | dependencyNotations.map { 7 | add("testApi", it) 8 | } 9 | 10 | fun DependencyHandler.api(vararg dependencyNotations: Any): List = 11 | dependencyNotations.map { 12 | add("api", it) 13 | } 14 | 15 | 16 | fun DependencyHandler.testImplementation(vararg dependencyNotations: Any): List = 17 | dependencyNotations.map { 18 | add("testImplementation", it) 19 | } 20 | 21 | fun DependencyHandler.implementation(vararg dependencyNotations: Any): List = 22 | dependencyNotations.map { 23 | add("implementation", it) 24 | } 25 | 26 | fun DependencyHandler.runtimeOnly(vararg dependencyNotations: Any): List = 27 | dependencyNotations.map { 28 | add("runtimeOnly", it) 29 | } 30 | 31 | fun DependencyHandler.project( 32 | path: String, 33 | configuration: String? = null 34 | ): ProjectDependency = project( 35 | if (configuration != null) mapOf("path" to path, "configuration" to configuration) 36 | else mapOf("path" to path) 37 | ) as ProjectDependency 38 | -------------------------------------------------------------------------------- /buildSrc/src/main/kotlin/Plugins.kt: -------------------------------------------------------------------------------- 1 | import org.gradle.api.Project 2 | import org.gradle.kotlin.dsl.* 3 | import org.gradle.plugin.use.PluginDependenciesSpec 4 | 5 | 6 | inline val PluginDependenciesSpec.kotlin 7 | get() = kotlin("jvm") 8 | 9 | inline val PluginDependenciesSpec.dokka 10 | get() = id("org.jetbrains.dokka") 11 | 12 | inline val PluginDependenciesSpec.license 13 | get() = id("com.github.hierynomus.license") version Versions.licenseGradlePluginVersion 14 | 15 | inline val PluginDependenciesSpec.jcp 16 | get() = id("com.igormaznitsa.jcp") 17 | 18 | inline val DependencyHandlerScope.jcp 19 | get() = "com.igormaznitsa:jcp:${Versions.jcp}" 20 | 21 | inline val DependencyHandlerScope.mavenPublish 22 | get() = "com.vanniktech:gradle-maven-publish-plugin:${Versions.mavenPublish}" 23 | 24 | inline val PluginDependenciesSpec.mavenPublish 25 | get() = id("com.vanniktech.maven.publish") 26 | 27 | inline val PluginDependenciesSpec.mavenPublishBase 28 | get() = id("com.vanniktech.maven.publish.base") 29 | 30 | inline val Project.mavenPublishBase 31 | get() = "com.vanniktech.maven.publish.base" 32 | 33 | inline val PluginDependenciesSpec.jupyter 34 | get() = kotlin("jupyter.api") version Versions.jupyter 35 | 36 | -------------------------------------------------------------------------------- /buildSrc/src/main/kotlin/Projects.kt: -------------------------------------------------------------------------------- 1 | @file:Suppress("NOTHING_TO_INLINE") 2 | 3 | import org.gradle.api.Project 4 | import org.gradle.api.artifacts.dsl.DependencyHandler 5 | import org.gradle.kotlin.dsl.support.delegates.ProjectDelegate 6 | 7 | object Projects { 8 | 9 | inline fun Project.searchProject(name: String): Project = 10 | rootProject 11 | .childProjects 12 | .filterKeys { name in it } 13 | .entries 14 | .singleOrNull() 15 | ?.value ?: error("Project $name not found") 16 | 17 | inline val Project.kotlinSparkApi 18 | get() = searchProject("kotlin-spark-api") 19 | 20 | inline val Project.core 21 | get() = searchProject("core") 22 | 23 | inline val Project.examples 24 | get() = searchProject("examples") 25 | 26 | inline val Project.jupyter 27 | get() = searchProject("jupyter") 28 | 29 | inline val Project.scalaTuplesInKotlin 30 | get() = searchProject("scala-tuples-in-kotlin") 31 | } -------------------------------------------------------------------------------- /buildSrc/src/main/kotlin/Versions.kt: -------------------------------------------------------------------------------- 1 | object Versions { 2 | const val project = "1.2.4" 3 | const val groupID = "org.jetbrains.kotlinx.spark" 4 | const val kotlin = "1.8.20" 5 | const val jvmTarget = "8" 6 | const val jupyterJvmTarget = "8" 7 | 8 | inline val spark get() = System.getProperty("spark") as String 9 | inline val scala get() = System.getProperty("scala") as String 10 | inline val sparkMinor get() = spark.substringBeforeLast('.') 11 | inline val scalaCompat get() = scala.substringBeforeLast('.') 12 | 13 | const val jupyter = "0.12.0-32-1" 14 | const val kotest = "5.5.4" 15 | const val kotestTestContainers = "1.3.3" 16 | const val dokka = "1.8.20" 17 | const val jcp = "7.0.5" 18 | const val mavenPublish = "0.20.0" 19 | const val atrium = "0.17.0" 20 | const val licenseGradlePluginVersion = "0.15.0" 21 | const val kafkaStreamsTestUtils = "3.1.0" 22 | const val hadoop = "3.3.6" 23 | const val kotlinxHtml = "0.7.5" 24 | const val klaxon = "5.5" 25 | const val jacksonDatabind = "2.13.4.2" 26 | 27 | inline val versionMap 28 | get() = mapOf( 29 | "kotlin" to kotlin, 30 | "scala" to scala, 31 | "scalaCompat" to scalaCompat, 32 | "spark" to spark, 33 | "sparkMinor" to sparkMinor, 34 | "version" to project, 35 | ) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /core/build.gradle.kts: -------------------------------------------------------------------------------- 1 | @file:Suppress("UnstableApiUsage", "NOTHING_TO_INLINE") 2 | 3 | import com.igormaznitsa.jcp.gradle.JcpTask 4 | import com.vanniktech.maven.publish.JavaLibrary 5 | import com.vanniktech.maven.publish.JavadocJar.Javadoc 6 | 7 | plugins { 8 | scala 9 | `java-library` 10 | jcp 11 | mavenPublishBase 12 | } 13 | 14 | group = Versions.groupID 15 | version = Versions.project 16 | 17 | repositories { 18 | mavenCentral() 19 | } 20 | 21 | dependencies { 22 | 23 | with(Dependencies) { 24 | api( 25 | scalaLibrary, 26 | reflect, 27 | ) 28 | 29 | // https://github.com/FasterXML/jackson-bom/issues/52 30 | if (Versions.spark == "3.3.1") implementation(jacksonDatabind) 31 | 32 | implementation( 33 | sparkSql, 34 | ) 35 | } 36 | } 37 | 38 | 39 | java { 40 | toolchain { 41 | if (Versions.scalaCompat.toDouble() > 2.12) { // scala 2.12 will always target java 8 42 | languageVersion.set( 43 | JavaLanguageVersion.of(Versions.jvmTarget) 44 | ) 45 | } else if (Versions.jvmTarget == "1.8" || Versions.jvmTarget == "8") { 46 | languageVersion.set( 47 | JavaLanguageVersion.of(8) 48 | ) 49 | } 50 | } 51 | } 52 | 53 | tasks.withType { 54 | if (Versions.scalaCompat.toDouble() > 2.12) { // scala 2.12 will always target java 8 55 | targetCompatibility = Versions.jvmTarget 56 | } else if (Versions.jvmTarget == "1.8" || Versions.jvmTarget == "8") { 57 | targetCompatibility = "1.8" 58 | } 59 | } 60 | 61 | val scalaMainSources = sourceSets.main.get().scala.sourceDirectories 62 | 63 | val preprocessMain by tasks.creating(JcpTask::class) { 64 | sources.set(scalaMainSources) 65 | clearTarget.set(true) 66 | fileExtensions.set(listOf("scala")) 67 | vars.set(Versions.versionMap) 68 | outputs.upToDateWhen { target.get().exists() } 69 | } 70 | 71 | tasks.compileScala { 72 | dependsOn(preprocessMain) 73 | outputs.upToDateWhen { 74 | preprocessMain.outcomingFiles.files.isEmpty() 75 | } 76 | 77 | doFirst { 78 | scala { 79 | sourceSets { 80 | main { 81 | scala.setSrcDirs(listOf(preprocessMain.target.get())) 82 | } 83 | } 84 | } 85 | } 86 | 87 | doLast { 88 | scala { 89 | sourceSets { 90 | main { 91 | scala.setSrcDirs(scalaMainSources) 92 | } 93 | } 94 | } 95 | } 96 | } 97 | 98 | mavenPublishing { 99 | configure(JavaLibrary(Javadoc())) 100 | } 101 | 102 | -------------------------------------------------------------------------------- /core/src/main/scala/org/jetbrains/kotlinx/spark/extensions/DemoCaseClass.scala: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.extensions 2 | 3 | case class DemoCaseClass[T](a: Int, b: T) 4 | -------------------------------------------------------------------------------- /core/src/main/scala/org/jetbrains/kotlinx/spark/extensions/KSparkExtensions.scala: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.extensions 21 | 22 | import org.apache.spark.SparkContext 23 | import org.apache.spark.sql._ 24 | import java.util 25 | import scala.reflect.ClassTag 26 | 27 | object KSparkExtensions { 28 | 29 | val kotlinVersion = /*$"\""+kotlin+"\""$*/ /*-*/ "" 30 | val scalaVersion = /*$"\""+scala+"\""$*/ /*-*/ "" 31 | val scalaCompatVersion = /*$"\""+scalaCompat+"\""$*/ /*-*/ "" 32 | val sparkVersion = /*$"\""+spark+"\""$*/ /*-*/ "" 33 | val sparkMinorVersion = /*$"\""+sparkMinor+"\""$*/ /*-*/ "" 34 | 35 | def col(d: Dataset[_], name: String): Column = d.col(name) 36 | 37 | def col(name: String): Column = functions.col(name) 38 | 39 | def lit(literal: Any): Column = functions.lit(literal) 40 | 41 | def collectAsList[T](ds: Dataset[T]): util.List[T] = { 42 | //#if scalaCompat >= 2.13 43 | scala.jdk.javaapi.CollectionConverters.asJava(ds.collect()) 44 | //#else 45 | //$scala.collection.JavaConverters.seqAsJavaList(ds.collect()) 46 | //#endif 47 | } 48 | 49 | 50 | def debugCodegen(df: Dataset[_]): Unit = { 51 | import org.apache.spark.sql.execution.debug._ 52 | df.debugCodegen() 53 | } 54 | 55 | def debug(df: Dataset[_]): Unit = { 56 | import org.apache.spark.sql.execution.debug._ 57 | df.debug() 58 | } 59 | 60 | def sparkContext(s: SparkSession): SparkContext = s.sparkContext 61 | 62 | /** 63 | * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. 64 | * 65 | * This method is used to keep ClassTags out of the external Java API, as the Java compiler 66 | * cannot produce them automatically. While this ClassTag-faking does please the compiler, 67 | * it can cause problems at runtime if the Scala API relies on ClassTags for correctness. 68 | * 69 | * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, just worse performance 70 | * or security issues. For instance, an Array[AnyRef] can hold any type T, but may lose primitive 71 | * specialization. 72 | */ 73 | def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] 74 | } 75 | -------------------------------------------------------------------------------- /core/src/main/scala/org/jetbrains/kotlinx/spark/extensions/VarargUnwrapper.scala: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.extensions 2 | 3 | import org.apache.spark.sql.api.java.{UDF1, UDF2} 4 | 5 | /** 6 | * Allows any simple vararg function reference to be treated as 23 different Scala functions. 7 | * Used to make vararg UDFs for `ScalaUDF`. 8 | * 9 | * @param varargFunc 10 | * @param newArray 11 | * @tparam T 12 | * @tparam Array 13 | * @tparam R 14 | */ 15 | class VarargUnwrapper[T, Array, R]( 16 | val varargFunc: UDF1[Array, R], 17 | val newArray: UDF2[Integer, UDF1[Integer, T], Array], 18 | ) extends Serializable 19 | with Function0[R] 20 | with Function1[T, R] 21 | with Function2[T, T, R] 22 | with Function3[T, T, T, R] 23 | with Function4[T, T, T, T, R] 24 | with Function5[T, T, T, T, T, R] 25 | with Function6[T, T, T, T, T, T, R] 26 | with Function7[T, T, T, T, T, T, T, R] 27 | with Function8[T, T, T, T, T, T, T, T, R] 28 | with Function9[T, T, T, T, T, T, T, T, T, R] 29 | with Function10[T, T, T, T, T, T, T, T, T, T, R] 30 | with Function11[T, T, T, T, T, T, T, T, T, T, T, R] 31 | with Function12[T, T, T, T, T, T, T, T, T, T, T, T, R] 32 | with Function13[T, T, T, T, T, T, T, T, T, T, T, T, T, R] 33 | with Function14[T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 34 | with Function15[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 35 | with Function16[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 36 | with Function17[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 37 | with Function18[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 38 | with Function19[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 39 | with Function20[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 40 | with Function21[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] 41 | with Function22[T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, R] { 42 | 43 | private def vararg(t: T*): R = varargFunc.call(newArray.call(t.size, { t(_) })) 44 | 45 | override def curried: Nothing = throw new UnsupportedOperationException() 46 | override def tupled: Nothing = throw new UnsupportedOperationException() 47 | 48 | override def apply(): R = vararg() 49 | 50 | override def apply(v0: T): R = vararg(v0) 51 | 52 | override def apply(v0: T, v1: T): R = vararg(v0, v1) 53 | 54 | override def apply(v0: T, v1: T, v2: T): R = vararg(v0, v1, v2) 55 | 56 | override def apply(v0: T, v1: T, v2: T, v3: T): R = vararg(v0, v1, v2, v3) 57 | 58 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T): R = vararg(v0, v1, v2, v3, v4) 59 | 60 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T): R = vararg(v0, v1, v2, v3, v4, v5) 61 | 62 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T): R = vararg(v0, v1, v2, v3, v4, v5, v6) 63 | 64 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7) 65 | 66 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8) 67 | 68 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) 69 | 70 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) 71 | 72 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) 73 | 74 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) 75 | 76 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) 77 | 78 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) 79 | 80 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) 81 | 82 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) 83 | 84 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T, v17: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) 85 | 86 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T, v17: T, v18: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) 87 | 88 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T, v17: T, v18: T, v19: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) 89 | 90 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T, v17: T, v18: T, v19: T, v20: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) 91 | 92 | override def apply(v0: T, v1: T, v2: T, v3: T, v4: T, v5: T, v6: T, v7: T, v8: T, v9: T, v10: T, v11: T, v12: T, v13: T, v14: T, v15: T, v16: T, v17: T, v18: T, v19: T, v20: T, v21: T): R = vararg(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) 93 | } 94 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-dinky -------------------------------------------------------------------------------- /docs/quick-start-guide.md: -------------------------------------------------------------------------------- 1 | # Quick Start Guide 2 | 3 | This tutorial provides instructions to help you get started with Kotlin Spark API. We use an example similar to the official [Apache Spark 4 | Quick Start Guide](https://spark.apache.org/docs/3.0.0/quick-start.html#self-contained-applications). 5 | You'll learn what you need to set up your environment, how to write, package and execute a simple self-contained application. 6 | 7 | Prerequisites: 8 | - You need to have Java installed and have the JAVA_HOME environment variable pointing to the Java installation. 9 | - You need to have Apache Spark installed and have SPARK_HOME environment variable pointing to the Spark installation. 10 | We recommend using Apache Spark 3.0.0 version. You can download it from the [Spark official website](https://spark.apache.org/downloads.html). 11 | 12 | 13 | ## Self-contained application 14 | 15 | For the purposes of this tutorial, let's write a Kotlin program that counts the number of lines containing 'a', 16 | and the number containing 'b' in the Spark README. Note that you'll need to replace `YOUR_SPARK_HOME` with the 17 | location where Spark is installed: 18 | 19 | ```kotlin 20 | /* SimpleApp.kt */ 21 | @file:JvmName("SimpleApp") 22 | import org.jetbrains.kotlinx.spark.api.* 23 | 24 | fun main() { 25 | val logFile = "YOUR_SPARK_HOME/README.md" // Change to your Spark Home path 26 | withSpark { 27 | spark.read().textFile(logFile).withCached { 28 | val numAs = filter { it.contains("a") }.count() 29 | val numBs = filter { it.contains("b") }.count() 30 | println("Lines with a: $numAs, lines with b: $numBs") 31 | } 32 | } 33 | } 34 | ``` 35 | 36 | ## Building the application 37 | Because Kotlin Spark API is not part of the official Apache Spark distribution yet, it is not enough to add Spark 38 | as a dependency to your build file. 39 | You need to: 40 | - Add Spark as a dependency 41 | - Add Kotlin Spark API as a dependency 42 | - Add Kotlin Standard Library as a dependency 43 | 44 | When packaging your project into a jar file, you need to explicitly include Kotlin Spark API and Kotlin Standard Library 45 | dependencies. Here you can find an example of building your application with Maven, and with Gradle. 46 | 47 | ### Building the application with Maven 48 | 49 | Here's what the `pom.xml` looks like for this example: 50 | ```xml 51 | 52 | 4.0.0 53 | 54 | org.example 55 | kotlin-spark-example 56 | 1.0-SNAPSHOT 57 | 58 | Sample Project 59 | jar 60 | 61 | 62 | UTF-8 63 | 1.8.0 64 | official 65 | 66 | 67 | 68 | 69 | org.jetbrains.kotlin 70 | kotlin-stdlib 71 | ${kotlin.version} 72 | 73 | 74 | org.jetbrains.kotlinx.spark 75 | kotlin-spark-api_3.3.2_2.13 76 | 1.2.3 77 | 78 | 79 | org.apache.spark 80 | spark-sql_2.12 81 | 3.3.2 82 | 83 | 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-shade-plugin 90 | 3.2.4 91 | 92 | 93 | package 94 | 95 | shade 96 | 97 | 98 | 99 | 100 | org.jetbrains.kotlinx.spark:* 101 | org.jetbrains.kotlin:* 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | org.jetbrains.kotlin 111 | kotlin-maven-plugin 112 | ${kotlin.version} 113 | 114 | src/main/kotlin 115 | 1.8 116 | true 117 | 118 | 119 | 120 | compile 121 | 122 | compile 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | ``` 131 | 132 | Here's what the project structure should look like: 133 | ```bash 134 | ./pom.xml 135 | ./src 136 | ./src/main 137 | ./src/main/kotlin 138 | ./src/main/kotlin/SimpleApp.kt 139 | ``` 140 | 141 | 142 | Now you can package the application using Maven: 143 | `mvn package` 144 | 145 | ### Building the application with Gradle 146 | 147 | Here's what the `build.gradle` looks like for this example: 148 | 149 | ```groovy 150 | plugins { 151 | id 'org.jetbrains.kotlin.jvm' version '1.4.0' 152 | id 'com.github.johnrengelman.shadow' version '5.2.0' 153 | } 154 | 155 | group = 'org.example' 156 | version = '1.0-SNAPSHOT' 157 | 158 | repositories { 159 | mavenCentral() 160 | } 161 | 162 | dependencies { 163 | // Kotlin stdlib 164 | implementation 'org.jetbrains.kotlin:kotlin-stdlib:1.8.0' 165 | // Kotlin Spark API 166 | implementation 'org.jetbrains.kotlinx.spark:kotlin-spark-api_3.3.2_2.13:1.2.3' // Apache Spark 167 | compileOnly 'org.apache.spark:spark-sql_2.12:3.3.2' 168 | } 169 | 170 | compileKotlin { 171 | kotlinOptions.jvmTarget = '1.8' 172 | } 173 | 174 | shadowJar { 175 | dependencies { 176 | exclude(dependency { 177 | it.moduleGroup == 'org.apache.spark' || it.moduleGroup == "org.scala-lang" 178 | }) 179 | } 180 | } 181 | ``` 182 | 183 | build.gradle.kts (Kotlin DSL) 184 | ``` 185 | import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar 186 | import org.jetbrains.kotlin.gradle.tasks.KotlinCompile 187 | 188 | plugins { 189 | id ("org.jetbrains.kotlin.jvm") version "1.8.0" 190 | id ("com.github.johnrengelman.shadow") version "5.2.0" 191 | } 192 | 193 | repositories { 194 | mavenCentral() 195 | } 196 | 197 | dependencies { 198 | // Kotlin stdlib 199 | implementation ("org.jetbrains.kotlin:kotlin-stdlib:1.4.0") 200 | // Kotlin Spark API 201 | implementation ("org.jetbrains.kotlinx.spark:kotlin-spark-api_3.3.2_2.13:1.2.3") 202 | // Apache Spark 203 | compileOnly ("org.apache.spark:spark-sql_2.12:3.3.2") 204 | } 205 | 206 | compileKotlin.kotlinOptions.jvmTarget = "1.8" 207 | 208 | tasks { 209 | named("shadowJar") { 210 | dependencies { 211 | exclude{ 212 | it.moduleGroup == "org.apache.spark" || it.moduleGroup == "org.scala-lang" 213 | } 214 | } 215 | } 216 | } 217 | ``` 218 | 219 | 220 | Now you can package the application using Gradle: 221 | `gradle shadowJar` 222 | 223 | 224 | ## Executing the application with spark-submit 225 | 226 | Once you have your jar, you can execute the packaged application with `./bin/spark-submit`: 227 | 228 | `YOUR_SPARK_HOME/bin/spark-submit --class "SimpleApp" --master local [path to your jar]` 229 | 230 | This example is also available as a [GitHub repo](https://github.com/MKhalusova/kotlin-spark-example), feel free to give it a try. 231 | -------------------------------------------------------------------------------- /examples/build.gradle.kts: -------------------------------------------------------------------------------- 1 | import org.jetbrains.kotlin.gradle.tasks.KotlinCompile 2 | 3 | plugins { 4 | kotlin 5 | idea 6 | } 7 | 8 | group = Versions.groupID 9 | version = Versions.project 10 | 11 | repositories { 12 | mavenCentral() 13 | } 14 | 15 | dependencies { 16 | 17 | with(Projects) { 18 | implementation( 19 | kotlinSparkApi, 20 | ) 21 | } 22 | 23 | with(Dependencies) { 24 | 25 | // https://github.com/FasterXML/jackson-bom/issues/52 26 | if (Versions.spark == "3.3.1") implementation(jacksonDatabind) 27 | 28 | implementation( 29 | sparkSql, 30 | sparkMl, 31 | sparkStreaming, 32 | sparkStreamingKafka, 33 | ) 34 | 35 | } 36 | } 37 | 38 | kotlin { 39 | jvmToolchain { 40 | languageVersion.set( 41 | JavaLanguageVersion.of(Versions.jvmTarget) 42 | ) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Broadcasting.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 2.4+ (Scala 2.11) 4 | * ---------- 5 | * Copyright (C) 2019 - 2021 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.jetbrains.kotlinx.spark.api.broadcast 23 | import org.jetbrains.kotlinx.spark.api.map 24 | import org.jetbrains.kotlinx.spark.api.withSpark 25 | import java.io.Serializable 26 | 27 | // (data) class must be Serializable to be broadcast 28 | data class SomeClass(val a: IntArray, val b: Int) : Serializable 29 | 30 | fun main() = withSpark { 31 | val broadcastVariable = spark.broadcast(SomeClass(a = intArrayOf(5, 6), b = 3)) 32 | val result = listOf(1, 2, 3, 4, 5) 33 | .toDS() 34 | .map { 35 | val receivedBroadcast = broadcastVariable.value 36 | it + receivedBroadcast.a.first() 37 | } 38 | .collectAsList() 39 | 40 | println(result) 41 | } 42 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/CachedOperations.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.jetbrains.kotlinx.spark.api.* 23 | import org.jetbrains.kotlinx.spark.api.tuples.* 24 | 25 | fun main() { 26 | withSpark { 27 | dsOf(1, 2, 3, 4, 5) 28 | .map { it X (it + 2) } 29 | .withCached { 30 | showDS() 31 | 32 | filter { it._1 % 2 == 0 }.showDS() 33 | } 34 | .map { it.appendedBy(it._1 + it._2 * 2) } 35 | .show() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Collect.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.apache.spark.sql.Row 23 | import org.jetbrains.kotlinx.spark.api.* 24 | import org.jetbrains.kotlinx.spark.api.tuples.* 25 | 26 | fun main() { 27 | withSpark { 28 | val sd = dsOf(1, 2, 3) 29 | sd.createOrReplaceTempView("ds") 30 | spark.sql("select * from ds") 31 | .withCached { 32 | println("asList: ${toList()}") 33 | println("asArray: ${toArray().contentToString()}") 34 | this 35 | } 36 | .to() 37 | .withCached { 38 | println("typed collect: " + (collect() as Array).contentToString()) 39 | println("type collectAsList: " + collectAsList()) 40 | } 41 | 42 | dsOf(1, 2, 3) 43 | .map { t(it, it + 1, it + 2) } 44 | .to() 45 | .select("_1") 46 | .collectAsList() 47 | .forEach { println(it) } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Group.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 2.4+ (Scala 2.11) 4 | * ---------- 5 | * Copyright (C) 2019 - 2021 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.jetbrains.kotlinx.spark.api.* 23 | import org.jetbrains.kotlinx.spark.api.tuples.* 24 | 25 | fun main() { 26 | withSpark { 27 | dsOf( 28 | 1 X "a", 29 | 1 X "b", 30 | 2 X "c", 31 | ) 32 | .groupByKey { it._1 } 33 | .reduceGroupsK { a, b -> 34 | tupleOf(a._1 + b._1, a._2 + b._2) 35 | } 36 | .show() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Join.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.jetbrains.kotlinx.spark.api.* 23 | import org.jetbrains.kotlinx.spark.api.tuples.* 24 | 25 | 26 | data class Left(val id: Int, val name: String) 27 | 28 | data class Right(val id: Int, val value: Int) 29 | 30 | 31 | fun main() { 32 | withSpark(logLevel = SparkLogLevel.INFO) { 33 | val first = dsOf(Left(1, "a"), Left(2, "b")) 34 | val second = dsOf(Right(1, 100), Right(3, 300)) 35 | first 36 | .leftJoin(second, first.col("id") eq second.col("id")) 37 | .debugCodegen() 38 | .also { it.show() } 39 | .map { (left, right) -> 40 | left.id X left.name X right?.value 41 | } 42 | .show() 43 | 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/Main.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.apache.spark.sql.Dataset 23 | import org.jetbrains.kotlinx.spark.api.* 24 | import org.jetbrains.kotlinx.spark.api.tuples.* 25 | import scala.Tuple2 26 | import scala.Tuple3 27 | 28 | data class Q(val id: Int, val text: T) 29 | 30 | @Suppress("RedundantLambdaArrow", "UsePropertyAccessSyntax") 31 | object Main { 32 | 33 | @JvmStatic 34 | fun main(args: Array) { 35 | val spark = SparkSession 36 | .builder() 37 | .master("local[2]") 38 | .appName("Simple Application") 39 | .getOrCreate() 40 | 41 | val triples: Dataset> = spark 42 | .toDS( 43 | listOf( 44 | Q(1, 1 X null), 45 | Q(2, 2 X "22"), 46 | Q(3, 3 X "333"), 47 | ) 48 | ) 49 | .map { (a, b) -> t(a + b._1, b._2?.length) } 50 | .map { it: Tuple2 -> it + 1 } // add counter 51 | 52 | val pairs = spark 53 | .toDS( 54 | listOf( 55 | 2 X "hell", 56 | 4 X "moon", 57 | 6 X "berry", 58 | ) 59 | ) 60 | 61 | triples 62 | .leftJoin( 63 | right = pairs, 64 | col = triples.col("_1").multiply(2) eq pairs.col("_1"), 65 | ) 66 | // .also { it.printSchema() } 67 | .map { (triple, pair) -> Five(triple._1, triple._2, triple._3, pair?._1, pair?._2) } 68 | .groupByKey { it.a } 69 | .reduceGroupsK { v1, v2 -> v1.copy(a = v1.a + v2.a, b = v1.a + v2.a) } 70 | .map { it._2 } 71 | .repartition(1) 72 | .withCached { 73 | write() 74 | .also { it.csv("csvpath") } 75 | .also { it.orc("orcpath") } 76 | showDS() 77 | } 78 | 79 | 80 | 81 | spark.stop() 82 | } 83 | 84 | data class Five(val a: A, val b: B, val c: C, val d: D, val e: E) 85 | } 86 | 87 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/MapAndListOperations.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.jetbrains.kotlinx.spark.api.* 23 | import org.jetbrains.kotlinx.spark.api.tuples.* 24 | 25 | fun main() { 26 | withSpark(props = mapOf("spark.sql.codegen.wholeStage" to true)) { 27 | dsOf( 28 | mapOf(1 to t(1, 2, 3), 2 to t(1, 2, 3)), 29 | mapOf(3 to t(1, 2, 3), 4 to t(1, 2, 3)), 30 | ) 31 | .flatMap { 32 | it.toList() 33 | .map { (first, tuple) -> (first + tuple).toList() } 34 | .iterator() 35 | } 36 | .flatten() 37 | .map { tupleOf(it) } 38 | .also { it.printSchema() } 39 | .distinct() 40 | .sort("_1") 41 | .debugCodegen() 42 | .show() 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/UdtRegistration.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.apache.spark.sql.catalyst.InternalRow 23 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 24 | import org.apache.spark.sql.types.* 25 | import org.apache.spark.unsafe.types.UTF8String 26 | import org.glassfish.jersey.internal.guava.MoreObjects 27 | import org.jetbrains.kotlinx.spark.api.* 28 | import org.jetbrains.kotlinx.spark.api.tuples.tupleOf 29 | 30 | class CityUserDefinedType : UserDefinedType() { 31 | 32 | override fun sqlType(): DataType = DATA_TYPE 33 | 34 | override fun serialize(city: City): InternalRow = GenericInternalRow(2).apply { 35 | setInt(DEPT_NUMBER_INDEX, city.departmentNumber) 36 | update(NAME_INDEX, UTF8String.fromString(city.name)) 37 | } 38 | 39 | override fun deserialize(datum: Any): City = 40 | if (datum is InternalRow) 41 | City( 42 | name = datum.getString(NAME_INDEX), 43 | departmentNumber = datum.getInt(DEPT_NUMBER_INDEX), 44 | ) 45 | else throw IllegalStateException("Unsupported conversion") 46 | 47 | override fun userClass(): Class = City::class.java 48 | 49 | companion object { 50 | private const val DEPT_NUMBER_INDEX = 0 51 | private const val NAME_INDEX = 1 52 | private val DATA_TYPE = DataTypes.createStructType( 53 | arrayOf( 54 | DataTypes.createStructField( 55 | "departmentNumber", 56 | DataTypes.IntegerType, 57 | false, 58 | MetadataBuilder().putLong("maxNumber", 99).build(), 59 | ), 60 | DataTypes.createStructField("name", DataTypes.StringType, false) 61 | ) 62 | ) 63 | } 64 | } 65 | 66 | @SQLUserDefinedType(udt = CityUserDefinedType::class) 67 | class City(val name: String, val departmentNumber: Int) { 68 | 69 | override fun toString(): String = 70 | MoreObjects 71 | .toStringHelper(this) 72 | .add("name", name) 73 | .add("departmentNumber", departmentNumber) 74 | .toString() 75 | } 76 | 77 | fun main() = withSpark { 78 | 79 | // Either use @SQLUserDefinedType or: 80 | // UDTRegistration.register(org.jetbrains.kotlinx.spark.examples.City::class.jvmName, org.jetbrains.kotlinx.spark.examples.CityUserDefinedType::class.jvmName) 81 | 82 | val items = listOf( 83 | City("Amsterdam", 1), 84 | City("Breda", 2), 85 | City("Oosterhout", 3), 86 | ) 87 | 88 | val ds = items.map(::tupleOf).toDS() 89 | ds.showDS() 90 | 91 | // Unlike in Scala, you can also directly encode UDT registered types to a Dataset! 92 | val ds2 = items.toDS() 93 | ds2.showDS() 94 | } 95 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/WordCount.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples 21 | 22 | import org.apache.spark.sql.Dataset 23 | import org.jetbrains.kotlinx.spark.api.* 24 | import org.jetbrains.kotlinx.spark.api.tuples.* 25 | 26 | const val MEANINGFUL_WORD_LENGTH = 4 27 | 28 | fun main() { 29 | withSpark { 30 | spark 31 | .read() 32 | .textFile(this::class.java.classLoader.getResource("the-catcher-in-the-rye.txt")?.path) 33 | .map { it.split(Regex("\\s")) } 34 | .flatten() 35 | .cleanup() 36 | .groupByKey { it } 37 | .mapGroups { k, iter -> k X iter.asSequence().count() } 38 | .sort { arrayOf(it.col("_2").desc()) } 39 | .limit(20) 40 | .map { it.swap() } 41 | .show(false) 42 | } 43 | } 44 | 45 | fun Dataset.cleanup(): Dataset = 46 | filter { it.isNotBlank() } 47 | .map { it.trim(',', ' ', '\n', ':', '.', ';', '?', '!', '"', '\'', '\t', ' ') } 48 | .filter { !it.endsWith("n’t") } 49 | .filter { it.length >= MEANINGFUL_WORD_LENGTH } 50 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/JupyterStreamingExample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "By default the latest version of the API and the latest supported Spark version is chosen. To specify your own: %use spark-streaming(spark=3.2, v=1.1.0)" 7 | ], 8 | "metadata": { 9 | "collapsed": false, 10 | "pycharm": { 11 | "name": "#%% md\n" 12 | } 13 | } 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "To start a spark streaming session, simply use `withSparkStreaming { }` inside a cell. To use Spark normally, use `withSpark { }` in a cell, or use `%use spark` to start a Spark session for the whole notebook.\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "%use spark-streaming" 29 | ], 30 | "metadata": { 31 | "collapsed": false, 32 | "pycharm": { 33 | "name": "#%%\n" 34 | } 35 | } 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "Let's define some data class to work with." 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%% md\n" 46 | } 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "outputs": [], 53 | "source": [ 54 | "data class TestRow(\n", 55 | " val word: String,\n", 56 | ")" 57 | ], 58 | "metadata": { 59 | "collapsed": false, 60 | "pycharm": { 61 | "name": "#%%\n" 62 | } 63 | } 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "source": [ 68 | "To run this on your local machine, you need to first run a Netcat server: `$ nc -lk 9999`.\n", 69 | "\n", 70 | "This example will collect the data from this stream for 10 seconds and 1 second intervals, splitting and counting the input per word." 71 | ], 72 | "metadata": { 73 | "collapsed": false, 74 | "pycharm": { 75 | "name": "#%% md\n" 76 | } 77 | } 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "+---+--------+\n", 88 | "|key|count(1)|\n", 89 | "+---+--------+\n", 90 | "+---+--------+\n", 91 | "\n", 92 | "+-----+--------+\n", 93 | "| key|count(1)|\n", 94 | "+-----+--------+\n", 95 | "|hello| 8|\n", 96 | "|Hello| 6|\n", 97 | "|world| 3|\n", 98 | "| | 2|\n", 99 | "| test| 4|\n", 100 | "+-----+--------+\n", 101 | "\n", 102 | "+-----+--------+\n", 103 | "| key|count(1)|\n", 104 | "+-----+--------+\n", 105 | "|hello| 3|\n", 106 | "+-----+--------+\n", 107 | "\n", 108 | "+---+--------+\n", 109 | "|key|count(1)|\n", 110 | "+---+--------+\n", 111 | "+---+--------+\n", 112 | "\n", 113 | "+---+--------+\n", 114 | "|key|count(1)|\n", 115 | "+---+--------+\n", 116 | "+---+--------+\n", 117 | "\n", 118 | "+---+--------+\n", 119 | "|key|count(1)|\n", 120 | "+---+--------+\n", 121 | "+---+--------+\n", 122 | "\n", 123 | "+---+--------+\n", 124 | "|key|count(1)|\n", 125 | "+---+--------+\n", 126 | "+---+--------+\n", 127 | "\n", 128 | "+---+--------+\n", 129 | "|key|count(1)|\n", 130 | "+---+--------+\n", 131 | "+---+--------+\n", 132 | "\n", 133 | "+-----+--------+\n", 134 | "| key|count(1)|\n", 135 | "+-----+--------+\n", 136 | "|hello| 1|\n", 137 | "|world| 2|\n", 138 | "+-----+--------+\n", 139 | "\n", 140 | "+---+--------+\n", 141 | "|key|count(1)|\n", 142 | "+---+--------+\n", 143 | "+---+--------+\n", 144 | "\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "withSparkStreaming(batchDuration = Durations.seconds(1), timeout = 10_000) { // this: KSparkStreamingSession\n", 150 | "\n", 151 | " val lines: JavaReceiverInputDStream = ssc.socketTextStream(\"localhost\", 9999)\n", 152 | " val words: JavaDStream = lines.flatMap { it.split(\" \").iterator() }\n", 153 | "\n", 154 | " words.foreachRDD { rdd: JavaRDD, _: Time ->\n", 155 | " withSpark(rdd) { // this: KSparkSession\n", 156 | " val dataframe: Dataset = rdd.map { TestRow(it) }.toDS()\n", 157 | " dataframe\n", 158 | " .groupByKey { it.word }\n", 159 | " .count()\n", 160 | " .show()\n", 161 | " }\n", 162 | " }\n", 163 | "}" 164 | ], 165 | "metadata": { 166 | "collapsed": false, 167 | "pycharm": { 168 | "name": "#%%\n" 169 | } 170 | } 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Kotlin", 176 | "language": "kotlin", 177 | "name": "kotlin" 178 | }, 179 | "language_info": { 180 | "name": "kotlin", 181 | "version": "1.7.0-dev-1825", 182 | "mimetype": "text/x-kotlin", 183 | "file_extension": ".kt", 184 | "pygments_lexer": "kotlin", 185 | "codemirror_mode": "text/x-kotlin", 186 | "nbconvert_exporter": "" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 0 191 | } -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/KotlinDirectKafkaWordCount.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples.streaming 21 | 22 | import org.apache.kafka.clients.consumer.ConsumerConfig.* 23 | import org.apache.kafka.clients.consumer.ConsumerRecord 24 | import org.apache.kafka.common.serialization.StringDeserializer 25 | import org.apache.spark.streaming.Durations 26 | import org.apache.spark.streaming.api.java.JavaDStream 27 | import org.apache.spark.streaming.api.java.JavaInputDStream 28 | import org.apache.spark.streaming.kafka010.ConsumerStrategies 29 | import org.apache.spark.streaming.kafka010.KafkaUtils 30 | import org.apache.spark.streaming.kafka010.LocationStrategies 31 | import org.jetbrains.kotlinx.spark.api.reduceByKey 32 | import org.jetbrains.kotlinx.spark.api.tuples.* 33 | import org.jetbrains.kotlinx.spark.api.withSparkStreaming 34 | import scala.Tuple2 35 | import java.io.Serializable 36 | import java.util.regex.Pattern 37 | import kotlin.system.exitProcess 38 | 39 | 40 | /** 41 | * Src: https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaDirectKafkaWordCount.java 42 | * 43 | * Consumes messages from one or more topics in Kafka and does wordcount. 44 | * Usage: JavaDirectKafkaWordCount 45 | * is a list of one or more Kafka brokers 46 | * is a consumer group name to consume from topics 47 | * is a list of one or more kafka topics to consume from 48 | * 49 | * Example: 50 | * 51 | * First make sure you have a Kafka producer running. For instance, when running locally: 52 | * $ kafka-console-producer.sh --topic quickstart-events --bootstrap-server localhost:9092 53 | * 54 | * Then start the program normally or like this: 55 | * $ bin/run-example streaming.JavaDirectKafkaWordCount broker1-host:port,broker2-host:port \ 56 | * consumer-group topic1,topic2 57 | */ 58 | object KotlinDirectKafkaWordCount { 59 | 60 | private val SPACE = Pattern.compile(" ") 61 | 62 | private const val DEFAULT_BROKER = "localhost:9092" 63 | private const val DEFAULT_GROUP_ID = "consumer-group" 64 | private const val DEFAULT_TOPIC = "quickstart-events" 65 | 66 | @JvmStatic 67 | fun main(args: Array) { 68 | if (args.size < 3 && args.isNotEmpty()) { 69 | System.err.println( 70 | """Usage: JavaDirectKafkaWordCount 71 | is a list of one or more Kafka brokers 72 | is a consumer group name to consume from topics 73 | is a list of one or more kafka topics to consume from 74 | """.trimIndent() 75 | ) 76 | exitProcess(1) 77 | } 78 | 79 | val brokers: String = args.getOrElse(0) { DEFAULT_BROKER } 80 | val groupId: String = args.getOrElse(1) { DEFAULT_GROUP_ID } 81 | val topics: String = args.getOrElse(2) { DEFAULT_TOPIC } 82 | 83 | // Create context with a 2 seconds batch interval 84 | withSparkStreaming(batchDuration = Durations.seconds(2), appName = "KotlinDirectKafkaWordCount") { 85 | 86 | val topicsSet: Set = topics.split(',').toSet() 87 | 88 | val kafkaParams: Map = mapOf( 89 | BOOTSTRAP_SERVERS_CONFIG to brokers, 90 | GROUP_ID_CONFIG to groupId, 91 | KEY_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java, 92 | VALUE_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java, 93 | ) 94 | 95 | // Create direct kafka stream with brokers and topics 96 | val messages: JavaInputDStream> = KafkaUtils.createDirectStream( 97 | ssc, 98 | LocationStrategies.PreferConsistent(), 99 | ConsumerStrategies.Subscribe(topicsSet, kafkaParams), 100 | ) 101 | 102 | // Get the lines, split them into words, count the words and print 103 | val lines: JavaDStream = messages.map { it.value() } 104 | val words: JavaDStream = lines.flatMap { it.split(SPACE).iterator() } 105 | 106 | val wordCounts: JavaDStream> = words 107 | .map { it X 1 } 108 | .reduceByKey { a: Int, b: Int -> a + b } 109 | 110 | wordCounts.print() 111 | 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/KotlinSqlNetworkWordCount.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples.streaming 21 | 22 | import org.apache.spark.api.java.JavaRDD 23 | import org.apache.spark.api.java.StorageLevels 24 | import org.apache.spark.streaming.Durations 25 | import org.apache.spark.streaming.Time 26 | import org.jetbrains.kotlinx.spark.api.withSparkStreaming 27 | import java.io.Serializable 28 | import java.util.regex.Pattern 29 | import kotlin.system.exitProcess 30 | 31 | 32 | /** 33 | * Use DataFrames and SQL to count words in UTF8 encoded, '\n' delimited text received from the 34 | * network every second. 35 | * 36 | * Usage: KotlinSqlNetworkWordCount 37 | * and describe the TCP server that Spark Streaming would connect to receive data. 38 | * 39 | * To run this on your local machine, you need to first run a Netcat server 40 | * `$ nc -lk 9999` 41 | * and then run the example 42 | * `$ bin/run-example org.apache.spark.examples.streaming.KotlinSqlNetworkWordCount localhost 9999` 43 | */ 44 | object KotlinSqlNetworkWordCount { 45 | 46 | private val SPACE = Pattern.compile(" ") 47 | 48 | private const val DEFAULT_IP = "localhost" 49 | private const val DEFAULT_PORT = "9999" 50 | 51 | @Throws(Exception::class) 52 | @JvmStatic 53 | fun main(args: Array) { 54 | if (args.size < 2 && args.isNotEmpty()) { 55 | System.err.println("Usage: KotlinNetworkWordCount ") 56 | exitProcess(1) 57 | } 58 | 59 | // Create the context with a 1 second batch size 60 | withSparkStreaming( 61 | batchDuration = Durations.seconds(1), 62 | appName = "KotlinSqlNetworkWordCount", 63 | ) { 64 | 65 | 66 | // Create a KotlinReceiverInputDStream on target ip:port and count the 67 | // words in input stream of \n delimited text (e.g. generated by 'nc') 68 | // Note that no duplication in storage level only for running locally. 69 | // Replication necessary in distributed scenario for fault tolerance. 70 | val lines = ssc.socketTextStream( 71 | args.getOrElse(0) { DEFAULT_IP }, 72 | args.getOrElse(1) { DEFAULT_PORT }.toInt(), 73 | StorageLevels.MEMORY_AND_DISK_SER, 74 | ) 75 | val words = lines.flatMap { it.split(SPACE).iterator() } 76 | 77 | // Convert RDDs of the words DStream to DataFrame and run SQL query 78 | words.foreachRDD { rdd: JavaRDD, time: Time -> 79 | withSpark(rdd) { 80 | 81 | // Convert JavaRDD to JavaRDD to DataFrame (Dataset) 82 | val rowRDD = rdd.map(::KotlinRecord) 83 | val wordsDataFrame = rowRDD.toDF() 84 | 85 | // Creates a temporary view using the DataFrame 86 | wordsDataFrame.createOrReplaceTempView("words") 87 | 88 | // Do word count on table using SQL and print it 89 | val wordCountsDataFrame = 90 | spark.sql("select word, count(*) as total from words group by word") 91 | println("========= $time=========") 92 | wordCountsDataFrame.show() 93 | } 94 | } 95 | } 96 | } 97 | } 98 | 99 | data class KotlinRecord(val word: String): Serializable 100 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/KotlinStatefulNetworkCount.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples.streaming 21 | 22 | import org.apache.spark.api.java.Optional 23 | import org.apache.spark.api.java.StorageLevels 24 | import org.apache.spark.streaming.Durations 25 | import org.apache.spark.streaming.State 26 | import org.apache.spark.streaming.StateSpec 27 | import org.jetbrains.kotlinx.spark.api.* 28 | import org.jetbrains.kotlinx.spark.api.tuples.X 29 | import java.util.regex.Pattern 30 | import kotlin.system.exitProcess 31 | 32 | 33 | /** 34 | * Src: https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/streaming/JavaStatefulNetworkWordCount.java 35 | * 36 | * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every 37 | * second starting with initial value of word count. 38 | * Usage: JavaStatefulNetworkWordCount 39 | * and describe the TCP server that Spark Streaming would connect to receive 40 | * data. 41 | * 42 | * 43 | * To run this on your local machine, you need to first run a Netcat server 44 | * `$ nc -lk 9999` 45 | * and then run the example 46 | * `$ bin/run-example 47 | * org.apache.spark.examples.streaming.JavaStatefulNetworkWordCount localhost 9999` */ 48 | object KotlinStatefulNetworkCount { 49 | 50 | private val SPACE = Pattern.compile(" ") 51 | 52 | private const val DEFAULT_HOSTNAME = "localhost" 53 | private const val DEFAULT_PORT = "9999" 54 | 55 | @Throws(Exception::class) 56 | @JvmStatic 57 | fun main(args: Array) { 58 | if (args.size < 2 && args.isNotEmpty()) { 59 | System.err.println("Usage: JavaStatefulNetworkWordCount ") 60 | exitProcess(1) 61 | } 62 | 63 | // Create the context with a 1 second batch size 64 | withSparkStreaming( 65 | batchDuration = Durations.seconds(1), 66 | checkpointPath = ".", 67 | appName = "JavaStatefulNetworkWordCount", 68 | ) { 69 | 70 | // Initial state RDD input to mapWithState 71 | val tuples = arrayOf("hello" X 1, "world" X 1) 72 | val initialRDD = ssc.sparkContext().rddOf(*tuples) 73 | 74 | val lines = ssc.socketTextStream( 75 | args.getOrElse(0) { DEFAULT_HOSTNAME }, 76 | args.getOrElse(1) { DEFAULT_PORT }.toInt(), 77 | StorageLevels.MEMORY_AND_DISK_SER_2, 78 | ) 79 | val words = lines.flatMap { it.split(SPACE).iterator() } 80 | 81 | val wordsDstream = words.map { it X 1 } 82 | 83 | // Update the cumulative count function 84 | val mappingFunc = { word: String, one: Optional, state: State -> 85 | val sum = one.getOrElse(0) + state.getOrElse(0) 86 | val output = word X sum 87 | state.update(sum) 88 | output 89 | } 90 | 91 | // DStream made of get cumulative counts that get updated in every batch 92 | val stateDstream = wordsDstream.mapWithState( 93 | StateSpec 94 | .function(mappingFunc) 95 | .initialState(initialRDD.toJavaPairRDD()) 96 | ) 97 | 98 | stateDstream.print() 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /examples/src/main/kotlin/org/jetbrains/kotlinx/spark/examples/streaming/Streaming.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: Examples for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.examples.streaming 21 | 22 | import org.apache.spark.api.java.JavaRDD 23 | import org.apache.spark.sql.Dataset 24 | import org.apache.spark.streaming.Durations 25 | import org.apache.spark.streaming.Time 26 | import org.apache.spark.streaming.api.java.JavaDStream 27 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream 28 | import org.jetbrains.kotlinx.spark.api.* 29 | 30 | data class TestRow( 31 | val word: String, 32 | ) 33 | 34 | /** 35 | * To run this on your local machine, you need to first run a Netcat server 36 | * 37 | * `$ nc -lk 9999` 38 | */ 39 | fun main() = withSparkStreaming(batchDuration = Durations.seconds(1), timeout = 10_000) { // this: KSparkStreamingSession 40 | 41 | val lines: JavaReceiverInputDStream = ssc.socketTextStream("localhost", 9999) 42 | val words: JavaDStream = lines.flatMap { it.split(" ").iterator() } 43 | 44 | words.foreachRDD { rdd: JavaRDD, _: Time -> 45 | withSpark(rdd) { // this: KSparkSession 46 | val dataframe: Dataset = rdd.map { TestRow(it) }.toDS() 47 | dataframe 48 | .groupByKey { it.word } 49 | .count() 50 | .show() 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.daemon.jvmargs=-Xmx8g 2 | org.gradle.jvmargs=-Xmx8g -XX:MaxMetaspaceSize=1g -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 3 | mavenCentralUsername=dummy 4 | mavenCentralPassword=dummy 5 | 6 | GROUP=org.jetbrains.kotlinx.spark 7 | 8 | # Controls the spark and scala version for the entire project 9 | # can also be defined like ./gradlew -Pspark=X.X.X -Pscala=X.X.X build 10 | spark=3.3.2 11 | scala=2.13.10 12 | # scala=2.12.17 13 | skipScalaTuplesInKotlin=false 14 | 15 | org.gradle.caching=true 16 | org.gradle.parallel=false 17 | #kotlin.incremental.useClasspathSnapshot=true 18 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kotlin/kotlin-spark-api/470bcf4dd6a0318a1cd0e947670f921f8f62969e/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 4 | # Copyright © 2015-2021 the original authors. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # https://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | ############################################################################## 20 | # 21 | # Gradle start up script for POSIX generated by Gradle. 22 | # 23 | # Important for running: 24 | # 25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is 26 | # noncompliant, but you have some other compliant shell such as ksh or 27 | # bash, then to run this script, type that shell name before the whole 28 | # command line, like: 29 | # 30 | # ksh Gradle 31 | # 32 | # Busybox and similar reduced shells will NOT work, because this script 33 | # requires all of these POSIX shell features: 34 | # * functions; 35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}», 36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»; 37 | # * compound commands having a testable exit status, especially «case»; 38 | # * various built-in commands including «command», «set», and «ulimit». 39 | # 40 | # Important for patching: 41 | # 42 | # (2) This script targets any POSIX shell, so it avoids extensions provided 43 | # by Bash, Ksh, etc; in particular arrays are avoided. 44 | # 45 | # The "traditional" practice of packing multiple parameters into a 46 | # space-separated string is a well documented source of bugs and security 47 | # problems, so this is (mostly) avoided, by progressively accumulating 48 | # options in "$@", and eventually passing that to Java. 49 | # 50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, 51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; 52 | # see the in-line comments for details. 53 | # 54 | # There are tweaks for specific operating systems such as AIX, CygWin, 55 | # Darwin, MinGW, and NonStop. 56 | # 57 | # (3) This script is generated from the Groovy template 58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt 59 | # within the Gradle project. 60 | # 61 | # You can find Gradle at https://github.com/gradle/gradle/. 62 | # 63 | ############################################################################## 64 | 65 | # Attempt to set APP_HOME 66 | 67 | # Resolve links: $0 may be a link 68 | app_path=$0 69 | 70 | # Need this for daisy-chained symlinks. 71 | while 72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path 73 | [ -h "$app_path" ] 74 | do 75 | ls=$( ls -ld "$app_path" ) 76 | link=${ls#*' -> '} 77 | case $link in #( 78 | /*) app_path=$link ;; #( 79 | *) app_path=$APP_HOME$link ;; 80 | esac 81 | done 82 | 83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit 84 | 85 | APP_NAME="Gradle" 86 | APP_BASE_NAME=${0##*/} 87 | 88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' 90 | 91 | # Use the maximum available, or set MAX_FD != -1 to use that value. 92 | MAX_FD=maximum 93 | 94 | warn () { 95 | echo "$*" 96 | } >&2 97 | 98 | die () { 99 | echo 100 | echo "$*" 101 | echo 102 | exit 1 103 | } >&2 104 | 105 | # OS specific support (must be 'true' or 'false'). 106 | cygwin=false 107 | msys=false 108 | darwin=false 109 | nonstop=false 110 | case "$( uname )" in #( 111 | CYGWIN* ) cygwin=true ;; #( 112 | Darwin* ) darwin=true ;; #( 113 | MSYS* | MINGW* ) msys=true ;; #( 114 | NONSTOP* ) nonstop=true ;; 115 | esac 116 | 117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 118 | 119 | 120 | # Determine the Java command to use to start the JVM. 121 | if [ -n "$JAVA_HOME" ] ; then 122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 123 | # IBM's JDK on AIX uses strange locations for the executables 124 | JAVACMD=$JAVA_HOME/jre/sh/java 125 | else 126 | JAVACMD=$JAVA_HOME/bin/java 127 | fi 128 | if [ ! -x "$JAVACMD" ] ; then 129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 130 | 131 | Please set the JAVA_HOME variable in your environment to match the 132 | location of your Java installation." 133 | fi 134 | else 135 | JAVACMD=java 136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 137 | 138 | Please set the JAVA_HOME variable in your environment to match the 139 | location of your Java installation." 140 | fi 141 | 142 | # Increase the maximum file descriptors if we can. 143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then 144 | case $MAX_FD in #( 145 | max*) 146 | MAX_FD=$( ulimit -H -n ) || 147 | warn "Could not query maximum file descriptor limit" 148 | esac 149 | case $MAX_FD in #( 150 | '' | soft) :;; #( 151 | *) 152 | ulimit -n "$MAX_FD" || 153 | warn "Could not set maximum file descriptor limit to $MAX_FD" 154 | esac 155 | fi 156 | 157 | # Collect all arguments for the java command, stacking in reverse order: 158 | # * args from the command line 159 | # * the main class name 160 | # * -classpath 161 | # * -D...appname settings 162 | # * --module-path (only if needed) 163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. 164 | 165 | # For Cygwin or MSYS, switch paths to Windows format before running java 166 | if "$cygwin" || "$msys" ; then 167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) 168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) 169 | 170 | JAVACMD=$( cygpath --unix "$JAVACMD" ) 171 | 172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 173 | for arg do 174 | if 175 | case $arg in #( 176 | -*) false ;; # don't mess with options #( 177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath 178 | [ -e "$t" ] ;; #( 179 | *) false ;; 180 | esac 181 | then 182 | arg=$( cygpath --path --ignore --mixed "$arg" ) 183 | fi 184 | # Roll the args list around exactly as many times as the number of 185 | # args, so each arg winds up back in the position where it started, but 186 | # possibly modified. 187 | # 188 | # NB: a `for` loop captures its iteration list before it begins, so 189 | # changing the positional parameters here affects neither the number of 190 | # iterations, nor the values presented in `arg`. 191 | shift # remove old arg 192 | set -- "$@" "$arg" # push replacement arg 193 | done 194 | fi 195 | 196 | # Collect all arguments for the java command; 197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of 198 | # shell script including quotes and variable substitutions, so put them in 199 | # double quotes to make sure that they get re-expanded; and 200 | # * put everything else in single quotes, so that it's not re-expanded. 201 | 202 | set -- \ 203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \ 204 | -classpath "$CLASSPATH" \ 205 | org.gradle.wrapper.GradleWrapperMain \ 206 | "$@" 207 | 208 | # Use "xargs" to parse quoted args. 209 | # 210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed. 211 | # 212 | # In Bash we could simply go: 213 | # 214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) && 215 | # set -- "${ARGS[@]}" "$@" 216 | # 217 | # but POSIX shell has neither arrays nor command substitution, so instead we 218 | # post-process each arg (as a line of input to sed) to backslash-escape any 219 | # character that might be a shell metacharacter, then use eval to reverse 220 | # that process (while maintaining the separation between arguments), and wrap 221 | # the whole thing up as a single "set" statement. 222 | # 223 | # This will of course break if any of these variables contains a newline or 224 | # an unmatched quote. 225 | # 226 | 227 | eval "set -- $( 228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | 229 | xargs -n1 | 230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | 231 | tr '\n' ' ' 232 | )" '"$@"' 233 | 234 | exec "$JAVACMD" "$@" 235 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /gradlew_all_versions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Run like you would `./gradlew arguments` 5 | # but now like `./gradlew_all_versions arguments`. 6 | 7 | DRY_RUN=${DRY_RUN:-false} 8 | SCALA2_12VERSION="2.12.16" 9 | SCALA2_13VERSION="2.13.8" 10 | SparkVersionsForBothScalaVersions=("3.3.0" "3.2.1" "3.2.0") 11 | SparkVersionsForScala2_12=("3.1.3" "3.1.2" "3.1.1" "3.1.0" "3.0.3" "3.0.2" "3.0.1" "3.0.0") 12 | 13 | echo Running for "$(expr ${#SparkVersionsForBothScalaVersions[@]} \* 2 + ${#SparkVersionsForScala2_12[@]}) versions of the library." 14 | 15 | echo "Cleaning the project first." 16 | if [ "$DRY_RUN" = false ]; then 17 | ./gradlew clean 18 | fi 19 | 20 | ARGS=("$@") 21 | execute() { 22 | echo "running ./gradlew -Pspark=$SPARK -Pscala=$SCALA -PskipScalaTuplesInKotlin=$SKIP_SCALA_TUPLES -PenforceCleanJCP=true ${ARGS[*]}" 23 | if [ "$DRY_RUN" = false ]; then 24 | ./gradlew -Pspark="$SPARK" -Pscala="$SCALA" -PskipScalaTuplesInKotlin="$SKIP_SCALA_TUPLES" "${ARGS[@]}" 25 | fi 26 | } 27 | 28 | SCALA="$SCALA2_12VERSION" 29 | SKIP_SCALA_TUPLES=false 30 | for spark in "${SparkVersionsForScala2_12[@]}"; do 31 | SPARK="$spark" 32 | execute 33 | SKIP_SCALA_TUPLES=true 34 | done 35 | 36 | 37 | execute_for_both_scala_versions() { 38 | for spark in "${SparkVersionsForBothScalaVersions[@]}"; do 39 | SPARK="$spark" 40 | execute 41 | SKIP_SCALA_TUPLES=true 42 | done 43 | } 44 | SCALA="$SCALA2_12VERSION" 45 | execute_for_both_scala_versions 46 | 47 | SCALA="$SCALA2_13VERSION" 48 | SKIP_SCALA_TUPLES=false 49 | execute_for_both_scala_versions 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /jupyter/build.gradle.kts: -------------------------------------------------------------------------------- 1 | @file:Suppress("UnstableApiUsage", "NOTHING_TO_INLINE") 2 | 3 | import com.igormaznitsa.jcp.gradle.JcpTask 4 | import com.vanniktech.maven.publish.JavadocJar.Dokka 5 | import com.vanniktech.maven.publish.KotlinJvm 6 | import org.jetbrains.dokka.gradle.AbstractDokkaLeafTask 7 | import org.jetbrains.kotlin.gradle.tasks.KotlinCompile 8 | 9 | plugins { 10 | scala 11 | kotlin 12 | dokka 13 | mavenPublishBase 14 | jupyter 15 | jcp 16 | } 17 | 18 | group = Versions.groupID 19 | version = Versions.project 20 | 21 | repositories { 22 | mavenCentral() 23 | maven(url = "https://maven.pkg.jetbrains.space/public/p/kotlinx-html/maven") 24 | maven(url = "https://maven.pkg.jetbrains.space/kotlin/p/kotlin/dev") 25 | } 26 | 27 | tasks.withType().configureEach { 28 | useJUnitPlatform() 29 | maxHeapSize = "2g" 30 | } 31 | 32 | tasks.processJupyterApiResources { 33 | libraryProducers = listOf( 34 | "org.jetbrains.kotlinx.spark.api.jupyter.SparkIntegration", 35 | "org.jetbrains.kotlinx.spark.api.jupyter.SparkStreamingIntegration", 36 | ) 37 | } 38 | 39 | dependencies { 40 | with(Projects) { 41 | api( 42 | kotlinSparkApi, 43 | ) 44 | } 45 | 46 | with(Dependencies) { 47 | 48 | // https://github.com/FasterXML/jackson-bom/issues/52 49 | if (Versions.spark == "3.3.1") implementation(jacksonDatabind) 50 | 51 | api( 52 | kotlinxHtml, 53 | sparkSql, 54 | sparkRepl, 55 | sparkStreaming, 56 | hadoopClient, 57 | ) 58 | 59 | implementation( 60 | kotlinStdLib, 61 | ) 62 | 63 | testImplementation( 64 | kotest, 65 | kotlinScriptingCommon, 66 | kotlinScriptingJvm, 67 | ) 68 | 69 | } 70 | } 71 | 72 | // Setup preprocessing with JCP for main sources 73 | 74 | val kotlinMainSources = kotlin.sourceSets.main.get().kotlin.sourceDirectories 75 | 76 | val preprocessMain by tasks.creating(JcpTask::class) { 77 | sources.set(kotlinMainSources) 78 | clearTarget.set(true) 79 | fileExtensions.set(listOf("kt")) 80 | vars.set(Versions.versionMap) 81 | outputs.upToDateWhen { target.get().exists() } 82 | } 83 | 84 | tasks.compileKotlin { 85 | dependsOn(preprocessMain) 86 | outputs.upToDateWhen { preprocessMain.outcomingFiles.files.isEmpty() } 87 | doFirst { 88 | kotlin { 89 | sourceSets { 90 | main { 91 | kotlin.setSrcDirs(listOf(preprocessMain.target.get())) 92 | } 93 | } 94 | } 95 | } 96 | 97 | doLast { 98 | kotlin { 99 | sourceSets { 100 | main { 101 | kotlin.setSrcDirs(kotlinMainSources) 102 | } 103 | } 104 | } 105 | } 106 | } 107 | 108 | // Setup preprocessing with JCP for test sources 109 | 110 | val kotlinTestSources = kotlin.sourceSets.test.get().kotlin.sourceDirectories 111 | 112 | val preprocessTest by tasks.creating(JcpTask::class) { 113 | sources.set(kotlinTestSources) 114 | clearTarget.set(true) 115 | fileExtensions.set(listOf("java", "kt")) 116 | vars.set(Versions.versionMap) 117 | outputs.upToDateWhen { target.get().exists() } 118 | } 119 | 120 | tasks.compileTestKotlin { 121 | dependsOn(preprocessTest) 122 | outputs.upToDateWhen { preprocessTest.outcomingFiles.files.isEmpty() } 123 | doFirst { 124 | kotlin { 125 | sourceSets { 126 | test { 127 | kotlin.setSrcDirs(listOf(preprocessTest.target.get())) 128 | } 129 | } 130 | } 131 | } 132 | 133 | doLast { 134 | kotlin { 135 | sourceSets { 136 | test { 137 | kotlin.setSrcDirs(kotlinTestSources) 138 | } 139 | } 140 | } 141 | } 142 | } 143 | 144 | kotlin { 145 | jvmToolchain { 146 | languageVersion.set( 147 | JavaLanguageVersion.of(Versions.jupyterJvmTarget) 148 | ) 149 | } 150 | } 151 | 152 | tasks.withType { 153 | dokkaSourceSets { 154 | all { 155 | sourceRoot(preprocessMain.target.get()) 156 | } 157 | } 158 | } 159 | 160 | 161 | mavenPublishing { 162 | configure(KotlinJvm(Dokka("dokkaHtml"))) 163 | } -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/HtmlRendering.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.jupyter 21 | 22 | import kotlinx.html.* 23 | import kotlinx.html.stream.appendHTML 24 | import org.apache.spark.SparkException 25 | import org.apache.spark.api.java.JavaRDDLike 26 | import org.apache.spark.sql.Dataset 27 | import org.apache.spark.unsafe.array.ByteArrayMethods 28 | import org.jetbrains.kotlinx.spark.api.asKotlinIterable 29 | import org.jetbrains.kotlinx.spark.api.asKotlinIterator 30 | import org.jetbrains.kotlinx.spark.api.asKotlinList 31 | import scala.Product 32 | import java.io.InputStreamReader 33 | import java.io.Serializable 34 | 35 | private fun createHtmlTable(fillTable: TABLE.() -> Unit): String = buildString { 36 | appendHTML().head { 37 | style("text/css") { 38 | unsafe { 39 | val resource = "/table.css" 40 | val res = SparkIntegration::class.java 41 | .getResourceAsStream(resource) ?: error("Resource '$resource' not found") 42 | val readRes = InputStreamReader(res).readText() 43 | raw("\n" + readRes) 44 | } 45 | } 46 | } 47 | 48 | appendHTML().table("dataset", fillTable) 49 | } 50 | 51 | 52 | internal fun JavaRDDLike.toHtml(limit: Int = 20, truncate: Int = 30): String = try { 53 | createHtmlTable { 54 | val numRows = limit.coerceIn(0 until ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) 55 | val tmpRows = take(numRows).toList() 56 | 57 | val hasMoreData = tmpRows.size - 1 > numRows 58 | val rows = tmpRows.take(numRows) 59 | 60 | tr { th { +"Values" } } 61 | 62 | for (row in rows) tr { 63 | td { 64 | val string = when (row) { 65 | is ByteArray -> row.joinToString(prefix = "[", postfix = "]") { "%02X".format(it) } 66 | 67 | is CharArray -> row.iterator().asSequence().toList().toString() 68 | is ShortArray -> row.iterator().asSequence().toList().toString() 69 | is IntArray -> row.iterator().asSequence().toList().toString() 70 | is LongArray -> row.iterator().asSequence().toList().toString() 71 | is FloatArray -> row.iterator().asSequence().toList().toString() 72 | is DoubleArray -> row.iterator().asSequence().toList().toString() 73 | is BooleanArray -> row.iterator().asSequence().toList().toString() 74 | is Array<*> -> row.iterator().asSequence().toList().toString() 75 | is Iterable<*> -> row.iterator().asSequence().toList().toString() 76 | is scala.collection.Iterable<*> -> row.asKotlinIterable().iterator().asSequence().toList().toString() 77 | is Iterator<*> -> row.asSequence().toList().toString() 78 | is scala.collection.Iterator<*> -> row.asKotlinIterator().asSequence().toList().toString() 79 | is Product -> row.productIterator().asKotlinIterator().asSequence().toList().toString() 80 | is Serializable -> row.toString() 81 | // maybe others? 82 | 83 | is Any? -> row.toString() 84 | else -> row.toString() 85 | } 86 | 87 | +if (truncate > 0 && string.length > truncate) { 88 | // do not show ellipses for strings shorter than 4 characters. 89 | if (truncate < 4) string.substring(0, truncate) 90 | else string.substring(0, truncate - 3) + "..." 91 | } else { 92 | string 93 | } 94 | } 95 | } 96 | 97 | if (hasMoreData) tr { 98 | +"only showing top $numRows ${if (numRows == 1) "row" else "rows"}" 99 | } 100 | } 101 | } catch (e: SparkException) { 102 | // Whenever toString() on the contents doesn't work, since the class might be unknown... 103 | """${toString()} 104 | |Cannot render this RDD of this class.""".trimMargin() 105 | } 106 | 107 | internal fun Dataset.toHtml(limit: Int = 20, truncate: Int = 30): String = createHtmlTable { 108 | val numRows = limit.coerceIn(0 until ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) 109 | val tmpRows = getRows(numRows, truncate).asKotlinList().map { it.asKotlinList() } 110 | 111 | val hasMoreData = tmpRows.size - 1 > numRows 112 | val rows = tmpRows.take(numRows + 1) 113 | 114 | tr { 115 | for (header in rows.first()) th { 116 | +if (truncate > 0 && header.length > truncate) { 117 | // do not show ellipses for strings shorter than 4 characters. 118 | if (truncate < 4) header.substring(0, truncate) 119 | else header.substring(0, truncate - 3) + "..." 120 | } else { 121 | header 122 | } 123 | 124 | } 125 | } 126 | 127 | for (row in rows.drop(1)) tr { 128 | for (item in row) td { 129 | +item 130 | } 131 | } 132 | 133 | if (hasMoreData) tr { 134 | +"only showing top $numRows ${if (numRows == 1) "row" else "rows"}" 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/Integration.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.jupyter 21 | 22 | import kotlinx.serialization.Serializable 23 | import kotlinx.serialization.json.* 24 | import org.apache.spark.api.java.JavaRDDLike 25 | import org.apache.spark.rdd.RDD 26 | import org.apache.spark.sql.Dataset 27 | import org.intellij.lang.annotations.Language 28 | import org.jetbrains.kotlinx.jupyter.api.* 29 | import org.jetbrains.kotlinx.jupyter.api.libraries.JupyterIntegration 30 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.displayLimitName 31 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.displayTruncateName 32 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.scalaName 33 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.sparkName 34 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.sparkPropertiesName 35 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.versionName 36 | import kotlin.reflect.KProperty1 37 | import kotlin.reflect.typeOf 38 | 39 | 40 | abstract class Integration(private val notebook: Notebook, private val options: MutableMap) : 41 | JupyterIntegration() { 42 | 43 | protected val kotlinVersion = /*$"\""+kotlin+"\""$*/ /*-*/ "" 44 | protected val scalaCompatVersion = /*$"\""+scalaCompat+"\""$*/ /*-*/ "" 45 | protected val scalaVersion = /*$"\""+scala+"\""$*/ /*-*/ "" 46 | protected val sparkVersion = /*$"\""+spark+"\""$*/ /*-*/ "" 47 | protected val version = /*$"\""+version+"\""$*/ /*-*/ "" 48 | 49 | protected val displayLimitOld = "DISPLAY_LIMIT" 50 | protected val displayTruncateOld = "DISPLAY_TRUNCATE" 51 | 52 | protected val properties: Properties 53 | get() = notebook 54 | .variablesState[sparkPropertiesName]!! 55 | .value 56 | .getOrThrow() as Properties 57 | 58 | 59 | protected open val usingProperties = arrayOf( 60 | displayLimitName, 61 | displayTruncateName, 62 | sparkName, 63 | scalaName, 64 | versionName, 65 | ) 66 | 67 | /** 68 | * Will be run after importing all dependencies 69 | */ 70 | open fun KotlinKernelHost.onLoaded() = Unit 71 | 72 | open fun KotlinKernelHost.onShutdown() = Unit 73 | 74 | open fun KotlinKernelHost.onInterrupt() = Unit 75 | 76 | open fun KotlinKernelHost.beforeCellExecution() = Unit 77 | 78 | open fun KotlinKernelHost.afterCellExecution(snippetInstance: Any, result: FieldValue) = Unit 79 | 80 | open fun Builder.onLoadedAlsoDo() = Unit 81 | 82 | open val dependencies: Array = arrayOf( 83 | "org.apache.spark:spark-repl_$scalaCompatVersion:$sparkVersion", 84 | "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlinVersion", 85 | "org.jetbrains.kotlin:kotlin-reflect:$kotlinVersion", 86 | "org.apache.spark:spark-sql_$scalaCompatVersion:$sparkVersion", 87 | "org.apache.spark:spark-yarn_$scalaCompatVersion:$sparkVersion", 88 | "org.apache.spark:spark-streaming_$scalaCompatVersion:$sparkVersion", 89 | "org.apache.spark:spark-mllib_$scalaCompatVersion:$sparkVersion", 90 | "org.apache.spark:spark-sql_$scalaCompatVersion:$sparkVersion", 91 | "org.apache.spark:spark-graphx_$scalaCompatVersion:$sparkVersion", 92 | "org.apache.spark:spark-launcher_$scalaCompatVersion:$sparkVersion", 93 | "org.apache.spark:spark-catalyst_$scalaCompatVersion:$sparkVersion", 94 | "org.apache.spark:spark-streaming_$scalaCompatVersion:$sparkVersion", 95 | "org.apache.spark:spark-core_$scalaCompatVersion:$sparkVersion", 96 | "org.scala-lang:scala-library:$scalaVersion", 97 | "org.scala-lang.modules:scala-xml_$scalaCompatVersion:2.0.1", 98 | "org.scala-lang:scala-reflect:$scalaVersion", 99 | "org.scala-lang:scala-compiler:$scalaVersion", 100 | "commons-io:commons-io:2.11.0", 101 | ) 102 | 103 | open val imports: Array = arrayOf( 104 | "org.jetbrains.kotlinx.spark.api.*", 105 | "org.jetbrains.kotlinx.spark.api.tuples.*", 106 | *(1..22).map { "scala.Tuple$it" }.toTypedArray(), 107 | "org.apache.spark.sql.functions.*", 108 | "org.apache.spark.*", 109 | "org.apache.spark.sql.*", 110 | "org.apache.spark.api.java.*", 111 | "scala.collection.Seq", 112 | "org.apache.spark.rdd.*", 113 | "java.io.Serializable", 114 | "org.apache.spark.streaming.api.java.*", 115 | "org.apache.spark.streaming.api.*", 116 | "org.apache.spark.streaming.*", 117 | ) 118 | 119 | override fun Builder.onLoaded() { 120 | dependencies(*dependencies) 121 | import(*imports) 122 | 123 | onLoaded { 124 | 125 | val mutableOptions = options.toMutableMap() 126 | 127 | declare( 128 | VariableDeclaration( 129 | name = sparkPropertiesName, 130 | value = object : Properties, MutableMap by mutableOptions { 131 | override fun toString(): String = "Properties: $mutableOptions" 132 | }, 133 | type = typeOf(), 134 | isMutable = true, 135 | ) 136 | ) 137 | 138 | @Language("kts") 139 | val _0 = execute( 140 | """ 141 | @Deprecated("Use ${displayLimitName}=${properties.displayLimit} in %use magic or ${sparkPropertiesName}.${displayLimitName} = ${properties.displayLimit} instead", ReplaceWith("${sparkPropertiesName}.${displayLimitName}")) 142 | var $displayLimitOld: Int 143 | get() = ${sparkPropertiesName}.${displayLimitName} 144 | set(value) { 145 | println("$displayLimitOld is deprecated: Use ${sparkPropertiesName}.${displayLimitName} instead") 146 | ${sparkPropertiesName}.${displayLimitName} = value 147 | } 148 | 149 | @Deprecated("Use ${displayTruncateName}=${properties.displayTruncate} in %use magic or ${sparkPropertiesName}.${displayTruncateName} = ${properties.displayTruncate} instead", ReplaceWith("${sparkPropertiesName}.${displayTruncateName}")) 150 | var $displayTruncateOld: Int 151 | get() = ${sparkPropertiesName}.${displayTruncateName} 152 | set(value) { 153 | println("$displayTruncateOld is deprecated: Use ${sparkPropertiesName}.${displayTruncateName} instead") 154 | ${sparkPropertiesName}.${displayTruncateName} = value 155 | } 156 | """.trimIndent() 157 | ) 158 | 159 | onLoaded() 160 | } 161 | 162 | beforeCellExecution { 163 | if (scalaCompatVersion.toDouble() >= 2.13) 164 | execute("scala.`Console\$`.`MODULE\$`.setOutDirect(System.out)") 165 | else 166 | execute("""scala.Console.setOut(System.out)""") 167 | 168 | beforeCellExecution() 169 | } 170 | 171 | afterCellExecution { snippetInstance, result -> 172 | afterCellExecution(snippetInstance, result) 173 | } 174 | 175 | onInterrupt { 176 | onInterrupt() 177 | } 178 | 179 | onShutdown { 180 | onShutdown() 181 | } 182 | 183 | 184 | // Render Dataset 185 | render> { 186 | with(properties) { 187 | HTML(it.toHtml(limit = displayLimit, truncate = displayTruncate)) 188 | } 189 | } 190 | 191 | render> { 192 | with(properties) { 193 | HTML(it.toJavaRDD().toHtml(limit = displayLimit, truncate = displayTruncate)) 194 | } 195 | } 196 | 197 | render> { 198 | with(properties) { 199 | HTML(it.toHtml(limit = displayLimit, truncate = displayTruncate)) 200 | } 201 | 202 | } 203 | 204 | onLoadedAlsoDo() 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/JupyterConfiguration.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api.jupyter 2 | 3 | //class JupyterConfiguration(init: JupyterConfiguration.() -> Unit = {}) { 4 | // init { invoke(init) } 5 | // val sparkProps: MutableMap = mutableMapOf() 6 | // operator fun invoke(block: JupyterConfiguration.() -> Unit): JupyterConfiguration { 7 | // block(this) 8 | // return this 9 | // } 10 | //} 11 | 12 | interface JupyterConfiguration { 13 | val sparkProps: MutableMap 14 | 15 | operator fun invoke(block: JupyterConfiguration.() -> Unit): JupyterConfiguration { 16 | block(this) 17 | return this 18 | } 19 | } -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/Properties.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api.jupyter 2 | 3 | import kotlinx.serialization.Serializable 4 | import kotlinx.serialization.json.Json 5 | import kotlinx.serialization.json.buildJsonObject 6 | import kotlinx.serialization.json.decodeFromJsonElement 7 | import kotlinx.serialization.json.put 8 | 9 | interface Properties : MutableMap { 10 | 11 | companion object { 12 | internal const val sparkPropertiesName = "sparkProperties" 13 | 14 | internal const val sparkMasterName = "spark.master" 15 | internal const val appNameName = "spark.app.name" 16 | internal const val sparkName = "spark" 17 | internal const val scalaName = "scala" 18 | internal const val versionName = "v" 19 | internal const val displayLimitName = "displayLimit" 20 | internal const val displayTruncateName = "displayTruncate" 21 | } 22 | 23 | /** The value which limits the number of rows while displaying an RDD or Dataset. 24 | * Default: 20 25 | */ 26 | var displayLimit: Int 27 | set(value) { this[displayLimitName] = value.toString() } 28 | get() = this[displayLimitName]?.toIntOrNull() ?: 20 29 | 30 | /** The value which limits the number characters per cell while displaying an RDD or Dataset. 31 | * `-1` for no limit. 32 | * Default: 30 33 | */ 34 | var displayTruncate: Int 35 | set(value) { this[displayTruncateName] = value.toString() } 36 | get() = this[displayTruncateName]?.toIntOrNull() ?: 30 37 | 38 | 39 | operator fun invoke(block: Properties.() -> Unit): Properties = apply(block) 40 | } 41 | -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/SparkIntegration.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | @file:Suppress("UsePropertyAccessSyntax") 21 | 22 | package org.jetbrains.kotlinx.spark.api.jupyter 23 | 24 | 25 | import org.intellij.lang.annotations.Language 26 | import org.jetbrains.kotlinx.jupyter.api.KotlinKernelHost 27 | import org.jetbrains.kotlinx.jupyter.api.Notebook 28 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.appNameName 29 | import org.jetbrains.kotlinx.spark.api.jupyter.Properties.Companion.sparkMasterName 30 | 31 | 32 | /** 33 | * %use spark 34 | */ 35 | @Suppress("UNUSED_VARIABLE", "LocalVariableName") 36 | @OptIn(ExperimentalStdlibApi::class) 37 | class SparkIntegration(notebook: Notebook, options: MutableMap) : Integration(notebook, options) { 38 | 39 | override fun KotlinKernelHost.onLoaded() { 40 | val _0 = execute("""%dumpClassesForSpark""") 41 | 42 | properties { 43 | getOrPut(sparkMasterName) { "local[*]" } 44 | getOrPut(appNameName) { "Kotlin Spark API - Jupyter" } 45 | getOrPut("spark.sql.codegen.wholeStage") { "false" } 46 | getOrPut("fs.hdfs.impl") { org.apache.hadoop.hdfs.DistributedFileSystem::class.java.name } 47 | getOrPut("fs.file.impl") { org.apache.hadoop.fs.LocalFileSystem::class.java.name } 48 | } 49 | 50 | @Language("kts") 51 | val _1 = listOf( 52 | """ 53 | val spark = org.jetbrains.kotlinx.spark.api.SparkSession 54 | .builder() 55 | .apply { 56 | ${ 57 | buildString { 58 | val sparkProps = properties.filterKeys { it !in usingProperties } 59 | println("received properties: $properties, providing Spark with: $sparkProps") 60 | 61 | sparkProps.forEach { (key, value) -> 62 | appendLine("config(\"${key}\", \"$value\")") 63 | } 64 | } 65 | } 66 | } 67 | .getOrCreate()""".trimIndent(), 68 | """ 69 | spark.sparkContext.setLogLevel(org.jetbrains.kotlinx.spark.api.SparkLogLevel.ERROR)""".trimIndent(), 70 | """ 71 | val sc by lazy { 72 | org.apache.spark.api.java.JavaSparkContext(spark.sparkContext) 73 | }""".trimIndent(), 74 | """ 75 | println("Spark session (Spark: $sparkVersion, Scala: $scalaCompatVersion, v: $version) has been started and is running. No `withSpark { }` necessary, you can access `spark` and `sc` directly. To use Spark streaming, use `%use spark-streaming` instead.")""".trimIndent(), 76 | """ 77 | inline fun List.toDS(): Dataset = toDS(spark)""".trimIndent(), 78 | """ 79 | inline fun List.toDF(vararg colNames: String): Dataset = toDF(spark, *colNames)""".trimIndent(), 80 | """ 81 | inline fun Array.toDS(): Dataset = toDS(spark)""".trimIndent(), 82 | """ 83 | inline fun Array.toDF(vararg colNames: String): Dataset = toDF(spark, *colNames)""".trimIndent(), 84 | """ 85 | inline fun dsOf(vararg arg: T): Dataset = spark.dsOf(*arg)""".trimIndent(), 86 | """ 87 | inline fun dfOf(vararg arg: T): Dataset = spark.dfOf(*arg)""".trimIndent(), 88 | """ 89 | inline fun emptyDataset(): Dataset = spark.emptyDataset(encoder())""".trimIndent(), 90 | """ 91 | inline fun dfOf(colNames: Array, vararg arg: T): Dataset = spark.dfOf(colNames, *arg)""".trimIndent(), 92 | """ 93 | inline fun RDD.toDS(): Dataset = toDS(spark)""".trimIndent(), 94 | """ 95 | inline fun JavaRDDLike.toDS(): Dataset = toDS(spark)""".trimIndent(), 96 | """ 97 | inline fun RDD.toDF(vararg colNames: String): Dataset = toDF(spark, *colNames)""".trimIndent(), 98 | """ 99 | inline fun JavaRDDLike.toDF(vararg colNames: String): Dataset = toDF(spark, *colNames)""".trimIndent(), 100 | """ 101 | fun List.toRDD(numSlices: Int = sc.defaultParallelism()): JavaRDD = sc.toRDD(this, numSlices)""".trimIndent(), 102 | """ 103 | fun rddOf(vararg elements: T, numSlices: Int = sc.defaultParallelism()): JavaRDD = sc.toRDD(elements.toList(), numSlices)""".trimIndent(), 104 | """ 105 | val udf: UDFRegistration get() = spark.udf()""".trimIndent(), 106 | """ 107 | inline fun > NAMED_UDF.register(): NAMED_UDF = spark.udf().register(namedUdf = this)""".trimIndent(), 108 | """ 109 | inline fun > UserDefinedFunction.register(name: String): NAMED_UDF = spark.udf().register(name = name, udf = this)""".trimIndent(), 110 | ).map(::execute) 111 | } 112 | 113 | override fun KotlinKernelHost.onShutdown() { 114 | execute("""spark.stop()""") 115 | } 116 | } 117 | 118 | -------------------------------------------------------------------------------- /jupyter/src/main/kotlin/org/jetbrains/kotlinx/spark/api/jupyter/SparkStreamingIntegration.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.jupyter 21 | 22 | 23 | import org.apache.spark.streaming.StreamingContextState 24 | import org.apache.spark.streaming.api.java.JavaStreamingContext 25 | import org.intellij.lang.annotations.Language 26 | import org.jetbrains.kotlinx.jupyter.api.KotlinKernelHost 27 | import org.jetbrains.kotlinx.jupyter.api.Notebook 28 | import org.jetbrains.kotlinx.jupyter.api.VariableDeclaration 29 | import org.jetbrains.kotlinx.jupyter.api.declare 30 | import kotlin.reflect.typeOf 31 | 32 | /** 33 | * %use spark-streaming 34 | */ 35 | @Suppress("UNUSED_VARIABLE", "LocalVariableName") 36 | class SparkStreamingIntegration(notebook: Notebook, options: MutableMap) : Integration(notebook, options) { 37 | 38 | override val imports: Array = super.imports + arrayOf( 39 | "org.apache.spark.deploy.SparkHadoopUtil", 40 | "org.apache.hadoop.conf.Configuration", 41 | ) 42 | 43 | private val sscCollection = mutableSetOf() 44 | 45 | override fun KotlinKernelHost.onLoaded() { 46 | 47 | declare( 48 | VariableDeclaration( 49 | name = ::sscCollection.name, 50 | value = sscCollection, 51 | isMutable = false, 52 | type = typeOf>(), 53 | ) 54 | ) 55 | 56 | val _0 = execute("""%dumpClassesForSpark""") 57 | 58 | @Language("kts") 59 | val _1 = listOf( 60 | """ 61 | @JvmOverloads 62 | fun withSparkStreaming( 63 | batchDuration: Duration = Durations.seconds(1L), 64 | checkpointPath: String? = null, 65 | hadoopConf: Configuration = SparkHadoopUtil.get().conf(), 66 | createOnError: Boolean = false, 67 | props: Map = emptyMap(), 68 | master: String = SparkConf().get("spark.master", "local[*]"), 69 | appName: String = "Kotlin Spark Sample", 70 | timeout: Long = -1L, 71 | startStreamingContext: Boolean = true, 72 | func: KSparkStreamingSession.() -> Unit, 73 | ) { 74 | 75 | // will only be set when a new context is created 76 | var kSparkStreamingSession: KSparkStreamingSession? = null 77 | 78 | val creatingFunc = { 79 | val sc = SparkConf() 80 | .setAppName(appName) 81 | .setMaster(master) 82 | .setAll( 83 | props 84 | .map { (key, value) -> key X value.toString() } 85 | .asScalaIterable() 86 | ) 87 | 88 | val ssc = JavaStreamingContext(sc, batchDuration) 89 | ssc.checkpoint(checkpointPath) 90 | 91 | kSparkStreamingSession = KSparkStreamingSession(ssc) 92 | func(kSparkStreamingSession!!) 93 | 94 | ssc 95 | } 96 | 97 | val ssc = when { 98 | checkpointPath != null -> 99 | JavaStreamingContext.getOrCreate(checkpointPath, creatingFunc, hadoopConf, createOnError) 100 | 101 | else -> creatingFunc() 102 | } 103 | sscCollection += ssc 104 | 105 | if (startStreamingContext) { 106 | ssc.start() 107 | kSparkStreamingSession?.invokeRunAfterStart() 108 | } 109 | ssc.awaitTerminationOrTimeout(timeout) 110 | ssc.stop() 111 | } 112 | """.trimIndent(), 113 | """ 114 | println("To start a Spark (Spark: $sparkVersion, Scala: $scalaCompatVersion, v: $version) Streaming session, simply use `withSparkStreaming { }` inside a cell. To use Spark normally, use `withSpark { }` in a cell, or use `%use spark` to start a Spark session for the whole notebook.")""".trimIndent(), 115 | ).map(::execute) 116 | } 117 | 118 | private fun cleanUp(e: Throwable): String { 119 | while (sscCollection.isNotEmpty()) 120 | sscCollection.first().let { 121 | while (it.state != StreamingContextState.STOPPED) { 122 | try { 123 | it.stop(true, true) 124 | } catch (_: Exception) { 125 | } 126 | } 127 | sscCollection.remove(it) 128 | } 129 | 130 | return "Spark streams cleaned up. Cause: $e" 131 | } 132 | 133 | override fun Builder.onLoadedAlsoDo() { 134 | renderThrowable { 135 | cleanUp(it) 136 | } 137 | } 138 | 139 | override fun KotlinKernelHost.onInterrupt() { 140 | println( 141 | cleanUp(InterruptedException("Kernel was interrupted.")) 142 | ) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /jupyter/src/main/resources/table.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --background: #fff; 3 | --background-odd: #f5f5f5; 4 | --background-hover: #d9edfd; 5 | --header-text-color: #474747; 6 | --text-color: #848484; 7 | --text-color-dark: #000; 8 | --text-color-medium: #737373; 9 | --text-color-pale: #b3b3b3; 10 | --inner-border-color: #aaa; 11 | --bold-border-color: #000; 12 | --link-color: #296eaa; 13 | --link-color-pale: #296eaa; 14 | --link-hover: #1a466c; 15 | } 16 | 17 | :root[theme="dark"], :root [data-jp-theme-light="false"]{ 18 | --background: #303030; 19 | --background-odd: #3c3c3c; 20 | --background-hover: #464646; 21 | --header-text-color: #dddddd; 22 | --text-color: #b3b3b3; 23 | --text-color-dark: #dddddd; 24 | --text-color-medium: #b2b2b2; 25 | --text-color-pale: #737373; 26 | --inner-border-color: #707070; 27 | --bold-border-color: #777777; 28 | --link-color: #008dc0; 29 | --link-color-pale: #97e1fb; 30 | --link-hover: #00688e; 31 | } 32 | 33 | table.dataset { 34 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 35 | font-size: 12px; 36 | background-color: var(--background); 37 | color: var(--text-color); 38 | border: none; 39 | border-collapse: collapse; 40 | } 41 | 42 | table.dataset th, td { 43 | padding: 6px; 44 | border: 1px solid transparent; 45 | text-align: left; 46 | } 47 | 48 | table.dataset th { 49 | background-color: var(--background); 50 | color: var(--header-text-color); 51 | } 52 | 53 | table.dataset td { 54 | vertical-align: top; 55 | } 56 | 57 | table.dataset th.bottomBorder { 58 | border-bottom-color: var(--bold-border-color); 59 | } 60 | 61 | table.dataset tbody > tr:nth-child(odd) { 62 | background: var(--background-odd); 63 | } 64 | 65 | table.dataset tbody > tr:nth-child(even) { 66 | background: var(--background); 67 | } 68 | 69 | table.dataset tbody > tr:hover { 70 | background: var(--background-hover); 71 | } 72 | 73 | table.dataset a { 74 | cursor: pointer; 75 | color: var(--link-color); 76 | text-decoration: none; 77 | } 78 | 79 | table.dataset tr:hover > td a { 80 | color: var(--link-color-pale); 81 | } 82 | 83 | table.dataset a:hover { 84 | color: var(--link-hover); 85 | text-decoration: underline; 86 | } 87 | 88 | table.dataset img { 89 | max-width: fit-content; 90 | } 91 | 92 | table.dataset th.complex { 93 | background-color: var(--background); 94 | border: 1px solid var(--background); 95 | } 96 | 97 | table.dataset .leftBorder { 98 | border-left-color: var(--inner-border-color); 99 | } 100 | 101 | table.dataset .rightBorder { 102 | border-right-color: var(--inner-border-color); 103 | } 104 | 105 | table.dataset .rightAlign { 106 | text-align: right; 107 | } 108 | 109 | table.dataset .expanderSvg { 110 | width: 8px; 111 | height: 8px; 112 | margin-right: 3px; 113 | } 114 | 115 | table.dataset .expander { 116 | display: flex; 117 | align-items: center; 118 | } 119 | 120 | /* formatting */ 121 | 122 | table.dataset .null { 123 | color: var(--text-color-pale); 124 | } 125 | 126 | table.dataset .structural { 127 | color: var(--text-color-medium); 128 | font-weight: bold; 129 | } 130 | 131 | table.dataset .datasetCaption { 132 | font-weight: bold; 133 | } 134 | 135 | table.dataset .numbers { 136 | color: var(--text-color-dark); 137 | } 138 | 139 | table.dataset td:hover .formatted .structural, .null { 140 | color: var(--text-color-dark); 141 | } 142 | 143 | table.dataset tr:hover .formatted .structural, .null { 144 | color: var(--text-color-dark); 145 | } 146 | 147 | -------------------------------------------------------------------------------- /kotlin-spark-api/build.gradle.kts: -------------------------------------------------------------------------------- 1 | @file:Suppress("UnstableApiUsage", "NOTHING_TO_INLINE") 2 | 3 | import com.igormaznitsa.jcp.gradle.JcpTask 4 | import com.vanniktech.maven.publish.JavadocJar.Dokka 5 | import com.vanniktech.maven.publish.KotlinJvm 6 | import org.jetbrains.dokka.gradle.AbstractDokkaLeafTask 7 | 8 | plugins { 9 | kotlin 10 | dokka 11 | mavenPublishBase 12 | jcp 13 | idea 14 | } 15 | 16 | group = Versions.groupID 17 | version = Versions.project 18 | 19 | 20 | repositories { 21 | mavenCentral() 22 | } 23 | 24 | tasks.withType().configureEach { 25 | useJUnitPlatform() 26 | maxHeapSize = "8g" 27 | } 28 | 29 | dependencies { 30 | 31 | with(Projects) { 32 | api( 33 | core, 34 | scalaTuplesInKotlin, 35 | ) 36 | } 37 | 38 | with(Dependencies) { 39 | 40 | // https://github.com/FasterXML/jackson-bom/issues/52 41 | if (Versions.spark == "3.3.1") implementation(jacksonDatabind) 42 | 43 | implementation( 44 | kotlinStdLib, 45 | reflect, 46 | sparkSql, 47 | sparkStreaming, 48 | hadoopClient, 49 | ) 50 | 51 | testImplementation( 52 | sparkStreamingKafka, 53 | kotest, 54 | kotestTestcontainers, 55 | klaxon, 56 | atrium, 57 | sparkStreaming, 58 | kafkaStreamsTestUtils, 59 | sparkMl, 60 | ) 61 | } 62 | } 63 | 64 | // Setup preprocessing with JCP for main sources 65 | 66 | val kotlinMainSources = kotlin.sourceSets.main.get().kotlin.sourceDirectories 67 | 68 | val preprocessMain by tasks.creating(JcpTask::class) { 69 | sources.set(kotlinMainSources) 70 | clearTarget.set(true) 71 | fileExtensions.set(listOf("kt")) 72 | vars.set(Versions.versionMap) 73 | outputs.upToDateWhen { target.get().exists() } 74 | } 75 | 76 | tasks.compileKotlin { 77 | dependsOn(preprocessMain) 78 | outputs.upToDateWhen { 79 | preprocessMain.outcomingFiles.files.isEmpty() 80 | } 81 | 82 | doFirst { 83 | kotlin { 84 | sourceSets { 85 | main { 86 | kotlin.setSrcDirs(listOf(preprocessMain.target.get())) 87 | } 88 | } 89 | } 90 | } 91 | 92 | doLast { 93 | kotlin { 94 | sourceSets { 95 | main { 96 | kotlin.setSrcDirs(kotlinMainSources) 97 | } 98 | } 99 | } 100 | } 101 | } 102 | 103 | // Setup preprocessing with JCP for test sources 104 | 105 | val kotlinTestSources = kotlin.sourceSets.test.get().kotlin.sourceDirectories 106 | 107 | val preprocessTest by tasks.creating(JcpTask::class) { 108 | sources.set(kotlinTestSources) 109 | clearTarget.set(true) 110 | fileExtensions.set(listOf("kt")) 111 | vars.set(Versions.versionMap) 112 | outputs.upToDateWhen { target.get().exists() } 113 | } 114 | 115 | tasks.compileTestKotlin { 116 | dependsOn(preprocessTest) 117 | outputs.upToDateWhen { 118 | preprocessTest.outcomingFiles.files.isEmpty() 119 | } 120 | 121 | doFirst { 122 | kotlin { 123 | sourceSets { 124 | test { 125 | kotlin.setSrcDirs(listOf(preprocessTest.target.get())) 126 | } 127 | } 128 | } 129 | } 130 | 131 | doLast { 132 | kotlin { 133 | sourceSets { 134 | test { 135 | kotlin.setSrcDirs(kotlinTestSources) 136 | } 137 | } 138 | } 139 | } 140 | } 141 | 142 | kotlin { 143 | jvmToolchain { 144 | languageVersion.set( 145 | JavaLanguageVersion.of(Versions.jvmTarget) 146 | ) 147 | } 148 | } 149 | 150 | tasks.withType { 151 | dokkaSourceSets { 152 | all { 153 | sourceRoot(preprocessMain.target.get()) 154 | } 155 | } 156 | } 157 | 158 | mavenPublishing { 159 | configure(KotlinJvm(Dokka("dokkaHtml"))) 160 | } 161 | 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/DataStreamWriter.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api 21 | 22 | import org.apache.spark.api.java.function.VoidFunction2 23 | import org.apache.spark.sql.Dataset 24 | import org.apache.spark.sql.streaming.DataStreamWriter 25 | 26 | /** 27 | * :: Experimental :: 28 | * 29 | * (Scala-specific) Sets the output of the streaming query to be processed using the provided 30 | * function. This is supported only in the micro-batch execution modes (that is, when the 31 | * trigger is not continuous). In every micro-batch, the provided function will be called in 32 | * every micro-batch with (i) the output rows as a Dataset and (ii) the batch identifier. 33 | * The batchId can be used to deduplicate and transactionally write the output 34 | * (that is, the provided Dataset) to external systems. The output Dataset is guaranteed 35 | * to be exactly the same for the same batchId (assuming all operations are deterministic 36 | * in the query). 37 | * 38 | * @since 2.4.0 39 | */ 40 | fun DataStreamWriter.forEachBatch( 41 | func: (batch: Dataset, batchId: Long) -> Unit, 42 | ): DataStreamWriter = foreachBatch(VoidFunction2(func)) 43 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/GroupState.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.0+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2021 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | 21 | /** 22 | * This file contains some helper functions to more easily work with [GroupState] from Kotlin. 23 | */ 24 | 25 | package org.jetbrains.kotlinx.spark.api 26 | 27 | import org.apache.spark.sql.streaming.GroupState 28 | import kotlin.reflect.KProperty 29 | 30 | /** 31 | * (Kotlin-specific) 32 | * Returns the group state value if it exists, else `null`. 33 | * This is comparable to [GroupState.getOption], but instead utilises Kotlin's nullability features 34 | * to get the same result. 35 | */ 36 | fun GroupState.getOrNull(): S? = if (exists()) get() else null 37 | 38 | /** 39 | * (Kotlin-specific) 40 | * Allows the group state object to be used as a delegate. Will be `null` if it does not exist. 41 | * 42 | * For example: 43 | * ```kotlin 44 | * groupedDataset.mapGroupsWithState(GroupStateTimeout.NoTimeout()) { key, values, state: GroupState -> 45 | * var s by state 46 | * ... 47 | * } 48 | * ``` 49 | */ 50 | operator fun GroupState.getValue(thisRef: Any?, property: KProperty<*>): S? = getOrNull() 51 | 52 | /** 53 | * (Kotlin-specific) 54 | * Allows the group state object to be used as a delegate. Will be `null` if it does not exist. 55 | * 56 | * For example: 57 | * ```kotlin 58 | * groupedDataset.mapGroupsWithState(GroupStateTimeout.NoTimeout()) { key, values, state: GroupState -> 59 | * var s by state 60 | * ... 61 | * } 62 | * ``` 63 | */ 64 | operator fun GroupState.setValue(thisRef: Any?, property: KProperty<*>, value: S?): Unit = update(value) 65 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/Iterators.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | 21 | /** 22 | * This file contains several ways to wrap and modify iterators lazily. 23 | * This includes mapping, filtering, and partitioning. 24 | */ 25 | 26 | package org.jetbrains.kotlinx.spark.api 27 | 28 | /** Partitions the values of the iterator lazily in groups of [size]. */ 29 | class PartitioningIterator( 30 | private val source: Iterator, 31 | private val size: Int, 32 | private val cutIncomplete: Boolean = false, 33 | ) : AbstractIterator>() { 34 | 35 | override fun computeNext() { 36 | if (!source.hasNext()) return done() 37 | val interimResult = arrayListOf() 38 | repeat(size) { 39 | if (source.hasNext()) 40 | interimResult.add(source.next()) 41 | else 42 | return if (cutIncomplete) 43 | done() 44 | else 45 | setNext(interimResult) 46 | } 47 | setNext(interimResult) 48 | } 49 | 50 | } 51 | 52 | /** Maps the values of the iterator lazily using [func]. */ 53 | @Deprecated("[Iterator.map] now uses the [Sequence.map] function") 54 | class MappingIterator( 55 | private val source: Iterator, 56 | private val func: (T) -> R, 57 | ) : AbstractIterator() { 58 | 59 | override fun computeNext(): Unit = 60 | if (source.hasNext()) 61 | setNext(func(source.next())) 62 | else 63 | done() 64 | } 65 | 66 | /** Filters the values of the iterator lazily using [predicate]. */ 67 | @Deprecated("[Iterator.filter] now uses the [Sequence.filter] function") 68 | class FilteringIterator( 69 | private val source: Iterator, 70 | private val predicate: (T) -> Boolean, 71 | ) : AbstractIterator() { 72 | 73 | override fun computeNext() { 74 | while (source.hasNext()) { 75 | val next = source.next() 76 | if (predicate(next)) { 77 | setNext(next) 78 | return 79 | } 80 | } 81 | done() 82 | } 83 | 84 | } 85 | 86 | /** Allows to transform an Iterator using the Sequence functions. */ 87 | fun Iterator.transformAsSequence(func: Sequence.() -> Sequence): Iterator = 88 | func(this.asSequence()).iterator() 89 | 90 | /** Flattens iterator. */ 91 | fun Iterator>.flatten(): Iterator = transformAsSequence { flatMap { it.asSequence() } } 92 | 93 | /** Maps the values of the iterator lazily using [func]. */ 94 | fun Iterator.map(func: (T) -> R): Iterator = transformAsSequence { map(func) } 95 | 96 | /** Filters the values of the iterator lazily using [predicate]. */ 97 | fun Iterator.filter(predicate: (T) -> Boolean): Iterator = transformAsSequence { filter(predicate) } 98 | 99 | /** Partitions the values of the iterator lazily in groups of [size]. */ 100 | fun Iterator.partition(size: Int, cutIncomplete: Boolean = false): Iterator> = 101 | PartitioningIterator(this, size, cutIncomplete) 102 | 103 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/Rdd.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api 2 | 3 | import org.apache.spark.api.java.JavaRDD 4 | import org.apache.spark.api.java.JavaSparkContext 5 | import java.io.Serializable 6 | 7 | /** 8 | * Utility method to create an RDD from a list. 9 | * NOTE: [T] must be [Serializable]. 10 | */ 11 | fun JavaSparkContext.rddOf( 12 | vararg elements: T, 13 | numSlices: Int = defaultParallelism(), 14 | ): JavaRDD = parallelize(elements.toList(), numSlices) 15 | 16 | /** 17 | * Utility method to create an RDD from a list. 18 | * NOTE: [T] must be [Serializable]. 19 | */ 20 | fun JavaSparkContext.toRDD( 21 | elements: List, 22 | numSlices: Int = defaultParallelism(), 23 | ): JavaRDD = parallelize(elements, numSlices) 24 | 25 | /** 26 | * Returns the minimum element from this RDD as defined by the specified 27 | * [Comparator]. 28 | * 29 | * @return the minimum of the RDD 30 | */ 31 | fun > JavaRDD.min(): T = min( 32 | object : Comparator, Serializable { 33 | override fun compare(o1: T, o2: T): Int = o1.compareTo(o2) 34 | } 35 | ) 36 | 37 | /** 38 | * Returns the maximum element from this RDD as defined by the specified 39 | * [Comparator]. 40 | * 41 | * @return the maximum of the RDD 42 | */ 43 | fun > JavaRDD.max(): T = max( 44 | object : Comparator, Serializable { 45 | override fun compare(o1: T, o2: T): Int = o1.compareTo(o2) 46 | } 47 | ) 48 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/RddDouble.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api 2 | 3 | import org.apache.spark.api.java.JavaDoubleRDD 4 | import org.apache.spark.api.java.JavaRDD 5 | import org.apache.spark.partial.BoundedDouble 6 | import org.apache.spark.partial.PartialResult 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.util.StatCounter 9 | import scala.Tuple2 10 | 11 | /** Utility method to convert [JavaRDD]<[Number]> to [JavaDoubleRDD]. */ 12 | @Suppress("UNCHECKED_CAST") 13 | inline fun JavaRDD.toJavaDoubleRDD(): JavaDoubleRDD = 14 | JavaDoubleRDD.fromRDD( 15 | when (T::class) { 16 | Double::class -> this 17 | else -> map(Number::toDouble) 18 | }.rdd() as RDD 19 | ) 20 | 21 | /** Utility method to convert [JavaDoubleRDD] to [JavaRDD]<[Double]>. */ 22 | @Suppress("UNCHECKED_CAST") 23 | fun JavaDoubleRDD.toDoubleRDD(): JavaRDD = 24 | JavaDoubleRDD.toRDD(this).toJavaRDD() as JavaRDD 25 | 26 | /** Add up the elements in this RDD. */ 27 | inline fun JavaRDD.sum(): Double = toJavaDoubleRDD().sum() 28 | 29 | /** 30 | * Return a [org.apache.spark.util.StatCounter] object that captures the mean, variance and 31 | * count of the RDD's elements in one operation. 32 | */ 33 | inline fun JavaRDD.stats(): StatCounter = toJavaDoubleRDD().stats() 34 | 35 | /** Compute the mean of this RDD's elements. */ 36 | inline fun JavaRDD.mean(): Double = toJavaDoubleRDD().mean() 37 | 38 | /** Compute the population variance of this RDD's elements. */ 39 | inline fun JavaRDD.variance(): Double = toJavaDoubleRDD().variance() 40 | 41 | /** Compute the population standard deviation of this RDD's elements. */ 42 | inline fun JavaRDD.stdev(): Double = toJavaDoubleRDD().stdev() 43 | 44 | /** 45 | * Compute the sample standard deviation of this RDD's elements (which corrects for bias in 46 | * estimating the standard deviation by dividing by N-1 instead of N). 47 | */ 48 | inline fun JavaRDD.sampleStdev(): Double = toJavaDoubleRDD().sampleStdev() 49 | 50 | /** 51 | * Compute the sample variance of this RDD's elements (which corrects for bias in 52 | * estimating the variance by dividing by N-1 instead of N). 53 | */ 54 | inline fun JavaRDD.sampleVariance(): Double = toJavaDoubleRDD().sampleVariance() 55 | 56 | /** Compute the population standard deviation of this RDD's elements. */ 57 | inline fun JavaRDD.popStdev(): Double = toJavaDoubleRDD().popStdev() 58 | 59 | /** Compute the population variance of this RDD's elements. */ 60 | inline fun JavaRDD.popVariance(): Double = toJavaDoubleRDD().popVariance() 61 | 62 | /** Approximate operation to return the mean within a timeout. */ 63 | inline fun JavaRDD.meanApprox( 64 | timeout: Long, 65 | confidence: Double = 0.95, 66 | ): PartialResult = toJavaDoubleRDD().meanApprox(timeout, confidence) 67 | 68 | /** Approximate operation to return the sum within a timeout. */ 69 | inline fun JavaRDD.sumApprox( 70 | timeout: Long, 71 | confidence: Double = 0.95, 72 | ): PartialResult = toJavaDoubleRDD().sumApprox(timeout, confidence) 73 | 74 | /** 75 | * Compute a histogram of the data using bucketCount number of buckets evenly 76 | * spaced between the minimum and maximum of the RDD. For example if the min 77 | * value is 0 and the max is 100 and there are two buckets the resulting 78 | * buckets will be `[0, 50)` `[50, 100]`. bucketCount must be at least 1 79 | * If the RDD contains infinity, NaN throws an exception 80 | * If the elements in RDD do not vary (max == min) always returns a single bucket. 81 | */ 82 | inline fun JavaRDD.histogram(bucketCount: Int): Tuple2 = 83 | toJavaDoubleRDD().histogram(bucketCount) 84 | 85 | /** 86 | * Compute a histogram using the provided buckets. The buckets are all open 87 | * to the right except for the last which is closed. 88 | * e.g. for the array 89 | * `[1, 10, 20, 50]` the buckets are `[1, 10) [10, 20) [20, 50]` 90 | * e.g. ` <=x<10, 10<=x<20, 20<=x<=50` 91 | * And on the input of 1 and 50 we would have a histogram of 1, 0, 1 92 | * 93 | * Note: If your histogram is evenly spaced (e.g. `[0, 10, 20, 30]`) this can be switched 94 | * from an O(log n) insertion to O(1) per element. (where n = # buckets) if you set evenBuckets 95 | * to true. 96 | * buckets must be sorted and not contain any duplicates. 97 | * buckets array must be at least two elements 98 | * All NaN entries are treated the same. If you have a NaN bucket it must be 99 | * the maximum value of the last position and all NaN entries will be counted 100 | * in that bucket. 101 | */ 102 | inline fun JavaRDD.histogram( 103 | buckets: Array, 104 | evenBuckets: Boolean = false, 105 | ): LongArray = toJavaDoubleRDD().histogram(buckets, evenBuckets) -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/Seq.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api 2 | 3 | import scala.collection.immutable.`Seq$`.`MODULE$` as Seq 4 | import scala.collection.immutable.Seq as Seq 5 | import scala.collection.mutable.`Seq$`.`MODULE$` as MutableSeq 6 | import scala.collection.mutable.Seq as MutableSeq 7 | 8 | /** Returns a new empty immutable Seq. */ 9 | fun emptySeq(): Seq = Seq.empty() as Seq 10 | 11 | /** Returns a new immutable Seq with the given elements. */ 12 | fun seqOf(vararg elements: T): Seq = 13 | if (elements.isEmpty()) 14 | emptySeq() 15 | else 16 | Seq.newBuilder().apply { 17 | for (it in elements) 18 | `$plus$eq`(it) 19 | }.result() as Seq 20 | 21 | /** Returns a new mutable Seq with the given elements. */ 22 | fun emptyMutableSeq(): MutableSeq = MutableSeq.empty() as MutableSeq 23 | 24 | /** Returns a new mutable Seq with the given elements. */ 25 | fun mutableSeqOf(vararg elements: T): MutableSeq = 26 | if (elements.isEmpty()) 27 | emptyMutableSeq() 28 | else 29 | MutableSeq.newBuilder().apply { 30 | for (it in elements) 31 | `$plus$eq`(it) 32 | }.result() as MutableSeq 33 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/main/kotlin/org/jetbrains/kotlinx/spark/api/UserDefinedFunction.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | @file:Suppress("unused") 21 | 22 | package org.jetbrains.kotlinx.spark.api 23 | 24 | import org.apache.spark.sql.* 25 | import org.apache.spark.sql.types.DataType 26 | import scala.collection.Seq 27 | import java.io.Serializable 28 | import kotlin.reflect.KClass 29 | import kotlin.reflect.KProperty 30 | import kotlin.reflect.full.isSubclassOf 31 | import kotlin.reflect.full.primaryConstructor 32 | import org.apache.spark.sql.expressions.UserDefinedFunction as SparkUserDefinedFunction 33 | 34 | /** Unwraps [DataTypeWithClass]. */ 35 | fun DataType.unWrap(): DataType = 36 | when (this) { 37 | is DataTypeWithClass -> DataType.fromJson(dt().json()) 38 | else -> this 39 | } 40 | 41 | /** 42 | * Checks if [this] is of a valid type for a UDF, otherwise it throws a [TypeOfUDFParameterNotSupportedException] 43 | */ 44 | @PublishedApi 45 | internal fun KClass<*>.checkForValidType(parameterName: String) { 46 | if (this == String::class || isSubclassOf(Seq::class) 47 | //#if scalaCompat < 2.13 48 | //$|| isSubclassOf(scala.collection.mutable.WrappedArray::class) 49 | //#endif 50 | ) 51 | return // Most of the time we need strings or WrappedArrays/Seqs 52 | 53 | if (isSubclassOf(Iterable::class) 54 | || java.isArray 55 | || isSubclassOf(Char::class) 56 | || isSubclassOf(Map::class) 57 | || isSubclassOf(Array::class) 58 | || isSubclassOf(ByteArray::class) 59 | || isSubclassOf(CharArray::class) 60 | || isSubclassOf(ShortArray::class) 61 | || isSubclassOf(IntArray::class) 62 | || isSubclassOf(LongArray::class) 63 | || isSubclassOf(FloatArray::class) 64 | || isSubclassOf(DoubleArray::class) 65 | || isSubclassOf(BooleanArray::class) 66 | ) throw TypeOfUDFParameterNotSupportedException(this, parameterName) 67 | } 68 | 69 | /** 70 | * An exception thrown when the UDF is generated with illegal types for the parameters 71 | */ 72 | class TypeOfUDFParameterNotSupportedException(kClass: KClass<*>, parameterName: String) : IllegalArgumentException( 73 | "Parameter $parameterName is subclass of ${kClass.qualifiedName}. If you need to process an array use ${Seq::class.qualifiedName}. You can convert any typed array/list-like column using [asSeq()]." 74 | ) 75 | 76 | @JvmName("arrayColumnAsSeq") 77 | fun TypedColumn>.asSeq(): TypedColumn> = typed() 78 | @JvmName("iterableColumnAsSeq") 79 | fun > TypedColumn.asSeq(): TypedColumn> = typed() 80 | @JvmName("byteArrayColumnAsSeq") 81 | fun TypedColumn.asSeq(): TypedColumn> = typed() 82 | @JvmName("charArrayColumnAsSeq") 83 | fun TypedColumn.asSeq(): TypedColumn> = typed() 84 | @JvmName("shortArrayColumnAsSeq") 85 | fun TypedColumn.asSeq(): TypedColumn> = typed() 86 | @JvmName("intArrayColumnAsSeq") 87 | fun TypedColumn.asSeq(): TypedColumn> = typed() 88 | @JvmName("longArrayColumnAsSeq") 89 | fun TypedColumn.asSeq(): TypedColumn> = typed() 90 | @JvmName("floatArrayColumnAsSeq") 91 | fun TypedColumn.asSeq(): TypedColumn> = typed() 92 | @JvmName("doubleArrayColumnAsSeq") 93 | fun TypedColumn.asSeq(): TypedColumn> = typed() 94 | @JvmName("booleanArrayColumnAsSeq") 95 | fun TypedColumn.asSeq(): TypedColumn> = typed() 96 | 97 | 98 | /** 99 | * Registers a user-defined function (UDF) with name, for a UDF that's already defined using the Dataset 100 | * API (i.e. of type [NamedUserDefinedFunction]). 101 | * @see UDFRegistration.register 102 | */ 103 | inline fun > UDFRegistration.register( 104 | namedUdf: NamedUdf, 105 | ): NamedUdf = namedUdf.copy(udf = register(namedUdf.name, namedUdf.udf)) 106 | 107 | inline fun > UDFRegistration.register( 108 | name: String, 109 | udf: UserDefinedFunction, 110 | ): NamedUdf = udf.withName(name).copy(udf = register(name, udf.udf)) 111 | 112 | /** 113 | * Typed wrapper around [SparkUserDefinedFunction] with defined encoder. 114 | * 115 | * @param Return the return type of the udf 116 | * @param NamedUdf a type reference to the named version of the [SparkUserDefinedFunction] implementing class 117 | */ 118 | sealed interface UserDefinedFunction : Serializable { 119 | val udf: SparkUserDefinedFunction 120 | val encoder: Encoder 121 | 122 | /** Returns true when the UDF can return a nullable value. */ 123 | val nullable: Boolean get() = udf.nullable() 124 | 125 | /** Returns true iff the UDF is deterministic, i.e. the UDF produces the same output given the same input. */ 126 | val deterministic: Boolean get() = udf.deterministic() 127 | 128 | /** Converts this [UserDefinedFunction] to a [NamedUserDefinedFunction]. */ 129 | fun withName(name: String): NamedUdf 130 | 131 | /** 132 | * Converts this [UserDefinedFunction] to a [NamedUserDefinedFunction]. 133 | * @see withName 134 | */ 135 | operator fun getValue(thisRef: Any?, property: KProperty<*>): NamedUdf 136 | } 137 | 138 | /** 139 | * Typed and named wrapper around [SparkUserDefinedFunction] with defined encoder. 140 | * 141 | * @param Return the return type of the udf 142 | * @param This a self reference to the named version of the [SparkUserDefinedFunction] implementing class. 143 | * Unfortunately needed to allow functions to treat any [NamedTypedUserDefinedFunction] as a normal [TypedUserDefinedFunction]. 144 | */ 145 | sealed interface NamedUserDefinedFunction : UserDefinedFunction { 146 | val name: String 147 | } 148 | 149 | /** Copy method for all [NamedUserDefinedFunction] functions. */ 150 | inline fun > T.copy( 151 | name: String = this.name, 152 | udf: SparkUserDefinedFunction = this.udf, 153 | encoder: Encoder = this.encoder, 154 | ): T = T::class.primaryConstructor!!.run { 155 | callBy( 156 | parameters.associateWith { 157 | when (it.name) { 158 | NamedUserDefinedFunction<*, *>::name.name -> name 159 | NamedUserDefinedFunction<*, *>::udf.name -> udf 160 | NamedUserDefinedFunction<*, *>::encoder.name -> encoder 161 | else -> error("Wrong arguments") 162 | } 163 | } 164 | ) 165 | } 166 | 167 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ApiTest.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api/*- 2 | * =LICENSE= 3 | * Kotlin Spark API 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | import ch.tutteli.atrium.api.fluent.en_GB.* 21 | import ch.tutteli.atrium.api.verbs.expect 22 | import io.kotest.core.spec.style.ShouldSpec 23 | import io.kotest.matchers.shouldBe 24 | import scala.collection.Seq 25 | import java.io.Serializable 26 | import kotlin.collections.Iterator 27 | import scala.collection.Iterator as ScalaIterator 28 | import scala.collection.Map as ScalaMap 29 | import scala.collection.mutable.Map as ScalaMutableMap 30 | 31 | class ApiTest : ShouldSpec({ 32 | 33 | context("miscellaneous integration tests") { 34 | withSpark(props = mapOf("spark.sql.codegen.comments" to true)) { 35 | 36 | should("Create Seqs") { 37 | spark.createDataset(seqOf(1, 2, 3), encoder()) 38 | .collectAsList() shouldBe listOf(1, 2, 3) 39 | 40 | 41 | seqOf(1, 2, 3) shouldBe seqOf(1, 2, 3) 42 | mutableSeqOf(1, 2, 3) shouldBe mutableSeqOf(1, 2, 3) 43 | 44 | seqOf() shouldBe emptySeq() 45 | mutableSeqOf() shouldBe emptyMutableSeq() 46 | } 47 | 48 | @OptIn(ExperimentalStdlibApi::class) 49 | should("broadcast variables") { 50 | val largeList = (1..15).map { SomeClass(a = (it..15).toList().toIntArray(), b = it) } 51 | val broadcast = spark.broadcast(largeList) 52 | val broadcast2 = spark.broadcast(arrayOf(doubleArrayOf(1.0, 2.0, 3.0, 4.0))) 53 | 54 | val result: List = listOf(1, 2, 3, 4, 5) 55 | .toDS() 56 | .mapPartitions { iterator -> 57 | val receivedBroadcast = broadcast.value 58 | val receivedBroadcast2 = broadcast2.value 59 | 60 | buildList { 61 | iterator.forEach { 62 | this.add(it + receivedBroadcast[it].b * receivedBroadcast2[0][0]) 63 | } 64 | }.iterator() 65 | } 66 | .collectAsList() 67 | 68 | expect(result).toContain.inOrder.only.values(3.0, 5.0, 7.0, 9.0, 11.0) 69 | } 70 | 71 | should("Handle JavaConversions in Kotlin") { 72 | // Test the iterator conversion 73 | val scalaIterator: ScalaIterator = listOf("test1", "test2").iterator().asScalaIterator() 74 | scalaIterator.next() shouldBe "test1" 75 | 76 | val kotlinIterator: Iterator = scalaIterator.asKotlinIterator() 77 | kotlinIterator.next() shouldBe "test2" 78 | 79 | 80 | val scalaMap: ScalaMap = mapOf(1 to "a", 2 to "b").asScalaMap() 81 | scalaMap.get(1).get() shouldBe "a" 82 | scalaMap.get(2).get() shouldBe "b" 83 | 84 | val kotlinMap: Map = scalaMap.asKotlinMap() 85 | kotlinMap[1] shouldBe "a" 86 | kotlinMap[2] shouldBe "b" 87 | 88 | 89 | val scalaMutableMap: ScalaMutableMap = mutableMapOf(1 to "a").asScalaMutableMap() 90 | scalaMutableMap.get(1).get() shouldBe "a" 91 | 92 | scalaMutableMap.put(2, "b") 93 | 94 | val kotlinMutableMap: MutableMap = scalaMutableMap.asKotlinMutableMap() 95 | kotlinMutableMap[1] shouldBe "a" 96 | kotlinMutableMap[2] shouldBe "b" 97 | 98 | val scalaSeq: Seq = listOf("a", "b").iterator().asScalaIterator().toSeq() 99 | scalaSeq.take(1).toList().last() shouldBe "a" 100 | scalaSeq.take(2).toList().last() shouldBe "b" 101 | 102 | val kotlinList: List = scalaSeq.asKotlinList() 103 | kotlinList.first() shouldBe "a" 104 | kotlinList.last() shouldBe "b" 105 | } 106 | 107 | should("Map iterators") { 108 | val data = (1..50).toList() 109 | val iterator = iterator { yieldAll(data) } 110 | .map { it.toString() } 111 | 112 | iterator.asSequence().toList() shouldBe data.map { it.toString() } 113 | } 114 | 115 | should("Filter iterators") { 116 | val data = (1..50).toList() 117 | val iterator = iterator { yieldAll(data) } 118 | .filter { it % 2 == 0 } 119 | 120 | iterator.asSequence().toList() shouldBe data.filter { it % 2 == 0 } 121 | } 122 | 123 | should("Partition iterators") { 124 | val data = (1..50).toList() 125 | 126 | val iterator1 = iterator { yieldAll(data) } 127 | .partition(8, cutIncomplete = false) 128 | val result1 = iterator1.asSequence().toList() 129 | result1.size shouldBe (50 / 8 + 1) 130 | result1.map { it.size }.distinct().size shouldBe 2 // two difference sizes should exist, 8 and the rest 131 | 132 | val iterator2 = iterator { yieldAll(data) } 133 | .partition(8, cutIncomplete = true) 134 | 135 | val result2 = iterator2.asSequence().toList() 136 | result2.size shouldBe (50 / 8) 137 | result2.forEach { it.size shouldBe 8 } 138 | } 139 | 140 | should("Flatten iterators") { 141 | val data = (1..50).toList() 142 | val (data1, data2) = data.partition { it <= 25 } 143 | val iterator = iterator { 144 | yield(data1.iterator()) 145 | yield(data2.iterator()) 146 | }.flatten() 147 | 148 | iterator.asSequence().toList() shouldBe data 149 | } 150 | 151 | should("Flatmap iterators using transformAsSequence") { 152 | val data = (1..50).toList() 153 | val iterator = data.iterator() 154 | .transformAsSequence { 155 | flatMap { 156 | listOf(it.toDouble(), it + 0.5) 157 | } 158 | } 159 | 160 | iterator.asSequence().toList() shouldBe data.flatMap { listOf(it.toDouble(), it + 0.5) } 161 | } 162 | } 163 | } 164 | }) 165 | 166 | 167 | // (data) class must be Serializable to be broadcast 168 | data class SomeClass(val a: IntArray, val b: Int) : Serializable 169 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/KafkaStreamingTest.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api 21 | 22 | import io.kotest.core.Tag 23 | import io.kotest.core.extensions.install 24 | import io.kotest.core.spec.style.FunSpec 25 | import io.kotest.extensions.testcontainers.TestContainerExtension 26 | import io.kotest.extensions.testcontainers.kafka.createStringStringConsumer 27 | import io.kotest.extensions.testcontainers.kafka.createStringStringProducer 28 | import io.kotest.matchers.collections.shouldContain 29 | import io.kotest.matchers.collections.shouldContainAll 30 | import org.apache.kafka.clients.consumer.ConsumerConfig 31 | import org.apache.kafka.clients.consumer.ConsumerRecord 32 | import org.apache.kafka.clients.producer.ProducerRecord 33 | import org.apache.kafka.common.serialization.StringDeserializer 34 | import org.apache.spark.streaming.Durations 35 | import org.apache.spark.streaming.api.java.JavaInputDStream 36 | import org.apache.spark.streaming.kafka010.ConsumerStrategies 37 | import org.apache.spark.streaming.kafka010.KafkaUtils 38 | import org.apache.spark.streaming.kafka010.LocationStrategies 39 | import org.jetbrains.kotlinx.spark.api.tuples.* 40 | import org.testcontainers.containers.ContainerLaunchException 41 | import org.testcontainers.containers.KafkaContainer 42 | import org.testcontainers.utility.DockerImageName 43 | import scala.Tuple3 44 | import java.io.Serializable 45 | import java.time.Duration 46 | 47 | object Kafka : Tag() 48 | 49 | class KafkaStreamingTest : FunSpec() { 50 | 51 | init { 52 | 53 | tags(Kafka) 54 | 55 | val kafka = run { 56 | var attempts = 0 57 | while (true) { 58 | try { 59 | return@run install( 60 | TestContainerExtension( 61 | KafkaContainer(DockerImageName.parse("confluentinc/cp-kafka:7.0.1")) 62 | ) 63 | ) { 64 | withEmbeddedZookeeper() 65 | withEnv("KAFKA_AUTO_CREATE_TOPICS_ENABLE", "true") 66 | } 67 | } catch (e: ContainerLaunchException) { 68 | attempts++ 69 | if (attempts > 10) throw e 70 | } 71 | } 72 | @Suppress("UNREACHABLE_CODE") 73 | error("Unreachable") 74 | } 75 | 76 | println(kafka.bootstrapServers) 77 | test("Streaming should support kafka") { 78 | val topic1 = "test1" 79 | val topic2 = "test2" 80 | 81 | val resultLists = mapOf( 82 | topic1 to listOf( 83 | "Hello" X 1, 84 | "this" X 1, 85 | "is" X 1, 86 | "a" X 1, 87 | "test" X 3, 88 | ), 89 | topic2 to listOf( 90 | "This" X 1, 91 | "is" X 1, 92 | "also" X 2, 93 | "a" X 1, 94 | "test" X 2, 95 | "something" X 1, 96 | ) 97 | ) 98 | val data = arrayListOf>>() 99 | 100 | withSparkStreaming( 101 | batchDuration = Durations.milliseconds(1000), 102 | appName = "KotlinDirectKafkaWordCount", 103 | timeout = 10_000L, 104 | master = "local" 105 | ) { 106 | 107 | setRunAfterStart { 108 | val producer = autoClose(kafka.createStringStringProducer()) 109 | producer.send(ProducerRecord(topic1, "Hello this is a test test test")) 110 | producer.send(ProducerRecord(topic2, "This is also also a test test something")) 111 | } 112 | 113 | val kafkaParams: Map = mapOf( 114 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG to "${kafka.host}:${kafka.getMappedPort(KafkaContainer.KAFKA_PORT)}", 115 | ConsumerConfig.GROUP_ID_CONFIG to "consumer-group", 116 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java, 117 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG to StringDeserializer::class.java, 118 | ) 119 | // Create direct kafka stream with brokers and topics 120 | val messages: JavaInputDStream> = KafkaUtils.createDirectStream( 121 | ssc, 122 | LocationStrategies.PreferBrokers(), 123 | ConsumerStrategies.Subscribe(setOf(topic1, topic2), kafkaParams), 124 | ) 125 | 126 | // Get the lines, split them into words, count the words and print 127 | 128 | val wordCounts = messages 129 | .map { it.topic() X it.value() } 130 | .flatMapValues { it.split(" ").iterator() } 131 | .map { t(it, 1) } 132 | .reduceByKey { a: Int, b: Int -> a + b } 133 | .map { (tup, counter) -> tup + counter } 134 | 135 | 136 | wordCounts.foreachRDD { rdd, _ -> 137 | data.add(rdd.collect()) 138 | } 139 | } 140 | 141 | val resultList = resultLists.flatMap { (topic, tuples) -> 142 | tuples.map { it.prependedBy(topic) } 143 | } 144 | data.flatten() shouldContainAll resultList 145 | } 146 | } 147 | } -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/ProjectConfig.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api 21 | 22 | import io.kotest.core.config.AbstractProjectConfig 23 | 24 | @Suppress("unused") 25 | object ProjectConfig : AbstractProjectConfig() { 26 | 27 | } 28 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/RddTest.kt: -------------------------------------------------------------------------------- 1 | package org.jetbrains.kotlinx.spark.api 2 | 3 | import io.kotest.core.spec.style.ShouldSpec 4 | import io.kotest.matchers.collections.shouldContainAll 5 | import io.kotest.matchers.shouldBe 6 | import org.apache.spark.api.java.JavaRDD 7 | import org.jetbrains.kotlinx.spark.api.tuples.* 8 | import scala.Tuple2 9 | 10 | class RddTest : ShouldSpec({ 11 | context("RDD extension functions") { 12 | 13 | withSpark(logLevel = SparkLogLevel.DEBUG) { 14 | 15 | context("Key/value") { 16 | should("work with spark example") { 17 | val rdd = rddOf(1, 1, 2, 2, 2, 3).map(Int::toString) 18 | 19 | val pairs = rdd.map { it X 1 } 20 | val counts = pairs.reduceByKey { a, b -> a + b } 21 | val list = counts.collect().toList() 22 | list.shouldContainAll("1" X 2, "2" X 3, "3" X 1) 23 | } 24 | 25 | should("Have handy functions") { 26 | val rdd = rddOf( 27 | 1 X "a", 28 | 2 X "b", 29 | 3 X "c", 30 | 4 X "d", 31 | 5 X "e", 32 | 6 X "f", 33 | ) 34 | 35 | //#if sparkMinor >= 3.1 36 | val rangeFiltered: JavaRDD> = rdd.filterByRange(2..5) 37 | rangeFiltered.collect().shouldContainAll( 38 | 2 X "b", 39 | 3 X "c", 40 | 4 X "d", 41 | 5 X "e", 42 | ) 43 | //#endif 44 | 45 | val result = rdd 46 | .flatMapValues { 47 | listOf(it + 1, it + 2, it + 3, it + 4).iterator() 48 | } 49 | .also { 50 | it.countByKey().values.forEach { it shouldBe 4 } 51 | } 52 | .foldByKey("", String::plus) // (1,"a1a2a3a4") etc. 53 | .mapValues { it.toSortedSet().fold("", String::plus) } // (1,"1234a") etc. 54 | .map { it.swap() } // ("1234a",1) etc. 55 | .mapKeys { it.take(4) } // ("1234",1) etc. 56 | .groupByKey() 57 | .mapValues { it.toList().sorted() } // ("1234",[1,2,3,4,5,6]) 58 | .collect() 59 | .single() 60 | 61 | result shouldBe t("1234", listOf(1, 2, 3, 4, 5, 6)) 62 | } 63 | } 64 | 65 | context("Double functions") { 66 | should("get max/min") { 67 | val rdd = rddOf(1, 1, 2, 2, 2, 3) 68 | 69 | rdd.max() shouldBe 3.0 70 | rdd.min() shouldBe 1.0 71 | } 72 | 73 | context("Work with any number") { 74 | 75 | should("Work with Bytes") { 76 | val data = listOf(1, 1, 2, 2, 2, 3).map(Int::toByte) 77 | val rdd = data.toRDD() 78 | rdd.sum() shouldBe data.sum().toDouble() 79 | } 80 | 81 | should("Work with Shorts") { 82 | val data = listOf(1, 1, 2, 2, 2, 3).map(Int::toShort) 83 | val rdd = data.toRDD() 84 | rdd.sum() shouldBe data.sum().toDouble() 85 | } 86 | 87 | should("Work with Ints") { 88 | val data = listOf(1, 1, 2, 2, 2, 3) 89 | val rdd = data.toRDD() 90 | rdd.sum() shouldBe data.sum().toDouble() 91 | } 92 | 93 | should("Work with Longs") { 94 | val data = listOf(1, 1, 2, 2, 2, 3).map(Int::toLong) 95 | val rdd = data.toRDD() 96 | rdd.sum() shouldBe data.sum().toDouble() 97 | } 98 | 99 | should("Work with Floats") { 100 | val data = listOf(1, 1, 2, 2, 2, 3).map(Int::toFloat) 101 | val rdd = data.toRDD() 102 | rdd.sum() shouldBe data.sum().toDouble() 103 | } 104 | 105 | should("Work with Doubles") { 106 | val data = listOf(1, 1, 2, 2, 2, 3).map(Int::toDouble) 107 | val rdd = data.toRDD().toJavaDoubleRDD() 108 | rdd.sum() shouldBe data.sum().toDouble() 109 | } 110 | } 111 | } 112 | } 113 | } 114 | }) -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/UdtTest.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api 21 | 22 | import io.kotest.core.spec.style.ShouldSpec 23 | import io.kotest.matchers.shouldBe 24 | import org.glassfish.jersey.internal.guava.MoreObjects 25 | import org.apache.spark.ml.linalg.* 26 | import org.apache.spark.sql.catalyst.InternalRow 27 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 28 | import org.apache.spark.sql.types.* 29 | import org.apache.spark.unsafe.types.UTF8String 30 | import org.jetbrains.kotlinx.spark.api.tuples.t 31 | import kotlin.reflect.jvm.jvmName 32 | 33 | class UdtTest : ShouldSpec({ 34 | context("udt") { 35 | withSpark { 36 | should("Recognize UDTs from libraries like MlLib") { 37 | val input = t( 38 | Vectors.dense(doubleArrayOf(1.0, 2.0, 3.0)), 39 | DenseVector(doubleArrayOf(1.0, 2.0, 3.0)), 40 | SparseVector(3, intArrayOf(0, 1, 2), doubleArrayOf(1.0, 2.0, 3.0)), 41 | Matrices.eye(1), 42 | DenseMatrix.eye(2), 43 | SparseMatrix.speye(2), 44 | ) 45 | 46 | val ds = dsOf(input) 47 | 48 | ds.collectAsList().single() shouldBe input 49 | } 50 | 51 | should("Recognize locally registered UDTs with annotation") { 52 | val input = t( 53 | City("Amsterdam", 1), 54 | City("Breda", 2), 55 | City("Oosterhout", 3), 56 | ) 57 | 58 | val ds = dsOf(input) 59 | 60 | ds.collectAsList().single() shouldBe input 61 | } 62 | 63 | should("Recognize locally registered UDTs with register function") { 64 | UDTRegistration.register(City::class.jvmName, CityUserDefinedType::class.jvmName) 65 | 66 | val input = t( 67 | City("Amsterdam", 1), 68 | City("Breda", 2), 69 | City("Oosterhout", 3), 70 | ) 71 | 72 | val ds = dsOf(input) 73 | 74 | ds.collectAsList().single() shouldBe input 75 | } 76 | 77 | should("Be able to create encoder from UDT too") { 78 | 79 | val input = listOf( 80 | City("Amsterdam", 1), 81 | City("Breda", 2), 82 | City("Oosterhout", 3), 83 | ) 84 | 85 | val ds = input.toDS() 86 | 87 | ds.collectAsList() shouldBe input 88 | } 89 | } 90 | } 91 | }) 92 | 93 | class CityUserDefinedType : UserDefinedType() { 94 | 95 | override fun sqlType(): DataType = DATA_TYPE 96 | 97 | override fun serialize(city: City): InternalRow = GenericInternalRow(2).apply { 98 | setInt(DEPT_NUMBER_INDEX, city.departmentNumber) 99 | update(NAME_INDEX, UTF8String.fromString(city.name)) 100 | } 101 | 102 | override fun deserialize(datum: Any): City = 103 | if (datum is InternalRow) 104 | City( 105 | name = datum.getString(NAME_INDEX), 106 | departmentNumber = datum.getInt(DEPT_NUMBER_INDEX), 107 | ) 108 | else throw IllegalStateException("Unsupported conversion") 109 | 110 | override fun userClass(): Class = City::class.java 111 | 112 | companion object { 113 | private const val DEPT_NUMBER_INDEX = 0 114 | private const val NAME_INDEX = 1 115 | private val DATA_TYPE = DataTypes.createStructType( 116 | arrayOf( 117 | DataTypes.createStructField( 118 | "departmentNumber", 119 | DataTypes.IntegerType, 120 | false, 121 | MetadataBuilder().putLong("maxNumber", 99).build(), 122 | ), 123 | DataTypes.createStructField("name", DataTypes.StringType, false) 124 | ) 125 | ) 126 | } 127 | } 128 | 129 | @SQLUserDefinedType(udt = CityUserDefinedType::class) 130 | class City(val name: String, val departmentNumber: Int) { 131 | 132 | override fun toString(): String = 133 | MoreObjects 134 | .toStringHelper(this) 135 | .add("name", name) 136 | .add("departmentNumber", departmentNumber) 137 | .toString() 138 | 139 | override fun equals(other: Any?): Boolean { 140 | if (this === other) return true 141 | if (javaClass != other?.javaClass) return false 142 | 143 | other as City 144 | 145 | if (name != other.name) return false 146 | if (departmentNumber != other.departmentNumber) return false 147 | 148 | return true 149 | } 150 | 151 | override fun hashCode(): Int { 152 | var result = name.hashCode() 153 | result = 31 * result + departmentNumber 154 | return result 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /kotlin-spark-api/src/test/kotlin/org/jetbrains/kotlinx/spark/api/struct/model/models.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API 4 | * ---------- 5 | * Copyright (C) 2019 - 2020 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.struct.model 21 | 22 | import com.beust.klaxon.Converter 23 | import com.beust.klaxon.JsonObject 24 | import com.beust.klaxon.JsonValue 25 | import com.beust.klaxon.Klaxon 26 | 27 | private fun Klaxon.convert( 28 | k: kotlin.reflect.KClass<*>, 29 | fromJson: (JsonValue) -> T, 30 | toJson: (T) -> String, 31 | isUnion: Boolean = false, 32 | ) = 33 | this.converter(object : Converter { 34 | @Suppress("UNCHECKED_CAST") 35 | override fun toJson(value: Any) = toJson(value as T) 36 | 37 | override fun fromJson(jv: JsonValue) = fromJson(jv) as Any 38 | override fun canConvert(cls: Class<*>) = cls == k.java || (isUnion && cls.superclass == k.java) 39 | }) 40 | 41 | private val klaxon = Klaxon() 42 | .convert(JsonObject::class, { it.obj!! }, { it.toJsonString() }) 43 | .convert(DataType::class, { DataType.fromJson(it) }, { it.toJson() }, true) 44 | .convert(ElementType::class, { ElementType.fromJson(it) }, { it.toJson() }, true) 45 | 46 | data class Struct( 47 | val type: String, 48 | val fields: List? = null, 49 | val containsNull: Boolean? = null, 50 | val elementType: ElementType? = null, 51 | ) { 52 | public fun toJson() = klaxon.toJsonString(this) 53 | 54 | companion object { 55 | public fun fromJson(json: String) = klaxon.parse(json) 56 | } 57 | } 58 | 59 | data class StructField( 60 | val name: String, 61 | val type: DataType, 62 | val nullable: Boolean, 63 | val metadata: Metadata, 64 | ) 65 | 66 | typealias Metadata = JsonObject 67 | 68 | sealed class DataType { 69 | data class StructType(val value: Struct) : DataType() 70 | data class TypeName(val value: String) : DataType() 71 | 72 | public fun toJson(): String = klaxon.toJsonString(when (this) { 73 | is StructType -> this.value 74 | is TypeName -> this.value 75 | }) 76 | 77 | companion object { 78 | public fun fromJson(jv: JsonValue): DataType = when (jv.inside) { 79 | is JsonObject -> StructType(jv.obj?.let { klaxon.parseFromJsonObject(it) }!!) 80 | is String -> TypeName(jv.string!!) 81 | else -> throw IllegalArgumentException() 82 | } 83 | } 84 | } 85 | 86 | sealed class ElementType { 87 | data class SimpleElement(val value: String) : ElementType() 88 | data class ComplexElement(val value: Struct) : ElementType() 89 | 90 | public fun toJson(): String = klaxon.toJsonString(when (this) { 91 | is SimpleElement -> this.value 92 | is ComplexElement -> this.value 93 | }) 94 | 95 | companion object { 96 | public fun fromJson(jv: JsonValue): ElementType = when (jv.inside) { 97 | is JsonObject -> ComplexElement(jv.obj?.let { klaxon.parseFromJsonObject(it) }!!) 98 | is String -> SimpleElement(jv.string!!) 99 | else -> throw IllegalArgumentException() 100 | } 101 | } 102 | 103 | } 104 | 105 | -------------------------------------------------------------------------------- /qodana.yaml: -------------------------------------------------------------------------------- 1 | version: "1.0" 2 | linter: jetbrains/qodana-jvm-community:2021.3 3 | profile: 4 | name: qodana.recommended 5 | exclude: 6 | - name: All 7 | paths: 8 | - scala-tuples-in-kotlin/src/main/kotlin/org/jetbrains/kotlinx/spark/api/tuples 9 | - kotlin-spark-api/3.2/src/main/kotlin/org/jetbrains/kotlinx/spark/api/Arities.kt 10 | -------------------------------------------------------------------------------- /scala-tuples-in-kotlin/build.gradle.kts: -------------------------------------------------------------------------------- 1 | @file:Suppress("UnstableApiUsage") 2 | 3 | import com.vanniktech.maven.publish.JavadocJar.Dokka 4 | import com.vanniktech.maven.publish.KotlinJvm 5 | import org.jetbrains.dokka.gradle.AbstractDokkaLeafTask 6 | import org.jetbrains.dokka.gradle.DokkaTask 7 | import org.jetbrains.dokka.gradle.DokkaTaskPartial 8 | import org.jetbrains.kotlin.gradle.tasks.KotlinCompile 9 | 10 | plugins { 11 | scala 12 | kotlin 13 | dokka 14 | mavenPublishBase 15 | } 16 | 17 | group = Versions.groupID 18 | version = Versions.project 19 | 20 | repositories { 21 | mavenCentral() 22 | } 23 | 24 | tasks.withType().configureEach { 25 | useJUnitPlatform() 26 | maxHeapSize = "4g" 27 | } 28 | 29 | dependencies { 30 | with(Dependencies) { 31 | implementation( 32 | kotlinStdLib, 33 | scalaLibrary, 34 | ) 35 | testImplementation( 36 | kotest, 37 | atrium, 38 | kotlinTest, 39 | ) 40 | } 41 | } 42 | 43 | 44 | kotlin { 45 | jvmToolchain { 46 | languageVersion.set( 47 | JavaLanguageVersion.of(Versions.jvmTarget) 48 | ) 49 | } 50 | } 51 | 52 | 53 | tasks.withType { 54 | dokkaSourceSets { 55 | all { 56 | sourceRoot( 57 | kotlin.sourceSets 58 | .main.get() 59 | .kotlin 60 | .srcDirs 61 | .first { it.path.endsWith("kotlin") } 62 | ) 63 | } 64 | } 65 | } 66 | 67 | mavenPublishing { 68 | configure(KotlinJvm(Dokka("dokkaHtml"))) 69 | } 70 | 71 | 72 | // Publishing of scala-tuples-in-kotlin can be skipped since it's only dependent on the Scala version 73 | val skipScalaTuplesInKotlin = System.getProperty("skipScalaTuplesInKotlin").toBoolean() 74 | tasks 75 | .filter { "publish" in it.name } 76 | .forEach { it.onlyIf { !skipScalaTuplesInKotlin } } 77 | 78 | -------------------------------------------------------------------------------- /scala-tuples-in-kotlin/src/main/kotlin/org/jetbrains/kotlinx/spark/api/Conversions.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.0+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2021 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | 21 | /** 22 | * This files contains conversions of Tuples between the Scala- 23 | * and Kotlin/Java variants. 24 | */ 25 | 26 | @file:Suppress("NOTHING_TO_INLINE", "RemoveExplicitTypeArguments", "unused") 27 | 28 | package org.jetbrains.kotlinx.spark.api 29 | 30 | import scala.* 31 | 32 | 33 | /** 34 | * Returns a new [Tuple2] based on the arguments in the current [Pair]. 35 | */ 36 | fun Pair.toTuple(): Tuple2 = Tuple2(first, second) 37 | 38 | /** 39 | * Returns a new [Pair] based on the arguments in the current [Tuple2]. 40 | */ 41 | fun Tuple2.toPair(): Pair = Pair(_1(), _2()) 42 | 43 | /** 44 | * Returns a new [Tuple3] based on the arguments in the current [Triple]. 45 | */ 46 | fun Triple.toTuple(): Tuple3 = Tuple3(first, second, third) 47 | 48 | /** 49 | * Returns a new [Triple] based on the arguments in the current [Tuple3]. 50 | */ 51 | fun Tuple3.toTriple(): Triple = Triple(_1(), _2(), _3()) 52 | -------------------------------------------------------------------------------- /scala-tuples-in-kotlin/src/main/kotlin/org/jetbrains/kotlinx/spark/api/tuples/EmptyTuple.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.tuples 21 | 22 | import scala.* 23 | import java.io.Serializable 24 | 25 | /** 26 | * Just as in Scala3, we provide the [EmptyTuple]. It is the result of dropping the last item from a [Tuple1] 27 | * or when calling `tupleOf()` for instance. 28 | */ 29 | 30 | object EmptyTuple : Product, Serializable { 31 | override fun canEqual(that: Any?): Boolean = that == EmptyTuple 32 | override fun productElement(n: Int): Nothing = throw IndexOutOfBoundsException("EmptyTuple has no members") 33 | override fun productArity(): Int = 0 34 | override fun toString(): String = "()" 35 | } 36 | 37 | public fun emptyTuple(): EmptyTuple = EmptyTuple 38 | -------------------------------------------------------------------------------- /scala-tuples-in-kotlin/src/main/kotlin/org/jetbrains/kotlinx/spark/api/tuples/ProductExtensions.kt: -------------------------------------------------------------------------------- 1 | /*- 2 | * =LICENSE= 3 | * Kotlin Spark API: API for Spark 3.2+ (Scala 2.12) 4 | * ---------- 5 | * Copyright (C) 2019 - 2022 JetBrains 6 | * ---------- 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * =LICENSEEND= 19 | */ 20 | package org.jetbrains.kotlinx.spark.api.tuples 21 | 22 | import scala.Product 23 | import scala.collection.JavaConverters 24 | import kotlin.jvm.Throws 25 | 26 | /** 27 | * Extra extensions for Scala [Product]s such as Tuples. 28 | * In most cases, the functions of `SameTypeProductExtensions.kt` will be used 29 | * instead of these. But these help for the overview and generic case. 30 | * 31 | * For example: 32 | * 33 | * ```kotlin 34 | * 1 in tupleOf(1, 2, 3) == true 35 | * 36 | * for (x in tupleOf("a", "b", "c")) { ... } 37 | * 38 | * val a: List = tupleOf(1, "a", 3L).asIterable().toList() 39 | * 40 | * tupleOf(1, 2, 3).size == 3 41 | * 42 | * tupleOf(1, 2, 3)[0] == 1 43 | * 44 | * tupleOf(1, 1, 2)[1..2] == tupleOf(1, 2, 2)[0..1] 45 | * ``` 46 | * 47 | */ 48 | 49 | /** Tests whether this iterator contains a given value as an element. 50 | * Note: may not terminate for infinite iterators. 51 | * 52 | * @param item the element to test. 53 | * @return `true` if this iterator produces some value that 54 | * is equal (as determined by `==`) to `elem`, `false` otherwise. 55 | * @note Reuse: After calling this method, one should discard the iterator it was called on. 56 | * Using it is undefined and subject to change. 57 | */ 58 | operator fun Product.contains(item: Any?): Boolean = productIterator().contains(item) 59 | 60 | /** 61 | * An iterator over all the elements of this product. 62 | * @return in the default implementation, an `Iterator` 63 | */ 64 | operator fun Product.iterator(): Iterator = JavaConverters.asJavaIterator(productIterator()) 65 | 66 | /** 67 | * Converts this product to an `Any?` iterable. 68 | */ 69 | fun Product.asIterable(): Iterable = object : Iterable { 70 | override fun iterator(): Iterator = JavaConverters.asJavaIterator(productIterator()) 71 | } 72 | 73 | /** The size of this product. 74 | * @return for a product `A(x,,1,,, ..., x,,k,,)`, returns `k` 75 | */ 76 | val Product.size: Int 77 | get() = productArity() 78 | 79 | /** The n'th element of this product, 0-based. In other words, for a 80 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 81 | * 82 | * @param n the index of the element to return 83 | * @throws IndexOutOfBoundsException 84 | * @return the element `n` elements after the first element 85 | */ 86 | @Throws(IndexOutOfBoundsException::class) 87 | operator fun Product.get(n: Int): Any? = productElement(n) 88 | 89 | /** The n'th element of this product, 0-based. In other words, for a 90 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 91 | * 92 | * @param n the index of the element to return 93 | * @return the element `n` elements after the first element, `null` if out of bounds 94 | */ 95 | fun Product.getOrNull(n: Int): Any? = if (n in 0 until size) productElement(n) else null 96 | 97 | /** The n'th element of this product, 0-based. In other words, for a 98 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 99 | * The result is cast to the given type [T]. 100 | * 101 | * @param n the index of the element to return 102 | * @throws IndexOutOfBoundsException 103 | * @throws ClassCastException 104 | * @return the element `n` elements after the first element 105 | */ 106 | @Suppress("UNCHECKED_CAST") 107 | @Throws(IndexOutOfBoundsException::class, ClassCastException::class) 108 | inline fun Product.getAs(n: Int): T = productElement(n) as T 109 | 110 | /** The n'th element of this product, 0-based. In other words, for a 111 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 112 | * The result is cast to the given type [T]. 113 | * 114 | * @param n the index of the element to return 115 | * @return the element `n` elements after the first element, `null` if out of bounds or unable to be cast 116 | */ 117 | @Suppress("UNCHECKED_CAST") 118 | inline fun Product.getAsOrNull(n: Int): T? = getOrNull(n) as? T 119 | 120 | /** The range of n'th elements of this product, 0-based. In other words, for a 121 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 122 | * 123 | * @param indexRange the indices of the elements to return 124 | * @throws IndexOutOfBoundsException 125 | * @return the elements in [indexRange] 126 | */ 127 | @Throws(IndexOutOfBoundsException::class) 128 | operator fun Product.get(indexRange: IntRange): List = indexRange.map(::get) 129 | 130 | /** The range of n'th elements of this product, 0-based. In other words, for a 131 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 132 | * 133 | * @param indexRange the indices of the elements to return 134 | * @return the elements in [indexRange], `null` if out of bounds 135 | */ 136 | fun Product.getOrNull(indexRange: IntRange): List = indexRange.map(::getOrNull) 137 | 138 | /** The range of n'th elements of this product, 0-based. In other words, for a 139 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 140 | * The results are cast to the given type [T]. 141 | * 142 | * @param indexRange the indices of the elements to return 143 | * @throws IndexOutOfBoundsException 144 | * @throws ClassCastException 145 | * @return the elements in [indexRange] 146 | */ 147 | @Throws(IndexOutOfBoundsException::class, ClassCastException::class) 148 | inline fun Product.getAs(indexRange: IntRange): List = indexRange.map(::getAs) 149 | 150 | /** The range of n'th elements of this product, 0-based. In other words, for a 151 | * product `A(x,,1,,, ..., x,,k,,)`, returns `x,,(n+1),,` where `0 <= n < k`. 152 | * The results are cast to the given type [T]. 153 | * 154 | * @param indexRange the indices of the elements to return 155 | * @return the elements in [indexRange], `null` is out of bounds or unable to be cast 156 | */ 157 | inline fun Product.getAsOrNull(indexRange: IntRange): List = indexRange.map(::getAsOrNull) 158 | -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | id("com.gradle.enterprise") version "3.10.3" 3 | } 4 | 5 | gradleEnterprise { 6 | buildScan { 7 | termsOfServiceUrl = "https://gradle.com/terms-of-service" 8 | termsOfServiceAgree = "yes" 9 | } 10 | } 11 | 12 | 13 | val spark: String by settings 14 | val scala: String by settings 15 | val skipScalaTuplesInKotlin: String by settings 16 | System.setProperty("spark", spark) 17 | System.setProperty("scala", scala) 18 | System.setProperty("skipScalaTuplesInKotlin", skipScalaTuplesInKotlin) 19 | 20 | 21 | val scalaCompat 22 | get() = scala.substringBeforeLast('.') 23 | 24 | val versions = "${spark}_${scalaCompat}" 25 | 26 | rootProject.name = "kotlin-spark-api-parent_$versions" 27 | 28 | include("core") 29 | include("scala-tuples-in-kotlin") 30 | include("kotlin-spark-api") 31 | include("jupyter") 32 | include("examples") 33 | 34 | project(":core").name = "core_$versions" 35 | project(":scala-tuples-in-kotlin").name = "scala-tuples-in-kotlin_$scalaCompat" 36 | project(":kotlin-spark-api").name = "kotlin-spark-api_$versions" 37 | project(":jupyter").name = "jupyter_$versions" 38 | project(":examples").name = "examples_$versions" 39 | 40 | buildCache { 41 | local { 42 | removeUnusedEntriesAfterDays = 30 43 | } 44 | } 45 | --------------------------------------------------------------------------------