├── .cargo └── config.toml ├── .git-blame-ignore-revs ├── .github ├── dependabot.yml └── workflows │ ├── pr-lint.yml │ └── release.yml ├── .gitignore ├── .scalafmt.conf ├── LICENSE ├── README.md ├── build.sbt ├── core └── src │ ├── main │ └── scala │ │ └── org │ │ └── polars │ │ └── scala │ │ └── polars │ │ ├── NativeLoader.scala │ │ ├── Polars.scala │ │ ├── api │ │ ├── DataFrame.scala │ │ ├── JSeries.java │ │ ├── LazyFrame.scala │ │ ├── Row.scala │ │ ├── Series.scala │ │ ├── expressions │ │ │ ├── Column.scala │ │ │ └── Expression.scala │ │ ├── io │ │ │ ├── Scannable.scala │ │ │ └── Writeable.scala │ │ └── types │ │ │ ├── DataTypes.scala │ │ │ └── Schema.scala │ │ ├── config │ │ ├── Config.scala │ │ └── constants.scala │ │ ├── functions.scala │ │ ├── internal │ │ └── jni │ │ │ ├── Natively.scala │ │ │ ├── common.scala │ │ │ ├── data_frame.scala │ │ │ ├── expressions │ │ │ ├── column_expr.scala │ │ │ └── literal_expr.scala │ │ │ ├── io │ │ │ ├── scan.scala │ │ │ └── write.scala │ │ │ ├── lazy_frame.scala │ │ │ ├── row.scala │ │ │ └── series.scala │ │ └── package.scala │ └── site │ └── index.html ├── examples └── src │ └── main │ ├── java │ └── examples │ │ └── java │ │ ├── InstantiateDataFrame.java │ │ ├── InstantiateSeries.java │ │ ├── configuration │ │ └── ConfiguringPolars.java │ │ ├── expressions │ │ └── ApplyingSimpleExpressions.java │ │ └── io │ │ ├── LazyAndEagerAPI.java │ │ ├── ReadingFileDatasets.java │ │ └── WritingToFileDatasets.java │ ├── resources │ └── files │ │ └── web-ds │ │ ├── data.csv │ │ ├── data.ipc │ │ ├── data.json │ │ └── data.parquet │ └── scala │ └── examples │ └── scala │ ├── InstantiateDataFrame.scala │ ├── InstantiateSeries.scala │ ├── configuration │ └── ConfiguringPolars.scala │ ├── expressions │ └── ApplyingSimpleExpressions.scala │ ├── io │ ├── LazyAndEagerAPI.scala │ ├── ReadingFileDatasets.scala │ └── WritingToFileDatasets.scala │ └── utils │ └── CommonUtils.scala ├── native ├── Cargo.lock ├── Cargo.toml ├── rustfmt.toml └── src │ ├── internal_jni │ ├── expr │ │ ├── column.rs │ │ ├── literal.rs │ │ └── mod.rs │ ├── frame.rs │ ├── io │ │ ├── mod.rs │ │ ├── scan │ │ │ ├── csv.rs │ │ │ ├── ipc.rs │ │ │ ├── json_lines.rs │ │ │ ├── mod.rs │ │ │ └── parquet.rs │ │ └── write │ │ │ ├── avro.rs │ │ │ ├── csv.rs │ │ │ ├── ipc.rs │ │ │ ├── json.rs │ │ │ ├── mod.rs │ │ │ └── parquet.rs │ ├── lazy.rs │ ├── mod.rs │ ├── row.rs │ ├── series.rs │ └── utils.rs │ ├── lib.rs │ └── utils │ ├── error.rs │ └── mod.rs ├── project ├── DocSettings.scala ├── ExtraCommands.scala ├── GeneralSettings.scala ├── NativeBuildSettings.scala ├── ProjectDependencies.scala ├── PublishingSettings.scala ├── Utils.scala ├── build.properties └── plugins.sbt └── version.sbt /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.aarch64-unknown-linux-gnu] 2 | linker = "aarch64-linux-gnu-gcc" 3 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Scala Steward: Reformat with scalafmt 3.8.6 2 | 7500ed15d9d50a19828222fc9a521c84d7d8b2e1 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" -------------------------------------------------------------------------------- /.github/workflows/pr-lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint All 2 | 3 | on: [ pull_request ] 4 | 5 | permissions: 6 | contents: read 7 | 8 | env: 9 | RUSTFLAGS: -C debuginfo=0 10 | 11 | jobs: 12 | check-formatting: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up JDK 20 | uses: actions/setup-java@v4 21 | with: 22 | java-version: "8" 23 | distribution: 'temurin' 24 | cache: 'sbt' 25 | 26 | - name: Install rust toolchain 27 | uses: dtolnay/rust-toolchain@nightly 28 | with: 29 | components: "clippy, rustfmt" 30 | 31 | - uses: Swatinem/rust-cache@v2 32 | with: 33 | workspaces: native 34 | prefix-key: lint 35 | 36 | - name: Check all formatting 37 | run: | 38 | cargo install cargo-sort 39 | sbt fmtCheckAll 40 | 41 | - name: Check doc issues 42 | run: sbt makeSite 43 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish Artifacts 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | permissions: 8 | contents: read 9 | 10 | env: 11 | NATIVE_LIB_LOCATION: /tmp/native-libs/ 12 | SBT_OPTS: "-Dsbt.ci=true" 13 | JAVA_OPTS: "-XX:+UseG1GC -Xms2G -Xmx8G -Xss6M -XX:ReservedCodeCacheSize=256M -Dfile.encoding=UTF-8" 14 | 15 | jobs: 16 | check-formatting: 17 | runs-on: ubuntu-latest 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }} 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Set up JDK 24 | uses: actions/setup-java@v4 25 | with: 26 | java-version: "8" 27 | distribution: "zulu" 28 | cache: "sbt" 29 | 30 | - uses: sbt/setup-sbt@v1 31 | 32 | - name: Install rust toolchain 33 | uses: dtolnay/rust-toolchain@nightly 34 | with: 35 | components: "clippy, rustfmt" 36 | 37 | - uses: Swatinem/rust-cache@v2 38 | with: 39 | workspaces: native 40 | prefix-key: lint 41 | 42 | - name: Check all formatting 43 | run: | 44 | cargo install cargo-sort 45 | sbt fmtCheckAll 46 | 47 | - name: Check doc issues 48 | run: sbt makeSite 49 | 50 | build-natives: 51 | name: ${{ matrix.arch }} build 52 | runs-on: ${{ matrix.os }} 53 | env: 54 | TARGET_TRIPLE: ${{ matrix.arch }} 55 | needs: [check-formatting] 56 | strategy: 57 | fail-fast: true 58 | matrix: 59 | include: 60 | - os: ubuntu-latest 61 | arch: aarch64-unknown-linux-gnu 62 | packages: "sudo apt update && sudo apt-get install gcc-aarch64-linux-gnu" 63 | 64 | - os: ubuntu-latest 65 | arch: x86_64-unknown-linux-gnu 66 | packages: "" 67 | 68 | - os: windows-latest 69 | arch: aarch64-pc-windows-msvc 70 | packages: "" 71 | 72 | - os: windows-latest 73 | arch: x86_64-pc-windows-msvc 74 | packages: "" 75 | 76 | - os: macos-latest 77 | arch: x86_64-apple-darwin 78 | packages: "brew install sbt" 79 | 80 | - os: macos-latest 81 | arch: aarch64-apple-darwin 82 | packages: "" 83 | steps: 84 | - name: Install system packages 85 | run: ${{ matrix.packages }} 86 | 87 | - uses: actions/checkout@v4 88 | 89 | - name: Set up JDK 90 | uses: actions/setup-java@v4 91 | with: 92 | java-version: "8" 93 | distribution: "zulu" 94 | cache: "sbt" 95 | 96 | - uses: sbt/setup-sbt@v1 97 | 98 | - name: Install rust toolchain 99 | uses: dtolnay/rust-toolchain@nightly 100 | 101 | - uses: Swatinem/rust-cache@v2 102 | with: 103 | workspaces: native 104 | prefix-key: ${{ matrix.arch }} 105 | 106 | - name: Cross publish artifacts containing native library 107 | run: | 108 | rustup target add ${{ matrix.arch }} 109 | sbt generateNativeLibrary 110 | 111 | - name: Temporarily save native library for ${{ matrix.arch }} 112 | uses: actions/upload-artifact@v4 113 | with: 114 | name: native_libs-${{ matrix.arch }} 115 | path: ${{env.NATIVE_LIB_LOCATION}} 116 | retention-days: 1 117 | if-no-files-found: error 118 | 119 | test-build: 120 | name: ${{ matrix.os }} ${{ matrix.java }} test 121 | runs-on: ${{ matrix.os }} 122 | needs: [build-natives] 123 | env: 124 | SKIP_NATIVE_GENERATION: true 125 | strategy: 126 | fail-fast: false 127 | matrix: 128 | java: ["8", "11", "17", "21"] 129 | os: ["ubuntu-latest", "windows-latest", "macos-latest"] 130 | 131 | steps: 132 | - uses: actions/checkout@v4 133 | 134 | - name: Set up JDK 135 | uses: actions/setup-java@v4 136 | with: 137 | java-version: ${{ matrix.java }} 138 | distribution: "zulu" 139 | cache: "sbt" 140 | 141 | - uses: sbt/setup-sbt@v1 142 | 143 | - name: Download artifacts 144 | uses: actions/download-artifact@v4 145 | with: 146 | pattern: native_libs-* 147 | path: ${{env.NATIVE_LIB_LOCATION}} 148 | merge-multiple: true 149 | 150 | - name: Test for ${{ matrix.os }} ${{ matrix.java }} 151 | run: | 152 | sbt +assembly 153 | java -cp ./examples/target/scala-2.12/scala-polars-examples-assembly-0.1.0-SNAPSHOT.jar examples.scala.io.LazyAndEagerAPI 154 | java -cp ./examples/target/scala-2.13/scala-polars-examples-assembly-0.1.0-SNAPSHOT.jar examples.scala.io.LazyAndEagerAPI 155 | java -cp ./examples/target/scala-3.3.4/scala-polars-examples-assembly-0.1.0-SNAPSHOT.jar examples.scala.io.LazyAndEagerAPI 156 | 157 | publish: 158 | timeout-minutes: 15 159 | runs-on: ubuntu-latest 160 | env: 161 | GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }} 162 | SKIP_NATIVE_GENERATION: true 163 | needs: [test-build] 164 | steps: 165 | - uses: actions/checkout@v4 166 | 167 | - name: Configure SSH 168 | uses: webfactory/ssh-agent@v0.9.1 169 | with: 170 | ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} 171 | 172 | - name: Set up JDK 173 | uses: actions/setup-java@v4 174 | with: 175 | java-version: "8" 176 | distribution: "zulu" 177 | cache: "sbt" 178 | 179 | - uses: sbt/setup-sbt@v1 180 | 181 | - name: Download artifacts 182 | uses: actions/download-artifact@v4 183 | with: 184 | pattern: native_libs-* 185 | path: ${{env.NATIVE_LIB_LOCATION}} 186 | merge-multiple: true 187 | 188 | - name: List the built artifacts 189 | run: ls -lhtR 190 | working-directory: ${{env.NATIVE_LIB_LOCATION}} 191 | 192 | - name: Publish Artifacts 193 | run: sbt +aetherDeploy 194 | 195 | - name: Publish API Docs 196 | run: | 197 | git config --global user.email "git@github.com" 198 | git config --global user.name "git" 199 | sbt ghpagesPushSite 200 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .DS_Store 4 | 5 | # IntelliJ IDEA specific 6 | .idea/ 7 | .fleet/ 8 | *.iml 9 | 10 | # SBT specific 11 | .bsp/ 12 | coverage.xml 13 | target/ 14 | .classpath 15 | .project 16 | .settings/ 17 | .metals 18 | .bloop 19 | metals.sbt 20 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.8.6 2 | runner.dialect = scala213 3 | project.git = true 4 | maxColumn = 98 5 | rewrite { 6 | rules = [ 7 | Imports, 8 | RedundantBraces, 9 | RedundantParens 10 | ] 11 | imports { 12 | sort = ascii 13 | groups = [ 14 | ["javax?\\..*"], 15 | ["sbt\\..*"], 16 | ["scala\\..*"], 17 | ["org\\..*"] 18 | ] 19 | } 20 | } 21 | align.tokens = none 22 | assumeStandardLibraryStripMargin = true 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scala-polars 2 | ============ 3 | 4 | `scala-polars` is a library for using the awesome [Polars](https://www.pola.rs/) DataFrame library in 5 | Scala and Java projects. 6 | 7 | ## About 8 | 9 | ### About Polars 10 | 11 | Polars is a blazing fast DataFrames library implemented in Rust using 12 | [Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) as the memory model. 13 | 14 | - Lazy / eager execution 15 | - Multithreaded 16 | - SIMD 17 | - Query optimization 18 | - Powerful expression API 19 | - Hybrid Streaming (larger than RAM datasets) 20 | - Rust | Python | NodeJS | ... 21 | 22 | ### About scala-polars 23 | 24 | This library has been written mostly in scala and leverages [JNI](https://en.wikipedia.org/wiki/Java_Native_Interface) 25 | to offload heavy data processing tasks to its native counterpart written completely in rust. The aim of this library is 26 | to provide an easy-to-use interface for Scala/ Java developers though which they can leverage the amazing Polars library 27 | in their existing projects. 28 | 29 | The project is mainly divided into 2 submodules, 30 | 31 | - `core` - Contains the user facing interfaces written in scala that will be used to work with data. Internally this 32 | module relies on native submodule. 33 | - `native` - This is an internal module written in rust which relies on the official rust implementation of Polars. 34 | 35 | ### Examples 36 | 37 | - [Java Examples](examples/src/main/java/examples/java/) 38 | - [Scala Examples](examples/src/main/scala/examples/scala/) 39 | 40 | ## Compatibility 41 | 42 | - JDK version `>=8` 43 | - Scala version `2.12.x`, `2.13.x` and `3.3.x`. Default is `2.13.x` 44 | - Rust version `>=1.58` 45 | 46 | ## Building 47 | 48 | ### Prerequisites 49 | 50 | The following tooling is required to start building `scala-polars`, 51 | 52 | - JDK 8+ ([OpenJDK](https://openjdk.org/projects/jdk/) 53 | or [Oracle Java SE](https://www.oracle.com/java/technologies/javase/)) 54 | - [Rust](https://www.rust-lang.org/tools/install) (cargo, rustc etc.) 55 | - [sbt](https://www.scala-sbt.org/index.html) 56 | 57 | ### How to Compile? 58 | 59 | sbt is the primary build tool for this project and all the required interlinking has been done in such a way that your 60 | IntelliJ IDE or an external build works in the same way. This means that whether you are in development mode or want to 61 | build to distribute, the process of the build remains the same and is more or less abstracted. 62 | 63 | The build process that sbt triggers involves the following steps, 64 | 65 | - Compile the rust code present in the `native` module to a binary. 66 | - Compile the scala and java (if any) facade code. 67 | - Copy the built rust binary to the classpath of scala code during its build at a fixed location. 68 | 69 | All of the above steps happen automatically when you run an sbt build job that triggers `compile` phase. Other than 70 | this, during package phase, the scala, java code and the built rust binary is added to the built jar(s). To keep 71 | everything monolithic, the `native` module is not packaged as a jar, only `core` module is. 72 | 73 | The above process might look complicated, and it actually is 😂, but since all the internally sbt wiring is already in 74 | place, the user facing process is fairly straight-forward. This can be done by going through the following steps in 75 | sequence firstly ensure JDK 8+, sbt and the latest rust 76 | compiler are installed, then follow the commands below as per the need. 77 | 78 | **Compilation** 79 | 80 | ```shell 81 | # To compile the whole project (scala/ java/ rust) in one go 82 | sbt compile 83 | ``` 84 | 85 | **Local packaging/ installation** 86 | 87 | ```shell 88 | # To package the project and install locally as slim jars with default scala version. 89 | sbt publishLocal 90 | 91 | # To package the project and install locally as slim jars for all supported scala versions. 92 | sbt +publishLocal 93 | ``` 94 | 95 | **Build Assembly (fat jar)** 96 | 97 | ```shell 98 | # To package the project and install locally as fat jars with default scala version. 99 | sbt assembly 100 | 101 | # To package the project and install locally as slim jars for all supported scala versions. 102 | sbt +assembly 103 | ``` 104 | 105 | **Generate Native Binary Only** 106 | 107 | ```shell 108 | # To compile only the native module containing rust code to binary. 109 | sbt generateNativeLibrary 110 | ``` 111 | 112 | ## License 113 | 114 | Apache License 2.0, see [LICENSE](LICENSE). 115 | 116 | ## Community 117 | 118 | Reach out to the Polars community on [Discord](https://discord.gg/4UfP5cfBE7). 119 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import DocSettings.* 2 | import Utils.* 3 | 4 | ThisBuild / publish / skip := true 5 | ThisBuild / publishArtifact := false 6 | 7 | /* 8 | *********************** 9 | * Core Module * 10 | *********************** 11 | */ 12 | 13 | lazy val core = project 14 | .in(file("core")) 15 | .withId("scala-polars") 16 | .settings(name := "scala-polars") 17 | .enablePlugins(GhpagesPlugin, SiteScaladocPlugin) 18 | .settings( 19 | // unidocSourceFilePatterns := Nil, 20 | git.remoteRepo := "git@github.com:chitralverma/scala-polars.git", 21 | SiteScaladoc / siteSubdirName := "api/latest" 22 | ) 23 | .settings(ProjectDependencies.dependencies) 24 | .settings(GeneralSettings.commonSettings) 25 | .settings(PublishingSettings.settings) 26 | .settings( 27 | nativeRoot := baseDirectory.value.toPath.resolveSibling("native").toFile, 28 | inConfig(Compile)(NativeBuildSettings.settings) 29 | ) 30 | .settings(ExtraCommands.commands) 31 | .settings(ExtraCommands.commandAliases) 32 | // .configureUnidoc("scala-polars API Reference") 33 | 34 | /* 35 | *********************** 36 | * Examples Module * 37 | *********************** 38 | */ 39 | 40 | lazy val examples = project 41 | .in(file("examples")) 42 | .withId("scala-polars-examples") 43 | .settings(name := "scala-polars-examples") 44 | .settings(GeneralSettings.commonSettings) 45 | .settings( 46 | Compile / packageBin / publishArtifact := false, 47 | Compile / packageDoc / publishArtifact := false, 48 | Compile / packageSrc / publishArtifact := false 49 | ) 50 | .dependsOn(core) 51 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/NativeLoader.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars 2 | 3 | import java.nio.file._ 4 | 5 | class NativeLoader(nativeLibrary: String) { 6 | NativeLoader.load(nativeLibrary) 7 | } 8 | 9 | object NativeLoader { 10 | def load(nativeLibrary: String): Unit = { 11 | def loadPackaged(arch: String): Unit = { 12 | val lib: String = System.mapLibraryName(nativeLibrary) 13 | val resourcePath: String = s"/native/$arch/$lib" 14 | 15 | val resourceStream = Option( 16 | this.getClass.getResourceAsStream(resourcePath) 17 | ) match { 18 | case Some(s) => s 19 | case None => 20 | throw new UnsatisfiedLinkError( 21 | s"Native library $lib ($resourcePath) cannot be found on the classpath." 22 | ) 23 | } 24 | 25 | val tmp: Path = Files.createTempDirectory("jni-") 26 | val extractedPath = tmp.resolve(lib) 27 | 28 | try 29 | Files.copy(resourceStream, extractedPath) 30 | catch { 31 | case ex: Exception => 32 | throw new UnsatisfiedLinkError( 33 | s"Error while extracting native library:\n$ex" 34 | ) 35 | } 36 | 37 | System.load(extractedPath.toAbsolutePath.toString) 38 | } 39 | 40 | def load(): Unit = try 41 | System.loadLibrary(nativeLibrary) 42 | catch { 43 | case e: Throwable => 44 | try 45 | loadPackaged("aarch64") 46 | catch { 47 | case t: Throwable => 48 | t.addSuppressed(e) 49 | try 50 | loadPackaged("x86_64") 51 | catch { 52 | case ex: Throwable => 53 | ex.addSuppressed(t) 54 | throw new IllegalStateException( 55 | s"Unable to load the provided native library '$nativeLibrary'.", 56 | ex 57 | ) 58 | } 59 | } 60 | 61 | } 62 | 63 | load() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/Polars.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars 2 | 3 | import org.polars.scala.polars.api.io.Scannable 4 | import org.polars.scala.polars.api.{DataFrame, LazyFrame} 5 | import org.polars.scala.polars.config.Config 6 | import org.polars.scala.polars.internal.jni.{common, data_frame, lazy_frame} 7 | 8 | object Polars { 9 | 10 | def config: Config = Config.getConfig 11 | 12 | def version(): String = common.version() 13 | 14 | /** Returns a [[org.polars.scala.polars.api.io.Scannable Scannable]] that can be used to lazily 15 | * scan datasets of various formats ([[org.polars.scala.polars.api.io.Scannable.parquet 16 | * parquet]], [[org.polars.scala.polars.api.io.Scannable.ipc ipc]], 17 | * [[org.polars.scala.polars.api.io.Scannable.csv csv]] and 18 | * [[org.polars.scala.polars.api.io.Scannable.jsonLines jsonLines]]) from local filesystems and 19 | * cloud object stores (aws, gcp and azure) as a 20 | * [[org.polars.scala.polars.api.LazyFrame LazyFrame]]. 21 | * @return 22 | * [[org.polars.scala.polars.api.io.Scannable Scannable]] 23 | */ 24 | def scan: Scannable = new Scannable() 25 | 26 | def concat(lazyFrame: LazyFrame, lazyFrames: Array[LazyFrame]): LazyFrame = 27 | concat(lazyFrame, lazyFrames, reChunk = false, parallel = true) 28 | 29 | def concat( 30 | lazyFrame: LazyFrame, 31 | lazyFrames: Array[LazyFrame], 32 | reChunk: Boolean = false, 33 | parallel: Boolean = true 34 | ): LazyFrame = 35 | if (lazyFrames.isEmpty) lazyFrame 36 | else { 37 | val ptr = 38 | lazy_frame.concatLazyFrames( 39 | lazyFrames.+:(lazyFrame).map(_.ptr), 40 | reChunk = reChunk, 41 | parallel = parallel 42 | ) 43 | 44 | LazyFrame.withPtr(ptr) 45 | } 46 | 47 | def concat(dataFrame: DataFrame, dataFrames: Array[DataFrame]): DataFrame = 48 | if (dataFrames.isEmpty) dataFrame 49 | else { 50 | val ptr = data_frame.concatDataFrames(dataFrames.+:(dataFrame).map(_.ptr)) 51 | 52 | DataFrame.withPtr(ptr) 53 | } 54 | 55 | } 56 | 57 | private[polars] object LibraryStates extends Enumeration { 58 | type LibraryState = Value 59 | 60 | val NOT_LOADED, LOADING, LOADED = Value 61 | } 62 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/DataFrame.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api 2 | 3 | import java.util.Collections 4 | 5 | import scala.annotation.varargs 6 | import scala.jdk.CollectionConverters._ 7 | 8 | import org.polars.scala.polars.api.expressions.Expression 9 | import org.polars.scala.polars.api.io.Writeable 10 | import org.polars.scala.polars.api.types.Schema 11 | import org.polars.scala.polars.config.UniqueKeepStrategies 12 | import org.polars.scala.polars.internal.jni.data_frame 13 | 14 | class DataFrame private (private[polars] val ptr: Long) { 15 | 16 | val schema: Schema = { 17 | val schemaString = data_frame.schemaString(ptr) 18 | Schema.fromString(schemaString) 19 | } 20 | 21 | val width: Int = schema.getFields.length 22 | 23 | val height: Long = count() 24 | 25 | val shape: (Long, Int) = (height, width) 26 | 27 | @varargs 28 | def select(colName: String, colNames: String*): DataFrame = 29 | toLazy.select(colName, colNames: _*).collect(noOptimization = true) 30 | 31 | @varargs 32 | def select(column: Expression, columns: Expression*): DataFrame = 33 | toLazy.select(column, columns: _*).collect(noOptimization = true) 34 | 35 | def filter(predicate: Expression): DataFrame = 36 | toLazy.filter(predicate).collect(noOptimization = true) 37 | 38 | def sort( 39 | cols: Array[String], 40 | descending: Array[Boolean], 41 | nullLast: Array[Boolean], 42 | maintainOrder: Boolean 43 | ): DataFrame = 44 | toLazy.sort(cols, descending, nullLast, maintainOrder).collect(noOptimization = true) 45 | 46 | def sort( 47 | expr: String, 48 | descending: Boolean, 49 | nullLast: Boolean, 50 | maintainOrder: Boolean 51 | ): DataFrame = 52 | toLazy 53 | .sort( 54 | cols = Array(expr), 55 | descending = Array(descending), 56 | nullLast = Array(nullLast), 57 | maintainOrder = maintainOrder 58 | ) 59 | .collect(noOptimization = true) 60 | 61 | def sort( 62 | exprs: Array[Expression], 63 | null_last: Array[Boolean], 64 | maintain_order: Boolean 65 | ): DataFrame = 66 | toLazy.sort(exprs, null_last, maintain_order).collect(noOptimization = true) 67 | 68 | def sort(expr: Expression, null_last: Boolean, maintain_order: Boolean): DataFrame = 69 | toLazy 70 | .sort(Array(expr), Array(null_last), maintainOrder = maintain_order) 71 | .collect(noOptimization = true) 72 | 73 | def set_sorted(mapping: Map[String, Boolean]): DataFrame = 74 | set_sorted(mapping.asJava) 75 | 76 | def set_sorted(mapping: java.util.Map[String, Boolean]): DataFrame = 77 | toLazy.set_sorted(mapping).collect(noOptimization = true) 78 | 79 | def top_k( 80 | k: Int, 81 | cols: Array[String], 82 | descending: Array[Boolean], 83 | nullLast: Array[Boolean], 84 | maintainOrder: Boolean 85 | ): DataFrame = 86 | toLazy 87 | .top_k(k, cols, descending, nullLast, maintainOrder) 88 | .collect(projectionPushdown = false, predicatePushdown = false, commSubplanElim = false) 89 | 90 | def top_k( 91 | k: Int, 92 | expr: String, 93 | descending: Boolean, 94 | nullLast: Boolean, 95 | maintainOrder: Boolean 96 | ): DataFrame = 97 | toLazy 98 | .top_k( 99 | k = k, 100 | cols = Array(expr), 101 | descending = Array(descending), 102 | nullLast = Array(nullLast), 103 | maintainOrder = maintainOrder 104 | ) 105 | .collect(projectionPushdown = false, predicatePushdown = false, commSubplanElim = false) 106 | 107 | def top_k( 108 | k: Int, 109 | exprs: Array[Expression], 110 | null_last: Array[Boolean], 111 | maintain_order: Boolean 112 | ): DataFrame = 113 | toLazy 114 | .top_k(k, exprs, null_last, maintain_order) 115 | .collect(projectionPushdown = false, predicatePushdown = false, commSubplanElim = false) 116 | 117 | def top_k(k: Int, expr: Expression, null_last: Boolean, maintain_order: Boolean): DataFrame = 118 | toLazy 119 | .top_k(k, Array(expr), Array(null_last), maintainOrder = maintain_order) 120 | .collect(projectionPushdown = false, predicatePushdown = false, commSubplanElim = false) 121 | 122 | def limit(n: Long): DataFrame = DataFrame.withPtr(data_frame.limit(ptr, n)) 123 | 124 | def head(n: Long): DataFrame = limit(n) 125 | 126 | def first(): DataFrame = limit(1) 127 | 128 | def tail(n: Long): DataFrame = DataFrame.withPtr(data_frame.tail(ptr, n)) 129 | 130 | def last(): DataFrame = tail(1) 131 | 132 | def with_column(name: String, expr: Expression): DataFrame = 133 | toLazy.with_column(name, expr).collect(noOptimization = true) 134 | 135 | @varargs 136 | def drop(colName: String, colNames: String*): DataFrame = 137 | toLazy.drop(colName, colNames: _*).collect(noOptimization = true) 138 | 139 | def drop_nulls: DataFrame = drop_nulls() 140 | 141 | def drop_nulls( 142 | subset: Array[String] = Array.empty 143 | ): DataFrame = 144 | toLazy.drop_nulls(subset).collect(noOptimization = true) 145 | 146 | def rename(oldName: String, newName: String): DataFrame = 147 | rename(Collections.singletonMap(oldName, newName)) 148 | 149 | def rename(mapping: Map[String, String]): DataFrame = 150 | rename(mapping.asJava) 151 | 152 | def rename(mapping: java.util.Map[String, String]): DataFrame = 153 | toLazy.rename(mapping).collect(noOptimization = true) 154 | 155 | def unique: DataFrame = unique() 156 | 157 | def unique( 158 | subset: Array[String] = Array.empty, 159 | keep: UniqueKeepStrategies.UniqueKeepStrategy = UniqueKeepStrategies.any, 160 | maintainOrder: Boolean = false 161 | ): DataFrame = 162 | toLazy.unique(subset, keep, maintainOrder).collect(noOptimization = true) 163 | 164 | def toLazy: LazyFrame = LazyFrame.withPtr(data_frame.toLazy(ptr)) 165 | 166 | def show(): Unit = data_frame.show(ptr) 167 | 168 | def count(): Long = data_frame.count(ptr) 169 | 170 | /** Provides an iterator to traverse a specified number of rows from the DataFrame. 171 | * @param nRows 172 | * number of rows to traverse 173 | * @note 174 | * if `nRows` is greater than the total rows in DataFrame then all rows are included. 175 | * @return 176 | * Iterator of [[Row]] 177 | */ 178 | def rows(nRows: Long): Iterator[Row] = RowIterator.withPtr(ptr).lazyIterator(nRows) 179 | 180 | /** Provides an iterator to traverse a all rows from the DataFrame. 181 | * @return 182 | * Iterator of [[Row]] 183 | */ 184 | def rows(): Iterator[Row] = rows(-1L) 185 | 186 | def write(): Writeable = new Writeable(ptr) 187 | 188 | } 189 | 190 | object DataFrame { 191 | 192 | private[polars] def withPtr(ptr: Long) = new DataFrame(ptr) 193 | 194 | /** Initialize new [[org.polars.scala.polars.api.DataFrame]] from one or more 195 | * [[org.polars.scala.polars.api.Series]]. The name of a series is used as column name and its 196 | * values are the values of this column. 197 | * 198 | * @param series 199 | * Series 200 | * @param more 201 | * Series as a scala or java array 202 | * 203 | * @return 204 | * [[org.polars.scala.polars.api.DataFrame]] formed from the provided 205 | * [[org.polars.scala.polars.api.Series]] 206 | */ 207 | @varargs 208 | def fromSeries(series: Series, more: Series*): DataFrame = 209 | DataFrame.withPtr(data_frame.fromSeries(more.+:(series).map(_.ptr).toArray)) 210 | 211 | /** Initialize new [[org.polars.scala.polars.api.DataFrame]] from one or more 212 | * [[org.polars.scala.polars.api.Series]]. The name of a series is used as column name and its 213 | * values are the values of this column. 214 | * 215 | * @param series 216 | * Series 217 | * @param more 218 | * Series as a scala iterable 219 | * 220 | * @return 221 | * [[org.polars.scala.polars.api.DataFrame]] formed from the provided 222 | * [[org.polars.scala.polars.api.Series]] 223 | */ 224 | def fromSeries(series: Series, more: Iterable[Series]): DataFrame = 225 | DataFrame.withPtr(data_frame.fromSeries(more.toSeq.+:(series).map(_.ptr).toArray)) 226 | 227 | /** Initialize new [[org.polars.scala.polars.api.DataFrame]] from one or more 228 | * [[org.polars.scala.polars.api.Series]]. The name of a series is used as column name and its 229 | * values are the values of this column. 230 | * 231 | * @param series 232 | * Series 233 | * @param more 234 | * Series as a java iterable 235 | * 236 | * @return 237 | * [[org.polars.scala.polars.api.DataFrame]] formed from the provided 238 | * [[org.polars.scala.polars.api.Series]] 239 | */ 240 | def fromSeries(series: Series, more: java.lang.Iterable[Series]): DataFrame = 241 | fromSeries(series, more.asScala) 242 | 243 | } 244 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/JSeries.java: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api; 2 | 3 | import scala.Boolean; 4 | import scala.Int; 5 | import scala.jdk.javaapi.CollectionConverters; 6 | 7 | import java.time.LocalDate; 8 | import java.time.LocalDateTime; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.Iterator; 12 | import java.util.List; 13 | import java.util.stream.Collectors; 14 | import java.util.stream.StreamSupport; 15 | 16 | import org.polars.scala.polars.internal.jni.series; 17 | 18 | class JSeries { 19 | final static String EmptyString = ""; 20 | 21 | @SuppressWarnings({ "unchecked", "rawtypes" }) 22 | static Series ofList(String name, Iterable values) { 23 | Iterator valuesIter = values.iterator(); 24 | List sList = new ArrayList<>(); 25 | 26 | while (valuesIter.hasNext()) { 27 | Iterable subList = valuesIter.next(); 28 | Object head = subList.iterator().next(); 29 | 30 | Series thisSeries; 31 | if (head instanceof Integer || head instanceof Int) { 32 | thisSeries = Series.ofInt(EmptyString, subList); 33 | } else if (head instanceof Long) { 34 | thisSeries = Series.ofLong(EmptyString, subList); 35 | } else if (head instanceof Float) { 36 | thisSeries = Series.ofFloat(EmptyString, subList); 37 | } else if (head instanceof Double) { 38 | thisSeries = Series.ofDouble(EmptyString, subList); 39 | } else if (head instanceof Boolean) { 40 | thisSeries = Series.ofBoolean(EmptyString, subList); 41 | } else if (head instanceof LocalDate) { 42 | thisSeries = Series.ofDate(EmptyString, subList); 43 | } else if (head instanceof LocalDateTime) { 44 | thisSeries = Series.ofDateTime(EmptyString, subList); 45 | } else if (head instanceof String) { 46 | thisSeries = Series.ofString(EmptyString, subList); 47 | } else if (head instanceof java.lang.Iterable) { 48 | thisSeries = ofList(EmptyString, subList); 49 | } else if (head instanceof scala.collection.Iterable) { 50 | Iterable s = (Iterable) StreamSupport.stream(subList.spliterator(), false) 51 | .map(v -> CollectionConverters.asJava((scala.collection.Iterable) v)) 52 | .collect(Collectors.toList()); 53 | 54 | thisSeries = ofList(EmptyString, s); 55 | } else if (head.getClass().isArray()) { 56 | Iterable s = (Iterable) StreamSupport.stream(subList.spliterator(), false) 57 | .map(v -> Arrays.asList((Object[]) v)) 58 | .collect(Collectors.toList()); 59 | 60 | thisSeries = ofList(EmptyString, s); 61 | } else { 62 | throw new IllegalArgumentException( 63 | String.format("Nested series of provided internal type `%s` is currently not supported.", head.getClass().getSimpleName()) 64 | ); 65 | } 66 | 67 | sList.add(thisSeries); 68 | } 69 | 70 | long[] ptrs = sList.stream().map(Series::ptr).mapToLong(Long::longValue).toArray(); 71 | return Series.withPtr(series.new_list_series(name, ptrs)); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/LazyFrame.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api 2 | 3 | import java.util.Collections 4 | 5 | import scala.annotation.varargs 6 | import scala.jdk.CollectionConverters._ 7 | 8 | import org.polars.scala.polars.api.expressions.Expression 9 | import org.polars.scala.polars.api.types.Schema 10 | import org.polars.scala.polars.config.UniqueKeepStrategies 11 | import org.polars.scala.polars.internal.jni.expressions.column_expr 12 | import org.polars.scala.polars.internal.jni.lazy_frame 13 | 14 | class LazyFrame private (private[polars] val ptr: Long) { 15 | 16 | val schema: Schema = { 17 | val schemaString = lazy_frame.schemaString(ptr) 18 | Schema.fromString(schemaString) 19 | } 20 | 21 | val width: Int = schema.getFields.length 22 | 23 | @varargs 24 | def select(colName: String, colNames: String*): LazyFrame = { 25 | val ldfPtr = lazy_frame.selectFromStrings(ptr, colNames.+:(colName).distinct.toArray) 26 | 27 | LazyFrame.withPtr(ldfPtr) 28 | } 29 | 30 | @varargs 31 | def select(column: Expression, columns: Expression*): LazyFrame = { 32 | val ldfPtr = lazy_frame.selectFromExprs(ptr, columns.+:(column).map(_.ptr).distinct.toArray) 33 | 34 | LazyFrame.withPtr(ldfPtr) 35 | } 36 | 37 | def filter(predicate: Expression): LazyFrame = { 38 | val ldfPtr = lazy_frame.filterFromExprs(ptr, predicate.ptr) 39 | 40 | LazyFrame.withPtr(ldfPtr) 41 | } 42 | 43 | def sort( 44 | cols: Array[String], 45 | descending: Array[Boolean], 46 | nullLast: Array[Boolean], 47 | maintainOrder: Boolean 48 | ): LazyFrame = { 49 | assert( 50 | cols.length == descending.length, 51 | s"Length of provided list columns(${cols.length}) and their " + 52 | s"sorting directions((${descending.length})) is not equal." 53 | ) 54 | 55 | val exprs = cols.zip(descending).map { case (column, bool) => 56 | Expression.withPtr(column_expr.sort_column_by_name(column, bool)) 57 | } 58 | 59 | sort(exprs, nullLast, maintainOrder = maintainOrder) 60 | } 61 | 62 | def sort( 63 | col: String, 64 | descending: Boolean, 65 | nullLast: Boolean, 66 | maintainOrder: Boolean 67 | ): LazyFrame = 68 | sort(Array(col), Array(descending), Array(nullLast), maintainOrder = maintainOrder) 69 | 70 | def sort( 71 | exprs: Array[Expression], 72 | null_last: Array[Boolean], 73 | maintainOrder: Boolean 74 | ): LazyFrame = { 75 | assert( 76 | exprs.length == null_last.length, 77 | s"Length of provided expressions (${exprs.length}) and their " + 78 | s"null_last (${null_last.length}) is not equal." 79 | ) 80 | 81 | val ldfPtr = 82 | lazy_frame.sortFromExprs(ptr, exprs.map(_.ptr).distinct, null_last, maintainOrder) 83 | 84 | LazyFrame.withPtr(ldfPtr) 85 | } 86 | 87 | def sort(expr: Expression, nullLast: Boolean, maintainOrder: Boolean): LazyFrame = 88 | sort(Array(expr), Array(nullLast), maintainOrder = maintainOrder) 89 | 90 | def set_sorted(mapping: Map[String, Boolean]): LazyFrame = 91 | set_sorted(mapping.asJava) 92 | 93 | def set_sorted(mapping: java.util.Map[String, Boolean]): LazyFrame = { 94 | val ldfPtr = lazy_frame.set_sorted(ptr, mapping) 95 | 96 | LazyFrame.withPtr(ldfPtr) 97 | } 98 | 99 | def top_k( 100 | k: Int, 101 | exprs: Array[Expression], 102 | null_last: Array[Boolean], 103 | maintainOrder: Boolean 104 | ): LazyFrame = { 105 | assert( 106 | exprs.length == null_last.length, 107 | s"Length of provided expressions (${exprs.length}) and their " + 108 | s"null_last (${null_last.length}) is not equal." 109 | ) 110 | val ldfPtr = 111 | lazy_frame.topKFromExprs(ptr, k, exprs.map(_.ptr).distinct, null_last, maintainOrder) 112 | 113 | LazyFrame.withPtr(ldfPtr) 114 | } 115 | 116 | def top_k(k: Int, expr: Expression, nullLast: Boolean, maintainOrder: Boolean): LazyFrame = 117 | top_k(k, Array(expr), Array(nullLast), maintainOrder = maintainOrder) 118 | 119 | def top_k( 120 | k: Int, 121 | cols: Array[String], 122 | descending: Array[Boolean], 123 | nullLast: Array[Boolean], 124 | maintainOrder: Boolean 125 | ): LazyFrame = { 126 | assert( 127 | cols.length == descending.length, 128 | s"Length of provided list columns (${cols.length}) and their " + 129 | s"sorting directions (${descending.length}) is not equal." 130 | ) 131 | 132 | val exprs = cols.zip(descending).map { case (column, bool) => 133 | Expression.withPtr(column_expr.sort_column_by_name(column, bool)) 134 | } 135 | 136 | top_k(k, exprs, null_last = nullLast, maintainOrder = maintainOrder) 137 | } 138 | 139 | def top_k( 140 | k: Int, 141 | col: String, 142 | descending: Boolean, 143 | nullLast: Boolean, 144 | maintainOrder: Boolean 145 | ): LazyFrame = 146 | top_k(k, Array(col), Array(descending), Array(nullLast), maintainOrder = maintainOrder) 147 | 148 | def limit(n: Long): LazyFrame = LazyFrame.withPtr(lazy_frame.limit(ptr, n)) 149 | 150 | def head(n: Long): LazyFrame = limit(n) 151 | 152 | def first(): LazyFrame = limit(1) 153 | 154 | def tail(n: Long): LazyFrame = LazyFrame.withPtr(lazy_frame.tail(ptr, n)) 155 | 156 | def last(): LazyFrame = tail(1) 157 | 158 | @varargs 159 | def drop(colName: String, colNames: String*): LazyFrame = { 160 | val ldfPtr = lazy_frame.drop(ptr, colNames.+:(colName).distinct.toArray) 161 | 162 | LazyFrame.withPtr(ldfPtr) 163 | } 164 | 165 | def with_column(name: String, expr: Expression): LazyFrame = { 166 | val ldfPtr = lazy_frame.withColumn(ptr, name, expr.ptr) 167 | 168 | LazyFrame.withPtr(ldfPtr) 169 | } 170 | 171 | def rename(oldName: String, newName: String): LazyFrame = 172 | rename(Collections.singletonMap(oldName, newName)) 173 | 174 | def rename(mapping: Map[String, String]): LazyFrame = rename(mapping.asJava) 175 | 176 | def rename(mapping: java.util.Map[String, String]): LazyFrame = { 177 | val ldfPtr = lazy_frame.rename(ptr, mapping) 178 | 179 | LazyFrame.withPtr(ldfPtr) 180 | } 181 | 182 | def unique: LazyFrame = unique() 183 | 184 | def unique( 185 | subset: Array[String] = Array.empty, 186 | keep: UniqueKeepStrategies.UniqueKeepStrategy = UniqueKeepStrategies.any, 187 | maintainOrder: Boolean = false 188 | ): LazyFrame = { 189 | val ldfPtr = lazy_frame.unique(ptr, subset, keep.toString, maintainOrder) 190 | 191 | LazyFrame.withPtr(ldfPtr) 192 | } 193 | 194 | def drop_nulls: LazyFrame = drop_nulls() 195 | 196 | def drop_nulls( 197 | subset: Array[String] = Array.empty 198 | ): LazyFrame = { 199 | val ldfPtr = lazy_frame.drop_nulls(ptr, subset) 200 | 201 | LazyFrame.withPtr(ldfPtr) 202 | } 203 | 204 | def explain: Unit = explain() 205 | 206 | def explain( 207 | optimized: Boolean = true, 208 | typeCoercion: Boolean = true, 209 | predicatePushdown: Boolean = true, 210 | projectionPushdown: Boolean = true, 211 | simplifyExpression: Boolean = true, 212 | slicePushdown: Boolean = true, 213 | commSubplanElim: Boolean = true, 214 | commSubexprElim: Boolean = true, 215 | streaming: Boolean = false, 216 | treeFormat: Boolean = false 217 | ): Unit = { 218 | val planStr = if (optimized) { 219 | lazy_frame.explain( 220 | lazy_frame.optimization_toggle( 221 | ptr, 222 | typeCoercion = typeCoercion, 223 | predicatePushdown = predicatePushdown, 224 | projectionPushdown = projectionPushdown, 225 | simplifyExpr = simplifyExpression, 226 | slicePushdown = slicePushdown, 227 | commSubplanElim = commSubplanElim, 228 | commSubexprElim = commSubexprElim, 229 | streaming = streaming 230 | ), 231 | optimized = true, 232 | treeFormat 233 | ) 234 | } else lazy_frame.explain(ptr, optimized = false, treeFormat) 235 | 236 | println(planStr) 237 | } 238 | 239 | def cache: LazyFrame = { 240 | val ldfPtr = lazy_frame.cache(ptr) 241 | 242 | LazyFrame.withPtr(ldfPtr) 243 | } 244 | 245 | def collect: DataFrame = collect() 246 | 247 | def collect( 248 | typeCoercion: Boolean = true, 249 | predicatePushdown: Boolean = true, 250 | projectionPushdown: Boolean = true, 251 | simplifyExpression: Boolean = true, 252 | noOptimization: Boolean = false, 253 | slicePushdown: Boolean = true, 254 | commSubplanElim: Boolean = true, 255 | commSubexprElim: Boolean = true, 256 | streaming: Boolean = false 257 | ): DataFrame = { 258 | val ldf = LazyFrame.withPtr( 259 | lazy_frame.optimization_toggle( 260 | ptr, 261 | typeCoercion = typeCoercion, 262 | predicatePushdown = if (noOptimization) false else predicatePushdown, 263 | projectionPushdown = if (noOptimization) false else projectionPushdown, 264 | simplifyExpr = simplifyExpression, 265 | slicePushdown = if (noOptimization) false else slicePushdown, 266 | commSubplanElim = if (noOptimization || streaming) false else commSubplanElim, 267 | commSubexprElim = if (noOptimization) false else commSubexprElim, 268 | streaming = streaming 269 | ) 270 | ) 271 | 272 | val dfPtr = lazy_frame.collect(ldf.ptr) 273 | DataFrame.withPtr(dfPtr) 274 | } 275 | 276 | } 277 | 278 | object LazyFrame { 279 | 280 | def withPtr(ptr: Long) = new LazyFrame(ptr) 281 | } 282 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/expressions/Column.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api.expressions 2 | 3 | import org.polars.scala.polars.functions.lit 4 | import org.polars.scala.polars.internal.jni.expressions.column_expr 5 | 6 | object UnaryOperators extends Enumeration { 7 | type UnaryOperator = Value 8 | 9 | val NOT, IS_NULL, IS_NOT_NULL, IS_NAN, IS_NOT_NAN, BETWEEN, IS_IN, LIKE, CAST = Value 10 | } 11 | 12 | object BinaryOperators extends Enumeration { 13 | type BinaryOperator = Value 14 | 15 | val EQUAL_TO, NOT_EQUAL_TO, LESS_THAN, LESS_THAN_EQUAL_TO, GREATER_THAN, GREATER_THAN_EQUAL_TO, 16 | OR, AND, PLUS, MINUS, MULTIPLY, DIVIDE, MODULUS = Value 17 | } 18 | 19 | class Column private (override protected[polars] val ptr: Long) extends Expression(ptr) { 20 | import BinaryOperators._ 21 | import UnaryOperators._ 22 | 23 | /** Not. */ 24 | def unary_! : Column = Column.withPtr(column_expr.applyUnary(ptr, NOT.id)) 25 | 26 | /** Is Null. */ 27 | def isNull: Column = Column.withPtr(column_expr.applyUnary(ptr, IS_NULL.id)) 28 | 29 | /** Is Not Null. */ 30 | def isNotNull: Column = Column.withPtr(column_expr.applyUnary(ptr, IS_NOT_NULL.id)) 31 | 32 | /** Is NaN. */ 33 | def isNaN: Column = Column.withPtr(column_expr.applyUnary(ptr, IS_NAN.id)) 34 | 35 | /** Is Not NaN. */ 36 | def isNotNaN: Column = Column.withPtr(column_expr.applyUnary(ptr, IS_NOT_NAN.id)) 37 | 38 | /** Plus. */ 39 | def +(value: Any): Column = { 40 | val rightPtr = lit(value).ptr 41 | 42 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, PLUS.id)) 43 | } 44 | 45 | def plus(other: Any): Column = this && other 46 | 47 | /** Minus. */ 48 | def -(value: Any): Column = { 49 | val rightPtr = lit(value).ptr 50 | 51 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, MINUS.id)) 52 | } 53 | 54 | def minus(other: Any): Column = this && other 55 | 56 | /** Divide. */ 57 | def *(value: Any): Column = { 58 | val rightPtr = lit(value).ptr 59 | 60 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, MULTIPLY.id)) 61 | } 62 | 63 | def multiply(other: Any): Column = this && other 64 | 65 | /** Divide. */ 66 | def /(value: Any): Column = { 67 | val rightPtr = lit(value).ptr 68 | 69 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, DIVIDE.id)) 70 | } 71 | 72 | def divide(other: Any): Column = this && other 73 | 74 | /** Modulus. */ 75 | def %(value: Any): Column = { 76 | val rightPtr = lit(value).ptr 77 | 78 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, MODULUS.id)) 79 | } 80 | 81 | def mod(other: Any): Column = this && other 82 | 83 | /** And. */ 84 | def &&(value: Any): Column = { 85 | val rightPtr = lit(value).ptr 86 | 87 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, AND.id)) 88 | } 89 | 90 | def and(other: Any): Column = this && other 91 | 92 | /** And. */ 93 | def ||(value: Any): Column = { 94 | val rightPtr = lit(value).ptr 95 | 96 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, OR.id)) 97 | } 98 | 99 | def or(other: Any): Column = this || other 100 | 101 | /** EqualTo. */ 102 | def ===(value: Any): Column = { 103 | val rightPtr = lit(value).ptr 104 | 105 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, EQUAL_TO.id)) 106 | } 107 | 108 | def equalTo(other: Any): Column = this === other 109 | 110 | /** NotEqualTo. */ 111 | def <>(value: Any): Column = { 112 | val rightPtr = lit(value).ptr 113 | 114 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, NOT_EQUAL_TO.id)) 115 | } 116 | 117 | def notEqualTo(other: Any): Column = this <> other 118 | 119 | /** LessThan. */ 120 | def <(value: Any): Column = { 121 | val rightPtr = lit(value).ptr 122 | 123 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, LESS_THAN.id)) 124 | } 125 | 126 | def lessThan(other: Any): Column = this < other 127 | 128 | /** LessThanEqualTo. */ 129 | def <=(value: Any): Column = { 130 | val rightPtr = lit(value).ptr 131 | 132 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, LESS_THAN_EQUAL_TO.id)) 133 | 134 | } 135 | 136 | def lessThanEqualTo(other: Any): Column = this <= other 137 | 138 | /** GreaterThan. */ 139 | def >(value: Any): Column = { 140 | val rightPtr = lit(value).ptr 141 | 142 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, GREATER_THAN.id)) 143 | 144 | } 145 | 146 | def greaterThan(other: Any): Column = this > other 147 | 148 | /** GreaterThanEqualTo. */ 149 | def >=(value: Any): Column = { 150 | val rightPtr = lit(value).ptr 151 | 152 | Column.withPtr(column_expr.applyBinary(ptr, rightPtr, GREATER_THAN_EQUAL_TO.id)) 153 | } 154 | 155 | def greaterThanEqualTo(other: Any): Column = this >= other 156 | 157 | } 158 | 159 | object Column { 160 | 161 | private[polars] def withPtr(ptr: Long) = new Column(ptr) 162 | 163 | private[polars] def from(name: String): Column = { 164 | val ptr = column_expr.column(name) 165 | new Column(ptr) 166 | } 167 | 168 | } 169 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/expressions/Expression.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api.expressions 2 | 3 | class Expression(protected[polars] val ptr: Long) {} 4 | 5 | object Expression { 6 | 7 | private[polars] def withPtr(ptr: Long) = new Expression(ptr) 8 | 9 | } 10 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/io/Writeable.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api.io 2 | 3 | import scala.collection.mutable.{Map => MutableMap} 4 | import scala.jdk.CollectionConverters._ 5 | 6 | import org.polars.scala.polars.api.DataFrame 7 | import org.polars.scala.polars.internal.jni.io.write._ 8 | 9 | /** Interface used to write a [[DataFrame]] in various formats to local filesystems and cloud 10 | * object stores (aws, gcp and azure). Use [[DataFrame.write write()]] to access this. 11 | * 12 | * Cloud options are global and can be set by methods like [[option option[s]()]] 13 | * - For amazon s3 options, see 14 | * [[https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants here]] 15 | * - For google cloud options, see 16 | * [[https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants here]] 17 | * - For azure options, see 18 | * [[https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants here]] 19 | * 20 | * This interface also supports the following global options, 21 | * - `write_mode`: Specifies the behavior when data already exists at provided path. Supported 22 | * values 'overwrite', 'error'. Default: error. 23 | * - overwrite: Overwrites the existing data at the provided location. 24 | * - error: Throw an exception if data already exists at the provided location. 25 | */ 26 | class Writeable private[polars] (ptr: Long) { 27 | import org.polars.scala.polars.jsonMapper 28 | 29 | private val _options: MutableMap[String, String] = MutableMap("write_mode" -> "error") 30 | 31 | /** Adds options for the underlying output format. */ 32 | def options(opts: Iterable[(String, String)]): Writeable = synchronized { 33 | opts.foreach { case (key, value) => option(key, value) } 34 | this 35 | } 36 | 37 | /** Adds options for the underlying output format. */ 38 | def options(opts: java.util.Map[String, String]): Writeable = synchronized { 39 | opts.asScala.foreach { case (key, value) => option(key, value) } 40 | this 41 | } 42 | 43 | /** Adds an option for the underlying output format. */ 44 | def option(key: String, value: String): Writeable = synchronized { 45 | if (Option(key).exists(_.trim.isEmpty) || Option(value).exists(_.trim.isEmpty)) { 46 | throw new IllegalArgumentException("Option key or value cannot be null or empty.") 47 | } 48 | 49 | _options.put(key.trim, value.trim) 50 | this 51 | } 52 | 53 | /** Saves the content of the [[DataFrame]] in Parquet format at the specified path (local and 54 | * cloud). 55 | * 56 | * Supported options: 57 | * - `write_parquet_parallel`: Serializes columns in parallel. Default: true. 58 | * - `write_parquet_data_page_size`: Sets the maximum bytes size of a data page. Default: 59 | * 1024^2^ bytes. 60 | * - `write_parquet_row_group_size`: Sets the row group size (in number of rows) during 61 | * writing. This can reduce memory pressure and improve writing performance. Default: 62 | * 512^2^ rows. 63 | * - `write_compression`: Sets the compression codec used for pages, for more compatibility 64 | * guarantees, consider using Snappy. Supported values 'uncompressed', 'snappy', 'gzip', 65 | * 'lzo', 'brotli', 'lz4', 'zstd'. Default: zstd. 66 | * - `write_compression_level`: Sets a valid level for codecs like 'gzip', 'brotli', 'zstd'. 67 | * Defaults to compression default. 68 | * - `write_parquet_stats`: Allows computation and writing of column statistics. Supported 69 | * values 'full', 'none', 'some'. Default: some 70 | * 71 | * @param filePath 72 | * output file location 73 | */ 74 | def parquet(filePath: String): Unit = 75 | writeParquet( 76 | ptr = ptr, 77 | filePath = filePath, 78 | options = jsonMapper.writeValueAsString(_options) 79 | ) 80 | 81 | /** Saves the content of the [[DataFrame]] in IPC format at the specified path (local and 82 | * cloud). 83 | * 84 | * Supported options: 85 | * - `write_ipc_compat_level`: Sets compatibility. Supported values 'oldest', 'newest'. 86 | * Default: newest. 87 | * - `write_compression`: Sets the compression codec used for pages. Supported values 88 | * 'uncompressed', 'lz4', 'zstd'. Default: zstd. 89 | * 90 | * @param filePath 91 | * output file location 92 | */ 93 | def ipc(filePath: String): Unit = 94 | writeIPC( 95 | ptr = ptr, 96 | filePath = filePath, 97 | options = jsonMapper.writeValueAsString(_options) 98 | ) 99 | 100 | /** Saves the content of the [[DataFrame]] in Avro format at the specified path (local and 101 | * cloud). 102 | * 103 | * Supported options: 104 | * - `write_avro_record_name`: Sets the name of avro record. Default: "". 105 | * - `write_compression`: Sets the compression codec used for blocks. Supported values 106 | * 'uncompressed', 'deflate', 'snappy'. Default: uncompressed. 107 | * 108 | * @param filePath 109 | * output file location 110 | */ 111 | def avro(filePath: String): Unit = 112 | writeAvro( 113 | ptr = ptr, 114 | filePath = filePath, 115 | options = jsonMapper.writeValueAsString(_options) 116 | ) 117 | 118 | /** Saves the content of the [[DataFrame]] in CSV format at the specified path (local and 119 | * cloud). 120 | * 121 | * Supported options: 122 | * - `write_csv_include_bom`: Sets whether to include UTF-8 Byte Order Mark (BOM) in the CSV 123 | * output. Default: `false`. 124 | * - `write_csv_include_header`: Sets whether to include header in the CSV output. Default: 125 | * `true`. 126 | * - `write_csv_float_scientific`: Sets whether to use scientific form always (true), never 127 | * (false), or automatically (if not set) for `Float` and `Double` datatypes. 128 | * - `write_csv_float_precision`: Sets the number of decimal places to write for `Float` and 129 | * `Double` datatypes. 130 | * - `write_csv_separator`: Sets the CSV file's column separator, defaulting to `,` 131 | * character. 132 | * - `write_csv_quote_char`: Sets the single byte character used for quoting, defaulting to 133 | * `"` character. 134 | * - `write_csv_date_format`: Sets the CSV file's date format defined by 135 | * [[https://docs.rs/chrono/latest/chrono/format/strftime/index.html chrono]]. If no format 136 | * specified, the default fractional-second precision is inferred from the maximum timeunit 137 | * found in the frame's Datetime cols (if any). 138 | * - `write_csv_time_format`: Sets the CSV file's time format defined by 139 | * [[https://docs.rs/chrono/latest/chrono/format/strftime/index.html chrono]]. 140 | * - `write_csv_datetime_format`: Sets the CSV file's datetime format defined by 141 | * [[https://docs.rs/chrono/latest/chrono/format/strftime/index.html chrono]]. 142 | * - `write_csv_line_terminator`: Sets the CSV file's line terminator. Default: "\n". 143 | * - `write_csv_null_value`: Sets the CSV file's null value representation defaulting to the 144 | * empty string. 145 | * - `write_csv_quote_style`: Sets the CSV file's quoting style which indicates when to 146 | * insert quotes around a field. Supported values 'necessary', 'always', 'non_numeric', 147 | * 'never'. 148 | * - necessary (default): This puts quotes around fields only when necessary. They are 149 | * necessary when fields contain a quote, separator or record terminator. Quotes are also 150 | * necessary when writing an empty record (which is indistinguishable from a record with 151 | * one empty field). 152 | * - always: This puts quotes around every field. Always. 153 | * - never: This never puts quotes around fields, even if that results in invalid CSV data 154 | * (e.g.: by not quoting strings containing the separator). 155 | * - non_numeric: This puts quotes around all fields that are non-numeric. Namely, when 156 | * writing a field that does not parse as a valid float or integer, then quotes will be 157 | * used even if they aren't strictly necessary. 158 | * 159 | * @note 160 | * compression is not supported for this format. 161 | * @param filePath 162 | * output file location 163 | */ 164 | def csv(filePath: String): Unit = 165 | writeCSV( 166 | ptr = ptr, 167 | filePath = filePath, 168 | options = jsonMapper.writeValueAsString(_options) 169 | ) 170 | 171 | /** Saves the content of the [[DataFrame]] in JSON format at the specified path (local and 172 | * cloud). 173 | * 174 | * A single JSON array containing each DataFrame row as an object. The length of the array is 175 | * the number of rows in the DataFrame. Use this to create valid JSON that can be deserialized 176 | * back into an array in one fell swoop. 177 | * 178 | * @note 179 | * compression is not supported for this format. 180 | * 181 | * @param filePath 182 | * output file location 183 | */ 184 | def json(filePath: String): Unit = { 185 | option("write_json_format", "json") 186 | writeJson( 187 | ptr = ptr, 188 | filePath = filePath, 189 | options = jsonMapper.writeValueAsString(_options) 190 | ) 191 | } 192 | 193 | /** Saves the content of the [[DataFrame]] in Newline Delimited JSON (ndjson) format at the 194 | * specified path (local and cloud). 195 | * 196 | * Each DataFrame row is serialized as a JSON object on a separate line. The number of lines in 197 | * the output is the number of rows in the DataFrame. 198 | * 199 | * The [[https://pola-rs.github.io/polars/py-polars/html/reference/config.html JSON Lines]] 200 | * format makes it easy to read records in a streaming fashion, one (line) at a time. But the 201 | * output in its entirety is not valid JSON; only the individual lines are. It is recommended 202 | * to use the file extension `.jsonl` when saving as JSON Lines. 203 | * 204 | * @note 205 | * compression is not supported for this format. 206 | * @param filePath 207 | * output file location 208 | */ 209 | def jsonLines(filePath: String): Unit = { 210 | option("write_json_format", "json_lines") 211 | writeJson( 212 | ptr = ptr, 213 | filePath = filePath, 214 | options = jsonMapper.writeValueAsString(_options) 215 | ) 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/types/DataTypes.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api.types 2 | 3 | import java.time.ZoneId 4 | import java.util.Locale 5 | import java.util.concurrent.TimeUnit 6 | 7 | import scala.reflect.ClassTag 8 | import scala.util.Try 9 | import scala.util.matching.Regex 10 | 11 | trait DataType { 12 | def simpleName: String = 13 | this.getClass.getSimpleName 14 | .stripSuffix("$") 15 | .stripSuffix("Type") 16 | .stripSuffix("UDT") 17 | .toLowerCase(Locale.ROOT) 18 | } 19 | 20 | trait BasicDataType extends DataType 21 | 22 | case object StringType extends BasicDataType 23 | 24 | case object BooleanType extends BasicDataType 25 | 26 | case object IntegerType extends BasicDataType 27 | 28 | case object LongType extends BasicDataType 29 | 30 | case object FloatType extends BasicDataType 31 | 32 | case object DoubleType extends BasicDataType 33 | 34 | case object DateType extends BasicDataType 35 | 36 | case object TimeType extends DataType 37 | 38 | case object DateTimeType extends DataType 39 | 40 | case object ListType extends DataType 41 | 42 | case object StructType extends DataType 43 | 44 | case class TimeType(protected val unitStr: String) extends DataType { 45 | val timeUnit: Option[TimeUnit] = 46 | unitStr match { 47 | case s if s.toLowerCase(Locale.ROOT).contains("nano") => Some(TimeUnit.NANOSECONDS) 48 | case s if s.toLowerCase(Locale.ROOT).contains("micro") => Some(TimeUnit.MICROSECONDS) 49 | case s if s.toLowerCase(Locale.ROOT).contains("milli") => Some(TimeUnit.MILLISECONDS) 50 | case _ => None 51 | } 52 | 53 | override def simpleName: String = timeUnit match { 54 | case Some(TimeUnit.NANOSECONDS) => "time[ns]" 55 | case Some(TimeUnit.MICROSECONDS) => "time[us]" 56 | case Some(TimeUnit.MILLISECONDS) => "time[ms]" 57 | case _ => "time" 58 | } 59 | } 60 | 61 | case class DateTimeType(protected val unitStr: String, protected val tzStr: String) 62 | extends DataType { 63 | val timeUnit: Option[TimeUnit] = 64 | unitStr match { 65 | case null => None 66 | case s if s.toLowerCase(Locale.ROOT).contains("nano") => Some(TimeUnit.NANOSECONDS) 67 | case s if s.toLowerCase(Locale.ROOT).contains("micro") => Some(TimeUnit.MICROSECONDS) 68 | case s if s.toLowerCase(Locale.ROOT).contains("milli") => Some(TimeUnit.MILLISECONDS) 69 | case _ => None 70 | } 71 | 72 | val timeZone: Option[ZoneId] = Try(ZoneId.of(tzStr)).toOption 73 | 74 | override def simpleName: String = { 75 | val tu = timeUnit match { 76 | case Some(TimeUnit.NANOSECONDS) => "ns" 77 | case Some(TimeUnit.MICROSECONDS) => "us" 78 | case Some(TimeUnit.MILLISECONDS) => "ms" 79 | case _ => null 80 | } 81 | 82 | val tz = timeZone.orNull 83 | 84 | (tu, tz) match { 85 | case (null, null) => "datetime" 86 | case (tu, null) => s"datetime[$tu]" 87 | case (null, tz) => s"datetime[$tz]" 88 | case (tu, tz) => s"datetime[$tu, $tz]" 89 | } 90 | 91 | } 92 | } 93 | 94 | case class Duration(protected val unitStr: String) extends DataType { 95 | val timeUnit: Option[TimeUnit] = 96 | unitStr match { 97 | case s if s.toLowerCase(Locale.ROOT).contains("nano") => Some(TimeUnit.NANOSECONDS) 98 | case s if s.toLowerCase(Locale.ROOT).contains("micro") => Some(TimeUnit.MICROSECONDS) 99 | case s if s.toLowerCase(Locale.ROOT).contains("milli") => Some(TimeUnit.MILLISECONDS) 100 | case _ => None 101 | } 102 | 103 | override def simpleName: String = timeUnit match { 104 | case Some(TimeUnit.NANOSECONDS) => "duration[ns]" 105 | case Some(TimeUnit.MICROSECONDS) => "duration[us]" 106 | case Some(TimeUnit.MILLISECONDS) => "duration[ms]" 107 | case _ => "duration" 108 | } 109 | } 110 | 111 | case class ListType(tpe: DataType) extends DataType { 112 | override def simpleName: String = "list" 113 | 114 | /** Borrowed from Apache Spark source to represent [[ListType]] as a tree string. */ 115 | private[polars] def buildFormattedString(prefix: String, buffer: StringBuffer): Unit = { 116 | buffer.append(s"$prefix-- element: ${tpe.simpleName}\n") 117 | DataType.buildFormattedString(tpe, s"$prefix |", buffer) 118 | } 119 | 120 | } 121 | 122 | case class StructType(fields: Array[Field]) extends DataType { 123 | override def simpleName: String = "struct" 124 | 125 | def toSchema: Schema = Schema.fromFields(fields) 126 | 127 | /** Borrowed from Apache Spark source to represent [[StructType]] as a tree string. */ 128 | private[polars] def buildFormattedString(prefix: String, buffer: StringBuffer): Unit = 129 | fields.foreach(field => field.buildFormattedString(prefix, buffer)) 130 | } 131 | 132 | object DataType { 133 | 134 | private[polars] final val StringRegex: Regex = """^(?i)Utf8|LargeUtf8|String$""".r 135 | private[polars] final val BooleanRegex: Regex = """^(?i)Boolean$""".r 136 | private[polars] final val IntRegex: Regex = """^(?i)Int8|Int16|Int32|UInt8|UInt16|UInt32$""".r 137 | private[polars] final val LongRegex: Regex = """^(?i)Int64|UInt64$""".r 138 | private[polars] final val FloatRegex: Regex = """^(?i)Float32$""".r 139 | private[polars] final val DoubleRegex: Regex = """^(?i)Float64$""".r 140 | private[polars] final val DateRegex: Regex = """^(?i)Date|Date32|Date64$""".r 141 | 142 | def fromBasicType(typeStr: String): DataType = typeStr match { 143 | case StringRegex() => StringType 144 | case BooleanRegex() => BooleanType 145 | case IntRegex() => IntegerType 146 | case LongRegex() => LongType 147 | case FloatRegex() => FloatType 148 | case DoubleRegex() => DoubleType 149 | case DateRegex() => DateType 150 | case typeStr => 151 | throw new IllegalArgumentException(s"Unknown basic type `$typeStr` is not supported.") 152 | } 153 | 154 | def typeToDataType[T: ClassTag](): DataType = { 155 | val clazz = implicitly[ClassTag[T]].runtimeClass 156 | clazz match { 157 | case c if c == classOf[java.lang.Integer] || c == classOf[Int] => IntegerType 158 | case c if c == classOf[java.lang.Long] || c == classOf[Long] => LongType 159 | case c if c == classOf[java.lang.Boolean] || c == classOf[Boolean] => BooleanType 160 | case c if c == classOf[java.lang.Float] || c == classOf[Float] => FloatType 161 | case c if c == classOf[java.lang.Double] || c == classOf[Double] => DoubleType 162 | case c if c == classOf[java.time.LocalDate] => DateType 163 | case c if c == classOf[java.time.LocalTime] => TimeType 164 | case c if c == classOf[java.time.ZonedDateTime] => DateTimeType 165 | case c if c == classOf[java.lang.String] || c == classOf[String] => StringType 166 | case c if c == classOf[java.util.List[_]] => ListType 167 | case c => 168 | throw new IllegalArgumentException( 169 | s"Data type could not be found for class `${c.getSimpleName}`" 170 | ) 171 | } 172 | } 173 | 174 | /** Borrowed from Apache Spark source to represent [[DataType]] as a tree string. */ 175 | private[polars] def buildFormattedString( 176 | dataType: DataType, 177 | prefix: String, 178 | buffer: StringBuffer 179 | ): Unit = 180 | dataType match { 181 | case array: ListType => array.buildFormattedString(prefix, buffer) 182 | case struct: StructType => struct.buildFormattedString(prefix, buffer) 183 | case _ => 184 | } 185 | 186 | } 187 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/api/types/Schema.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.api.types 2 | 3 | import scala.jdk.CollectionConverters._ 4 | import scala.util.Try 5 | 6 | import org.polars.scala.polars.jsonMapper 7 | 8 | import com.fasterxml.jackson.databind.JsonNode 9 | import com.fasterxml.jackson.databind.node.JsonNodeType 10 | 11 | case class Field(name: String, dataType: DataType) { 12 | 13 | /** Borrowed from Apache Spark source to represent [[Field]] as a tree string. */ 14 | private[polars] def buildFormattedString(prefix: String, buffer: StringBuffer): Unit = { 15 | buffer.append(s"$prefix-- $name: ${dataType.simpleName} \n") 16 | DataType.buildFormattedString(dataType, s"$prefix |", buffer) 17 | } 18 | 19 | } 20 | 21 | class Schema { 22 | 23 | private var _fields: Array[Field] = _ 24 | 25 | def getFields: Array[Field] = _fields 26 | 27 | def getFieldNames: Array[String] = _fields.map(f => f.name) 28 | 29 | def getField(i: Int): Option[Field] = Try(getFields(i)).toOption 30 | 31 | def getField(name: String, ignoreCase: Boolean = false): Option[Field] = 32 | getFields.find { field => 33 | val fieldName = field.name 34 | if (ignoreCase) fieldName.equalsIgnoreCase(name) 35 | else fieldName.equals(name) 36 | } 37 | 38 | def getFieldIndex(name: String, ignoreCase: Boolean = false): Option[Int] = 39 | getField(name, ignoreCase).map(f => getFields.indexOf(f)) 40 | 41 | override def toString: String = treeString 42 | 43 | private def toField(field: (String, JsonNode, JsonNodeType)): Field = field match { 44 | // For Basic Types 45 | case (name, node, _ @JsonNodeType.STRING) => 46 | Field(name, DataType.fromBasicType(node.textValue())) 47 | 48 | // For Time Type 49 | case (name, node, _ @JsonNodeType.OBJECT) 50 | if node.hasNonNull("Time") || node.hasNonNull("Time32") || node.hasNonNull("Time64") => 51 | Seq(node.get("Time"), node.get("Time32"), node.get("Time64")) 52 | .map(Option(_)) 53 | .collectFirst { case Some(v) => v } match { 54 | case Some(timeUnit) => Field(name, TimeType(timeUnit.textValue())) 55 | 56 | case None => 57 | throw new IllegalArgumentException("Invalid time cannot be parsed.") 58 | } 59 | 60 | // For Duration Type 61 | case (name, node, _ @JsonNodeType.OBJECT) if node.hasNonNull("Duration") => 62 | val timeUnit = node.get("Duration") 63 | Field(name, Duration(timeUnit.textValue())) 64 | 65 | // For DateTime Type 66 | case (name, node, _ @JsonNodeType.OBJECT) if node.hasNonNull("Timestamp") => 67 | node.get("Timestamp").elements().asScala.map(_.asText(null)).toSeq match { 68 | case Seq(tu, tz) => 69 | Field(name, DateTimeType(tu, tz)) 70 | case _ => 71 | Field(name, DateTimeType(null, null)) 72 | } 73 | 74 | // For (Nested) List Type 75 | case (name, node, _ @JsonNodeType.OBJECT) 76 | if node.hasNonNull("List") || node.hasNonNull("LargeList") => 77 | Seq(node.get("List"), node.get("LargeList")) 78 | .map(Option(_)) 79 | .collectFirst { case Some(v) => v } match { 80 | case Some(listNode) => 81 | val listNodeType = listNode.get("dtype") 82 | Field(name, ListType(toField((name, listNodeType, listNodeType.getNodeType)).dataType)) 83 | 84 | case None => 85 | throw new IllegalArgumentException("Invalid list cannot be parsed as a JSON.") 86 | } 87 | 88 | // For (Nested) Struct Type 89 | case (name, node, _ @JsonNodeType.OBJECT) if node.has("Struct") => 90 | val structNode = node.get("Struct") 91 | val structFields = structNode.iterator().asScala 92 | val sf = structFields.map { 93 | case node: JsonNode if node.hasNonNull("name") && node.hasNonNull("dtype") => 94 | val structFieldName: String = node.get("name").textValue() 95 | val structFieldType: JsonNode = node.get("dtype") 96 | 97 | Field( 98 | structFieldName, 99 | toField(name, structFieldType, structFieldType.getNodeType).dataType 100 | ) 101 | 102 | case _ => 103 | throw new IllegalArgumentException("Invalid struct cannot be parsed as a JSON.") 104 | }.toArray 105 | 106 | Field(name, StructType(sf)) 107 | 108 | case _ => 109 | throw new IllegalArgumentException("Invalid field cannot be parsed as a JSON.") 110 | } 111 | 112 | private def setFields(fields: Array[Field]): Schema = { 113 | fields match { 114 | case f if f == null || f.isEmpty => 115 | throw new IllegalArgumentException("Provided fields cannot be null or empty.") 116 | 117 | case _ => 118 | _fields = fields 119 | } 120 | 121 | this 122 | } 123 | 124 | private def deserialize(json: String): Schema = { 125 | Try(jsonMapper.reader.readTree(json)).toOption match { 126 | case None => 127 | throw new IllegalArgumentException("Provided schema string cannot be parsed as a JSON.") 128 | 129 | case Some(node: JsonNode) if node.hasNonNull("fields") => 130 | val fields = node.get("fields").elements().asScala.toList 131 | _fields = fields 132 | .map(f => 133 | toField(f.get("name").textValue(), f.get("dtype"), f.get("dtype").getNodeType) 134 | ) 135 | .toArray 136 | 137 | case _ => 138 | throw new IllegalArgumentException("Provided schema string is an invalid JSON.") 139 | } 140 | 141 | this 142 | } 143 | 144 | /** Borrowed from Apache Spark source to represent Schema as a tree string. */ 145 | private[polars] def treeString: String = { 146 | val stringBuffer = new StringBuffer() 147 | stringBuffer.append("root\n") 148 | val prefix = " |" 149 | getFields.foreach(field => field.buildFormattedString(prefix, stringBuffer)) 150 | 151 | stringBuffer.toString 152 | } 153 | 154 | } 155 | 156 | object Schema { 157 | def fromString(jsonString: String): Schema = new Schema().deserialize(jsonString) 158 | 159 | def fromFields(fields: Array[Field]): Schema = new Schema().setFields(fields) 160 | } 161 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/config/Config.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.config 2 | 3 | import java.io.File 4 | import java.nio.charset.StandardCharsets 5 | import java.nio.file.{Files, Paths, StandardOpenOption} 6 | 7 | import scala.jdk.CollectionConverters._ 8 | 9 | import org.polars.scala.polars.internal.jni.common 10 | import org.polars.scala.polars.jsonMapper 11 | 12 | private case class ConfigExport( 13 | environment: Map[String, String], 14 | local: Map[String, String] = Map.empty[String, String] 15 | ) 16 | 17 | class Config private (val options: Map[String, String]) { 18 | 19 | class ConfigUpdateBuilder private[config] () { 20 | private[this] val options = new java.util.HashMap[String, String]() 21 | 22 | /** Sets a configs from a Java Map. 23 | * 24 | * For more details, see 25 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/config.html this.]] 26 | */ 27 | def withOptions(opts: java.util.Map[String, String]): ConfigUpdateBuilder = synchronized { 28 | withOptions(opts.asScala) 29 | 30 | this 31 | } 32 | 33 | /** Sets a configs from an Iterable of key and value pairs. 34 | * 35 | * For more details, see 36 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/config.html this.]] 37 | */ 38 | def withOptions(opts: Iterable[(String, String)]): ConfigUpdateBuilder = synchronized { 39 | opts.foreach { case (key, value) => withOption(key, value) } 40 | 41 | this 42 | } 43 | 44 | /** Sets a config option from a key and value pair. 45 | * 46 | * For more details, see 47 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/config.html this]] and 48 | * [[https://github.com/pola-rs/polars/blob/d3f4d63d6fcd02e4bddb445dc24ad8533f8b069d/py-polars/polars/config.py#L24 this]]. 49 | */ 50 | def withOption(key: String, value: String): ConfigUpdateBuilder = synchronized { 51 | (key, value) match { 52 | case (_, null) | (null, _) | (null, null) => 53 | throw new IllegalArgumentException("Config key or value cannot be null or empty.") 54 | 55 | case (k, v) => 56 | options.put(k.trim, v.trim) 57 | this 58 | } 59 | } 60 | 61 | /** Sets a configs from an existing file. */ 62 | def fromPath(path: String): ConfigUpdateBuilder = synchronized { 63 | val configFile = new File(path) 64 | 65 | if (!configFile.exists() || !configFile.isFile) 66 | throw new IllegalArgumentException("Provided path must point to an existing file.") 67 | 68 | fromPath(configFile) 69 | } 70 | 71 | /** Sets a configs from an existing file. */ 72 | def fromPath(file: File): ConfigUpdateBuilder = synchronized { 73 | val content: String = 74 | new String(Files.readAllBytes(Paths.get(file.toURI)), StandardCharsets.UTF_8) 75 | 76 | fromString(content) 77 | } 78 | 79 | /** Sets a configs from a JSON config string. */ 80 | def fromString(configStr: String): ConfigUpdateBuilder = synchronized { 81 | val config = jsonMapper.readValue(configStr, classOf[ConfigExport]) 82 | 83 | withOptions(config.environment) 84 | this 85 | } 86 | 87 | /** Set table formatting style. 88 | * 89 | * For more details, see 90 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Config.set_tbl_formatting.html this.]] 91 | */ 92 | def withTableFormatting(format: TableFormats.TableFormat): ConfigUpdateBuilder = 93 | synchronized { 94 | options.put("POLARS_FMT_TABLE_FORMATTING", format.toString) 95 | this 96 | } 97 | 98 | /** Set the max number of columns used to print tables. 99 | * 100 | * If n < 0, then print all the columns. 101 | * 102 | * For more details, see 103 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Config.set_tbl_cols.html this.]] 104 | */ 105 | def withMaxTableColumns(nCols: Int): ConfigUpdateBuilder = synchronized { 106 | options.put("POLARS_FMT_MAX_COLS", nCols.toString) 107 | this 108 | } 109 | 110 | /** Set the max number of rows used to print tables. 111 | * 112 | * If n < 0, then print all the rows. 113 | * 114 | * For more details, see 115 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Config.set_tbl_rows.html this.]] 116 | */ 117 | def withMaxTableRows(nRows: Int): ConfigUpdateBuilder = synchronized { 118 | options.put("POLARS_FMT_MAX_ROWS", nRows.toString) 119 | this 120 | } 121 | 122 | /** Print the dataframe shape below the dataframe when displaying tables. 123 | * 124 | * For more details, see 125 | * [[https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.Config.set_tbl_dataframe_shape_below.html this.]] 126 | */ 127 | def withDataFrameShapeBelow(active: Boolean): ConfigUpdateBuilder = synchronized { 128 | options.put("POLARS_FMT_TABLE_DATAFRAME_SHAPE_BELOW", if (active) "1" else "0") 129 | this 130 | } 131 | 132 | /** Clear the current state of config. */ 133 | def reset(): ConfigUpdateBuilder = { 134 | options.clear() 135 | this 136 | } 137 | 138 | /** Applies current configuration in a persistent way. */ 139 | def apply(): Boolean = synchronized { 140 | Config.updateConfig(new Config(options.asScala.toMap)) 141 | common.setConfigs(options) 142 | } 143 | } 144 | 145 | /** Creates a builder for Polars [[Config]]. */ 146 | def update(): ConfigUpdateBuilder = new ConfigUpdateBuilder() 147 | 148 | /** Save the config to a specified path as a JSON config string. */ 149 | def saveTo(path: String, overwrite: Boolean): Unit = { 150 | val configFile = new File(path) 151 | 152 | saveTo(configFile, overwrite) 153 | } 154 | 155 | /** Save the config to a specified path as a JSON config string. */ 156 | def saveTo(path: File, overwrite: Boolean): Unit = synchronized { 157 | val configStr = this.toString 158 | 159 | if (path.exists() && path.isDirectory) 160 | throw new IllegalArgumentException("Provided path points to an existing directory.") 161 | 162 | val openOption = 163 | if (overwrite) Nil else Seq(StandardOpenOption.CREATE_NEW) 164 | 165 | Files.write( 166 | Paths.get(path.toURI), 167 | s"$configStr\n".getBytes(StandardCharsets.UTF_8), 168 | openOption: _* 169 | ) 170 | } 171 | 172 | override def toString: String = 173 | jsonMapper.writeValueAsString(ConfigExport(environment = options)) 174 | } 175 | 176 | object Config { 177 | 178 | private var _instance: Config = _ 179 | 180 | private[polars] def updateConfig(config: Config): Unit = synchronized { 181 | _instance = config 182 | } 183 | 184 | private[polars] def getConfig: Config = synchronized { 185 | Option(_instance) match { 186 | case None => 187 | _instance = new Config(Map.empty[String, String]) 188 | 189 | case _ => 190 | } 191 | 192 | _instance 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/config/constants.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.config 2 | 3 | object TableFormats extends Enumeration { 4 | type TableFormat = Value 5 | 6 | val NOTHING, ASCII_FULL, ASCII_FULL_CONDENSED, ASCII_NO_BORDERS, ASCII_BORDERS_ONLY, 7 | ASCII_BORDERS_ONLY_CONDENSED, ASCII_HORIZONTAL_ONLY, ASCII_MARKDOWN, UTF8_FULL, 8 | UTF8_FULL_CONDENSED, UTF8_NO_BORDERS, UTF8_BORDERS_ONLY, UTF8_HORIZONTAL_ONLY = Value 9 | } 10 | 11 | object UniqueKeepStrategies extends Enumeration { 12 | type UniqueKeepStrategy = Value 13 | 14 | val first, last, any, none = Value 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/functions.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars 2 | 3 | import java.time.format.DateTimeFormatter 4 | import java.time.{LocalDate, LocalTime, ZonedDateTime} 5 | 6 | import org.polars.scala.polars.api.expressions.{Column, Expression} 7 | import org.polars.scala.polars.internal.jni.expressions.{column_expr, literal_expr} 8 | 9 | object functions { 10 | 11 | def col(name: String): Column = Column.from(name) 12 | 13 | def lit(value: Any): Expression = { 14 | val ptr = value match { 15 | case null => literal_expr.nullLit() 16 | case v: Expression => v.ptr 17 | case v: Boolean => literal_expr.fromBool(v) 18 | case v: Int => literal_expr.fromInt(v) 19 | case v: Long => literal_expr.fromLong(v) 20 | case v: Float => literal_expr.fromFloat(v) 21 | case v: Double => literal_expr.fromDouble(v) 22 | case v: LocalDate => 23 | literal_expr.fromDate(java.time.format.DateTimeFormatter.ISO_LOCAL_DATE.format(v)) 24 | case v: LocalTime => 25 | literal_expr.fromTime(java.time.format.DateTimeFormatter.ISO_LOCAL_TIME.format(v)) 26 | case v: ZonedDateTime => 27 | literal_expr.fromDateTime( 28 | java.time.format.DateTimeFormatter.ISO_LOCAL_DATE_TIME.format(v) 29 | ) 30 | case v: String => literal_expr.fromString(v) 31 | case _ => 32 | throw new IllegalArgumentException( 33 | s"Unsupported value `$value` of type `${value.getClass.getSimpleName}` was provided." 34 | ) 35 | } 36 | 37 | Expression.withPtr(ptr) 38 | } 39 | 40 | def desc(col_name: String): Expression = { 41 | val ptr = column_expr.sort_column_by_name(col_name, descending = true) 42 | Expression.withPtr(ptr) 43 | } 44 | 45 | def asc(col_name: String): Expression = { 46 | val ptr = column_expr.sort_column_by_name(col_name, descending = false) 47 | Expression.withPtr(ptr) 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/Natively.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | import org.polars.scala.polars.loadLibraryIfRequired 4 | 5 | private[jni] trait Natively { loadLibraryIfRequired() } 6 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/common.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | private[polars] object common extends Natively { 4 | 5 | @native def version(): String 6 | 7 | @native def setConfigs(options: java.util.Map[String, String]): Boolean 8 | 9 | } 10 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/data_frame.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | private[polars] object data_frame extends Natively { 4 | 5 | @native def concatDataFrames(ptrs: Array[Long]): Long 6 | 7 | @native def schemaString(ptr: Long): String 8 | 9 | @native def toLazy(ptr: Long): Long 10 | 11 | @native def show(ptr: Long): Unit 12 | 13 | @native def count(ptr: Long): Long 14 | 15 | @native def limit(ptr: Long, n: Long): Long 16 | 17 | @native def tail(ptr: Long, n: Long): Long 18 | 19 | @native def fromSeries(ptrs: Array[Long]): Long 20 | 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/expressions/column_expr.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni.expressions 2 | 3 | import org.polars.scala.polars.internal.jni.Natively 4 | 5 | private[polars] object column_expr extends Natively { 6 | 7 | @native def column(name: String): Long 8 | 9 | @native def sort_column_by_name(name: String, descending: Boolean): Long 10 | 11 | @native def applyUnary(ptr: Long, op: Int): Long 12 | 13 | @native def applyBinary(leftPtr: Long, rightPtr: Long, op: Int): Long 14 | 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/expressions/literal_expr.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni.expressions 2 | 3 | import org.polars.scala.polars.internal.jni.Natively 4 | 5 | private[polars] object literal_expr extends Natively { 6 | 7 | @native def nullLit(): Long 8 | 9 | @native def fromString(value: String): Long 10 | 11 | @native def fromBool(value: Boolean): Long 12 | 13 | @native def fromInt(value: Int): Long 14 | 15 | @native def fromLong(value: Long): Long 16 | 17 | @native def fromFloat(value: Float): Long 18 | 19 | @native def fromDouble(value: Double): Long 20 | 21 | @native def fromDate(value: String): Long 22 | 23 | @native def fromTime(value: String): Long 24 | 25 | @native def fromDateTime(value: String): Long 26 | 27 | } 28 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/io/scan.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni.io 2 | 3 | import org.polars.scala.polars.internal.jni.Natively 4 | 5 | private[polars] object scan extends Natively { 6 | 7 | @native def scanParquet(paths: Array[String], options: String): Long 8 | 9 | @native def scanIPC(paths: Array[String], options: String): Long 10 | 11 | @native def scanCSV(paths: Array[String], options: String): Long 12 | 13 | @native def scanJsonLines(paths: Array[String], options: String): Long 14 | 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/io/write.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni.io 2 | 3 | import org.polars.scala.polars.internal.jni.Natively 4 | 5 | object write extends Natively { 6 | 7 | @native def writeParquet( 8 | ptr: Long, 9 | filePath: String, 10 | options: String 11 | ): Unit 12 | 13 | @native def writeIPC( 14 | ptr: Long, 15 | filePath: String, 16 | options: String 17 | ): Unit 18 | 19 | @native def writeAvro( 20 | ptr: Long, 21 | filePath: String, 22 | options: String 23 | ): Unit 24 | 25 | @native def writeCSV( 26 | ptr: Long, 27 | filePath: String, 28 | options: String 29 | ): Unit 30 | 31 | @native def writeJson( 32 | ptr: Long, 33 | filePath: String, 34 | options: String 35 | ): Unit 36 | 37 | } 38 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/lazy_frame.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | private[polars] object lazy_frame extends Natively { 4 | 5 | @native def concatLazyFrames(ptrs: Array[Long], reChunk: Boolean, parallel: Boolean): Long 6 | 7 | @native def schemaString(ptr: Long): String 8 | 9 | @native def selectFromStrings(ptr: Long, cols: Array[String]): Long 10 | 11 | @native def selectFromExprs(ptr: Long, exprs: Array[Long]): Long 12 | 13 | @native def filterFromExprs(ldfPtr: Long, exprPtr: Long): Long 14 | 15 | @native def limit(ptr: Long, n: Long): Long 16 | 17 | @native def tail(ptr: Long, n: Long): Long 18 | 19 | @native def drop(ptr: Long, cols: Array[String]): Long 20 | 21 | @native def drop_nulls(ptr: Long, subset: Array[String]): Long 22 | 23 | @native def rename(ptr: Long, mapping: java.util.Map[String, String]): Long 24 | 25 | @native def sortFromExprs( 26 | ldfPtr: Long, 27 | exprPtrs: Array[Long], 28 | nullLast: Array[Boolean], 29 | maintainOrder: Boolean 30 | ): Long 31 | 32 | @native def topKFromExprs( 33 | ldfPtr: Long, 34 | k: Int, 35 | exprPtrs: Array[Long], 36 | nullLast: Array[Boolean], 37 | maintainOrder: Boolean 38 | ): Long 39 | 40 | @native def withColumn(ldfPtr: Long, name: String, exprPtr: Long): Long 41 | 42 | @native def unique(ptr: Long, subset: Array[String], keep: String, maintainOrder: Boolean): Long 43 | 44 | @native def explain(ptr: Long, optimized: Boolean, tree_format: Boolean): String 45 | 46 | @native def set_sorted(ptr: Long, mapping: java.util.Map[String, Boolean]): Long 47 | 48 | @native def cache(ptr: Long): Long 49 | 50 | @native def collect(ptr: Long): Long 51 | 52 | @native def optimization_toggle( 53 | ptr: Long, 54 | typeCoercion: Boolean, 55 | predicatePushdown: Boolean, 56 | projectionPushdown: Boolean, 57 | simplifyExpr: Boolean, 58 | slicePushdown: Boolean, 59 | commSubplanElim: Boolean, 60 | commSubexprElim: Boolean, 61 | streaming: Boolean 62 | ): Long 63 | 64 | } 65 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/row.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | private[polars] object row extends Natively { 4 | 5 | @native def createIterator(dfPtr: Long, nRows: Long): Long 6 | 7 | @native def advanceIterator(ptr: Long): Array[Object] 8 | 9 | @native def schemaString(ptr: Long): String 10 | 11 | } 12 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/internal/jni/series.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala.polars.internal.jni 2 | 3 | private[polars] object series extends Natively { 4 | 5 | @native def show(ptr: Long): Unit 6 | 7 | @native def new_str_series(name: String, data: Array[String]): Long 8 | 9 | @native def new_int_series(name: String, data: Array[Int]): Long 10 | 11 | @native def new_float_series(name: String, data: Array[Float]): Long 12 | 13 | @native def new_double_series(name: String, data: Array[Double]): Long 14 | 15 | @native def new_long_series(name: String, data: Array[Long]): Long 16 | 17 | @native def new_boolean_series(name: String, data: Array[Boolean]): Long 18 | 19 | @native def new_date_series(name: String, data: Array[String]): Long 20 | 21 | @native def new_datetime_series(name: String, data: Array[String]): Long 22 | 23 | @native def new_time_series(name: String, data: Array[String]): Long 24 | 25 | @native def new_list_series(name: String, ptrs: Array[Long]): Long 26 | 27 | @native def new_struct_series(name: String, ptrs: Array[Long]): Long 28 | 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/org/polars/scala/polars/package.scala: -------------------------------------------------------------------------------- 1 | package org.polars.scala 2 | 3 | import java.util.concurrent.atomic.AtomicReference 4 | 5 | import scala.util.{Failure, Success, Try} 6 | 7 | import com.fasterxml.jackson.databind.SerializationFeature 8 | import com.fasterxml.jackson.databind.json.JsonMapper 9 | import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule 10 | import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule} 11 | 12 | package object polars { 13 | 14 | private final val NATIVE_LIB_NAME = "scala_polars" 15 | 16 | private[polars] val libraryLoaded = 17 | new AtomicReference[LibraryStates.LibraryState](LibraryStates.NOT_LOADED) 18 | 19 | final val jsonMapper = 20 | JsonMapper 21 | .builder() 22 | .addModules( 23 | DefaultScalaModule, 24 | new JavaTimeModule() 25 | ) 26 | .build() :: ClassTagExtensions 27 | 28 | jsonMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS) 29 | 30 | private[polars] def loadLibraryIfRequired(): Unit = { 31 | if (libraryLoaded.get() == LibraryStates.LOADED) 32 | return 33 | 34 | if (libraryLoaded.compareAndSet(LibraryStates.NOT_LOADED, LibraryStates.LOADING)) { 35 | Try(NativeLoader.load(NATIVE_LIB_NAME)) match { 36 | case Success(_) => 37 | libraryLoaded.set(LibraryStates.LOADED) 38 | 39 | case Failure(e) => 40 | libraryLoaded.set(LibraryStates.NOT_LOADED) 41 | throw new RuntimeException(s"Unable to load the `$NATIVE_LIB_NAME` native library.", e) 42 | } 43 | 44 | return 45 | } 46 | 47 | while (libraryLoaded.get() == LibraryStates.LOADING) 48 | Thread.sleep(10) 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /core/src/site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Project Documentation 6 | 16 | 17 | 18 | Go to the project documentation 19 | 20 | 21 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/InstantiateDataFrame.java: -------------------------------------------------------------------------------- 1 | package examples.java; 2 | 3 | import java.util.Arrays; 4 | import org.polars.scala.polars.api.DataFrame; 5 | import org.polars.scala.polars.api.Series; 6 | 7 | public class InstantiateDataFrame { 8 | 9 | public static void main(String[] args) { 10 | DataFrame.fromSeries(Series.ofBoolean("bool_col", new boolean[] {true, false, true})).show(); 11 | 12 | DataFrame.fromSeries( 13 | Series.ofInt("i32_col", new int[] {1, 2, 3}), 14 | Series.ofLong("i64_col", new long[] {1L, 2L, 3L}), 15 | Series.ofBoolean("bool_col", new boolean[] {true, false, true}), 16 | Series.ofList( 17 | "nested_str_col", 18 | new String[][] { 19 | {"a", "b", "c"}, 20 | {"a", "b", "c"}, 21 | {"a", "b", "c"}, 22 | })) 23 | .show(); 24 | 25 | /* Values as Java array(s) */ 26 | 27 | DataFrame.fromSeries( 28 | Series.ofInt("i32_col", new int[] {1, 2, 3}), 29 | new Series[] { 30 | Series.ofLong("i64_col", new long[] {1L, 2L, 3L}), 31 | Series.ofBoolean("bool_col", new boolean[] {true, false, true}), 32 | Series.ofList( 33 | "nested_str_col", 34 | new String[][] { 35 | {"a", "b", "c"}, 36 | {"a", "b", "c"}, 37 | {"a", "b", "c"}, 38 | }), 39 | }) 40 | .show(); 41 | 42 | DataFrame.fromSeries( 43 | Series.ofInt("i32_col", new Integer[] {1, 2, 3}), 44 | new Series[] { 45 | Series.ofLong("i64_col", new Long[] {1L, 2L, 3L}), 46 | Series.ofBoolean("bool_col", new Boolean[] {true, false, true}), 47 | Series.ofFloat("f32_col", new Float[] {1F, 2F, 3F}), 48 | }) 49 | .show(); 50 | 51 | /* Values as Java lists(s) */ 52 | 53 | DataFrame.fromSeries( 54 | Series.ofInt("i32_col", Arrays.asList(1, 2, 3)), 55 | new Series[] { 56 | Series.ofLong("i64_col", Arrays.asList(1L, 2L, 3L)), 57 | Series.ofBoolean("bool_col", Arrays.asList(true, false, true)), 58 | Series.ofFloat("f32_col", Arrays.asList(1F, 2F, 3F)), 59 | }) 60 | .show(); 61 | 62 | /* Values as a mix of Java lists(s) and array(s) */ 63 | 64 | DataFrame.fromSeries( 65 | Series.ofInt("i32_col", Arrays.asList(1, 2, 3)), 66 | new Series[] { 67 | Series.ofLong("i64_col", new Long[] {1L, 2L, 3L}), 68 | Series.ofBoolean("bool_col", new Boolean[] {true, false, true}), 69 | Series.ofFloat("f32_col", Arrays.asList(1F, 2F, 3F)), 70 | }) 71 | .show(); 72 | 73 | DataFrame.fromSeries( 74 | Series.ofInt("i32_col", Arrays.asList(1, 2, 3)), 75 | new Series[] { 76 | Series.ofLong("i64_col", new Long[] {1L, 2L, 3L}), 77 | Series.ofBoolean("bool_col", new Boolean[] {true, false, true}), 78 | Series.ofSeries( 79 | "struct_col", 80 | new Series[] { 81 | Series.ofLong("i64_col", new Long[] {1L, 2L, 3L}), 82 | Series.ofBoolean("bool_col", new Boolean[] {true, false, true}), 83 | Series.ofFloat("f32_col", Arrays.asList(1F, 2F, 3F)), 84 | }), 85 | }) 86 | .show(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/InstantiateSeries.java: -------------------------------------------------------------------------------- 1 | package examples.java; 2 | 3 | import java.util.Arrays; 4 | import java.util.Collections; 5 | import org.polars.scala.polars.api.Series; 6 | 7 | public class InstantiateSeries { 8 | public static void main(String[] args) { 9 | 10 | /* Values as Java array/ list of Basic Types */ 11 | 12 | // int or java.lang.Integer 13 | Series.ofInt("series_i32_java_array_primitive", new int[] {1, 2, 3}).show(); 14 | Series.ofInt("series_i32_java_array", new java.lang.Integer[] {1, 2, 3}).show(); 15 | Series.ofInt("series_i32_java_list", Arrays.asList(1, 2, 3)).show(); 16 | 17 | // long or java.lang.Long 18 | Series.ofLong("series_i64_java_array_primitive", new long[] {1L, 2L, 3L}).show(); 19 | Series.ofLong("series_i64_java_array", new java.lang.Long[] {1L, 2L, 3L}).show(); 20 | Series.ofLong("series_i64_java_list", Arrays.asList(1L, 2L, 3L)).show(); 21 | 22 | // float or java.lang.Float 23 | Series.ofFloat("series_f32_java_array_primitive", new float[] {1f, 2f, 3f}).show(); 24 | Series.ofFloat("series_f32_java_array", new java.lang.Float[] {1f, 2f, 3f}).show(); 25 | Series.ofFloat("series_f32_java_list", Arrays.asList(1f, 2f, 3f)).show(); 26 | 27 | // double or java.lang.Double 28 | Series.ofDouble("series_f64_java_array_primitive", new double[] {1d, 2d, 3d}).show(); 29 | Series.ofDouble("series_f64_java_array", new java.lang.Double[] {1d, 2d, 3d}).show(); 30 | Series.ofDouble("series_f64_java_list", Arrays.asList(1d, 2d, 3d)).show(); 31 | 32 | // boolean or java.lang.Boolean 33 | Series.ofBoolean("series_bool_java_array_primitive", new boolean[] {true, false, true, true}) 34 | .show(); 35 | Series.ofBoolean("series_bool_java_array", new java.lang.Boolean[] {true, false, true, true}) 36 | .show(); 37 | Series.ofBoolean("series_bool_java_list", Arrays.asList(true, false, true, true)).show(); 38 | 39 | // String 40 | Series.ofString("series_str_java_array_primitive", new String[] {"a", "b"}).show(); 41 | Series.ofString("series_str_java_list", Arrays.asList("a", "b")).show(); 42 | 43 | // java.time.LocalDate 44 | Series.ofDate( 45 | "series_date_java_array_primitive", 46 | new java.time.LocalDate[] {java.time.LocalDate.now()}) 47 | .show(); 48 | Series.ofDate("series_date_java_list", Collections.singletonList(java.time.LocalDate.now())) 49 | .show(); 50 | 51 | // java.time.LocalTime 52 | Series.ofTime( 53 | "series_time_java_array_primitive", 54 | new java.time.LocalTime[] {java.time.LocalTime.now()}) 55 | .show(); 56 | Series.ofTime("series_time_java_list", Collections.singletonList(java.time.LocalTime.now())) 57 | .show(); 58 | 59 | // java.time.ZonedDateTime 60 | Series.ofDateTime( 61 | "series_datetime_java_array_primitive", 62 | new java.time.ZonedDateTime[] {java.time.ZonedDateTime.now()}) 63 | .show(); 64 | Series.ofDateTime( 65 | "series_datetime_java_list", Collections.singletonList(java.time.ZonedDateTime.now())) 66 | .show(); 67 | 68 | /* Values as Java array/ list of Nested List Types */ 69 | 70 | // int or java.lang.Integer 71 | Series.ofList("series_list_int_java_array", new java.lang.Integer[][] {{1, 2, 3}}).show(); 72 | Series.ofList("series_list_int_java_list", Collections.singletonList(Arrays.asList(1, 2, 3))) 73 | .show(); 74 | 75 | // String 76 | Series.ofList("series_list_str_java_array", new String[][] {{"a", "b"}}).show(); 77 | Series.ofList("series_list_str_java_list", Collections.singletonList(Arrays.asList("a", "b"))) 78 | .show(); 79 | 80 | // Deep Nested 81 | Series.ofList("series_list_list_str_java_array", new String[][][] {{{"a", "b"}}}).show(); 82 | Series.ofList( 83 | "series_list_list_str_java_list", 84 | Collections.singletonList(Collections.singletonList(Arrays.asList("a", "b")))) 85 | .show(); 86 | 87 | /* Values as Java array/ list of Struct Types */ 88 | 89 | Series.ofSeries( 90 | "series_struct_java_array", 91 | new Series[] { 92 | Series.ofInt("int_col", new int[] {1, 2, 3}), 93 | Series.ofString("str_col", new String[] {"a", "b", "c"}), 94 | Series.ofBoolean("bool_col", new boolean[] {true, false, true}), 95 | }) 96 | .show(); 97 | Series.ofSeries( 98 | "series_struct_java_list", 99 | Arrays.asList( 100 | Series.ofInt("int_col", Arrays.asList(1, 2, 3)), 101 | Series.ofString("str_col", Arrays.asList("a", "b", "c")), 102 | Series.ofBoolean("bool_col", Arrays.asList(true, false, true)))) 103 | .show(); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/configuration/ConfiguringPolars.java: -------------------------------------------------------------------------------- 1 | package examples.java.configuration; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.Files; 6 | import java.nio.file.Path; 7 | import java.util.HashMap; 8 | import org.polars.scala.polars.Polars; 9 | 10 | public class ConfiguringPolars { 11 | 12 | public static void main(String[] args) throws IOException { 13 | 14 | /* Checking the version scala-polars is compiled against. */ 15 | String version = Polars.version(); 16 | System.out.printf("scala-polars has been compiled against version '%s'%n%n", version); 17 | 18 | /* Get default configuration. */ 19 | System.out.printf("Default Configuration:%n%s%n%n", Polars.config()); 20 | 21 | /* Updating configuration. */ 22 | 23 | /* Update the number of rows shown while doing `df.show()` */ 24 | Polars.config().update().withMaxTableRows(20).apply(); 25 | System.out.printf("After updating number of rows:%n%s%n%n", Polars.config()); 26 | 27 | /* Update the number of columns shown while doing `df.show()` */ 28 | Polars.config().update().withMaxTableColumns(20).apply(); 29 | System.out.printf("After updating number of columns:%n%s%n%n", Polars.config()); 30 | 31 | /* Reset config */ 32 | Polars.config().update().reset().apply(); 33 | System.out.printf("After resetting config:%n%s%n%n", Polars.config()); 34 | 35 | /* Chaining configuration options */ 36 | HashMap options = new HashMap<>(); 37 | options.put("POLARS_TABLE_WIDTH", "5000"); 38 | 39 | Polars.config() 40 | .update() 41 | .withMaxTableRows(20) 42 | .withMaxTableColumns(20) 43 | .withOption("POLARS_FMT_TABLE_CELL_ALIGNMENT", "RIGHT") 44 | .withOptions(options) 45 | .apply(); 46 | 47 | System.out.printf("After chained configs:%n%s%n%n", Polars.config()); 48 | 49 | /* Persisting current configuration to file */ 50 | Path tempDirectory = Files.createTempDirectory("polars-config-"); 51 | File tempFile = Files.createTempFile(tempDirectory, "temp-polars-config-", "plcfg").toFile(); 52 | Polars.config().saveTo(tempFile, true); 53 | 54 | /* Reloading current configuration to file */ 55 | Polars.config().update().reset().apply(); 56 | System.out.printf("After resetting config:%n%s%n%n", Polars.config()); 57 | 58 | Polars.config().update().fromPath(tempFile).apply(); 59 | System.out.printf("After reloading config from file path:%n%s%n", Polars.config()); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/expressions/ApplyingSimpleExpressions.java: -------------------------------------------------------------------------------- 1 | package examples.java.expressions; 2 | 3 | import static org.polars.scala.polars.functions.*; 4 | 5 | import examples.scala.utils.CommonUtils; 6 | import java.util.Collections; 7 | import java.util.Random; 8 | import org.polars.scala.polars.Polars; 9 | import org.polars.scala.polars.api.DataFrame; 10 | import org.polars.scala.polars.api.LazyFrame; 11 | 12 | public class ApplyingSimpleExpressions { 13 | 14 | public static void main(String[] args) { 15 | /* Read a dataset as a DataFrame lazily or eagerly */ 16 | String path = CommonUtils.getResource("/files/web-ds/data.json"); 17 | LazyFrame input = Polars.scan().jsonLines(path); 18 | 19 | /* Apply multiple operations on the LazyFrame or DataFrame */ 20 | LazyFrame ldf = 21 | input 22 | .cache() 23 | .select("id", "name") 24 | .with_column("lower_than_four", col("id").lessThanEqualTo(4)) 25 | .filter(col("lower_than_four")) 26 | .with_column("long_value", lit(new Random().nextLong())) 27 | .with_column("date", lit(java.time.LocalDate.now())) 28 | .with_column("time", lit(java.time.LocalTime.now())) 29 | .with_column("current_ts", lit(java.time.ZonedDateTime.now())) 30 | .sort(asc("name"), true, false) 31 | .set_sorted(Collections.singletonMap("name", false)) 32 | .top_k(2, "id", true, true, false) 33 | .limit(2) // .head(2) 34 | .tail(2) 35 | .drop("long_value") 36 | .rename("lower_than_four", "less_than_four") 37 | .drop_nulls(); 38 | 39 | ldf = Polars.concat(ldf, new LazyFrame[] {ldf, ldf}); 40 | ldf = ldf.unique(); 41 | 42 | System.out.println("Showing LazyFrame plan to stdout."); 43 | ldf.explain(); 44 | 45 | DataFrame df = ldf.collect(); 46 | 47 | System.out.println("Showing resultant DataFrame to stdout."); 48 | df.show(); 49 | 50 | System.out.printf("Total rows: %s%n%n", df.count()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/io/LazyAndEagerAPI.java: -------------------------------------------------------------------------------- 1 | package examples.java.io; 2 | 3 | import examples.scala.utils.CommonUtils; 4 | import org.polars.scala.polars.Polars; 5 | import org.polars.scala.polars.api.DataFrame; 6 | import org.polars.scala.polars.api.LazyFrame; 7 | import org.polars.scala.polars.api.Row; 8 | import scala.collection.Iterator; 9 | 10 | /** 11 | * Polars provides 2 API for reading datasets lazily ({@code scan}) or eagerly ({@code read}). 12 | * 13 | *

These APIs serve different purposes and result in either a {@link LazyFrame} or a {@link 14 | * DataFrame}. A LazyFrame can be materialized to a DataFrame and vice-versa if required. 15 | * 16 | *

17 | * 18 | *

Lazy API

19 | * 20 | * With the lazy API Polars doesn't run each query line-by-line but instead processes the full query 21 | * end-to-end. To get the most out of Polars it is important that you use the lazy API because: 22 | * 23 | *
    24 | *
  • the lazy API allows Polars to apply automatic query optimization with the query optimizer. 25 | *
  • the lazy API allows you to work with larger than memory datasets using streaming. 26 | *
  • the lazy API can catch schema errors before processing the data. 27 | *
28 | * 29 | *

More info can be found here 31 | * 32 | *

33 | * 34 | *

Eager API

35 | * 36 | * With eager API the queries are executed line-by-line in contrast to the lazy API. 37 | */ 38 | public class LazyAndEagerAPI { 39 | 40 | public static void main(String[] args) { 41 | /* Lazily read data from file based datasets */ 42 | String path = CommonUtils.getResource("/files/web-ds/data.csv"); 43 | LazyFrame ldf = Polars.scan().option("scan_csv_n_rows", "2").csv(path); 44 | 45 | /* Materialize LazyFrame to DataFrame */ 46 | DataFrame df = ldf.collect(); 47 | 48 | System.out.println("Showing CSV file as a DataFrame to stdout"); 49 | df.show(); 50 | 51 | System.out.printf("Total rows: %s%n%n", df.count()); 52 | System.out.printf("Total columns: %s%n%n", df.schema().getFields().length); 53 | 54 | /* Lazily read only first 3 rows */ 55 | df = Polars.scan().option("scan_csv_n_rows", "3").csv(path).collect(); 56 | System.out.printf("Total rows: %s%n%n", df.count()); 57 | 58 | System.out.println("Rows:"); 59 | Iterator rows = df.rows(); 60 | 61 | while (rows.hasNext()) { 62 | System.out.println(rows.next()); 63 | } 64 | System.out.println("\n"); 65 | 66 | /* Convert DataFrame back to LazyFrame */ 67 | LazyFrame backToLdf = df.toLazy(); 68 | System.out.printf("Show schema: %s%n%n", backToLdf.schema()); 69 | 70 | /* Eagerly read data from file based datasets */ 71 | df = Polars.scan().csv(path).collect(); 72 | 73 | System.out.println("Showing CSV file as a DataFrame to stdout"); 74 | df.show(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/io/ReadingFileDatasets.java: -------------------------------------------------------------------------------- 1 | package examples.java.io; 2 | 3 | import examples.scala.utils.CommonUtils; 4 | import org.polars.scala.polars.Polars; 5 | import org.polars.scala.polars.api.DataFrame; 6 | import org.polars.scala.polars.api.LazyFrame; 7 | 8 | /** 9 | * Polars supports various input file formats like the following, 10 | * 11 | *
    12 | *
  • {@link org.polars.scala.polars.api.io.Scannable#csv CSV} (delimited format like CSV, TSV, 13 | * etc.) 14 | *
  • {@link org.polars.scala.polars.api.io.Scannable#parquet Apache Parquet} 15 | *
  • {@link org.polars.scala.polars.api.io.Scannable#ipc Apache Arrow IPC} 16 | *
  • {@link org.polars.scala.polars.api.io.Scannable#jsonLines New line delimited JSON} 17 | *
18 | * 19 | *

All the above formats are compatible with the lazy or eager input API and users can supply 1 20 | * or more file paths which will be read in parallel to return a {@link LazyFrame} or a {@link 21 | * DataFrame}. 22 | * 23 | *

Since each format may have its own additional options (example: delimiter for CSV format), 24 | * Polars allows a simple builder pattern which can be used to supply these options. 25 | * 26 | *

While the examples below have been provided for Parquet files only, they also similarly apply 27 | * on the other supported file formats. 28 | * 29 | *

Some additional examples may also be found in {@link LazyAndEagerAPI}. 30 | */ 31 | public class ReadingFileDatasets { 32 | 33 | public static void main(String[] args) { 34 | 35 | /* For one Parquet file */ 36 | String path = CommonUtils.getResource("/files/web-ds/data.parquet"); 37 | DataFrame df = Polars.scan().parquet(path).collect(); 38 | 39 | System.out.println("Showing parquet file as a DataFrame to stdout."); 40 | df.show(); 41 | 42 | System.out.printf("Total rows: %s%n%n", df.count()); 43 | 44 | /* For multiple Parquet file(s) */ 45 | DataFrame multiLdf = Polars.scan().parquet(path, path, path).collect(); 46 | 47 | System.out.println("Showing multiple parquet files as 1 DataFrame to stdout."); 48 | multiLdf.show(); 49 | System.out.printf("Total rows: %s%n%n", multiLdf.count()); 50 | 51 | /* Providing additional options with Parquet file input */ 52 | DataFrame pqDfWithOpts = 53 | Polars.scan() 54 | .option("scan_parquet_low_memory", "true") 55 | .option("scan_parquet_n_rows", "3") 56 | .option("scan_parquet_cache", "false") 57 | .option("scan_parquet_row_index_name", "SerialNum") 58 | .parquet(path) 59 | .collect(); 60 | 61 | System.out.println("Showing parquet file as a DataFrame to stdout."); 62 | pqDfWithOpts.show(); 63 | 64 | System.out.printf("Total rows: %s%n%n", pqDfWithOpts.count()); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /examples/src/main/java/examples/java/io/WritingToFileDatasets.java: -------------------------------------------------------------------------------- 1 | package examples.java.io; 2 | 3 | import examples.scala.utils.CommonUtils; 4 | import org.polars.scala.polars.Polars; 5 | import org.polars.scala.polars.api.DataFrame; 6 | 7 | /** 8 | * Polars supports various output file formats like the following, 9 | * 10 | *

    11 | *
  • {@link org.polars.scala.polars.api.io.Writeable#parquet(String) Apache Parquet} 12 | *
  • {@link org.polars.scala.polars.api.io.Writeable#ipc(String) Apache IPC} 13 | *
14 | * 15 | *

A {@link DataFrame} can be written to an object storage as a file in one of the supported 16 | * formats mentioned above. 17 | * 18 | *

Since each format and storage may have its own additional options, Polars allows a simple 19 | * builder pattern which can be used to supply these options. 20 | * 21 | *

While the examples below have been provided for Parquet files only, they also similarly apply 22 | * on the other supported file formats. 23 | */ 24 | public class WritingToFileDatasets { 25 | 26 | public static void main(String[] args) { 27 | 28 | /* Read a dataset as a DataFrame lazily or eagerly */ 29 | String path = CommonUtils.getResource("/files/web-ds/data.ipc"); 30 | DataFrame df = Polars.scan().ipc(path).collect(); 31 | 32 | System.out.println("Showing ipc file as a DataFrame to stdout."); 33 | df.show(); 34 | 35 | System.out.printf("Total rows: %s%n%n", df.count()); 36 | 37 | /* Write this DataFrame to local filesystem at the provided path */ 38 | String outputPath = CommonUtils.getOutputLocation("output.pq"); 39 | df.write().parquet(outputPath); 40 | System.out.printf("File written to location: %s%n%n", outputPath); 41 | 42 | /* Overwrite output if already exists */ 43 | df.write().option("write_mode", "overwrite").parquet(outputPath); 44 | System.out.printf("File overwritten at location: %s%n%n", outputPath); 45 | 46 | /* Write output file with compression */ 47 | df.write() 48 | .option("write_compression", "zstd") 49 | .option("write_mode", "overwrite") 50 | .option("write_parquet_stats", "full") 51 | .parquet(outputPath); 52 | System.out.printf("File overwritten at location: %s with compression%n%n", outputPath); 53 | 54 | /* Write output file to Amazon S3 object store */ 55 | String s3Path = "s3://bucket/output.pq"; 56 | df.write() 57 | .option("write_compression", "zstd") 58 | .option("write_mode", "overwrite") 59 | .option("write_parquet_stats", "full") 60 | .option("aws_default_region", "us‑east‑2") 61 | .option("aws_access_key_id", "ABC") 62 | .option("aws_secret_access_key", "XYZ") 63 | .parquet(s3Path); 64 | System.out.printf("File overwritten at location: %s with compression%n%n", s3Path); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /examples/src/main/resources/files/web-ds/data.csv: -------------------------------------------------------------------------------- 1 | id,name,created_utc,updated_on,comment_karma,link_karma 2 | 1,truman48lamb_jasonbroken,1397113470,1536527864,0,0 3 | 2,johnethen06_jasonbroken,1397113483,1536527864,0,0 4 | 3,yaseinrez_jasonbroken,1397113483,1536527864,0,1 5 | 4,Valve92_jasonbroken,1397113503,1536527864,0,0 6 | 5,srbhuyan_jasonbroken,1397113506,1536527864,0,0 7 | 6,taojianlong_jasonbroken,1397113510,1536527864,4,0 8 | 7,YourPalGrant92_jasonbroken,1397113513,1536527864,0,0 9 | 8,Lucki87_jasonbroken,1397113515,1536527864,0,0 10 | 9,punkstock_jasonbroken,1397113517,1536527864,0,0 11 | 10,duder_con_chile_jasonbroken,1397113519,1536527864,0,2 12 | 11,IHaveBigBalls_jasonbroken,1397113520,1536527864,0,0 13 | 12,Foggybanana_jasonbroken,1397113523,1536527864,0,0 14 | 13,Thedrinkdriver_jasonbroken,1397113527,1536527864,-9,0 15 | 14,littlemissd_jasonbroken,1397113530,1536527864,0,-3 16 | 15,phonethaway_jasonbroken,1397113537,1536527864,0,0 17 | 16,DreamingOfWinterfell_jasonbroken,1397113538,1536527864,0,0 18 | 17,ssaig_jasonbroken,1397113544,1536527864,1,0 19 | 18,divinetribe_jasonbroken,1397113549,1536527864,0,0 20 | 19,fdbvfdssdgfds_jasonbroken,1397113552,1536527864,3,0 21 | 20,hjtrsh54yh43_jasonbroken,1397113559,1536527864,-1,-1 22 | -------------------------------------------------------------------------------- /examples/src/main/resources/files/web-ds/data.ipc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chitralverma/scala-polars/9997baf445782bf8b5642a19d3346df2595b292b/examples/src/main/resources/files/web-ds/data.ipc -------------------------------------------------------------------------------- /examples/src/main/resources/files/web-ds/data.json: -------------------------------------------------------------------------------- 1 | {"id":1,"name":null,"created_utc":1397113470,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 2 | {"id":2,"name":"johnethen06_jasonbroken","created_utc":1397113483,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 3 | {"id":3,"name":"yaseinrez_jasonbroken","created_utc":1397113483,"updated_on":1536527864,"comment_karma":0,"link_karma":1} 4 | {"id":null,"name":"Valve92_jasonbroken","created_utc":1397113503,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 5 | {"id":5,"name":"srbhuyan_jasonbroken","created_utc":1397113506,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 6 | {"id":6,"name":"taojianlong_jasonbroken","created_utc":1397113510,"updated_on":1536527864,"comment_karma":4,"link_karma":0} 7 | {"id":7,"name":"YourPalGrant92_jasonbroken","created_utc":1397113513,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 8 | {"id":8,"name":"Lucki87_jasonbroken","created_utc":1397113515,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 9 | {"id":9,"name":"punkstock_jasonbroken","created_utc":1397113517,"updated_on":null,"comment_karma":0,"link_karma":0} 10 | {"id":10,"name":"duder_con_chile_jasonbroken","created_utc":1397113519,"updated_on":1536527864,"comment_karma":0,"link_karma":2} 11 | {"id":null,"name":"IHaveBigBalls_jasonbroken","created_utc":1397113520,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 12 | {"id":12,"name":"Foggybanana_jasonbroken","created_utc":1397113523,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 13 | {"id":13,"name":"Thedrinkdriver_jasonbroken","created_utc":1397113527,"updated_on":1536527864,"comment_karma":-9,"link_karma":0} 14 | {"id":14,"name":"littlemissd_jasonbroken","created_utc":1397113530,"updated_on":1536527864,"comment_karma":0,"link_karma":-3} 15 | {"id":15,"name":"phonethaway_jasonbroken","created_utc":1397113537,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 16 | {"id":16,"name":"DreamingOfWinterfell_jasonbroken","created_utc":1397113538,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 17 | {"id":17,"name":"ssaig_jasonbroken","created_utc":1397113544,"updated_on":1536527864,"comment_karma":1,"link_karma":0} 18 | {"id":18,"name":"divinetribe_jasonbroken","created_utc":1397113549,"updated_on":1536527864,"comment_karma":0,"link_karma":0} 19 | {"id":19,"name":"fdbvfdssdgfds_jasonbroken","created_utc":1397113552,"updated_on":1536527864,"comment_karma":3,"link_karma":0} 20 | {"id":20,"name":"hjtrsh54yh43_jasonbroken","created_utc":1397113559,"updated_on":1536527864,"comment_karma":-1,"link_karma":-1} 21 | -------------------------------------------------------------------------------- /examples/src/main/resources/files/web-ds/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chitralverma/scala-polars/9997baf445782bf8b5642a19d3346df2595b292b/examples/src/main/resources/files/web-ds/data.parquet -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/InstantiateDataFrame.scala: -------------------------------------------------------------------------------- 1 | package examples.scala 2 | 3 | import org.polars.scala.polars.api.{DataFrame, Series} 4 | 5 | object InstantiateDataFrame { 6 | 7 | def main(args: Array[String]): Unit = { 8 | DataFrame.fromSeries(Series.ofBoolean("bool_col", Array[Boolean](true, false, true))).show() 9 | 10 | DataFrame 11 | .fromSeries( 12 | Series.ofInt("i32_col", Array[Int](1, 2, 3)), 13 | Series.ofLong("i64_col", Array[Long](1L, 2L, 3L)), 14 | Series.ofBoolean("bool_col", Array[Boolean](true, false, true)), 15 | Series.ofList( 16 | "nested_str_col", 17 | Array[Array[String]](Array("a", "b", "c"), Array("a", "b", "c"), Array("a", "b", "c")) 18 | ) 19 | ) 20 | .show() 21 | 22 | /* Values as Scala array(s) */ 23 | DataFrame 24 | .fromSeries( 25 | Series.ofInt("i32_col", Array[Int](1, 2, 3)), 26 | Array[Series]( 27 | Series.ofLong("i64_col", Array[Long](1L, 2L, 3L)), 28 | Series.ofBoolean("bool_col", Array[Boolean](true, false, true)), 29 | Series.ofList( 30 | "nested_str_col", 31 | Array[Array[String]](Array("a", "b", "c"), Array("a", "b", "c"), Array("a", "b", "c")) 32 | ) 33 | ) 34 | ) 35 | .show() 36 | 37 | /* Values as scala lists(s) */ 38 | 39 | DataFrame 40 | .fromSeries( 41 | Series.ofInt("i32_col", Seq(1, 2, 3)), 42 | Array[Series]( 43 | Series.ofLong("i64_col", Seq(1L, 2L, 3L)), 44 | Series.ofBoolean("bool_col", Seq(true, false, true)), 45 | Series.ofFloat("f32_col", Seq(1f, 2f, 3f)) 46 | ) 47 | ) 48 | .show() 49 | 50 | /* Values as a mix of Scala lists(s) and array(s) */ 51 | 52 | DataFrame 53 | .fromSeries( 54 | Series.ofInt("i32_col", Seq(1, 2, 3)), 55 | Array[Series]( 56 | Series.ofLong("i64_col", Array[Long](1L, 2L, 3L)), 57 | Series.ofBoolean("bool_col", Array[Boolean](true, false, true)), 58 | Series.ofFloat("f32_col", Seq(1f, 2f, 3f)) 59 | ) 60 | ) 61 | .show() 62 | 63 | DataFrame 64 | .fromSeries( 65 | Series.ofInt("i32_col", Array[Int](1, 2, 3)), 66 | Series.ofLong("i64_col", Array[Long](1L, 2L, 3L)), 67 | Series.ofBoolean("bool_col", Array[Boolean](true, false, true)), 68 | Series.ofSeries( 69 | "struct_col", 70 | Array[Series]( 71 | Series.ofLong("i64_col", Array[Long](1L, 2L, 3L)), 72 | Series.ofBoolean("bool_col", Array[Boolean](true, false, true)), 73 | Series.ofFloat("f32_col", Seq(1f, 2f, 3f)) 74 | ) 75 | ) 76 | ) 77 | .show() 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/InstantiateSeries.scala: -------------------------------------------------------------------------------- 1 | package examples.scala 2 | 3 | import org.polars.scala.polars.api.Series 4 | 5 | object InstantiateSeries { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | /* Values as Scala array/ iterable of Basic Types */ 10 | 11 | // Int 12 | Series.ofInt("series_i32_scala_array", Array(1, 2, 3)).show() 13 | Series.ofInt("series_i32_scala_iterable", Seq(1, 2, 3)).show() 14 | 15 | // Long 16 | Series.ofLong("series_i64_scala_array", Array(1L, 2L, 3L)).show() 17 | Series.ofLong("series_i64_scala_iterable", Seq(1L, 2L, 3L)).show() 18 | 19 | // Float 20 | Series.ofFloat("series_f32_scala_array", Array(1f, 2f, 3f)).show() 21 | Series.ofFloat("series_f32_scala_iterable", Seq(1f, 2f, 3f)).show() 22 | 23 | // Double 24 | Series.ofDouble("series_f64_scala_array", Array(1d, 2d, 3d)).show() 25 | Series.ofDouble("series_f64_scala_iterable", Seq(1d, 2d, 3d)).show() 26 | 27 | // Boolean 28 | Series.ofBoolean("series_bool_scala_array", Array(true, false, true, true)).show() 29 | Series.ofBoolean("series_bool_scala_iterable", Seq(true, false, true, true)).show() 30 | 31 | // String 32 | Series.ofString("series_str_scala_array", Array("a", "b")).show() 33 | Series.ofString("series_str_scala_iterable", Seq("a", "b")).show() 34 | 35 | // java.time.LocalDate 36 | Series.ofDate("series_date_scala_array", Array(java.time.LocalDate.now())).show() 37 | Series.ofDate("series_date_scala_iterable", Seq(java.time.LocalDate.now())).show() 38 | 39 | // java.time.LocalTime 40 | Series.ofTime("series_time_scala_array", Array(java.time.LocalTime.now())).show() 41 | Series.ofTime("series_time_scala_iterable", Seq(java.time.LocalTime.now())).show() 42 | 43 | // java.time.ZonedDateTime 44 | Series.ofDateTime("series_datetime_scala_array", Array(java.time.ZonedDateTime.now())).show() 45 | Series.ofDateTime("series_datetime_scala_iterable", Seq(java.time.ZonedDateTime.now())).show() 46 | 47 | /* Values as Java array/ list of Nested List Types */ 48 | 49 | // int or java.lang.Integer 50 | Series.ofList("series_list_int_scala_array", Array(Array(1, 2, 3))).show() 51 | Series.ofList("series_list_int_scala_iterable", Seq(Seq(1, 2, 3))).show() 52 | 53 | // String 54 | Series.ofList("series_list_str_scala_array", Array(Array("a", "b"))).show() 55 | Series.ofList("series_list_str_scala_iterable", Seq(Seq("a", "b"))).show() 56 | 57 | // Deep Nested 58 | Series.ofList("series_list_list_str_scala_array", Array(Array(Array("a", "b")))).show() 59 | Series.ofList("series_list_list_str_scala_iterable", Seq(Seq(Seq("a", "b")))).show() 60 | 61 | /* Values as Java array/ list of Struct Types */ 62 | Series 63 | .ofSeries( 64 | "series_struct_scala_array", 65 | Array( 66 | Series.ofInt("int_col", Array(1, 2, 3)), 67 | Series.ofString("str_col", Array("a", "b", "c")), 68 | Series.ofBoolean("bool_col", Array(true, false, true)) 69 | ) 70 | ) 71 | .show() 72 | Series 73 | .ofSeries( 74 | "series_struct_scala_iterable", 75 | Seq( 76 | Series.ofInt("int_col", Seq(1, 2, 3)), 77 | Series.ofString("str_col", Seq("a", "b", "c")), 78 | Series.ofBoolean("bool_col", Seq(true, false, true)) 79 | ) 80 | ) 81 | .show() 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/configuration/ConfiguringPolars.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.configuration 2 | 3 | import java.io.File 4 | import java.nio.file.{Files, Path} 5 | 6 | import org.polars.scala.polars.Polars 7 | 8 | object ConfiguringPolars { 9 | 10 | def main(args: Array[String]): Unit = { 11 | 12 | /* Checking the version scala-polars is compiled against. */ 13 | val version: String = Polars.version() 14 | printf("scala-polars has been compiled against version '%s'%n%n", version) 15 | 16 | /* Get default configuration. */ 17 | printf("Default Configuration:%n%s%n%n", Polars.config) 18 | 19 | /* Updating configuration. */ 20 | 21 | /* Update the number of rows shown while doing `df.show()` */ 22 | Polars.config.update().withMaxTableRows(20).apply() 23 | printf("After updating number of rows:%n%s%n%n", Polars.config) 24 | 25 | /* Update the number of columns shown while doing `df.show()` */ 26 | Polars.config.update().withMaxTableColumns(20).apply() 27 | printf("After updating number of columns:%n%s%n%n", Polars.config) 28 | 29 | /* Reset config */ 30 | Polars.config.update().reset().apply() 31 | printf("After resetting config:%n%s%n%n", Polars.config) 32 | 33 | /* Chaining configuration options */ 34 | val options = Map("POLARS_TABLE_WIDTH" -> "5000") 35 | 36 | Polars.config 37 | .update() 38 | .withMaxTableRows(20) 39 | .withMaxTableColumns(20) 40 | .withOption("POLARS_FMT_TABLE_CELL_ALIGNMENT", "RIGHT") 41 | .withOptions(options) 42 | .apply() 43 | 44 | printf("After chained configs:%n%s%n%n", Polars.config) 45 | 46 | /* Persisting current configuration to file */ 47 | val tempDirectory: Path = Files.createTempDirectory("polars-config-") 48 | val tempFile: File = 49 | Files.createTempFile(tempDirectory, "temp-polars-config-", "plcfg").toFile 50 | Polars.config.saveTo(tempFile, overwrite = true) 51 | 52 | /* Reloading current configuration to file */ /* Reloading current configuration to file */ 53 | Polars.config.update().reset().apply() 54 | printf("After resetting config:%n%s%n%n", Polars.config) 55 | 56 | Polars.config.update().fromPath(tempFile).apply() 57 | printf("After reloading config from file path:%n%s%n", Polars.config) 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/expressions/ApplyingSimpleExpressions.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.expressions 2 | 3 | import scala.util.Random 4 | 5 | import org.polars.scala.polars.Polars 6 | import org.polars.scala.polars.functions._ 7 | 8 | import examples.scala.utils.CommonUtils 9 | 10 | object ApplyingSimpleExpressions { 11 | def main(args: Array[String]): Unit = { 12 | 13 | /* Read a dataset as a DataFrame lazily or eagerly */ 14 | val path = CommonUtils.getResource("/files/web-ds/data.json") 15 | val input = Polars.scan.jsonLines(path) 16 | 17 | /* Apply multiple operations on the LazyFrame or DataFrame */ 18 | var ldf = input.cache 19 | .select("id", "name") 20 | .with_column("lower_than_four", col("id") <= 4) 21 | .filter(col("lower_than_four")) 22 | .with_column("long_value", lit(Random.nextLong())) 23 | .with_column("date", lit(java.time.LocalDate.now())) 24 | .with_column("time", lit(java.time.LocalTime.now())) 25 | .with_column("current_ts", lit(java.time.ZonedDateTime.now())) 26 | .sort(asc("name"), nullLast = true, maintainOrder = false) 27 | .set_sorted(Map("name" -> false)) 28 | .top_k(2, "id", descending = true, nullLast = true, maintainOrder = false) 29 | .limit(2) // .head(2) 30 | .tail(2) 31 | .drop("long_value") 32 | .rename("lower_than_four", "less_than_four") 33 | .drop_nulls() 34 | 35 | ldf = Polars.concat(ldf, Array(ldf, ldf)) 36 | ldf = ldf.unique() 37 | 38 | println("Showing LazyFrame plan to stdout.") 39 | ldf.explain() 40 | 41 | val df = ldf.collect() 42 | 43 | println("Showing resultant DataFrame to stdout.") 44 | df.show() 45 | 46 | printf("Total rows: %s%n%n", df.count()) 47 | 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/io/LazyAndEagerAPI.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.io 2 | 3 | import org.polars.scala.polars.Polars 4 | import org.polars.scala.polars.api.{DataFrame, LazyFrame} 5 | 6 | import examples.scala.utils.CommonUtils 7 | 8 | /** Polars provides 2 API for reading datasets lazily (`scan`) or eagerly (`read`). 9 | * 10 | * These APIs serve different purposes and result in either a [[LazyFrame]] or a [[DataFrame]]. A 11 | * LazyFrame can be materialized to a DataFrame and vice-versa if required. 12 | * ==Lazy API== 13 | * With the lazy API Polars doesn't run each query line-by-line but instead processes the full 14 | * query end-to-end. To get the most out of Polars it is important that you use the lazy API 15 | * because: 16 | * 17 | * - the lazy API allows Polars to apply automatic query optimization with the query optimizer. 18 | * - the lazy API allows you to work with larger than memory datasets using streaming. 19 | * - the lazy API can catch schema errors before processing the data. 20 | * 21 | * More info can be found 22 | * [[https://pola-rs.github.io/polars-book/user-guide/lazy-api/intro.html here]]. 23 | * ==Eager API== 24 | * With eager API the queries are executed line-by-line in contrast to the lazy API. 25 | */ 26 | 27 | object LazyAndEagerAPI { 28 | 29 | def main(args: Array[String]): Unit = { 30 | /* Lazily read data from file based datasets */ 31 | val path = CommonUtils.getResource("/files/web-ds/data.csv") 32 | val ldf = Polars.scan.csv(path) 33 | 34 | /* Materialize LazyFrame to DataFrame */ 35 | var df: DataFrame = ldf.collect() 36 | 37 | println("Showing CSV file as a DataFrame to stdout.") 38 | df.show() 39 | 40 | printf("Total rows: %s%n%n", df.count()) 41 | printf("Total columns: %s%n%n", df.schema.getFields.length) 42 | 43 | /* Lazily read only first 3 rows */ 44 | df = Polars.scan.option("scan_csv_n_rows", "3").csv(path).collect() 45 | printf("Total rows: %s%n%n", df.count()) 46 | 47 | println("Rows:") 48 | df.rows().foreach(println) 49 | println("\n") 50 | 51 | /* Convert DataFrame back to LazyFrame */ 52 | val backToLdf: LazyFrame = df.toLazy 53 | printf("Show schema: %s%n%n", backToLdf.schema) 54 | 55 | /* Eagerly read data from file based datasets */ 56 | df = Polars.scan.csv(path).collect 57 | 58 | println("Showing CSV file as a DataFrame to stdout") 59 | df.show() 60 | 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/io/ReadingFileDatasets.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.io 2 | 3 | import org.polars.scala.polars.Polars 4 | import org.polars.scala.polars.api.{DataFrame, LazyFrame} 5 | 6 | import examples.scala.utils.CommonUtils 7 | 8 | /** Polars supports various input file formats like the following, 9 | * - [[org.polars.scala.polars.api.io.Scannable.csv CSV]] (delimited format like CSV, TSV, 10 | * etc.) 11 | * - [[org.polars.scala.polars.api.io.Scannable.parquet Apache Parquet]] 12 | * - [[org.polars.scala.polars.api.io.Scannable.ipc Apache Arrow IPC]] 13 | * - [[org.polars.scala.polars.api.io.Scannable.jsonLines New line delimited JSON]] 14 | * 15 | * All the above formats are compatible with the lazy or eager input API and users can supply 1 16 | * or more file paths which will be read in parallel to return a [[LazyFrame]] or a 17 | * [[DataFrame]]. 18 | * 19 | * Since each format may have its own additional options (example: delimiter for CSV format), 20 | * Polars allows a simple builder pattern which can be used to supply these options. 21 | * 22 | * While the examples below have been provided for Parquet files only, they also similarly apply 23 | * on the other supported file formats. 24 | * 25 | * Some additional examples may also be found in [[examples.scala.io.LazyAndEagerAPI]]. 26 | */ 27 | 28 | object ReadingFileDatasets { 29 | 30 | def main(args: Array[String]): Unit = { 31 | 32 | /* For one Parquet file */ 33 | val path = CommonUtils.getResource("/files/web-ds/data.parquet") 34 | val df = Polars.scan 35 | .parquet(path) 36 | .collect() 37 | 38 | println("Showing parquet file as a DataFrame to stdout.") 39 | df.show() 40 | 41 | printf("Total rows: %s%n%n", df.count()) 42 | 43 | /* For multiple Parquet file(s) */ 44 | val multiLdf = Polars.scan.parquet(path, path, path).collect() 45 | 46 | println("Showing multiple parquet files as 1 DataFrame to stdout.") 47 | multiLdf.show() 48 | printf("Total rows: %s%n%n", multiLdf.count()) 49 | 50 | /* Providing additional options with Parquet file input */ 51 | val pqDfWithOpts = Polars.scan 52 | .option("scan_parquet_low_memory", "true") 53 | .option("scan_parquet_n_rows", "3") 54 | .option("scan_parquet_cache", "false") 55 | .option("scan_parquet_row_index_name", "SerialNum") 56 | .parquet(path) 57 | .collect() 58 | 59 | println("Showing parquet file as a DataFrame to stdout.") 60 | pqDfWithOpts.show() 61 | 62 | printf("Total rows: %s%n%n", pqDfWithOpts.count()) 63 | 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/io/WritingToFileDatasets.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.io 2 | 3 | import org.polars.scala.polars.Polars 4 | 5 | import examples.scala.utils.CommonUtils 6 | 7 | /** Polars supports various output file formats like the following, 8 | * - [[org.polars.scala.polars.api.io.Writeable.parquet Apache Parquet]] 9 | * - [[org.polars.scala.polars.api.io.Writeable.ipc Apache IPC]] 10 | * 11 | * A [[org.polars.scala.polars.api.DataFrame DataFrame]] can be written to an object storage as a 12 | * file in one of the supported formats mentioned above. 13 | * 14 | * Since each format and storage may have its own additional options, Polars allows a simple 15 | * builder pattern which can be used to supply these options. 16 | * 17 | * While the examples below have been provided for Parquet files only, they also similarly apply 18 | * on the other supported file formats. 19 | */ 20 | object WritingToFileDatasets { 21 | 22 | def main(args: Array[String]): Unit = { 23 | 24 | /* Read a dataset as a DataFrame lazily or eagerly */ 25 | val path = CommonUtils.getResource("/files/web-ds/data.ipc") 26 | val df = Polars.scan.ipc(path).collect 27 | 28 | println("Showing ipc file as a DataFrame to stdout.") 29 | df.show() 30 | 31 | printf("Total rows: %s%n%n", df.count()) 32 | 33 | /* Write this DataFrame to local filesystem at the provided path */ 34 | val outputPath = CommonUtils.getOutputLocation("output.pq") 35 | df.write().parquet(outputPath) 36 | printf("File written to location: %s%n%n", outputPath) 37 | 38 | /* Overwrite output if already exists */ 39 | df.write().option("write_mode", "full").parquet(outputPath) 40 | printf("File overwritten at location: %s%n%n", outputPath) 41 | 42 | /* Write output file with compression */ 43 | df.write() 44 | .options( 45 | Map( 46 | "write_compression" -> "zstd", 47 | "write_mode" -> "overwrite", 48 | "write_parquet_stats" -> "full" 49 | ) 50 | ) 51 | .parquet(outputPath) 52 | printf("File overwritten at location: %s with compression%n%n", outputPath) 53 | 54 | /* Write output file to Amazon S3 object store */ 55 | val s3Path: String = "s3://bucket/output.pq" 56 | df.write() 57 | .options( 58 | Map( 59 | "write_compression" -> "zstd", 60 | "write_mode" -> "overwrite", 61 | "write_parquet_stats" -> "full", 62 | "aws_default_region" -> "us‑east‑2", 63 | "aws_access_key_id" -> "ABC", 64 | "aws_secret_access_key" -> "XYZ" 65 | ) 66 | ) 67 | .parquet(s3Path) 68 | printf("File overwritten at location: %s with compression%n%n", s3Path) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /examples/src/main/scala/examples/scala/utils/CommonUtils.scala: -------------------------------------------------------------------------------- 1 | package examples.scala.utils 2 | 3 | import java.nio.file.{Files, Paths, StandardCopyOption} 4 | 5 | object CommonUtils { 6 | 7 | def getResource(path: String): String = { 8 | val target = 9 | Files.createTempFile("tmp-resource-", s"-${Paths.get(path).getFileName.toString}") 10 | Files.copy( 11 | this.getClass.getResourceAsStream(path), 12 | target, 13 | StandardCopyOption.REPLACE_EXISTING 14 | ) 15 | 16 | target.toAbsolutePath.toString 17 | } 18 | 19 | def getOutputLocation(path: String): String = { 20 | val target = 21 | Files.createTempFile("tmp-resource-", s"-${Paths.get(path).getFileName.toString}") 22 | Files.deleteIfExists(target) 23 | 24 | target.toAbsolutePath.toString 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /native/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "scala-polars-native" 3 | version = "0.1.0" 4 | authors = ["chitralverma "] 5 | edition = "2021" 6 | license = "Apache-2.0" 7 | readme = "../README.md" 8 | publish = false 9 | 10 | [lib] 11 | name = "scala_polars" 12 | crate-type = ["cdylib"] 13 | doc = false 14 | 15 | [dependencies] 16 | anyhow = "1" 17 | jni = "0.21.1" 18 | jni_fn = "0.1" 19 | num-derive = "0.4" 20 | num-traits = "0.2" 21 | object_store = { version = "0.11", features = ["aws", "azure", "gcp", "http"] } 22 | polars-core = { version = "0.45", default-features = false } 23 | rust_decimal = "1.36.0" 24 | serde_json = "1" 25 | toml = "0.8" 26 | 27 | [dependencies.polars] 28 | version = "0.45" 29 | default-features = false 30 | features = [ 31 | "json", 32 | "parquet", 33 | "ipc", 34 | "ipc_streaming", 35 | "avro", 36 | "csv", 37 | "cloud", 38 | "approx_unique", 39 | "array_any_all", 40 | "array_count", 41 | "bitwise", 42 | "is_in", 43 | "repeat_by", 44 | "trigonometry", 45 | "sign", 46 | "list_gather", 47 | "list_count", 48 | "list_sets", 49 | "list_any_all", 50 | "list_drop_nulls", 51 | "list_sample", 52 | "cutqcut", 53 | "rle", 54 | "extract_groups", 55 | "pivot", 56 | "extract_jsonpath", 57 | "asof_join", 58 | "cross_join", 59 | "pct_change", 60 | "search_sorted", 61 | "merge_sorted", 62 | "top_k", 63 | "propagate_nans", 64 | "timezones", 65 | "peaks", 66 | "hist", 67 | "find_many", 68 | "dtype-full", 69 | "meta", 70 | "decompress", 71 | "regex", 72 | "binary_encoding", 73 | "polars_cloud", 74 | "performant", 75 | "lazy", 76 | "fmt", 77 | "temporal", 78 | "strings", 79 | "serde", 80 | "serde-lazy", 81 | "rows", 82 | "async", 83 | "aws", 84 | "gcp", 85 | "azure", 86 | "http", 87 | ] 88 | 89 | [profile.release] 90 | codegen-units = 1 91 | lto = true 92 | -------------------------------------------------------------------------------- /native/rustfmt.toml: -------------------------------------------------------------------------------- 1 | group_imports = "StdExternalCrate" 2 | imports_granularity = "Module" 3 | match_block_trailing_comma = true 4 | -------------------------------------------------------------------------------- /native/src/internal_jni/expr/column.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use std::ops::{Add, Div, Mul, Rem, Sub}; 4 | 5 | use anyhow::Context; 6 | use jni::objects::{JClass, JString}; 7 | use jni::sys::{jint, jlong}; 8 | use jni::JNIEnv; 9 | use jni_fn::jni_fn; 10 | use num_derive::FromPrimitive; 11 | use num_traits::FromPrimitive; 12 | use polars::prelude::*; 13 | 14 | use crate::internal_jni::utils::{j_string_to_string, to_ptr}; 15 | use crate::utils::error::ResultExt; 16 | 17 | #[derive(Clone, PartialEq, Eq, Debug, FromPrimitive)] 18 | pub enum BinaryOperator { 19 | EqualTo = 0, 20 | NotEqualTo = 1, 21 | LessThan = 2, 22 | LessThanEqualTo = 3, 23 | GreaterThan = 4, 24 | GreaterThanEqualTo = 5, 25 | Or = 6, 26 | And = 7, 27 | Plus = 8, 28 | Minus = 9, 29 | Multiply = 10, 30 | Divide = 11, 31 | Modulus = 12, 32 | } 33 | 34 | #[derive(Clone, PartialEq, Eq, Debug, FromPrimitive)] 35 | pub enum UnaryOperator { 36 | NOT = 0, 37 | IsNull = 1, 38 | IsNotNull = 2, 39 | IsNan = 3, 40 | IsNotNan = 4, 41 | Between = 5, 42 | IsIn = 6, 43 | Like = 7, 44 | Cast = 8, 45 | } 46 | 47 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.column_expr$")] 48 | pub fn column(mut env: JNIEnv, _: JClass, value: JString) -> jlong { 49 | let name = j_string_to_string( 50 | &mut env, 51 | &value, 52 | Some("Failed to parse provided column name as string"), 53 | ); 54 | 55 | let expr = col(name.as_str()); 56 | to_ptr(expr) 57 | } 58 | 59 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.column_expr$")] 60 | pub fn sort_column_by_name(mut env: JNIEnv, _: JClass, value: JString, descending: bool) -> jlong { 61 | let name = j_string_to_string( 62 | &mut env, 63 | &value, 64 | Some("Failed to parse provided column name as string"), 65 | ); 66 | 67 | let expr = Expr::Sort { 68 | expr: Arc::new(col(name.as_str())), 69 | options: SortOptions { 70 | descending, 71 | ..Default::default() 72 | }, 73 | }; 74 | 75 | to_ptr(expr) 76 | } 77 | 78 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.column_expr$")] 79 | pub unsafe fn applyUnary(mut env: JNIEnv, _: JClass, expr_ptr: *mut Expr, operator: jint) -> jlong { 80 | let l_expr = (*expr_ptr).clone(); 81 | 82 | let expr = UnaryOperator::from_i32(operator) 83 | .and_then(|option| match option { 84 | UnaryOperator::NOT => Some(l_expr.not()), 85 | UnaryOperator::IsNull => Some(l_expr.is_null()), 86 | UnaryOperator::IsNotNull => Some(l_expr.is_not_null()), 87 | UnaryOperator::IsNan => Some(l_expr.is_nan()), 88 | UnaryOperator::IsNotNan => Some(l_expr.is_not_nan()), 89 | _ => None, 90 | }) 91 | .context(format!( 92 | "Failed to parse provided ID `{operator}` as unary operator." 93 | )) 94 | .unwrap_or_throw(&mut env); 95 | 96 | to_ptr(expr) 97 | } 98 | 99 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.column_expr$")] 100 | pub unsafe fn applyBinary( 101 | mut env: JNIEnv, 102 | _: JClass, 103 | left_ptr: *mut Expr, 104 | right_ptr: *mut Expr, 105 | operator: jint, 106 | ) -> jlong { 107 | let l_expr = (*left_ptr).clone(); 108 | let r_expr = (*right_ptr).clone(); 109 | 110 | let expr = BinaryOperator::from_i32(operator) 111 | .map(|option| match option { 112 | BinaryOperator::EqualTo => l_expr.eq(r_expr), 113 | BinaryOperator::NotEqualTo => l_expr.neq(r_expr), 114 | BinaryOperator::LessThan => l_expr.lt(r_expr), 115 | BinaryOperator::LessThanEqualTo => l_expr.lt_eq(r_expr), 116 | BinaryOperator::GreaterThan => l_expr.gt(r_expr), 117 | BinaryOperator::GreaterThanEqualTo => l_expr.gt_eq(r_expr), 118 | BinaryOperator::Or => l_expr.or(r_expr), 119 | BinaryOperator::And => l_expr.and(r_expr), 120 | BinaryOperator::Plus => l_expr.add(r_expr), 121 | BinaryOperator::Minus => l_expr.sub(r_expr), 122 | BinaryOperator::Multiply => l_expr.mul(r_expr), 123 | BinaryOperator::Divide => l_expr.div(r_expr), 124 | BinaryOperator::Modulus => l_expr.rem(r_expr), 125 | }) 126 | .context(format!( 127 | "Failed to parse provided ID `{operator}` as binary operator." 128 | )) 129 | .unwrap_or_throw(&mut env); 130 | 131 | to_ptr(expr) 132 | } 133 | -------------------------------------------------------------------------------- /native/src/internal_jni/expr/literal.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JClass, JString}; 5 | use jni::sys::{jboolean, jdouble, jfloat, jint, jlong}; 6 | use jni::JNIEnv; 7 | use jni_fn::jni_fn; 8 | use polars::export::chrono::{NaiveDate, NaiveDateTime}; 9 | use polars::prelude::*; 10 | use polars_core::export::chrono::{NaiveTime, Timelike}; 11 | 12 | use crate::internal_jni::utils::{j_string_to_string, to_ptr}; 13 | use crate::utils::error::ResultExt; 14 | 15 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 16 | pub fn nullLit(_: JNIEnv, _: JClass) -> jlong { 17 | let expr = NULL.lit(); 18 | to_ptr(expr) 19 | } 20 | 21 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 22 | pub fn fromString(mut env: JNIEnv, _: JClass, value: JString) -> jlong { 23 | let string_value = j_string_to_string( 24 | &mut env, 25 | &value, 26 | Some("Failed to parse provided literal value as string"), 27 | ); 28 | let expr = lit(string_value); 29 | to_ptr(expr) 30 | } 31 | 32 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 33 | pub fn fromBool(_: JNIEnv, _: JClass, value: jboolean) -> jlong { 34 | let expr = lit(value); 35 | to_ptr(expr) 36 | } 37 | 38 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 39 | pub fn fromInt(_: JNIEnv, _: JClass, value: jint) -> jlong { 40 | let expr = lit(value); 41 | to_ptr(expr) 42 | } 43 | 44 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 45 | pub fn fromLong(_: JNIEnv, _: JClass, value: jlong) -> jlong { 46 | let expr = lit(value); 47 | to_ptr(expr) 48 | } 49 | 50 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 51 | pub fn fromFloat(_: JNIEnv, _: JClass, value: jfloat) -> jlong { 52 | let expr = lit(value); 53 | to_ptr(expr) 54 | } 55 | 56 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 57 | pub fn fromDouble(_: JNIEnv, _: JClass, value: jdouble) -> jlong { 58 | let expr = lit(value); 59 | to_ptr(expr) 60 | } 61 | 62 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 63 | pub fn fromDate(mut env: JNIEnv, _: JClass, value: JString) -> jlong { 64 | let string_value = j_string_to_string( 65 | &mut env, 66 | &value, 67 | Some("Failed to parse provided literal value as string"), 68 | ); 69 | 70 | let date = NaiveDate::parse_from_str(string_value.as_str(), "%Y-%m-%d") 71 | .context(format!( 72 | "Failed to parse value `{}` as date with format `%Y-%m-%d`", 73 | string_value 74 | )) 75 | .unwrap_or_throw(&mut env); 76 | 77 | let expr = lit(date); 78 | to_ptr(expr) 79 | } 80 | 81 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 82 | pub fn fromTime(mut env: JNIEnv, _: JClass, value: JString) -> jlong { 83 | let string_value = j_string_to_string( 84 | &mut env, 85 | &value, 86 | Some("Failed to parse provided literal value as string"), 87 | ); 88 | 89 | let time = NaiveTime::parse_from_str(string_value.as_str(), "%H:%M:%S%.f") 90 | .context(format!( 91 | "Failed to parse value `{}` as time with format `%H:%M:%S%.f`", 92 | string_value 93 | )) 94 | .unwrap_or_throw(&mut env); 95 | 96 | let total_seconds = time.num_seconds_from_midnight() as i64; 97 | let nanos = time.nanosecond() as i64; 98 | 99 | let expr = Expr::Literal(LiteralValue::Time((total_seconds) * 1_000_000_000 + nanos)); 100 | to_ptr(expr) 101 | } 102 | 103 | #[jni_fn("org.polars.scala.polars.internal.jni.expressions.literal_expr$")] 104 | pub fn fromDateTime(mut env: JNIEnv, _: JClass, value: JString) -> jlong { 105 | let string_value = j_string_to_string( 106 | &mut env, 107 | &value, 108 | Some("Failed to parse provided literal value as string"), 109 | ); 110 | 111 | let datetime = NaiveDateTime::parse_from_str(string_value.as_str(), "%FT%T%.f") 112 | .context(format!( 113 | "Failed to parse value `{}` as datetime with format `%FT%T%.f`", 114 | string_value 115 | )) 116 | .unwrap_or_throw(&mut env); 117 | 118 | let expr = lit(datetime); 119 | to_ptr(expr) 120 | } 121 | -------------------------------------------------------------------------------- /native/src/internal_jni/expr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod column; 2 | pub mod literal; 3 | -------------------------------------------------------------------------------- /native/src/internal_jni/frame.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | use std::borrow::ToOwned; 3 | use std::iter::Iterator; 4 | 5 | use anyhow::Context; 6 | use jni::objects::{JClass, JLongArray}; 7 | use jni::sys::{jlong, jstring}; 8 | use jni::JNIEnv; 9 | use jni_fn::jni_fn; 10 | use polars::prelude::*; 11 | use polars_core::utils::concat_df; 12 | 13 | use crate::internal_jni::utils::*; 14 | use crate::utils::error::ResultExt; 15 | 16 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 17 | pub unsafe fn schemaString(mut env: JNIEnv, _: JClass, df_ptr: *mut DataFrame) -> jstring { 18 | let df = &mut *df_ptr; 19 | 20 | serde_json::to_string(&df.schema().to_arrow(CompatLevel::oldest())) 21 | .map(|schema_string| string_to_j_string(&mut env, schema_string, None::<&str>)) 22 | .context("Failed to serialize schema") 23 | .unwrap_or_throw(&mut env) 24 | } 25 | 26 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 27 | pub unsafe fn show(_: JNIEnv, _: JClass, df_ptr: *mut DataFrame) { 28 | let df = &mut *df_ptr; 29 | println!("{:?}", df) 30 | } 31 | 32 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 33 | pub unsafe fn count(_: JNIEnv, _: JClass, df_ptr: *mut DataFrame) -> jlong { 34 | (*df_ptr).shape().0 as i64 35 | } 36 | 37 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 38 | pub unsafe fn concatDataFrames(mut env: JNIEnv, _: JClass, inputs: JLongArray) -> jlong { 39 | let dfs: Vec<_> = JavaArrayToVec::to_vec(&mut env, inputs) 40 | .into_iter() 41 | .map(|ptr| (*(ptr as *mut DataFrame)).to_owned()) 42 | .collect(); 43 | 44 | let concatenated_df = concat_df(dfs.iter()) 45 | .context("Failed to concatenate dataframes") 46 | .unwrap_or_throw(&mut env); 47 | 48 | to_ptr(concatenated_df) 49 | } 50 | 51 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 52 | pub unsafe fn toLazy(_: JNIEnv, _: JClass, df_ptr: *mut DataFrame) -> jlong { 53 | let ldf = (*df_ptr).clone().lazy(); 54 | to_ptr(ldf) 55 | } 56 | 57 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 58 | pub unsafe fn limit(_: JNIEnv, _: JClass, df_ptr: *mut DataFrame, n: jlong) -> jlong { 59 | let limited_df = (*df_ptr).head(Some(n as usize)); 60 | to_ptr(limited_df) 61 | } 62 | 63 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 64 | pub unsafe fn tail(_: JNIEnv, _: JClass, df_ptr: *mut DataFrame, n: jlong) -> jlong { 65 | let limited_df = (*df_ptr).tail(Some(n as usize)); 66 | to_ptr(limited_df) 67 | } 68 | 69 | #[jni_fn("org.polars.scala.polars.internal.jni.data_frame$")] 70 | pub unsafe fn fromSeries(mut env: JNIEnv, _: JClass, ptrs: JLongArray) -> jlong { 71 | let data: Vec<_> = JavaArrayToVec::to_vec(&mut env, ptrs) 72 | .into_iter() 73 | .map(|ptr| (*(ptr as *mut Series)).to_owned()) 74 | .collect(); 75 | 76 | let df = DataFrame::from_iter(data); 77 | to_ptr(df) 78 | } 79 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/mod.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context; 2 | use jni::objects::JString; 3 | use jni::sys::jint; 4 | use jni::JNIEnv; 5 | use polars::io::RowIndex; 6 | use polars::prelude::{IdxSize, PlHashMap}; 7 | 8 | use super::utils::j_string_to_string; 9 | use crate::utils::error::ResultExt; 10 | 11 | pub mod scan; 12 | pub mod write; 13 | 14 | pub fn get_file_path(env: &mut JNIEnv, file_path: JString) -> String { 15 | j_string_to_string(env, &file_path, Some("Failed to get provided path")) 16 | } 17 | 18 | fn parse_json_to_options(env: &mut JNIEnv, options: JString) -> PlHashMap { 19 | Ok(j_string_to_string( 20 | env, 21 | &options, 22 | Some("Failed to deserialize the provided options"), 23 | )) 24 | .and_then(|s| serde_json::from_str(&s)) 25 | .context("Failed to parse the provided options") 26 | .unwrap_or_throw(env) 27 | } 28 | 29 | pub fn get_row_index( 30 | env: &mut JNIEnv, 31 | row_count_col_name: JString, 32 | row_count_col_offset: jint, 33 | ) -> Option { 34 | if !row_count_col_name.is_null() { 35 | Some(RowIndex { 36 | name: j_string_to_string( 37 | env, 38 | &row_count_col_name, 39 | Some("Failed to get the provided row column name"), 40 | ) 41 | .into(), 42 | offset: if row_count_col_offset.is_positive() { 43 | row_count_col_offset as IdxSize 44 | } else { 45 | 0 46 | }, 47 | }) 48 | } else { 49 | None 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/scan/csv.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | use std::sync::Arc; 3 | 4 | use anyhow::Context; 5 | use jni::objects::{JClass, JObject, JObjectArray, JString}; 6 | use jni::sys::jlong; 7 | use jni::JNIEnv; 8 | use jni_fn::jni_fn; 9 | use polars::io::cloud::CloudOptions; 10 | use polars::io::RowIndex; 11 | use polars::prelude::*; 12 | 13 | use crate::internal_jni::io::{get_file_path, parse_json_to_options}; 14 | use crate::internal_jni::utils::{to_ptr, JavaArrayToVec}; 15 | use crate::utils::error::ResultExt; 16 | 17 | #[jni_fn("org.polars.scala.polars.internal.jni.io.scan$")] 18 | pub unsafe fn scanCSV(mut env: JNIEnv, _: JClass, paths: JObjectArray, options: JString) -> jlong { 19 | let mut options = parse_json_to_options(&mut env, options); 20 | 21 | let n_rows = options 22 | .remove("scan_csv_n_rows") 23 | .and_then(|s| s.parse::().ok()); 24 | 25 | let row_index_offset = options 26 | .remove("scan_csv_row_index_offset") 27 | .and_then(|s| s.parse::().ok()) 28 | .unwrap_or(0); 29 | 30 | let row_index = options 31 | .remove("scan_csv_row_index_name") 32 | .map(|name| RowIndex { 33 | name: name.into(), 34 | offset: row_index_offset, 35 | }); 36 | 37 | let cache = options 38 | .remove("scan_csv_cache") 39 | .and_then(|s| s.parse::().ok()) 40 | .unwrap_or(true); 41 | 42 | let glob = options 43 | .remove("scan_csv_glob") 44 | .and_then(|s| s.parse::().ok()) 45 | .unwrap_or(true); 46 | 47 | let low_memory = options 48 | .remove("scan_csv_low_memory") 49 | .and_then(|s| s.parse::().ok()) 50 | .unwrap_or(false); 51 | 52 | let rechunk = options 53 | .remove("scan_csv_rechunk") 54 | .and_then(|s| s.parse::().ok()) 55 | .unwrap_or(false); 56 | 57 | let file_path_col = options 58 | .remove("scan_csv_include_file_paths") 59 | .map(PlSmallStr::from_string); 60 | 61 | let raise_if_empty = options 62 | .remove("scan_csv_raise_if_empty") 63 | .and_then(|s| s.parse::().ok()) 64 | .unwrap_or(true); 65 | 66 | let ignore_errors = options 67 | .remove("scan_csv_ignore_errors") 68 | .and_then(|s| s.parse::().ok()) 69 | .unwrap_or(false); 70 | 71 | let has_header = options 72 | .remove("scan_csv_has_header") 73 | .and_then(|s| s.parse::().ok()) 74 | .unwrap_or(true); 75 | 76 | let missing_is_null = options 77 | .remove("scan_csv_missing_is_null") 78 | .and_then(|s| s.parse::().ok()) 79 | .unwrap_or(true); 80 | 81 | let truncate_ragged_lines = options 82 | .remove("scan_csv_truncate_ragged_lines") 83 | .and_then(|s| s.parse::().ok()) 84 | .unwrap_or(false); 85 | 86 | let try_parse_dates = options 87 | .remove("scan_csv_try_parse_dates") 88 | .and_then(|s| s.parse::().ok()) 89 | .unwrap_or(false); 90 | 91 | let decimal_comma = options 92 | .remove("scan_csv_decimal_comma") 93 | .and_then(|s| s.parse::().ok()) 94 | .unwrap_or(false); 95 | 96 | let chunk_size = options 97 | .remove("scan_csv_chunk_size") 98 | .and_then(|s| s.parse::().ok()) 99 | .unwrap_or(1 << 18); 100 | 101 | let skip_rows = options 102 | .remove("scan_csv_skip_rows") 103 | .and_then(|s| s.parse::().ok()) 104 | .unwrap_or(0); 105 | 106 | let skip_rows_after_header = options 107 | .remove("scan_csv_skip_rows_after_header") 108 | .and_then(|s| s.parse::().ok()) 109 | .unwrap_or(0); 110 | 111 | let infer_schema_length = options 112 | .remove("scan_csv_skip_infer_schema_length") 113 | .and_then(|s| s.parse::().ok()) 114 | .map_or(Some(100), Some); 115 | 116 | let separator = options 117 | .remove("scan_csv_separator") 118 | .and_then(|s| s.parse::().ok()) 119 | .unwrap_or(b','); 120 | 121 | let eol_char = options 122 | .remove("scan_csv_eol_char") 123 | .and_then(|s| s.parse::().ok()) 124 | .unwrap_or(b'\n'); 125 | 126 | let quote_char = options 127 | .remove("scan_csv_quote_char") 128 | .and_then(|s| s.parse::().ok()) 129 | .map_or(Some(b'"'), Some); 130 | 131 | let encoding = options 132 | .remove("scan_csv_encoding") 133 | .map(|s| match s.as_str() { 134 | "lossy_utf8" => CsvEncoding::LossyUtf8, 135 | _ => CsvEncoding::Utf8, 136 | }) 137 | .unwrap_or_default(); 138 | 139 | let null_value = options 140 | .remove("scan_csv_null_value") 141 | .map(|s| NullValues::AllColumnsSingle(s.as_str().into())); 142 | 143 | let comment_prefix = options 144 | .remove("scan_csv_comment_prefix") 145 | .map(PlSmallStr::from); 146 | 147 | let paths_vec: Vec = JavaArrayToVec::to_vec(&mut env, paths) 148 | .into_iter() 149 | .map(|o| JObject::from_raw(o)) 150 | .map(|o| get_file_path(&mut env, JString::from(o))) 151 | .map(PathBuf::from) 152 | .collect(); 153 | 154 | let first_path = paths_vec 155 | .first() 156 | .and_then(|p| p.to_str()) 157 | .context("Failed to get first path from provided list of paths") 158 | .unwrap_or_throw(&mut env); 159 | 160 | let cloud_options = CloudOptions::from_untyped_config(first_path, &options).ok(); 161 | 162 | let ldf = LazyCsvReader::new_paths(Arc::from(paths_vec.into_boxed_slice())) 163 | .with_glob(glob) 164 | .with_cache(cache) 165 | .with_include_file_paths(file_path_col) 166 | .with_low_memory(low_memory) 167 | .with_rechunk(rechunk) 168 | .with_n_rows(n_rows) 169 | .with_row_index(row_index) 170 | .with_raise_if_empty(raise_if_empty) 171 | .with_ignore_errors(ignore_errors) 172 | .with_has_header(has_header) 173 | .with_missing_is_null(missing_is_null) 174 | .with_truncate_ragged_lines(truncate_ragged_lines) 175 | .with_try_parse_dates(try_parse_dates) 176 | .with_decimal_comma(decimal_comma) 177 | .with_chunk_size(chunk_size) 178 | .with_skip_rows(skip_rows) 179 | .with_skip_rows_after_header(skip_rows_after_header) 180 | .with_infer_schema_length(infer_schema_length) 181 | .with_separator(separator) 182 | .with_quote_char(quote_char) 183 | .with_eol_char(eol_char) 184 | .with_encoding(encoding) 185 | .with_null_values(null_value) 186 | .with_comment_prefix(comment_prefix) 187 | .with_cloud_options(cloud_options) 188 | .finish() 189 | .context("Failed to perform csv scan") 190 | .unwrap_or_throw(&mut env); 191 | 192 | to_ptr(ldf) 193 | } 194 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/scan/ipc.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | use std::sync::Arc; 3 | 4 | use anyhow::Context; 5 | use jni::objects::{JClass, JObject, JObjectArray, JString}; 6 | use jni::sys::jlong; 7 | use jni::JNIEnv; 8 | use jni_fn::jni_fn; 9 | use polars::io::cloud::CloudOptions; 10 | use polars::io::{HiveOptions, RowIndex}; 11 | use polars::prelude::*; 12 | 13 | use crate::internal_jni::io::{get_file_path, parse_json_to_options}; 14 | use crate::internal_jni::utils::{to_ptr, JavaArrayToVec}; 15 | use crate::utils::error::ResultExt; 16 | 17 | #[jni_fn("org.polars.scala.polars.internal.jni.io.scan$")] 18 | pub unsafe fn scanIPC(mut env: JNIEnv, _: JClass, paths: JObjectArray, options: JString) -> jlong { 19 | let mut options = parse_json_to_options(&mut env, options); 20 | 21 | let n_rows = options 22 | .remove("scan_ipc_n_rows") 23 | .and_then(|s| s.parse::().ok()); 24 | 25 | let cache = options 26 | .remove("scan_ipc_cache") 27 | .and_then(|s| s.parse::().ok()) 28 | .unwrap_or(true); 29 | 30 | let rechunk = options 31 | .remove("scan_ipc_rechunk") 32 | .and_then(|s| s.parse::().ok()) 33 | .unwrap_or(false); 34 | 35 | let row_index_offset = options 36 | .remove("scan_ipc_row_index_offset") 37 | .and_then(|s| s.parse::().ok()) 38 | .unwrap_or(0); 39 | 40 | let row_index = options 41 | .remove("scan_ipc_row_index_name") 42 | .map(|name| RowIndex { 43 | name: name.into(), 44 | offset: row_index_offset, 45 | }); 46 | 47 | let file_path_col = options 48 | .remove("scan_ipc_include_file_paths") 49 | .map(PlSmallStr::from_string); 50 | 51 | let hive_scan_partitions = options 52 | .remove("scan_ipc_hive_scan_partitions") 53 | .and_then(|s| s.parse::().ok()) 54 | .map_or(Some(true), Some); 55 | 56 | let hive_try_parse_dates = options 57 | .remove("scan_ipc_hive_try_parse_dates") 58 | .and_then(|s| s.parse::().ok()) 59 | .unwrap_or(true); 60 | 61 | let paths_vec: Vec = JavaArrayToVec::to_vec(&mut env, paths) 62 | .into_iter() 63 | .map(|o| JObject::from_raw(o)) 64 | .map(|o| get_file_path(&mut env, JString::from(o))) 65 | .map(PathBuf::from) 66 | .collect(); 67 | 68 | let first_path = paths_vec 69 | .first() 70 | .and_then(|p| p.to_str()) 71 | .context("Failed to get first path from provided list of paths") 72 | .unwrap_or_throw(&mut env); 73 | 74 | let cloud_options = CloudOptions::from_untyped_config(first_path, &options).ok(); 75 | 76 | let scan_args = ScanArgsIpc { 77 | n_rows, 78 | cache, 79 | rechunk, 80 | row_index, 81 | cloud_options, 82 | hive_options: HiveOptions { 83 | enabled: hive_scan_partitions, 84 | hive_start_idx: 0, 85 | schema: None, 86 | try_parse_dates: hive_try_parse_dates, 87 | }, 88 | include_file_paths: file_path_col, 89 | }; 90 | 91 | let ldf = LazyFrame::scan_ipc_files(Arc::from(paths_vec.into_boxed_slice()), scan_args) 92 | .context("Failed to perform ipc scan") 93 | .unwrap_or_throw(&mut env); 94 | 95 | to_ptr(ldf) 96 | } 97 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/scan/json_lines.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroUsize; 2 | use std::path::PathBuf; 3 | use std::str::FromStr; 4 | use std::sync::Arc; 5 | 6 | use anyhow::Context; 7 | use jni::objects::{JClass, JObject, JObjectArray, JString}; 8 | use jni::sys::jlong; 9 | use jni::JNIEnv; 10 | use jni_fn::jni_fn; 11 | use polars::io::cloud::CloudOptions; 12 | use polars::io::RowIndex; 13 | use polars::prelude::*; 14 | 15 | use crate::internal_jni::io::{get_file_path, parse_json_to_options}; 16 | use crate::internal_jni::utils::{to_ptr, JavaArrayToVec}; 17 | use crate::utils::error::ResultExt; 18 | 19 | #[jni_fn("org.polars.scala.polars.internal.jni.io.scan$")] 20 | pub unsafe fn scanJsonLines( 21 | mut env: JNIEnv, 22 | _: JClass, 23 | paths: JObjectArray, 24 | options: JString, 25 | ) -> jlong { 26 | let mut options = parse_json_to_options(&mut env, options); 27 | 28 | let n_rows = options 29 | .remove("scan_ndjson_n_rows") 30 | .and_then(|s| s.parse::().ok()); 31 | 32 | let row_index_offset = options 33 | .remove("scan_ndjson_row_index_offset") 34 | .and_then(|s| s.parse::().ok()) 35 | .unwrap_or(0); 36 | 37 | let row_index = options 38 | .remove("scan_ndjson_row_index_name") 39 | .map(|name| RowIndex { 40 | name: name.into(), 41 | offset: row_index_offset, 42 | }); 43 | 44 | let low_memory = options 45 | .remove("scan_ndjson_low_memory") 46 | .and_then(|s| s.parse::().ok()) 47 | .unwrap_or(false); 48 | 49 | let rechunk = options 50 | .remove("scan_ndjson_rechunk") 51 | .and_then(|s| s.parse::().ok()) 52 | .unwrap_or(false); 53 | 54 | let file_path_col = options 55 | .remove("scan_ndjson_include_file_paths") 56 | .map(PlSmallStr::from_string); 57 | 58 | let ignore_errors = options 59 | .remove("scan_ndjson_ignore_errors") 60 | .and_then(|s| s.parse::().ok()) 61 | .unwrap_or(false); 62 | 63 | let batch_size = options 64 | .remove("scan_ndjson_batch_size") 65 | .and_then(|s| NonZeroUsize::from_str(s.as_str()).ok()); 66 | 67 | let infer_schema_length = options 68 | .remove("scan_ndjson_infer_schema_length") 69 | .and_then(|s| NonZeroUsize::from_str(s.as_str()).ok()) 70 | .map_or(NonZeroUsize::new(100), Some); 71 | 72 | let paths_vec: Vec = JavaArrayToVec::to_vec(&mut env, paths) 73 | .into_iter() 74 | .map(|o| JObject::from_raw(o)) 75 | .map(|o| get_file_path(&mut env, JString::from(o))) 76 | .map(PathBuf::from) 77 | .collect(); 78 | 79 | let first_path = paths_vec 80 | .first() 81 | .and_then(|p| p.to_str()) 82 | .context("Failed to get first path from provided list of paths") 83 | .unwrap_or_throw(&mut env); 84 | 85 | let cloud_options = CloudOptions::from_untyped_config(first_path, &options).ok(); 86 | 87 | let ldf = LazyJsonLineReader::new_paths(Arc::from(paths_vec.into_boxed_slice())) 88 | .low_memory(low_memory) 89 | .with_rechunk(rechunk) 90 | .with_n_rows(n_rows) 91 | .with_row_index(row_index) 92 | .with_infer_schema_length(infer_schema_length) 93 | .with_ignore_errors(ignore_errors) 94 | .with_batch_size(batch_size) 95 | .with_include_file_paths(file_path_col) 96 | .with_cloud_options(cloud_options) 97 | .finish() 98 | .context("Failed to perform ndjson scan") 99 | .unwrap_or_throw(&mut env); 100 | 101 | to_ptr(ldf) 102 | } 103 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/scan/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod csv; 2 | pub mod ipc; 3 | pub mod json_lines; 4 | pub mod parquet; 5 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/scan/parquet.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | use std::sync::Arc; 3 | 4 | use anyhow::Context; 5 | use jni::objects::{JClass, JObject, JObjectArray, JString}; 6 | use jni::sys::jlong; 7 | use jni::JNIEnv; 8 | use jni_fn::jni_fn; 9 | use polars::io::cloud::CloudOptions; 10 | use polars::io::{HiveOptions, RowIndex}; 11 | use polars::prelude::*; 12 | 13 | use crate::internal_jni::io::{get_file_path, parse_json_to_options}; 14 | use crate::internal_jni::utils::{to_ptr, JavaArrayToVec}; 15 | use crate::utils::error::ResultExt; 16 | 17 | #[jni_fn("org.polars.scala.polars.internal.jni.io.scan$")] 18 | pub unsafe fn scanParquet( 19 | mut env: JNIEnv, 20 | _: JClass, 21 | paths: JObjectArray, 22 | options: JString, 23 | ) -> jlong { 24 | let mut options = parse_json_to_options(&mut env, options); 25 | 26 | let n_rows = options 27 | .remove("scan_parquet_n_rows") 28 | .and_then(|s| s.parse::().ok()); 29 | 30 | let parallel = options 31 | .remove("scan_parquet_parallel") 32 | .map(|s| match s.as_str() { 33 | "columns" => ParallelStrategy::Columns, 34 | "prefiltered" => ParallelStrategy::Prefiltered, 35 | "row_groups" => ParallelStrategy::RowGroups, 36 | "none" => ParallelStrategy::None, 37 | _ => ParallelStrategy::default(), 38 | }) 39 | .unwrap_or_default(); 40 | 41 | let row_index_offset = options 42 | .remove("scan_parquet_row_index_offset") 43 | .and_then(|s| s.parse::().ok()) 44 | .unwrap_or(0); 45 | 46 | let row_index = options 47 | .remove("scan_parquet_row_index_name") 48 | .map(|name| RowIndex { 49 | name: name.into(), 50 | offset: row_index_offset, 51 | }); 52 | 53 | let use_statistics = options 54 | .remove("scan_parquet_use_statistics") 55 | .and_then(|s| s.parse::().ok()) 56 | .unwrap_or(true); 57 | 58 | let cache = options 59 | .remove("scan_parquet_cache") 60 | .and_then(|s| s.parse::().ok()) 61 | .unwrap_or(true); 62 | 63 | let glob = options 64 | .remove("scan_parquet_glob") 65 | .and_then(|s| s.parse::().ok()) 66 | .unwrap_or(true); 67 | 68 | let low_memory = options 69 | .remove("scan_parquet_low_memory") 70 | .and_then(|s| s.parse::().ok()) 71 | .unwrap_or(false); 72 | 73 | let rechunk = options 74 | .remove("scan_parquet_rechunk") 75 | .and_then(|s| s.parse::().ok()) 76 | .unwrap_or(false); 77 | 78 | let allow_missing_columns = options 79 | .remove("scan_parquet_allow_missing_columns") 80 | .and_then(|s| s.parse::().ok()) 81 | .unwrap_or(false); 82 | 83 | let file_path_col = options 84 | .remove("scan_parquet_include_file_paths") 85 | .map(PlSmallStr::from_string); 86 | 87 | let hive_scan_partitions = options 88 | .remove("scan_parquet_hive_scan_partitions") 89 | .and_then(|s| s.parse::().ok()) 90 | .map_or(Some(true), Some); 91 | 92 | let hive_try_parse_dates = options 93 | .remove("scan_parquet_hive_try_parse_dates") 94 | .and_then(|s| s.parse::().ok()) 95 | .unwrap_or(true); 96 | 97 | let paths_vec: Vec = JavaArrayToVec::to_vec(&mut env, paths) 98 | .into_iter() 99 | .map(|o| JObject::from_raw(o)) 100 | .map(|o| get_file_path(&mut env, JString::from(o))) 101 | .map(PathBuf::from) 102 | .collect(); 103 | 104 | let first_path = paths_vec 105 | .first() 106 | .and_then(|p| p.to_str()) 107 | .context("Failed to get first path from provided list of paths") 108 | .unwrap_or_throw(&mut env); 109 | 110 | let cloud_options = CloudOptions::from_untyped_config(first_path, &options).ok(); 111 | 112 | let scan_args = ScanArgsParquet { 113 | n_rows, 114 | parallel, 115 | row_index, 116 | use_statistics, 117 | cache, 118 | glob, 119 | low_memory, 120 | rechunk, 121 | allow_missing_columns, 122 | cloud_options, 123 | include_file_paths: file_path_col, 124 | hive_options: HiveOptions { 125 | enabled: hive_scan_partitions, 126 | hive_start_idx: 0, 127 | schema: None, 128 | try_parse_dates: hive_try_parse_dates, 129 | }, 130 | schema: None, 131 | }; 132 | 133 | let ldf = LazyFrame::scan_parquet_files(Arc::from(paths_vec.into_boxed_slice()), scan_args) 134 | .context("Failed to perform parquet scan") 135 | .unwrap_or_throw(&mut env); 136 | 137 | to_ptr(ldf) 138 | } 139 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/avro.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JObject, JString}; 5 | use jni::JNIEnv; 6 | use jni_fn::jni_fn; 7 | use polars::io::avro::{AvroCompression, AvroWriter}; 8 | use polars::prelude::*; 9 | 10 | use crate::internal_jni::io::parse_json_to_options; 11 | use crate::internal_jni::io::write::get_df_and_writer; 12 | use crate::utils::error::ResultExt; 13 | 14 | fn parse_avro_compression(compression: Option) -> Option { 15 | match compression { 16 | Some(t) => match t.to_lowercase().as_str() { 17 | "uncompressed" => None, 18 | "deflate" => Some(AvroCompression::Deflate), 19 | "snappy" => Some(AvroCompression::Snappy), 20 | e => { 21 | polars_warn!(format!( 22 | "Compression must be one of {{'uncompressed', 'deflate', 'snappy'}}, got {e}. Using defaults." 23 | )); 24 | None 25 | }, 26 | }, 27 | _ => None, 28 | } 29 | } 30 | 31 | #[jni_fn("org.polars.scala.polars.internal.jni.io.write$")] 32 | pub fn writeAvro( 33 | mut env: JNIEnv, 34 | _object: JObject, 35 | df_ptr: *mut DataFrame, 36 | filePath: JString, 37 | options: JString, 38 | ) { 39 | let mut options = parse_json_to_options(&mut env, options); 40 | 41 | let record_name = options.remove("write_avro_record_name"); 42 | 43 | let overwrite_mode = options 44 | .remove("write_mode") 45 | .map(|s| matches!(s.to_lowercase().as_str(), "overwrite")) 46 | .unwrap_or(false); 47 | 48 | let compression = options.remove("write_compression"); 49 | 50 | let (mut dataframe, writer) = 51 | get_df_and_writer(&mut env, df_ptr, filePath, overwrite_mode, options); 52 | 53 | let avro_compression = parse_avro_compression(compression); 54 | 55 | let mut avro_writer = AvroWriter::new(writer).with_compression(avro_compression); 56 | 57 | if let Some(value) = record_name { 58 | avro_writer = avro_writer.with_name(value) 59 | } 60 | 61 | avro_writer 62 | .finish(&mut dataframe) 63 | .context("Failed to write Avro data") 64 | .unwrap_or_throw(&mut env); 65 | } 66 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/csv.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JObject, JString}; 5 | use jni::JNIEnv; 6 | use jni_fn::jni_fn; 7 | use polars::prelude::*; 8 | 9 | use crate::internal_jni::io::parse_json_to_options; 10 | use crate::internal_jni::io::write::get_df_and_writer; 11 | use crate::utils::error::ResultExt; 12 | 13 | #[jni_fn("org.polars.scala.polars.internal.jni.io.write$")] 14 | pub fn writeCSV( 15 | mut env: JNIEnv, 16 | _object: JObject, 17 | df_ptr: *mut DataFrame, 18 | filePath: JString, 19 | options: JString, 20 | ) { 21 | let mut options = parse_json_to_options(&mut env, options); 22 | 23 | let include_bom = options 24 | .remove("write_csv_include_bom") 25 | .and_then(|s| s.parse::().ok()); 26 | 27 | let include_header = options 28 | .remove("write_csv_include_header") 29 | .and_then(|s| s.parse::().ok()); 30 | 31 | let float_scientific = options 32 | .remove("write_csv_float_scientific") 33 | .and_then(|s| s.parse::().ok()); 34 | 35 | let float_precision = options 36 | .remove("write_csv_float_precision") 37 | .and_then(|s| s.parse::().ok()); 38 | 39 | let separator = options 40 | .remove("write_csv_separator") 41 | .and_then(|s| s.parse::().ok()); 42 | 43 | let quote_char = options 44 | .remove("write_csv_quote_char") 45 | .and_then(|s| s.parse::().ok()); 46 | 47 | let date_format = options.remove("write_csv_date_format"); 48 | let time_format = options.remove("write_csv_time_format"); 49 | let datetime_format = options.remove("write_csv_datetime_format"); 50 | 51 | let line_terminator = options.remove("write_csv_line_terminator"); 52 | let null_value = options.remove("write_csv_null_value"); 53 | 54 | let quote_style = options 55 | .remove("write_csv_quote_style") 56 | .map(|s| match s.as_str() { 57 | "always" => QuoteStyle::Always, 58 | "non_numeric" => QuoteStyle::NonNumeric, 59 | "never" => QuoteStyle::Never, 60 | _ => QuoteStyle::Necessary, 61 | }); 62 | 63 | let overwrite_mode = options 64 | .remove("write_mode") 65 | .map(|s| matches!(s.to_lowercase().as_str(), "overwrite")) 66 | .unwrap_or(false); 67 | 68 | let (mut dataframe, writer) = 69 | get_df_and_writer(&mut env, df_ptr, filePath, overwrite_mode, options); 70 | 71 | let mut csv_writer = CsvWriter::new(writer) 72 | .with_date_format(date_format) 73 | .with_time_format(time_format) 74 | .with_datetime_format(datetime_format) 75 | .with_float_precision(float_precision) 76 | .with_float_scientific(float_scientific); 77 | 78 | if let Some(value) = include_bom { 79 | csv_writer = csv_writer.include_bom(value) 80 | } 81 | 82 | if let Some(value) = include_header { 83 | csv_writer = csv_writer.include_header(value) 84 | } 85 | 86 | if let Some(value) = separator { 87 | csv_writer = csv_writer.with_separator(value) 88 | } 89 | 90 | if let Some(value) = quote_char { 91 | csv_writer = csv_writer.with_quote_char(value) 92 | } 93 | 94 | if let Some(value) = line_terminator { 95 | csv_writer = csv_writer.with_line_terminator(value) 96 | } 97 | 98 | if let Some(value) = null_value { 99 | csv_writer = csv_writer.with_null_value(value) 100 | } 101 | 102 | if let Some(value) = quote_style { 103 | csv_writer = csv_writer.with_quote_style(value) 104 | } 105 | 106 | csv_writer 107 | .finish(&mut dataframe) 108 | .context("Failed to write CSV data") 109 | .unwrap_or_throw(&mut env); 110 | } 111 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/ipc.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JObject, JString}; 5 | use jni::JNIEnv; 6 | use jni_fn::jni_fn; 7 | use polars::prelude::*; 8 | 9 | use crate::internal_jni::io::parse_json_to_options; 10 | use crate::internal_jni::io::write::get_df_and_writer; 11 | use crate::utils::error::ResultExt; 12 | 13 | fn parse_ipc_compression(compression: Option) -> Option { 14 | match compression { 15 | Some(t) => match t.to_lowercase().as_str() { 16 | "uncompressed" => None, 17 | "lz4" => Some(IpcCompression::LZ4), 18 | "zstd" => Some(IpcCompression::ZSTD), 19 | e => { 20 | polars_warn!(format!( 21 | "Compression must be one of {{'uncompressed', 'lz4', 'zstd'}}, got {e}. Using defaults." 22 | )); 23 | None 24 | }, 25 | }, 26 | _ => None, 27 | } 28 | } 29 | 30 | #[jni_fn("org.polars.scala.polars.internal.jni.io.write$")] 31 | pub fn writeIPC( 32 | mut env: JNIEnv, 33 | _object: JObject, 34 | df_ptr: *mut DataFrame, 35 | filePath: JString, 36 | options: JString, 37 | ) { 38 | let mut options = parse_json_to_options(&mut env, options); 39 | 40 | let compat_level = 41 | options 42 | .remove("write_ipc_compat_level") 43 | .map(|s| match s.to_lowercase().as_str() { 44 | "newest" => CompatLevel::newest(), 45 | _ => CompatLevel::oldest(), 46 | }); 47 | 48 | let overwrite_mode = options 49 | .remove("write_mode") 50 | .map(|s| matches!(s.to_lowercase().as_str(), "overwrite")) 51 | .unwrap_or(false); 52 | 53 | let compression = options.remove("write_compression"); 54 | 55 | let (mut dataframe, writer) = 56 | get_df_and_writer(&mut env, df_ptr, filePath, overwrite_mode, options); 57 | 58 | let ipc_compression = parse_ipc_compression(compression); 59 | 60 | let mut ipc_writer = IpcWriter::new(writer).with_compression(ipc_compression); 61 | 62 | if let Some(value) = compat_level { 63 | ipc_writer = ipc_writer.with_compat_level(value) 64 | } 65 | 66 | ipc_writer 67 | .finish(&mut dataframe) 68 | .context("Failed to write IPC data") 69 | .unwrap_or_throw(&mut env); 70 | } 71 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/json.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JObject, JString}; 5 | use jni::JNIEnv; 6 | use jni_fn::jni_fn; 7 | use polars::prelude::*; 8 | 9 | use crate::internal_jni::io::parse_json_to_options; 10 | use crate::internal_jni::io::write::get_df_and_writer; 11 | use crate::utils::error::ResultExt; 12 | 13 | #[jni_fn("org.polars.scala.polars.internal.jni.io.write$")] 14 | pub fn writeJson( 15 | mut env: JNIEnv, 16 | _object: JObject, 17 | df_ptr: *mut DataFrame, 18 | filePath: JString, 19 | options: JString, 20 | ) { 21 | let mut options = parse_json_to_options(&mut env, options); 22 | 23 | let json_format = options 24 | .remove("write_json_format") 25 | .and_then(|s| match s.to_lowercase().as_str() { 26 | "json" => Some(JsonFormat::Json), 27 | "json_lines" => Some(JsonFormat::JsonLines), 28 | _ => None, 29 | }) 30 | .unwrap_or(JsonFormat::Json); 31 | 32 | let overwrite_mode = options 33 | .remove("write_mode") 34 | .map(|s| matches!(s.to_lowercase().as_str(), "overwrite")) 35 | .unwrap_or(false); 36 | 37 | let (mut dataframe, writer) = 38 | get_df_and_writer(&mut env, df_ptr, filePath, overwrite_mode, options); 39 | 40 | let mut json_writer = JsonWriter::new(writer).with_json_format(json_format); 41 | 42 | json_writer 43 | .finish(&mut dataframe) 44 | .context("Failed to write JSON data") 45 | .unwrap_or_throw(&mut env); 46 | } 47 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod avro; 2 | pub mod csv; 3 | pub mod ipc; 4 | pub mod json; 5 | pub mod parquet; 6 | 7 | use std::sync::Arc; 8 | 9 | use anyhow::Context; 10 | use jni::objects::JString; 11 | use jni::JNIEnv; 12 | use object_store::path::Path; 13 | use object_store::ObjectStore; 14 | use polars::io::cloud::{build_object_store, CloudOptions, CloudWriter}; 15 | use polars::io::pl_async::get_runtime; 16 | use polars::prelude::*; 17 | 18 | use super::get_file_path; 19 | use crate::utils::error::ResultExt; 20 | 21 | async fn ensure_write_mode( 22 | object_store_ref: &Arc, 23 | uri: &str, 24 | prefix: &str, 25 | overwrite_mode: bool, 26 | ) -> PolarsResult<()> { 27 | let meta = object_store_ref.head(&Path::from(prefix)).await; 28 | match meta { 29 | Err(object_store::Error::NotFound { .. }) => Ok(()), 30 | Err(e) => Err(PolarsError::IO { 31 | error: Arc::new(e.into()), 32 | msg: Some("Failed to connect to object store, recheck the provided options".into()), 33 | }), 34 | Ok(_) if !overwrite_mode => Err( 35 | polars_err!(ComputeError: "File already exists at the provided location `{uri}` and overwrite option is not set"), 36 | ), 37 | _ => Ok(()), 38 | } 39 | } 40 | 41 | async fn create_cloud_writer( 42 | uri: &str, 43 | cloud_options: Option<&CloudOptions>, 44 | overwrite_mode: bool, 45 | ) -> PolarsResult { 46 | let (cloud_location, object_store) = build_object_store(uri, cloud_options, false).await?; 47 | let dyn_store = object_store.to_dyn_object_store().await; 48 | ensure_write_mode( 49 | &dyn_store, 50 | uri, 51 | cloud_location.prefix.as_ref(), 52 | overwrite_mode, 53 | ) 54 | .await?; 55 | 56 | let cloud_writer = CloudWriter::new_with_object_store( 57 | dyn_store.clone(), 58 | cloud_location.prefix.clone().into(), 59 | )?; 60 | 61 | Ok(cloud_writer) 62 | } 63 | 64 | fn get_df_and_writer( 65 | env: &mut JNIEnv, 66 | df_ptr: *mut DataFrame, 67 | filePath: JString, 68 | overwrite_mode: bool, 69 | writer_options: PlHashMap, 70 | ) -> (DataFrame, CloudWriter) { 71 | let full_path = get_file_path(env, filePath); 72 | let uri = full_path.as_str(); 73 | 74 | let cloud_options = CloudOptions::from_untyped_config(uri, &writer_options); 75 | let writer: CloudWriter = get_runtime() 76 | .block_on_potential_spawn(async { 77 | create_cloud_writer(uri, cloud_options.ok().as_ref(), overwrite_mode).await 78 | }) 79 | .context("Failed to create writer") 80 | .unwrap_or_throw(env); 81 | 82 | let dataframe = unsafe { &*df_ptr }.clone(); 83 | (dataframe, writer) 84 | } 85 | -------------------------------------------------------------------------------- /native/src/internal_jni/io/write/parquet.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use anyhow::Context; 4 | use jni::objects::{JObject, JString}; 5 | use jni::JNIEnv; 6 | use jni_fn::jni_fn; 7 | use num_traits::ToPrimitive; 8 | use polars::prelude::*; 9 | 10 | use crate::internal_jni::io::parse_json_to_options; 11 | use crate::internal_jni::io::write::get_df_and_writer; 12 | use crate::utils::error::ResultExt; 13 | 14 | fn parse_parquet_compression( 15 | compression: Option, 16 | compression_level: Option, 17 | ) -> Option { 18 | match (compression, compression_level) { 19 | (Some(t), l) => match t.to_lowercase().as_str() { 20 | "uncompressed" => Some(ParquetCompression::Uncompressed), 21 | "snappy" => Some(ParquetCompression::Snappy), 22 | "lz4" => Some(ParquetCompression::Lz4Raw), 23 | "lzo" => Some(ParquetCompression::Lzo), 24 | "gzip" => { 25 | let level = l.and_then(|v| GzipLevel::try_new(v.to_u8()?).ok()); 26 | Some(ParquetCompression::Gzip(level)) 27 | }, 28 | "brotli" => { 29 | let level = l.and_then(|v| BrotliLevel::try_new(v.to_u32()?).ok()); 30 | Some(ParquetCompression::Brotli(level)) 31 | }, 32 | "zstd" => { 33 | let level = l.and_then(|v| ZstdLevel::try_new(v).ok()); 34 | Some(ParquetCompression::Zstd(level)) 35 | }, 36 | e => { 37 | polars_warn!(format!("Compression must be one of {{'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'}}, got {e}. Using defaults.")); 38 | None 39 | }, 40 | }, 41 | _ => None, 42 | } 43 | } 44 | 45 | #[jni_fn("org.polars.scala.polars.internal.jni.io.write$")] 46 | pub fn writeParquet( 47 | mut env: JNIEnv, 48 | _object: JObject, 49 | df_ptr: *mut DataFrame, 50 | filePath: JString, 51 | options: JString, 52 | ) { 53 | let mut options = parse_json_to_options(&mut env, options); 54 | 55 | let is_parallel = options 56 | .remove("write_parquet_parallel") 57 | .and_then(|s| s.parse::().ok()); 58 | 59 | let data_page_size = options 60 | .remove("write_parquet_data_page_size") 61 | .and_then(|s| s.parse::().ok()); 62 | 63 | let row_group_size = options 64 | .remove("write_parquet_row_group_size") 65 | .and_then(|s| s.parse::().ok()); 66 | 67 | let overwrite_mode = options 68 | .remove("write_mode") 69 | .map(|s| matches!(s.to_lowercase().as_str(), "overwrite")) 70 | .unwrap_or(false); 71 | 72 | let compression = options.remove("write_compression"); 73 | let compression_level = options 74 | .remove("write_compression_level") 75 | .and_then(|s| s.parse::().ok()); 76 | 77 | let write_stats = options 78 | .remove("write_parquet_stats") 79 | .map(|s| match s.as_str() { 80 | "full" => StatisticsOptions::full(), 81 | "none" => StatisticsOptions::empty(), 82 | _ => StatisticsOptions::default(), 83 | }); 84 | 85 | let (mut dataframe, writer) = 86 | get_df_and_writer(&mut env, df_ptr, filePath, overwrite_mode, options); 87 | 88 | let parquet_compression = parse_parquet_compression(compression, compression_level); 89 | 90 | let mut parquet_writer = ParquetWriter::new(writer) 91 | .with_data_page_size(data_page_size) 92 | .with_row_group_size(row_group_size); 93 | 94 | if let Some(value) = is_parallel { 95 | parquet_writer = parquet_writer.set_parallel(value) 96 | } 97 | 98 | if let Some(value) = write_stats { 99 | parquet_writer = parquet_writer.with_statistics(value) 100 | } 101 | 102 | if let Some(value) = parquet_compression { 103 | parquet_writer = parquet_writer.with_compression(value) 104 | } 105 | 106 | parquet_writer 107 | .finish(&mut dataframe) 108 | .context("Failed to write Parquet data") 109 | .unwrap_or_throw(&mut env); 110 | } 111 | -------------------------------------------------------------------------------- /native/src/internal_jni/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod expr; 2 | pub mod frame; 3 | pub mod io; 4 | pub mod lazy; 5 | pub mod row; 6 | pub mod series; 7 | pub mod utils; 8 | -------------------------------------------------------------------------------- /native/src/internal_jni/series.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | 3 | use std::iter::Iterator; 4 | 5 | use anyhow::{Context, Error}; 6 | use jni::objects::*; 7 | use jni::sys::jlong; 8 | use jni::JNIEnv; 9 | use jni_fn::jni_fn; 10 | use polars::export::chrono::{NaiveDate, NaiveDateTime, NaiveTime}; 11 | use polars::prelude::*; 12 | 13 | use crate::internal_jni::utils::{j_string_to_string, to_ptr, JavaArrayToVec}; 14 | use crate::utils::error::ResultExt; 15 | 16 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 17 | pub unsafe fn new_str_series( 18 | mut env: JNIEnv, 19 | _: JClass, 20 | name: JString, 21 | values: JObjectArray, 22 | ) -> jlong { 23 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 24 | .into_iter() 25 | .map(|o| JObject::from_raw(o)) 26 | .map(|o| { 27 | j_string_to_string( 28 | &mut env, 29 | &JString::from(o), 30 | Some("Failed to parse the provided value as a series element"), 31 | ) 32 | }) 33 | .collect(); 34 | 35 | let series_name = j_string_to_string( 36 | &mut env, 37 | &name, 38 | Some("Failed to parse the provided value as a series name"), 39 | ); 40 | let series = Series::new(PlSmallStr::from_string(series_name), data); 41 | 42 | to_ptr(series) 43 | } 44 | 45 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 46 | pub fn new_long_series(mut env: JNIEnv, _: JClass, name: JString, values: JLongArray) -> jlong { 47 | let data = JavaArrayToVec::to_vec(&mut env, values); 48 | 49 | let series_name = j_string_to_string( 50 | &mut env, 51 | &name, 52 | Some("Failed to parse the provided value as a series name"), 53 | ); 54 | let series = Series::new(PlSmallStr::from_string(series_name), data); 55 | 56 | to_ptr(series) 57 | } 58 | 59 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 60 | pub fn new_int_series(mut env: JNIEnv, _: JClass, name: JString, values: JIntArray) -> jlong { 61 | let data = JavaArrayToVec::to_vec(&mut env, values); 62 | 63 | let series_name = j_string_to_string( 64 | &mut env, 65 | &name, 66 | Some("Failed to parse the provided value as a series name"), 67 | ); 68 | let series = Series::new(PlSmallStr::from_string(series_name), data); 69 | 70 | to_ptr(series) 71 | } 72 | 73 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 74 | pub fn new_float_series(mut env: JNIEnv, _: JClass, name: JString, values: JFloatArray) -> jlong { 75 | let data = JavaArrayToVec::to_vec(&mut env, values); 76 | 77 | let series_name = j_string_to_string( 78 | &mut env, 79 | &name, 80 | Some("Failed to parse the provided value as a series name"), 81 | ); 82 | let series = Series::new(PlSmallStr::from_string(series_name), data); 83 | 84 | to_ptr(series) 85 | } 86 | 87 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 88 | pub fn new_double_series(mut env: JNIEnv, _: JClass, name: JString, values: JDoubleArray) -> jlong { 89 | let data = JavaArrayToVec::to_vec(&mut env, values); 90 | 91 | let series_name = j_string_to_string( 92 | &mut env, 93 | &name, 94 | Some("Failed to parse the provided value as a series name"), 95 | ); 96 | let series = Series::new(PlSmallStr::from_string(series_name), data); 97 | 98 | to_ptr(series) 99 | } 100 | 101 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 102 | pub fn new_boolean_series( 103 | mut env: JNIEnv, 104 | _: JClass, 105 | name: JString, 106 | values: JBooleanArray, 107 | ) -> jlong { 108 | let data = JavaArrayToVec::to_vec(&mut env, values); 109 | 110 | let series_name = j_string_to_string( 111 | &mut env, 112 | &name, 113 | Some("Failed to parse the provided value as a series name"), 114 | ); 115 | let series = Series::new(PlSmallStr::from_string(series_name), data); 116 | 117 | to_ptr(series) 118 | } 119 | 120 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 121 | pub unsafe fn new_date_series( 122 | mut env: JNIEnv, 123 | _: JClass, 124 | name: JString, 125 | values: JObjectArray, 126 | ) -> jlong { 127 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 128 | .into_iter() 129 | .map(|o| JObject::from_raw(o)) 130 | .map(|o| { 131 | j_string_to_string( 132 | &mut env, 133 | &JString::from(o), 134 | Some("Failed to parse the provided value as a series element"), 135 | ) 136 | }) 137 | .map(|s| { 138 | let lit = s.as_str(); 139 | NaiveDate::parse_from_str(lit, "%Y-%m-%d").context(format!( 140 | "Failed to parse value `{}` as date with format `%Y-%m-%d`", 141 | lit 142 | )) 143 | }) 144 | .collect::, Error>>() 145 | .unwrap_or_throw(&mut env); 146 | 147 | let series_name = j_string_to_string( 148 | &mut env, 149 | &name, 150 | Some("Failed to parse the provided value as a series name"), 151 | ); 152 | let series = Series::new(PlSmallStr::from_string(series_name), data); 153 | 154 | to_ptr(series) 155 | } 156 | 157 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 158 | pub unsafe fn new_time_series( 159 | mut env: JNIEnv, 160 | _: JClass, 161 | name: JString, 162 | values: JObjectArray, 163 | ) -> jlong { 164 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 165 | .into_iter() 166 | .map(|o| JObject::from_raw(o)) 167 | .map(|o| { 168 | j_string_to_string( 169 | &mut env, 170 | &JString::from(o), 171 | Some("Failed to parse the provided value as a series element"), 172 | ) 173 | }) 174 | .map(|s| { 175 | let lit = s.as_str(); 176 | NaiveTime::parse_from_str(lit, "%H:%M:%S%.f").context(format!( 177 | "Failed to parse value `{}` as time with format `%H:%M:%S.f`", 178 | lit 179 | )) 180 | }) 181 | .collect::, Error>>() 182 | .unwrap_or_throw(&mut env); 183 | 184 | let series_name = j_string_to_string( 185 | &mut env, 186 | &name, 187 | Some("Failed to parse the provided value as a series name"), 188 | ); 189 | let series = Series::new(PlSmallStr::from_string(series_name), data); 190 | 191 | to_ptr(series) 192 | } 193 | 194 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 195 | pub unsafe fn new_datetime_series( 196 | mut env: JNIEnv, 197 | _: JClass, 198 | name: JString, 199 | values: JObjectArray, 200 | ) -> jlong { 201 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 202 | .into_iter() 203 | .map(|o| JObject::from_raw(o)) 204 | .map(|o| { 205 | j_string_to_string( 206 | &mut env, 207 | &JString::from(o), 208 | Some("Failed to parse the provided value as a series element"), 209 | ) 210 | }) 211 | .map(|s| { 212 | let lit = s.as_str(); 213 | NaiveDateTime::parse_from_str(lit, "%FT%T%.f").context(format!( 214 | "Failed to parse value `{}` as datetime with format `%FT%T%.f`", 215 | lit 216 | )) 217 | }) 218 | .collect::, Error>>() 219 | .unwrap_or_throw(&mut env); 220 | 221 | let series_name = j_string_to_string( 222 | &mut env, 223 | &name, 224 | Some("Failed to parse the provided value as a series name"), 225 | ); 226 | let series = Series::new(PlSmallStr::from_string(series_name), data); 227 | 228 | to_ptr(series) 229 | } 230 | 231 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 232 | pub unsafe fn new_list_series( 233 | mut env: JNIEnv, 234 | _: JClass, 235 | name: JString, 236 | values: JLongArray, 237 | ) -> jlong { 238 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 239 | .into_iter() 240 | .map(|ptr| (*(ptr as *mut Series)).to_owned()) 241 | .collect(); 242 | 243 | let series_name = j_string_to_string( 244 | &mut env, 245 | &name, 246 | Some("Failed to parse the provided value as a series name"), 247 | ); 248 | let series = Series::new(PlSmallStr::from_string(series_name), data); 249 | 250 | to_ptr(series) 251 | } 252 | 253 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 254 | pub unsafe fn new_struct_series( 255 | mut env: JNIEnv, 256 | _: JClass, 257 | name: JString, 258 | values: JLongArray, 259 | ) -> jlong { 260 | let data: Vec = JavaArrayToVec::to_vec(&mut env, values) 261 | .into_iter() 262 | .map(|ptr| (*(ptr as *mut Series)).to_owned()) 263 | .collect(); 264 | 265 | let series_name = j_string_to_string( 266 | &mut env, 267 | &name, 268 | Some("Failed to parse the provided value as a series name"), 269 | ); 270 | let series = StructChunked::from_series( 271 | PlSmallStr::from_string(series_name), 272 | data.len(), 273 | data.iter(), 274 | ) 275 | .context("Failed to create struct series from provided list of series") 276 | .unwrap_or_throw(&mut env) 277 | .into_series(); 278 | 279 | to_ptr(series) 280 | } 281 | 282 | #[jni_fn("org.polars.scala.polars.internal.jni.series$")] 283 | pub unsafe fn show(_: JNIEnv, _: JClass, series_ptr: *mut Series) { 284 | let series = &*series_ptr; 285 | println!("{:?}", series) 286 | } 287 | -------------------------------------------------------------------------------- /native/src/internal_jni/utils.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Display; 2 | 3 | use anyhow::Context; 4 | use jni::objects::ReleaseMode::NoCopyBack; 5 | use jni::objects::*; 6 | use jni::strings::JNIString; 7 | use jni::sys::*; 8 | use jni::JNIEnv; 9 | 10 | use crate::utils::error::ResultExt; 11 | 12 | pub trait JavaArrayToVec { 13 | type Output; 14 | type InternalType; 15 | 16 | fn get_elements<'local, 'array, 'other_local, 'env>( 17 | env: &'env mut JNIEnv<'local>, 18 | array: &'array JPrimitiveArray<'other_local, ::InternalType>, 19 | ) -> AutoElementsCritical<'local, 'other_local, 'array, 'env, Self::InternalType> 20 | where 21 | ::InternalType: TypeArray, 22 | { 23 | unsafe { 24 | let mut cloned_env = env.unsafe_clone(); 25 | env.get_array_elements_critical(array, NoCopyBack) 26 | .context("Failed to get elements of the array") 27 | .unwrap_or_throw(&mut cloned_env) 28 | } 29 | } 30 | 31 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec; 32 | } 33 | 34 | impl JavaArrayToVec for JBooleanArray<'_> { 35 | type Output = bool; 36 | type InternalType = jboolean; 37 | 38 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 39 | let arr = Self::get_elements(env, &array); 40 | arr.iter().map(|&jb| jb == JNI_TRUE).collect() 41 | } 42 | } 43 | 44 | impl JavaArrayToVec for JIntArray<'_> { 45 | type Output = i32; 46 | type InternalType = jint; 47 | 48 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 49 | let arr = Self::get_elements(env, &array); 50 | arr.iter().copied().collect() 51 | } 52 | } 53 | 54 | impl JavaArrayToVec for JLongArray<'_> { 55 | type Output = i64; 56 | type InternalType = jlong; 57 | 58 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 59 | let arr = Self::get_elements(env, &array); 60 | arr.iter().copied().collect() 61 | } 62 | } 63 | 64 | impl JavaArrayToVec for JFloatArray<'_> { 65 | type Output = f32; 66 | type InternalType = jfloat; 67 | 68 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 69 | let arr = Self::get_elements(env, &array); 70 | arr.iter().copied().collect() 71 | } 72 | } 73 | 74 | impl JavaArrayToVec for JDoubleArray<'_> { 75 | type Output = f64; 76 | type InternalType = jdouble; 77 | 78 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 79 | let arr = Self::get_elements(env, &array); 80 | arr.iter().copied().collect() 81 | } 82 | } 83 | 84 | impl JavaArrayToVec for JObjectArray<'_> { 85 | type Output = jobject; 86 | type InternalType = jobject; 87 | fn to_vec(env: &mut JNIEnv, array: Self) -> Vec { 88 | let len = env 89 | .get_array_length(&array) 90 | .context("Error getting length of the array") 91 | .unwrap_or_throw(env); 92 | let mut result = Vec::with_capacity(len as usize); 93 | 94 | for i in 0..len { 95 | let obj = env 96 | .get_object_array_element(&array, i) 97 | .context("Error getting element of the array") 98 | .unwrap_or_throw(env); 99 | result.push(obj.into_raw()); 100 | } 101 | 102 | result 103 | } 104 | } 105 | 106 | pub fn string_to_j_string>(env: &mut JNIEnv, s: S, msg: Option) -> jstring 107 | where 108 | T: AsRef + Send + Sync + Display + 'static, 109 | { 110 | if let Some(c) = msg { 111 | env.new_string(s).context(c) 112 | } else { 113 | env.new_string(s) 114 | .context("Error converting JString to Rust String") 115 | } 116 | .unwrap_or_throw(env) 117 | .as_raw() 118 | } 119 | 120 | pub fn j_string_to_string(env: &mut JNIEnv, s: &JString, msg: Option) -> String 121 | where 122 | T: AsRef + Send + Sync + Display + 'static, 123 | { 124 | if let Some(c) = msg { 125 | env.get_string(s).context(c) 126 | } else { 127 | env.get_string(s) 128 | .context("Error converting JString to Rust String") 129 | } 130 | .unwrap_or_throw(env) 131 | .into() 132 | } 133 | 134 | pub fn get_n_rows(n_rows: jlong) -> Option { 135 | if n_rows.is_positive() { 136 | Some(n_rows as usize) 137 | } else { 138 | None 139 | } 140 | } 141 | 142 | pub fn to_ptr(v: T) -> jlong { 143 | Box::into_raw(Box::new(v.clone())) as jlong 144 | } 145 | 146 | pub fn find_java_class<'a>(env: &mut JNIEnv<'a>, class: &str) -> JClass<'a> { 147 | env.find_class(class) 148 | .context(format!( 149 | "Error finding Java class for provided value `{class}`" 150 | )) 151 | .unwrap_or_throw(env) 152 | } 153 | -------------------------------------------------------------------------------- /native/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_snake_case)] 2 | #![allow(clippy::missing_safety_doc)] 3 | #![allow(clippy::expect_fun_call)] 4 | 5 | use anyhow::Context; 6 | use internal_jni::utils::{j_string_to_string, string_to_j_string}; 7 | use jni::objects::{JObject, JString}; 8 | use jni::sys::{jboolean, jstring, JNI_TRUE}; 9 | use jni::JNIEnv; 10 | use jni_fn::jni_fn; 11 | use utils::error::ResultExt; 12 | 13 | pub mod internal_jni; 14 | pub mod utils; 15 | 16 | #[jni_fn("org.polars.scala.polars.internal.jni.common$")] 17 | pub fn version(mut env: JNIEnv, _object: JObject) -> jstring { 18 | let cargo_toml_raw = include_str!("../Cargo.toml"); 19 | let cargo_toml_res: anyhow::Result = 20 | toml::from_str(cargo_toml_raw).context("context"); 21 | 22 | cargo_toml_res 23 | .map(|cargo_toml| { 24 | let polars_version = cargo_toml 25 | .get("dependencies") 26 | .and_then(|v| v.get("polars")) 27 | .and_then(|v| v.get("version")); 28 | 29 | let polars_version = match polars_version { 30 | Some(toml::Value::String(s)) => s.as_str(), 31 | _ => "unknown", 32 | }; 33 | 34 | string_to_j_string(&mut env, polars_version, None::<&str>) 35 | }) 36 | .context("Failed to get polars_rs version") 37 | .unwrap_or_throw(&mut env) 38 | } 39 | 40 | #[jni_fn("org.polars.scala.polars.internal.jni.common$")] 41 | pub fn setConfigs(mut env: JNIEnv, _object: JObject, options: JObject) -> jboolean { 42 | let map = env 43 | .get_map(&options) 44 | .context("Failed to get mapping to rename columns") 45 | .unwrap_or_throw(&mut env); 46 | 47 | let mut map_iterator = map 48 | .iter(&mut env) 49 | .context("Failed to get mapping to rename columns") 50 | .unwrap_or_throw(&mut env); 51 | 52 | while let Ok(Some((key, value))) = map_iterator.next(&mut env) { 53 | let key_str = j_string_to_string( 54 | &mut env, 55 | &JString::from(key), 56 | Some("Failed to parse the provided config key as string"), 57 | ); 58 | 59 | let value_str = j_string_to_string( 60 | &mut env, 61 | &JString::from(value), 62 | Some("Failed to parse the provided config value as string"), 63 | ); 64 | 65 | std::env::set_var(key_str, value_str); 66 | } 67 | 68 | JNI_TRUE 69 | } 70 | -------------------------------------------------------------------------------- /native/src/utils/error.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Error; 2 | use jni::errors::Result as JniResult; 3 | use jni::JNIEnv; 4 | 5 | use crate::internal_jni::utils::find_java_class; 6 | 7 | fn format_nested_error(error: &Error) -> String { 8 | let mut formatted = String::new(); 9 | 10 | for (i, cause) in error.chain().enumerate() { 11 | if i == 0 { 12 | formatted.push_str(&format!("{cause}\n",)); 13 | } else { 14 | formatted.push_str(&format!(" Caused by: {cause}\n",)); 15 | } 16 | } 17 | 18 | formatted.trim_end().to_string() 19 | } 20 | 21 | pub fn throw_java_exception(env: &mut JNIEnv, err: Error) -> JniResult<()> { 22 | // Find the Java exception class 23 | let exception_class = find_java_class(env, "java/lang/RuntimeException"); 24 | 25 | // Throw the exception with the provided message 26 | env.throw_new(exception_class, format_nested_error(&err))?; 27 | Ok(()) 28 | } 29 | 30 | /// Trait to unwrap `Result` or throw an exception. 31 | pub trait ResultExt { 32 | fn unwrap_or_throw(self, env: &mut JNIEnv) -> T; 33 | } 34 | 35 | impl ResultExt for Result { 36 | fn unwrap_or_throw(self, env: &mut JNIEnv) -> T { 37 | match self { 38 | Ok(val) => val, 39 | Err(err) => { 40 | // Map the error to a Java exception 41 | let _ = throw_java_exception(env, err); 42 | 43 | // Exit early by returning a default value for JNI 44 | env.exception_describe().unwrap_or(()); 45 | std::process::abort(); 46 | }, 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /native/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod error; 2 | -------------------------------------------------------------------------------- /project/DocSettings.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | import sbt.Keys.* 3 | 4 | import sbtunidoc.* 5 | import sbtunidoc.BaseUnidocPlugin.autoImport.* 6 | import sbtunidoc.JavaUnidocPlugin.autoImport.* 7 | import sbtunidoc.ScalaUnidocPlugin.autoImport.* 8 | 9 | /* Borrowed from delta-io/delta */ 10 | 11 | object DocSettings { 12 | val unidocSourceFilePatterns = settingKey[Seq[SourceFilePattern]]( 13 | "Patterns to match (simple substring match) against full source file paths. " + 14 | "Matched files will be selected for generating API docs." 15 | ) 16 | 17 | implicit class PatternsHelper(patterns: Seq[SourceFilePattern]) { 18 | def scopeToProject(projectToAdd: Project): Seq[SourceFilePattern] = 19 | patterns.map(_.copy(project = Some(projectToAdd))) 20 | } 21 | implicit class UnidocHelper(val projectToUpdate: Project) { 22 | def configureUnidoc(docTitle: String = null): Project = 23 | projectToUpdate 24 | .enablePlugins(ScalaUnidocPlugin, GenJavadocPlugin, JavaUnidocPlugin) 25 | .settings( 26 | libraryDependencies ++= Seq( 27 | // Ensure genJavaDoc plugin is of the right version that works with Scala 2.12 28 | compilerPlugin( 29 | "com.typesafe.genjavadoc" %% "genjavadoc-plugin" % "0.18" cross CrossVersion.full 30 | ) 31 | ), 32 | generateUnidocSettings(docTitle), 33 | 34 | // Ensure unidoc is run with tests. 35 | (Test / test) := ((Test / test) dependsOn (Compile / unidoc)).value 36 | ) 37 | 38 | private def generateUnidocSettings(customDocTitle: String): Def.SettingsDefinition = { 39 | 40 | val internalFilePattern = Seq("/internal/", "/execution/", "$") 41 | 42 | // Generate the full doc title 43 | def fullDocTitle(projectName: String, version: String, isScalaDoc: Boolean): String = { 44 | val namePart = Option(customDocTitle).getOrElse { 45 | projectName.split("-").map(_.capitalize).mkString(" ") 46 | } 47 | val versionPart = version.replaceAll("-SNAPSHOT", "") 48 | val langPart = if (isScalaDoc) "Scala API Docs" else "Java API Docs" 49 | s"$namePart $versionPart - $langPart" 50 | } 51 | 52 | // Remove source files that does not match the pattern 53 | def ignoreUndocumentedSources( 54 | allSourceFiles: Seq[Seq[java.io.File]], 55 | sourceFilePatternsToKeep: Seq[SourceFilePattern] 56 | ): Seq[Seq[java.io.File]] = { 57 | if (sourceFilePatternsToKeep.isEmpty) return Nil 58 | 59 | val projectSrcDirToFilePatternsToKeep = sourceFilePatternsToKeep.map { 60 | case SourceFilePattern(dirs, projOption) => 61 | val projectPath = projOption.getOrElse(projectToUpdate).base.getCanonicalPath 62 | projectPath -> dirs 63 | }.toMap 64 | 65 | def shouldKeep(path: String): Boolean = { 66 | projectSrcDirToFilePatternsToKeep.foreach { case (projBaseDir, filePatterns) => 67 | def isInProjectSrcDir = 68 | path.contains(s"$projBaseDir/src") || path.contains(s"$projBaseDir/target/java/") 69 | def matchesFilePattern = filePatterns.exists(path.contains(_)) 70 | def matchesInternalFilePattern = internalFilePattern.exists(path.contains(_)) 71 | if (isInProjectSrcDir && matchesFilePattern && !matchesInternalFilePattern) 72 | return true 73 | } 74 | false 75 | } 76 | allSourceFiles.map(_.filter(f => shouldKeep(f.getCanonicalPath))) 77 | } 78 | 79 | val javaUnidocSettings = Seq( 80 | // Configure Java unidoc 81 | JavaUnidoc / unidoc / javacOptions := Seq( 82 | "-public", 83 | "-windowtitle", 84 | fullDocTitle((projectToUpdate / name).value, version.value, isScalaDoc = false), 85 | "-noqualifier", 86 | "java.lang", 87 | "-tag", 88 | "implNote:a:Implementation Note:", 89 | "-tag", 90 | "apiNote:a:API Note:", 91 | "-Xdoclint:none" 92 | ), 93 | JavaUnidoc / unidoc / unidocAllSources := 94 | ignoreUndocumentedSources( 95 | allSourceFiles = (JavaUnidoc / unidoc / unidocAllSources).value, 96 | sourceFilePatternsToKeep = unidocSourceFilePatterns.value 97 | ), 98 | 99 | // Settings for plain, old Java doc needed for successful doc generation during publishing. 100 | Compile / doc / javacOptions ++= Seq( 101 | "-public", 102 | "-noqualifier", 103 | "java.lang", 104 | "-tag", 105 | "implNote:a:Implementation Note:", 106 | "-tag", 107 | "apiNote:a:API Note:", 108 | "-Xdoclint:all" 109 | ) 110 | ) 111 | 112 | val scalaUnidocSettings = Seq( 113 | // Configure Scala unidoc 114 | ScalaUnidoc / unidoc / scalacOptions ++= Seq( 115 | "-doc-title", 116 | fullDocTitle((projectToUpdate / name).value, version.value, isScalaDoc = true) 117 | ), 118 | ScalaUnidoc / unidoc / unidocAllSources := 119 | ignoreUndocumentedSources( 120 | allSourceFiles = (ScalaUnidoc / unidoc / unidocAllSources).value, 121 | sourceFilePatternsToKeep = unidocSourceFilePatterns.value 122 | ) 123 | ) 124 | 125 | javaUnidocSettings ++ scalaUnidocSettings 126 | } 127 | } 128 | 129 | /** Patterns are strings to do simple substring matches on the full path of every source file. 130 | */ 131 | case class SourceFilePattern(patterns: Seq[String], project: Option[Project] = None) 132 | 133 | object SourceFilePattern { 134 | def apply(patterns: String*): SourceFilePattern = SourceFilePattern(patterns.toSeq, None) 135 | } 136 | 137 | } 138 | -------------------------------------------------------------------------------- /project/ExtraCommands.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | import sbt.Keys.* 3 | 4 | import Utils.* 5 | import com.github.sbt.jni.plugins.JniJavah.autoImport.javah 6 | 7 | object ExtraCommands { 8 | 9 | lazy val cleanHeaders = 10 | taskKey[Unit]("Removes all previously generated headers") 11 | lazy val cargoFmt = 12 | taskKey[Unit]("Formats native module and its Cargo.toml.") 13 | lazy val cargoCheck = 14 | taskKey[Unit]("Checks the formatting of native module and its Cargo.toml.") 15 | 16 | lazy val commandAliases: Seq[Setting[_]] = Seq( 17 | addCommandAlias("cleanAll", ";cleanHeaders; clean; cleanFiles; reload"), 18 | addCommandAlias("genHeaders", ";cleanHeaders; javah"), 19 | addCommandAlias("fmtAll", ";scalafmtAll; scalafmtSbt; javafmtAll; cargoFmt; reload"), 20 | addCommandAlias( 21 | "fmtCheckAll", 22 | ";scalafmtCheckAll; scalafmtSbtCheck; javafmtCheckAll; cargoCheck" 23 | ) 24 | ).flatten 25 | 26 | lazy val commands: Seq[Setting[_]] = Seq( 27 | cleanHeaders := { 28 | import scala.reflect.io.Directory 29 | 30 | val headerDir = (javah / target).value 31 | val directory = new Directory(headerDir) 32 | 33 | directory.deleteRecursively() 34 | sLog.value.info(s"Removed headers directory $headerDir") 35 | }, 36 | cargoFmt := { 37 | val nativeRootDir = nativeRoot.value: @sbtUnchecked 38 | val cmds = Seq( 39 | "cargo fix --allow-dirty --allow-staged", 40 | "cargo sort", 41 | "cargo fmt --verbose --all" 42 | ) 43 | 44 | executeProcesses(cmds, cwd = Some(nativeRootDir), sLog.value, infoOnly = true) 45 | }, 46 | cargoCheck := { 47 | val nativeRootDir = nativeRoot.value: @sbtUnchecked 48 | val cmds = Seq( 49 | "cargo fmt --check --all", 50 | "cargo sort --check", 51 | "cargo clippy -- -D warnings" 52 | ) 53 | 54 | executeProcesses(cmds, cwd = Some(nativeRootDir), sLog.value, infoOnly = true) 55 | } 56 | ) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /project/GeneralSettings.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | import sbt.Keys.* 3 | 4 | import Utils.* 5 | import sbtassembly.AssemblyPlugin.autoImport.* 6 | 7 | object GeneralSettings { 8 | 9 | val scala212 = "2.12.20" 10 | val scala213 = "2.13.15" 11 | val scala33 = "3.3.4" 12 | 13 | val defaultScalaVersion: String = scala213 14 | val supportedScalaVersions: Seq[String] = Seq(scala212, scala213, scala33) 15 | 16 | lazy val commonSettings = Seq( 17 | organization := "org.polars", 18 | versionScheme := Some("early-semver"), 19 | licenses := List("Apache-2.0" -> url("https://www.apache.org/licenses/LICENSE-2.0")), 20 | developers := List( 21 | Developer( 22 | id = "chitralverma", 23 | name = "Chitral Verma", 24 | email = "chitral.verma@gmail.com", 25 | url = url("https://github.com/chitralverma") 26 | ) 27 | ), 28 | scalaVersion := defaultScalaVersion, 29 | crossScalaVersions := supportedScalaVersions, 30 | scalacOptions ++= Seq( 31 | "-encoding", 32 | "utf8", 33 | "-deprecation", 34 | "-feature", 35 | "-language:existentials", 36 | "-language:implicitConversions", 37 | "-language:reflectiveCalls", 38 | "-language:higherKinds", 39 | "-language:postfixOps", 40 | "-unchecked", 41 | "-Xfatal-warnings" 42 | ) ++ (if (priorTo213(scalaVersion.value)) Seq("-target:jvm-1.8") 43 | else Seq("-release", "8")), 44 | fork := true, 45 | turbo := true, 46 | assembly / assemblyMergeStrategy := { 47 | case PathList("META-INF", xs @ _*) => MergeStrategy.discard 48 | case x => MergeStrategy.first 49 | } 50 | ) 51 | 52 | lazy val settings: Seq[Setting[_]] = Seq( 53 | name := "scala-polars", 54 | nativeRoot := baseDirectory.value.toPath.resolveSibling("native").toFile 55 | ) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /project/NativeBuildSettings.scala: -------------------------------------------------------------------------------- 1 | import java.nio.file.* 2 | 3 | import sbt.* 4 | import sbt.Keys.* 5 | 6 | import scala.collection.JavaConverters.* 7 | import scala.sys.process.* 8 | 9 | import Utils.* 10 | 11 | object NativeBuildSettings { 12 | 13 | lazy val generateNativeLibrary = taskKey[Unit]( 14 | "Generates native library using Cargo which can be added as managed resource to classpath." 15 | ) 16 | 17 | lazy val managedNativeLibraries = taskKey[Seq[Path]]( 18 | "Maps locally built, platform-dependant libraries to their locations on the classpath." 19 | ) 20 | 21 | lazy val settings: Seq[Setting[_]] = Seq( 22 | generateNativeLibrary := Def 23 | .taskDyn[Unit] { 24 | Def.task { 25 | val logger: Logger = sLog.value 26 | 27 | sys.env.get("SKIP_NATIVE_GENERATION") match { 28 | case None => 29 | val processLogger = getProcessLogger(sLog.value, infoOnly = true) 30 | 31 | val targetTriple = sys.env.getOrElse( 32 | "TARGET_TRIPLE", { 33 | logger.warn( 34 | "Environment variable TARGET_TRIPLE was not set, getting value from `rustc`." 35 | ) 36 | 37 | s"rustc -vV".!!.split("\n") 38 | .map(_.trim) 39 | .find(_.startsWith("host")) 40 | .map(_.split(" ")(1).trim) 41 | .getOrElse(throw new IllegalStateException("No target triple found.")) 42 | } 43 | ) 44 | 45 | val arch = targetTriple.toLowerCase(java.util.Locale.ROOT).split("-").head 46 | 47 | val nativeOutputDir = resourceManaged.value.toPath.resolve(s"native/$arch/") 48 | val cargoTomlPath = s"${baseDirectory.value.getParent}/native/Cargo.toml" 49 | 50 | // Build native project using cargo 51 | val cmd = 52 | s"""cargo build 53 | |-Z unstable-options 54 | |--release 55 | |--lib 56 | |--target $targetTriple 57 | |--artifact-dir $nativeOutputDir""".stripMargin.replaceAll("\n", " ") 58 | 59 | executeProcess(cmd = cmd, cwd = Some(nativeRoot.value), sLog.value, infoOnly = true) 60 | logger.success(s"Successfully built native library at location '$nativeOutputDir'") 61 | 62 | sys.env.get("NATIVE_LIB_LOCATION") match { 63 | case Some(path) => 64 | val dest = Paths.get(path, arch).toAbsolutePath 65 | logger.info( 66 | "Environment variable NATIVE_LIB_LOCATION is set, " + 67 | s"copying built native library from location '$nativeOutputDir' to '$dest'." 68 | ) 69 | 70 | IO.copyDirectory(nativeOutputDir.toFile, dest.toFile) 71 | 72 | case None => 73 | } 74 | 75 | case Some(_) => 76 | logger.info( 77 | "Environment variable SKIP_NATIVE_GENERATION is set, skipping cargo build." 78 | ) 79 | } 80 | } 81 | } 82 | .value, 83 | managedNativeLibraries := Def 84 | .taskDyn[Seq[Path]] { 85 | Def.task { 86 | val managedLibs = sys.env.get("SKIP_NATIVE_GENERATION") match { 87 | case None => 88 | Files 89 | .find( 90 | resourceManaged.value.toPath.resolve("native/"), 91 | Int.MaxValue, 92 | (filePath, _) => filePath.toFile.isFile 93 | ) 94 | .iterator() 95 | .asScala 96 | .toSeq 97 | 98 | case Some(_) => Seq.empty[Path] 99 | } 100 | 101 | val externalNativeLibs = sys.env.get("NATIVE_LIB_LOCATION") match { 102 | case Some(path) => 103 | Files 104 | .find( 105 | Paths.get(path), 106 | Int.MaxValue, 107 | (filePath, _) => filePath.toFile.isFile 108 | ) 109 | .iterator() 110 | .asScala 111 | .toSeq 112 | 113 | case None => Seq.empty[Path] 114 | } 115 | 116 | // Collect paths of built resources to later include in classpath 117 | (managedLibs ++ externalNativeLibs).distinct.map(_.toAbsolutePath) 118 | } 119 | } 120 | .dependsOn(generateNativeLibrary) 121 | .value, 122 | resourceGenerators += Def.task { 123 | // Add all generated resources to manage resources' classpath 124 | managedNativeLibraries.value 125 | .map { path => 126 | val pathStr = path.toString 127 | val arch = path.getParent.getFileName.toString 128 | 129 | val libraryFile = path.toFile 130 | 131 | // native library as a managed resource file 132 | val resource = resourceManaged.value / "native" / arch / libraryFile.getName 133 | 134 | // copy native library to a managed resource, so that it is always available 135 | // on the classpath, even when not packaged as a jar 136 | IO.copyDirectory(libraryFile, resource) 137 | 138 | sLog.value.success( 139 | s"Added resource from location '$pathStr' " + 140 | s"(size: ${libraryFile.length() / (1024 * 1024)} MBs) to classpath." 141 | ) 142 | 143 | resource 144 | } 145 | }.taskValue 146 | ) 147 | 148 | } 149 | -------------------------------------------------------------------------------- /project/ProjectDependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | import sbt.Keys.* 3 | 4 | import Utils.* 5 | import Versions.* 6 | 7 | object ProjectDependencies { 8 | 9 | lazy val dependencies: Seq[Setting[_]] = Seq( 10 | libraryDependencies ++= 11 | Seq( 12 | "org.scala-lang.modules" %% "scala-collection-compat" % scalaCollectionCompat, 13 | "com.fasterxml.jackson.core" % "jackson-databind" % jacksonVersion, 14 | "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion, 15 | "com.fasterxml.jackson.datatype" % "jackson-datatype-jsr310" % jacksonVersion 16 | ) ++ 17 | (if (!priorTo213(scalaVersion.value)) 18 | Seq( 19 | "org.scala-lang.modules" %% "scala-parallel-collections" % scalaParallelCollections 20 | ) 21 | else Nil) ++ 22 | ( 23 | scalaVersion.value match { 24 | // Only include scala-reflect for Scala 2 25 | case v if v.startsWith("2.") => Seq("org.scala-lang" % "scala-reflect" % v) 26 | // No scala-reflect for Scala 3 27 | case _ => Seq.empty 28 | } 29 | ) 30 | ) 31 | 32 | } 33 | 34 | object Versions { 35 | val scalaCollectionCompat = "2.13.0" 36 | val scalaParallelCollections = "1.1.0" 37 | val jacksonVersion = "2.18.4" 38 | } 39 | -------------------------------------------------------------------------------- /project/PublishingSettings.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | import sbt.Keys.* 3 | 4 | object PublishingSettings { 5 | 6 | lazy val settings: Seq[Setting[_]] = Seq( 7 | publish / skip := false, 8 | publishArtifact := true, 9 | publishMavenStyle := true, 10 | externalResolvers += "GitHub Package Registry" at "https://maven.pkg.github.com/chitralverma/scala-polars", 11 | publishTo := Some( 12 | "GitHub Package Registry" at "https://maven.pkg.github.com/chitralverma/scala-polars" 13 | ), 14 | credentials += Credentials( 15 | realm = "GitHub Package Registry", 16 | host = "maven.pkg.github.com", 17 | userName = "chitralverma", 18 | passwd = sys.env.getOrElse("GITHUB_TOKEN", "") 19 | ) 20 | ) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /project/Utils.scala: -------------------------------------------------------------------------------- 1 | import sbt.* 2 | 3 | import scala.sys.process.* 4 | 5 | object Utils { 6 | 7 | lazy val nativeRoot = taskKey[File]("Directory pointing to the native project root.") 8 | 9 | def executeProcesses( 10 | cmds: Seq[String], 11 | cwd: Option[File] = None, 12 | logger: Logger, 13 | infoOnly: Boolean = false, 14 | extraEnv: Seq[(String, String)] = Nil 15 | ): Unit = cmds.foreach(cmd => executeProcess(cmd, cwd, logger, infoOnly = true)) 16 | 17 | def executeProcess( 18 | cmd: String, 19 | cwd: Option[File] = None, 20 | logger: Logger, 21 | infoOnly: Boolean = false, 22 | extraEnv: Seq[(String, String)] = Nil 23 | ): Unit = { 24 | val exitCode = 25 | Process(cmd, cwd, extraEnv: _*).run(getProcessLogger(logger, infoOnly)).exitValue() 26 | 27 | if (exitCode != 0) { 28 | logger.error(s"Failed to executed command `$cmd` with exit code $exitCode.") 29 | System.exit(exitCode) 30 | } else { 31 | logger.success(s"Successfully executed command `$cmd`.") 32 | } 33 | } 34 | 35 | def priorTo213(scalaVersion: String): Boolean = 36 | CrossVersion.partialVersion(scalaVersion) match { 37 | case Some((2, minor)) if minor < 13 => true 38 | case _ => false 39 | } 40 | 41 | def getProcessLogger(logger: Logger, infoOnly: Boolean = false): ProcessLogger = 42 | ProcessLogger( 43 | (o: String) => logger.info(o), 44 | (e: String) => if (infoOnly) logger.info(e) else logger.error(e) 45 | ) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.10.7 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.sbt" % "sbt-jni" % "1.7.0") 2 | 3 | addSbtPlugin("com.github.sbt" % "sbt-java-formatter" % "0.10.0") 4 | 5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.4") 6 | 7 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.3.1") 8 | 9 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.4") 10 | 11 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.0") 12 | 13 | addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.29.1") 14 | 15 | addSbtPlugin("com.github.sbt" % "sbt-dynver" % "5.1.0") 16 | 17 | addSbtPlugin("com.github.sbt" % "sbt-ghpages" % "0.8.0") 18 | 19 | addSbtPlugin("com.github.sbt" % "sbt-unidoc" % "0.5.0") 20 | 21 | addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") 22 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / version := "0.1.0-SNAPSHOT" 2 | --------------------------------------------------------------------------------