├── .github
    └── workflows
    │   ├── auto-merge.yml
    │   ├── blossom-ci.yml
    │   ├── gcs-benchmark.yml
    │   ├── license-header-check.yml
    │   └── signoff-check.yml
├── .gitignore
├── .pylintrc
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── NOTICE-binary
├── README.md
├── SECURITY.md
├── ci
    ├── Dockerfile
    ├── Jenkinsfile.premerge
    ├── deploy.sh
    ├── docs.sh
    ├── lint_python.py
    └── test.sh
├── deprecated
    ├── README.md
    ├── native
    │   ├── CMakeLists.txt
    │   └── src
    │   │   ├── CMakeLists.txt
    │   │   ├── rapidsml_jni.cpp
    │   │   ├── rapidsml_jni.cu
    │   │   └── rapidsml_jni.hpp
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── nvidia
    │       │   │       └── spark
    │       │   │           └── ml
    │       │   │               └── linalg
    │       │   │                   └── JniRAPIDSML.java
    │       └── scala
    │       │   ├── com
    │       │       └── nvidia
    │       │       │   └── spark
    │       │       │       └── ml
    │       │       │           └── feature
    │       │       │               └── PCA.scala
    │       │   └── org
    │       │       └── apache
    │       │           └── spark
    │       │               └── ml
    │       │                   ├── feature
    │       │                       └── RapidsPCA.scala
    │       │                   └── linalg
    │       │                       ├── RAPIDSML.scala
    │       │                       └── distributed
    │       │                           └── RapidsRowMatrix.scala
    │   └── test
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           ├── com
    │               └── nvidia
    │               │   └── spark
    │               │       └── ml
    │               │           └── feature
    │               │               └── PCASuite.scala
    │           └── org
    │               └── apache
    │                   └── spark
    │                       └── ml
    │                           └── util
    │                               └── RapidsMLTest.scala
├── docker
    ├── Dockerfile
    ├── Dockerfile.pip
    ├── Dockerfile.python
    └── README.md
├── docs
    ├── Makefile
    ├── requirements.txt
    ├── site
    │   ├── 404.html
    │   ├── FAQ.md
    │   ├── _config.yml
    │   ├── api
    │   │   └── index.md
    │   ├── compatibility.md
    │   ├── configuration.md
    │   ├── contact.md
    │   ├── get-started
    │   │   ├── databricks.md
    │   │   ├── dataproc.md
    │   │   ├── emr.md
    │   │   ├── index.md
    │   │   ├── local.md
    │   │   └── spark_connect.md
    │   ├── index.md
    │   ├── performance.md
    │   └── security.md
    └── source
    │   ├── _templates
    │       └── autosummary
    │       │   ├── class.rst
    │       │   └── class_with_docs.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── spark_rapids_ml.rst
├── jvm
    ├── .gitignore
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   └── META-INF
    │       │   │   └── services
    │       │   │       ├── org.apache.spark.ml.Estimator
    │       │   │       └── org.apache.spark.ml.Transformer
    │       └── scala
    │       │   ├── com
    │       │       └── nvidia
    │       │       │   └── rapids
    │       │       │       └── ml
    │       │       │           ├── Plugin.scala
    │       │       │           ├── RapidsKMeans.scala
    │       │       │           ├── RapidsLinearRegression.scala
    │       │       │           ├── RapidsLogisticRegression.scala
    │       │       │           ├── RapidsPCA.scala
    │       │       │           ├── RapidsRandomForestClassifier.scala
    │       │       │           ├── RapidsRandomForestRegressor.scala
    │       │       │           └── RapidsTraits.scala
    │       │   └── org
    │       │       └── apache
    │       │           └── spark
    │       │               └── ml
    │       │                   ├── clustering
    │       │                       └── rapids
    │       │                       │   └── RapidsKMeansModel.scala
    │       │                   └── rapids
    │       │                       ├── ModelHelper.scala
    │       │                       ├── PythonEstimatorRunner.scala
    │       │                       ├── PythonModelRunner.scala
    │       │                       ├── RapidsLinearRegressionModel.scala
    │       │                       ├── RapidsLogisticRegressionModel.scala
    │       │                       ├── RapidsModel.scala
    │       │                       ├── RapidsPCAModel.scala
    │       │                       ├── RapidsRandomForestClassificationModel.scala
    │       │                       ├── RapidsRandomForestRegressionModel.scala
    │       │                       └── Utils.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── nvidia
    │                   └── rapids
    │                       └── ml
    │                           └── SparkRapidsMLSuite.scala
├── notebooks
    ├── README.md
    ├── approx-nearest-neighbors.ipynb
    ├── aws-emr
    │   ├── README.md
    │   ├── init-bootstrap-action.sh
    │   └── init-configurations.json
    ├── cv-rf-regressor.ipynb
    ├── databricks
    │   ├── README.md
    │   └── init-pip-cuda-11.8.sh
    ├── dataproc
    │   ├── README.md
    │   └── spark_rapids_ml.sh
    ├── dbscan.ipynb
    ├── kmeans-no-import-change.ipynb
    ├── kmeans.ipynb
    ├── knn.ipynb
    ├── linear-regression.ipynb
    ├── logistic-regression.ipynb
    ├── pca.ipynb
    ├── random-forest-classification.ipynb
    ├── random-forest-regression.ipynb
    ├── spark-compat.ipynb
    └── umap.ipynb
├── python
    ├── README.md
    ├── benchmark
    │   ├── README.md
    │   ├── aws-emr
    │   │   ├── README.md
    │   │   ├── cpu-init-configurations.json
    │   │   ├── run_benchmark.sh
    │   │   ├── setup.sh
    │   │   └── start_cluster.sh
    │   ├── benchmark
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── bench_approximate_nearest_neighbors.py
    │   │   ├── bench_dbscan.py
    │   │   ├── bench_kmeans.py
    │   │   ├── bench_linear_regression.py
    │   │   ├── bench_logistic_regression.py
    │   │   ├── bench_nearest_neighbors.py
    │   │   ├── bench_pca.py
    │   │   ├── bench_random_forest.py
    │   │   ├── bench_umap.py
    │   │   ├── utils.py
    │   │   └── utils_knn.py
    │   ├── benchmark_runner.py
    │   ├── conftest.py
    │   ├── databricks
    │   │   ├── README.md
    │   │   ├── benchmark_utils.sh
    │   │   ├── cpu_cluster_spec.sh
    │   │   ├── gpu_cluster_spec.sh
    │   │   ├── gpu_etl_cluster_spec.sh
    │   │   ├── init-cpu.sh
    │   │   ├── init-pip-cuda-11.8.sh
    │   │   ├── process_bm_log.sh
    │   │   ├── results
    │   │   │   └── running_times.png
    │   │   ├── run_benchmark.sh
    │   │   └── setup.sh
    │   ├── dataproc
    │   │   ├── README.md
    │   │   ├── init_benchmark.sh
    │   │   ├── run_benchmark.sh
    │   │   ├── setup.sh
    │   │   └── start_cluster.sh
    │   ├── gen_data.py
    │   ├── gen_data_distributed.py
    │   └── test_gen_data.py
    ├── pyproject.toml
    ├── requirements.txt
    ├── requirements_dev.txt
    ├── run_benchmark.sh
    ├── run_plugin_test.sh
    ├── run_test.sh
    ├── setup.cfg
    ├── src
    │   └── spark_rapids_ml
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── classification.py
    │   │   ├── clustering.py
    │   │   ├── common
    │   │       ├── __init__.py
    │   │       └── cuml_context.py
    │   │   ├── connect_plugin.py
    │   │   ├── core.py
    │   │   ├── feature.py
    │   │   ├── install.py
    │   │   ├── knn.py
    │   │   ├── metrics
    │   │       ├── MulticlassMetrics.py
    │   │       ├── RegressionMetrics.py
    │   │       └── __init__.py
    │   │   ├── params.py
    │   │   ├── pipeline.py
    │   │   ├── pyspark_rapids.py
    │   │   ├── regression.py
    │   │   ├── spark_rapids_submit.py
    │   │   ├── tree.py
    │   │   ├── tuning.py
    │   │   ├── umap.py
    │   │   └── utils.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── discover_gpu.sh
    │   ├── sparksession.py
    │   ├── test_approximate_nearest_neighbors.py
    │   ├── test_benchmark.py
    │   ├── test_common_estimator.py
    │   ├── test_dbscan.py
    │   ├── test_kmeans.py
    │   ├── test_linear_model.py
    │   ├── test_logistic_regression.py
    │   ├── test_metrics.py
    │   ├── test_nearest_neighbors.py
    │   ├── test_pca.py
    │   ├── test_pipeline.py
    │   ├── test_random_forest.py
    │   ├── test_tuning.py
    │   ├── test_ucx.py
    │   ├── test_umap.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── tests_large
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_large_logistic_regression.py
    └── tests_no_import_change
    │   └── test_no_import_change.py
└── thirdparty
    └── LICENSES
        ├── LICENSE.cuml
        ├── LICENSE.scikit_learn
        ├── LICENSE.spark
        └── LICENSE.xgboost


/.github/workflows/auto-merge.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to keep BASE branch up-to-date from HEAD branch
16 | name: auto-merge HEAD to BASE
17 | 
18 | on:
19 |   pull_request_target:
20 |     branches:
21 |       - branch-*
22 |     types: [closed]
23 | 
24 | jobs:
25 |   auto-merge:
26 |     if: github.event.pull_request.merged == true
27 |     uses: NVIDIA/spark-rapids-common/.github/workflows/auto-merge.yml@main
28 |     with:
29 |       owner: ${{ github.repository_owner }}
30 |       repo: spark-rapids-ml
31 |       branch: ${{ github.event.pull_request.base.ref }}
32 |     secrets:
33 |       token: ${{ secrets.AUTOMERGE_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/gcs-benchmark.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to trigger gcs tests
16 | name: GCS Benchmark Test
17 | 
18 | on:
19 |   workflow_dispatch:
20 |     inputs:
21 |       computer_region:
22 |         required: true
23 |         default: 'us-central1'
24 |         description: 'COMPUTER REGION'
25 |   schedule:
26 |     - cron: "0 13 * * 1"    
27 | 
28 | jobs:
29 |   Benchmark:
30 |     if: github.repository == 'NVIDIA/spark-rapids-ml'
31 |     runs-on: ubuntu-latest
32 |     env:
33 |       PROJECT: rapids-spark
34 |       DATAPROC_REGION: us-central1
35 |       COMPUTE_REGION: ${{ inputs.computer_region || 'us-central1' }}
36 |       COMPUTE_ZONE: us-central1-a
37 |       GCS_BUCKET: spark-rapids-ml-benchmarking
38 |       KEY_FILE_CONTENT: ${{ secrets.GCLOUD_PRIVATE_KEY }}
39 |       SERVICE_ACCOUNT: ${{ secrets.GCLOUD_SERVICE_ACCOUNT }}
40 |       CLUSTER_NAME: github-spark-rapids-ml-${{github.run_number}}
41 |     steps:
42 |       - uses: actions/checkout@v4
43 | 
44 |       - name: run benchmark
45 |         shell: bash
46 |         run: |
47 |           set -x
48 |           cat <<< $KEY_FILE_CONTENT > key.json
49 |           gcloud auth activate-service-account $SERVICE_ACCOUNT --key-file key.json
50 |           gcloud config set project $PROJECT
51 |           gcloud config set dataproc/region $DATAPROC_REGION
52 |           gcloud config set compute/region $COMPUTE_REGION
53 |           gcloud config set compute/zone $COMPUTE_ZONE
54 |           export BENCHMARK_HOME=$GCS_BUCKET/benchmark
55 |           cd python/benchmark/dataproc
56 |           ./setup.sh
57 |           ./run_benchmark.sh
58 | 
59 |       - name: delete cluster
60 |         if: ${{ always() }}
61 |         shell: bash
62 |         continue-on-error: true
63 |         run: |
64 |           set -x
65 |           cat <<< $KEY_FILE_CONTENT > key.json
66 |           gcloud auth activate-service-account $SERVICE_ACCOUNT --key-file key.json
67 |           gcloud config set project $PROJECT
68 |           echo y | gcloud dataproc clusters delete $CLUSTER_NAME --region $COMPUTE_REGION
69 | 


--------------------------------------------------------------------------------
/.github/workflows/license-header-check.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to check copyright/license header
16 | name: license header check
17 | 
18 | on:
19 |   pull_request:
20 |     types: [opened, synchronize, reopened]
21 | 
22 | jobs:
23 |   license-header-check:
24 |     runs-on: ubuntu-latest
25 |     if: "!contains(github.event.pull_request.title, '[bot]')"
26 |     steps:
27 |       - name: Get checkout depth
28 |         run: |
29 |           echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
30 | 
31 |       - name: Checkout code
32 |         uses: actions/checkout@v4
33 |         with:
34 |           fetch-depth: ${{ env.PR_FETCH_DEPTH }}
35 | 
36 |       - name: license-header-check
37 |         uses: NVIDIA/spark-rapids-common/license-header-check@main
38 |         with:
39 |           included_file_patterns: |
40 |             *.sh,
41 |             *.py,
42 |             *.toml,
43 |             *.cfg,
44 |             *Dockerfile*,
45 |             *Jenkinsfile*,
46 |             *.yml,
47 |             *.txt,
48 |             *.xml
49 |           excluded_file_patterns: |
50 |             thirdparty/*
51 | 


--------------------------------------------------------------------------------
/.github/workflows/signoff-check.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to check if PR got sign-off
16 | name: signoff check
17 | 
18 | on:
19 |   pull_request_target:
20 |     types: [opened, synchronize, reopened]
21 | 
22 | jobs:
23 |   signoff-check:
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - name: signoff
27 |         uses: NVIDIA/spark-rapids-common/signoff-check@main
28 |         with:
29 |           owner: ${{ github.repository_owner }}
30 |           repo: spark-rapids-ml
31 |           pull_number: ${{ github.event.number }}
32 |           token: ${{ secrets.GITHUB_TOKEN }}
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *#*#
 2 | *.#*
 3 | *.iml
 4 | *.ipr
 5 | *.iws
 6 | *.pyc
 7 | *.pyo
 8 | *.swp
 9 | *~
10 | .DS_Store
11 | .cache
12 | .classpath
13 | .ensime
14 | .ensime_cache/
15 | .ensime_lucene
16 | .generated-mima*
17 | .idea/
18 | .idea_modules/
19 | .project
20 | .pydevproject
21 | .scala_dependencies
22 | .settings
23 | hs_err*.log
24 | dependency-reduced-pom.xml
25 | scalastyle-on-compile.generated.xml
26 | scalastyle-output.xml
27 | scalastyle.txt
28 | target/
29 | */metastore_db
30 | */spark-warehouse
31 | */.vscode
32 | */.clang-format
33 | __pycache__/
34 | dist/
35 | docs/build/
36 | */.ipynb_checkpoints/
37 | *.egg-info/
38 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | 
 3 | ignore=tests
 4 | 
 5 | extension-pkg-whitelist=numpy
 6 | 
 7 | disable=unexpected-special-method-signature,too-many-nested-blocks,useless-object-inheritance,import-outside-toplevel,unsubscriptable-object,attribute-defined-outside-init
 8 | 
 9 | dummy-variables-rgx=(unused|)_.*
10 | 
11 | reports=no
12 | 
13 | [BASIC]
14 | 
15 | # Enforce naming convention
16 | const-naming-style=UPPER_CASE
17 | class-naming-style=PascalCase
18 | function-naming-style=snake_case
19 | method-naming-style=snake_case
20 | attr-naming-style=snake_case
21 | argument-naming-style=snake_case
22 | variable-naming-style=snake_case
23 | class-attribute-naming-style=snake_case
24 | 
25 | # Allow single-letter variables
26 | variable-rgx=[a-zA-Z_][a-z0-9_]{0,30}$
27 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | RAPIDS ML for Apache Spark
 2 | Copyright (c) 2021, NVIDIA CORPORATION
 3 | 
 4 | // ------------------------------------------------------------------
 5 | // NOTICE file corresponding to the section 4d of The Apache License,
 6 | // Version 2.0, in this case for
 7 | // ------------------------------------------------------------------
 8 | 
 9 | Apache Spark
10 | Copyright 2014 and onwards The Apache Software Foundation
11 | 
12 | This product includes software developed at
13 | The Apache Software Foundation (http://www.apache.org/).
14 | 
15 | ---------------------------------------------------------------------
16 | 
17 | raft - RAPIDS Analytics Frameworks Toolset
18 | Copyright 2020 NVIDIA Corporation
19 | 
20 | ---------------------------------------------------------------------
21 | 
22 | cuML - RAPIDS Machine Learning Library
23 | Copyright 2018 NVIDIA CORPORATION


--------------------------------------------------------------------------------
/NOTICE-binary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/spark-rapids-ml/7267705b4f226f0b579844116f4cb72249e64a27/NOTICE-binary


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Rapids ML
 2 | 
 3 | Spark Rapids ML enables GPU accelerated distributed machine learning on [Apache Spark](https://spark.apache.org/).  It provides several PySpark ML compatible algorithms powered by the [RAPIDS cuML](https://docs.rapids.ai/api/cuml/stable/) library.
 4 | 
 5 | These APIs seek to minimize any code changes to end user Spark code.  After your environment is configured to support GPUs (with drivers, CUDA toolkit, and RAPIDS dependencies), you should be able to just change an import statement or class name to take advantage of GPU acceleration.   See [here](./python/README.md#clis-enabling-no-package-import-change) for experimental CLIs that enable GPU acceleration without the need for changing the `pyspark.ml` package names in an existing pyspark ml application.
 6 | 
 7 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/NVIDIA/spark-rapids-ml)
 8 | 
 9 | **Python**
10 | ```python
11 | # from pyspark.ml.feature import PCA
12 | from spark_rapids_ml.feature import PCA
13 | 
14 | pca = (
15 |     PCA()
16 |     .setK(3)
17 |     .setInputCol("features")
18 |     .setOutputCol("pca_features")
19 | )
20 | pca.fit(df)
21 | ```
22 | 
23 | ## Supported Algorithms
24 | 
25 | The following table shows the currently supported algorithms.  The goal is to expand this over time with support from the underlying RAPIDS cuML libraries.  If you would like support for a specific algorithm, please file a [git issue](https://github.com/NVIDIA/spark-rapids-ml/issues) to help us prioritize.
26 | 
27 | | Supported Algorithms   | Python |
28 | | :--------------------- | :----: |
29 | | CrossValidator         |   √    |
30 | | DBSCAN (*)             |   √    |
31 | | KMeans                 |   √    |
32 | | approx/exact k-NN (*)  |   √    |
33 | | LinearRegression       |   √    |
34 | | LogisticRegression     |   √    |
35 | | PCA                    |   √    |
36 | | RandomForestClassifier |   √    |
37 | | RandomForestRegressor  |   √    |
38 | | UMAP (*)               |   √    |
39 | 
40 | (*) Notes: 
41 | - As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library.
42 | - Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. 
43 | - As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. 
44 | 
45 | ## Getting started
46 | 
47 | For PySpark (Python) users, see [this guide](python/README.md).
48 | 
49 | ## Performance
50 | 
51 | GPU acceleration can provide significant performance and cost benefits.  Benchmarking instructions and results can be found [here](python/benchmark/README.md).
52 | 
53 | ## Contributing
54 | 
55 | We welcome community contributions!  Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) to get started.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all
 4 | source code repositories managed through our organization. 
 5 | 
 6 | If you need to report a security issue, please use the appropriate contact points outlined
 7 | below. **Please do not report security vulnerabilities through GitHub/GitLab.** 
 8 | 
 9 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
10 | 
11 | To report a potential security vulnerability in any NVIDIA product:
12 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
13 | - E-Mail: psirt@nvidia.com
14 |    - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
15 |    - Please include the following information:
16 |       - Product/Driver name and version/branch that contains the vulnerability
17 | 


--------------------------------------------------------------------------------
/ci/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | ARG CUDA_VERSION=11.8.0
18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
19 | 
20 | # ubuntu22
21 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \
22 |            -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \
23 |            /etc/apt/sources.list
24 | # ubuntu24+
25 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \
26 |            -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \
27 |            -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} +
28 | # Install packages to build spark-rapids-ml
29 | RUN chmod 1777 /tmp
30 | RUN apt update -y \
31 |     && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \
32 |     && apt install -y git numactl software-properties-common wget zip maven \
33 |     && rm -rf /var/lib/apt/lists
34 | 
35 | # Config JAVA_HOME
36 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-amd64
37 | 
38 | # Install conda
39 | ENV PATH="/root/miniconda3/bin:${PATH}"
40 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
41 |     && mkdir /root/.conda \
42 |     && bash Miniconda3-latest-Linux-x86_64.sh -b \
43 |     && rm -f Miniconda3-latest-Linux-x86_64.sh \
44 |     && conda init && conda update -n base conda \
45 |     && conda install -n base conda-libmamba-solver \
46 |     && conda config --set solver libmamba
47 | 
48 | # install cuML
49 | ARG CUML_VER=25.06
50 | RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \
51 |     && conda clean --all -f -y
52 | 


--------------------------------------------------------------------------------
/ci/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | set -ex
19 | 
20 | # build plugin jar
21 | pushd jvm
22 | mvn clean package -DskipTests
23 | popd
24 | 
25 | # copy plugin jar to python package
26 | JARS_DIR=python/src/spark_rapids_ml/jars
27 | mkdir -p $JARS_DIR
28 | rm -f $JARS_DIR/*.jar
29 | cp jvm/target/*.jar $JARS_DIR
30 | 
31 | # build whl package
32 | pushd python
33 | pip install -r requirements_dev.txt && pip install -e .
34 | python -m build
35 | popd
36 | 


--------------------------------------------------------------------------------
/ci/docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | if [[ $1 == "nightly" ]]; then
19 |     TAG=$(git log -1 --format="%h")
20 |     BRANCH=$(git branch --show-current)
21 | else
22 |     # get version tag
23 |     TAG="v$VERSION"
24 | fi
25 | 
26 | set -ex
27 | 
28 | # build and publish docs
29 | pushd docs
30 | make clean
31 | make html
32 | git worktree add --track -b gh-pages _site origin/gh-pages
33 | 
34 | pushd _site
35 | if [[ $1 == "nightly" ]]; then
36 |     # draft copy
37 |     api_dest=api/python-draft
38 | else
39 |     # release copy
40 |     api_dest=api/python
41 |     # also copy site wide changes for release
42 |     cp -r ../site/* .
43 | fi
44 | 
45 | # in _site
46 | mkdir -p $api_dest
47 | cp -r ../build/html/* $api_dest/
48 | 
49 | git add --all
50 | dff=$(git diff --staged --stat)
51 | repo_url=$(git config --get remote.origin.url)
52 | url=${repo_url#https://}
53 | github_account=${GITHUB_ACCOUNT:-nvauto}
54 | if [[ -n $dff ]]; then
55 |     git commit -m "Update draft api docs to commit ${TAG} on ${BRANCH}"
56 |     git push -f https://${github_account}:${GITHUB_TOKEN}@${url} gh-pages
57 | fi
58 | 
59 | popd #_site
60 | git worktree remove _site --force
61 | popd
62 | 


--------------------------------------------------------------------------------
/ci/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | set -ex
19 | 
20 | type="$1"
21 | case $type in
22 |   "pre-merge" | "")
23 |     ut_args=""
24 |     ;;
25 |   "nightly" | "release")
26 |     ut_args="--runslow"
27 |     ;;
28 |   *)
29 |     echo "Unknown test type: $type"; exit 1;;
30 | esac
31 | bench_args=""
32 | 
33 | # environment
34 | nvidia-smi
35 | which python
36 | 
37 | # spark-rapids-ml and dependencies
38 | cd python
39 | pip install -r requirements_dev.txt && pip install -e .
40 | 
41 | # unit tests
42 | ./run_test.sh $ut_args
43 | 
44 | # benchmark
45 | ./run_benchmark.sh $bench_args
46 | 
47 | # plugin tests
48 | ./run_plugin_test.sh
49 | 
50 | # check compatibility with Spark 3.3 in nightly run
51 | # also push draft release docs to gh-pages
52 | if [[ $type == "nightly" ]]; then
53 |     pip uninstall pyspark -y
54 |     pip install pyspark~=3.3.0
55 |     ./run_test.sh
56 |     ./run_benchmark.sh $bench_args
57 |     # if everything passed till now update draft release docs in gh-pages
58 |     # need to invoke docs.sh from top level of repo
59 |     cd .. # top level of repo
60 |     ci/docs.sh nightly
61 | fi
62 | 


--------------------------------------------------------------------------------
/deprecated/native/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #=============================================================================
 2 | # Copyright (c) 2021-2023, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #=============================================================================
16 | cmake_minimum_required(VERSION 3.20)
17 | 
18 | file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake
19 |      ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
20 | include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
21 | 
22 | include(rapids-cuda)
23 | 
24 | rapids_cuda_init_architectures(SPARK_RAPIDS_ML)
25 | project(SPARK_RAPIDS_ML LANGUAGES CXX CUDA C)
26 | 
27 | # Build options.
28 | option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
29 | 
30 | # Set C++ standard.
31 | set(CMAKE_CXX_STANDARD 17)
32 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
33 | set(CMAKE_CXX_EXTENSIONS OFF)
34 | 
35 | # Set CUDA C++ standard.
36 | set(CMAKE_CUDA_STANDARD 17)
37 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
38 | set(CMAKE_CUDA_EXTENSIONS OFF)
39 | 
40 | find_package(JNI REQUIRED)
41 | find_package(CUDAToolkit REQUIRED)
42 | 
43 | # Add the project.
44 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
45 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
46 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
47 | add_subdirectory(src)
48 | 


--------------------------------------------------------------------------------
/deprecated/native/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #=============================================================================
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #=============================================================================
16 | 
17 | 
18 | # Install cuDF nightly via Conda, only for local development, will remove in CI.
19 | find_package(cudf)
20 | 
21 | set (CMAKE_CUDA_FLAGS "--extended-lambda")
22 | 
23 | if(DEFINED ENV{RAFT_PATH})
24 |   message(STATUS "RAFT_PATH environment variable detected.")
25 |   message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}")
26 |   set(RAFT_DIR ENV{RAFT_PATH})
27 | 
28 | else(DEFINED ENV{RAFT_PATH})
29 |   message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT")
30 |   set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
31 | 
32 |   ExternalProject_Add(raft
33 |     GIT_REPOSITORY    git@github.com:rapidsai/raft.git
34 |     GIT_TAG           pinned_commit/git_tag/branch
35 |     PREFIX            ${RAFT_GIT_DIR}
36 |     CONFIGURE_COMMAND ""
37 |     BUILD_COMMAND     ""
38 |     INSTALL_COMMAND   "")
39 | 
40 |   set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
41 | endif(DEFINED ENV{RAFT_PATH})
42 | 
43 | 
44 | 
45 | #################################################################################################
46 | # - CPM -----------------------------------------------------------------------------------------
47 | 
48 | set(CPM_DOWNLOAD_VERSION 0.27.2)
49 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
50 | 
51 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
52 |     message(STATUS "Downloading CPM.cmake")
53 |     file(DOWNLOAD https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION})
54 | endif()
55 | 
56 | include(${CPM_DOWNLOAD_LOCATION})
57 | #################################################################################################
58 | 
59 | # pull cuDF sources, to use jni_utils.hpp
60 | # cmake options should be added here for CI build.
61 | CPMAddPackage(NAME cudf
62 |         VERSION         "22.04.00"
63 |         GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
64 |         GIT_TAG         branch-22.04
65 | )
66 | 
67 | add_library(rapidsml_jni SHARED rapidsml_jni.cpp
68 |                                 rapidsml_jni.cu
69 |                                 )
70 | 
71 | target_link_options(rapidsml_jni PRIVATE "-Wl,-as-needed")
72 | 
73 | target_include_directories(rapidsml_jni PRIVATE
74 |                                     "${JNI_INCLUDE_DIRS}"
75 |                                     "${CUDAToolkit_INCLUDE_DIRS}"
76 |                                     "$ENV{RAFT_PATH}/cpp/include"
77 |                                     "${CMAKE_SOURCE_DIR}/src/utils"
78 |                                     "${cudf_SOURCE_DIR}/java/src/main/native/include"
79 |                                     )
80 | 
81 | target_link_libraries(rapidsml_jni PRIVATE
82 |                       libcudart_static.a
83 |                       libcusparse_static.a
84 |                       libcusolver_static.a
85 |                       libculibos.a
86 |                       liblapack_static.a
87 |                       CUDA::cublas
88 |                       cudf::cudf)
89 | 
90 | if(PER_THREAD_DEFAULT_STREAM)
91 |   target_compile_definitions(rapidsml_jni PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
92 | endif()
93 | 
94 | 


--------------------------------------------------------------------------------
/deprecated/native/src/rapidsml_jni.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <jni.h>
18 | #include <assert.h>
19 | #include <iostream>
20 | #include <stdlib.h>
21 | 
22 | #include <cudf/binaryop.hpp>
23 | #include <cudf/column/column.hpp>
24 | #include <cudf/column/column_view.hpp>
25 | #include <cudf/detail/sequence.hpp>
26 | #include <cudf/lists/lists_column_view.hpp>
27 | #include <cudf/scalar/scalar_factories.hpp>
28 | #include <cudf/utilities/type_dispatcher.hpp>
29 | 
30 | #include "rapidsml_jni.hpp"
31 | #include "jni_utils.hpp"
32 | 
33 | extern "C" {
34 | 
35 | JNIEXPORT jlong JNICALL Java_com_nvidia_spark_ml_linalg_JniRAPIDSML_dgemmWithColumnViewPtr(
36 |   JNIEnv *env, jclass, jint transa, jint transb, jint m, jint n, jint k, jdouble alpha,
37 |   jdoubleArray A, jint lda, jlong B,jint ldb, jdouble beta, jint ldc, jint deviceID) {
38 |   try {
39 |     cudf::jni::native_jdoubleArray native_A(env, A);
40 |     auto ret_column = dgemm(transa, transb, m, n, k, alpha, native_A.data(), native_A.size(), lda,
41 |                             B, ldb, beta, ldc, deviceID);
42 |     return ret_column;
43 |   }
44 |   CATCH_STD(env, 0);
45 | }
46 | 
47 | JNIEXPORT jlong JNICALL Java_com_nvidia_spark_ml_linalg_JniRAPIDSML_dgemmCov(JNIEnv *env, jclass,
48 |   jint transa, jint transb, jint m, jint n, jint k, jdouble alpha, jlong A, jint lda, jlong B,
49 |   jint ldb, jdouble beta, jdoubleArray C, jint ldc, jint deviceID) {
50 |   try {
51 |     cudf::jni::native_jdoubleArray native_C(env, C);
52 |     dgemmCov(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, native_C.data(), ldc, deviceID);
53 |   }
54 |   CATCH_STD(env, 0);
55 | }
56 | 
57 | }  // extern "C"
58 | 


--------------------------------------------------------------------------------
/deprecated/native/src/rapidsml_jni.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <raft/linalg/detail/cublas_wrappers.hpp>
20 | 
21 | cublasOperation_t convertToCublasOpEnum(int int_type);
22 | 
23 | void signFlip(double* input, int n_rows, int n_cols, double* components,
24 |               int n_cols_comp, cudaStream_t stream);
25 | 
26 | long dgemm(int transa, int transb, int m, int n,
27 |            int k, double alpha, double* A, int size_A, int lda, long B,
28 |            int ldb, double beta, int ldc, int deviceID);
29 | 
30 | void dgemmCov(int transa, int transb, int m, int n,int k, double alpha, long A, int lda,long B,
31 |               int ldb, double beta, double* C, int ldc, int deviceID);
32 | 


--------------------------------------------------------------------------------
/deprecated/src/main/java/com/nvidia/spark/ml/linalg/JniRAPIDSML.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.spark.ml.linalg;
18 | 
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.nio.file.Files;
22 | import java.nio.file.Path;
23 | import java.nio.file.StandardCopyOption;
24 | import java.nio.file.attribute.PosixFilePermissions;
25 | 
26 | public final class JniRAPIDSML {
27 |   private static final JniRAPIDSML instance = new JniRAPIDSML();
28 |   private static boolean loaded = false;
29 | 
30 |   public static boolean depsLoaded() {
31 |     return loaded;
32 |   }
33 | 
34 |   private JniRAPIDSML() {
35 |     String osArch = System.getProperty("os.arch");
36 |     if (osArch == null || osArch.isEmpty()) {
37 |       throw new RuntimeException("Unable to load native implementation");
38 |     }
39 |     String osName = System.getProperty("os.name");
40 |     if (osName == null || osName.isEmpty()) {
41 |       throw new RuntimeException("Unable to load native implementation");
42 |     }
43 | 
44 |     Path temp;
45 |     try (InputStream resource = this.getClass().getClassLoader().getResourceAsStream(
46 |         String.format("%s/%s/librapidsml_jni.so", osArch, osName))) {
47 |       assert resource != null;
48 |       Files.copy(resource, temp = Files.createTempFile("librapidsml_jni.so", "",
49 |               PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-x---"))),
50 |           StandardCopyOption.REPLACE_EXISTING);
51 |       temp.toFile().deleteOnExit();
52 |     } catch (IOException e) {
53 |       throw new RuntimeException("Unable to load native implementation", e);
54 |     }
55 | 
56 |     System.load(temp.toString());
57 |     loaded=true;
58 |   }
59 | 
60 |   public static JniRAPIDSML getInstance() {
61 |     return instance;
62 |   }
63 | 
64 |   public native long dgemmCov(int transa, int transb, int m, int n, int k, double alpha, long A, int lda, long B,
65 |                            int ldb, double beta, double[] C, int ldc, int deviceID);
66 | 
67 |   public native long accumulateCov(long a, long b);
68 | 
69 |   /** Wrapper of JNI entrance for cuBLAS gemm routine. Most parameters are the same as the original gemm's: https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm.
70 |    * Differences are:
71 |    * 1. transa and transb are int values instead of enum.
72 |    * 2. B is a long value that represeents the `cudf::lists_column_view *` holding the matrix data on device
73 |    * 3. an extra deviceID to indicate which GPU device will perform this computation
74 |    */
75 |   public native long dgemmWithColumnViewPtr(int transa, int transb, int m, int n, int k, double alpha, double[] A,
76 |                                             int lda, long B, int ldb, double beta, int ldc, int deviceID);
77 |   public native void calSVD(int m, double[] A, double[] U, double[] S, int deviceID);
78 | }
79 | 


--------------------------------------------------------------------------------
/deprecated/src/main/scala/com/nvidia/spark/ml/feature/PCA.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.spark.ml.feature
18 | 
19 | import org.apache.spark.ml.feature.RapidsPCA
20 | import org.apache.spark.ml.param.ParamMap
21 | import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
22 | 
23 | /**
24 |  * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
25 |  * principal components.
26 |  */
27 | class PCA(override val uid: String) extends RapidsPCA {
28 | 
29 |   def this() = this(Identifiable.randomUID("pca"))
30 | 
31 |   override def copy(extra: ParamMap): PCA = defaultCopy(extra)
32 | }
33 | 
34 | object PCA extends DefaultParamsReadable[PCA] {
35 | 
36 |   override def load(path: String): PCA = super.load(path)
37 | }
38 | 


--------------------------------------------------------------------------------
/deprecated/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # Set everything to be logged to the file target/unit-tests.log
18 | log4j.rootCategory=INFO, file
19 | log4j.appender.file=org.apache.log4j.FileAppender
20 | log4j.appender.file.append=true
21 | log4j.appender.file.file=target/unit-tests.log
22 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
24 | 
25 | # Ignore messages below warning level from Jetty, because it's a bit verbose
26 | log4j.logger.org.sparkproject.jetty=WARN
27 | 
28 | 


--------------------------------------------------------------------------------
/deprecated/src/test/scala/org/apache/spark/ml/util/RapidsMLTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.util
18 | 
19 | import org.apache.spark.SparkConf
20 | import org.apache.spark.sql.DataFrame
21 | 
22 | trait RapidsMLTest extends MLTest {
23 |   override def sparkConf: SparkConf = {
24 |     super.sparkConf.set("spark.rapids.sql.enabled", "true")
25 |   }
26 | 
27 |   override def checkVectorSizeOnDF(
28 |       dataframe: DataFrame,
29 |       vecColName: String,
30 |       vecSize: Int): Unit = {
31 |     super.checkVectorSizeOnDF(dataframe, vecColName, vecSize)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.pip:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | ARG CUDA_VERSION=11.8.0
18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
19 | 
20 | ARG PYSPARK_VERSION=3.3.1
21 | ARG RAPIDS_VERSION=25.6.0
22 | ARG ARCH=amd64
23 | #ARG ARCH=arm64
24 | 
25 | # ubuntu22
26 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \
27 |            -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \
28 |            /etc/apt/sources.list
29 | # ubuntu24+
30 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \
31 |            -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \
32 |            -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} +
33 | # Install packages to build spark-rapids-ml
34 | RUN apt-get update -y \
35 |     && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \
36 |     && rm -rf /var/lib/apt/lists
37 | 
38 | RUN apt-get update -y \
39 |     && apt install -y git numactl python3.10-venv python3-pip python-is-python3 software-properties-common wget zip \
40 |     && python -m pip install --upgrade pip \
41 |     && rm -rf /var/lib/apt/lists
42 | 
43 | RUN apt-get update -y \
44 |     && apt install -y python3.10-dev cmake curl \
45 |     && rm -rf /var/lib/apt/lists
46 | 
47 | # install RAPIDS
48 | # using ~= pulls in micro version patches
49 | RUN pip install --no-cache-dir \
50 |     cudf-cu11~=${RAPIDS_VERSION} \
51 |     cuml-cu11~=${RAPIDS_VERSION} \
52 |     cuvs-cu11~=${RAPIDS_VERSION} \
53 |     numpy~=1.0 \
54 |     --extra-index-url=https://pypi.nvidia.com
55 | 
56 | # install python dependencies
57 | RUN pip install --no-cache-dir pyspark==${PYSPARK_VERSION} "scikit-learn>=1.2.1" \
58 |     && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \
59 |     numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0"
60 | 
61 | # Config JAVA_HOME
62 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-$ARCH
63 | 
64 | ### END OF CACHE ###
65 | 
66 | #ARG RAPIDS_ML_VER=main
67 | #RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git
68 | COPY . /spark-rapids-ml
69 | WORKDIR /spark-rapids-ml/python
70 | 
71 | # install spark-rapids-ml with requirements_dev.txt (in case it has diverged from cache)
72 | RUN pip install --no-cache-dir -r requirements_dev.txt \
73 |     && pip install --no-cache-dir -e .
74 | 
75 | SHELL ["/bin/bash", "-c"]
76 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.python:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | ARG CUDA_VERSION=11.8.0
18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
19 | 
20 | ARG CUML_VERSION=25.06
21 | 
22 | # ubuntu22
23 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \
24 |            -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \
25 |            /etc/apt/sources.list
26 | # ubuntu24+
27 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \
28 |            -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \
29 |            -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} +
30 | # Install packages to build spark-rapids-ml
31 | RUN apt update -y \
32 |     && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \
33 |     && apt install -y git numactl software-properties-common wget zip \
34 |     && rm -rf /var/lib/apt/lists
35 | 
36 | # Config JAVA_HOME
37 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-amd64
38 | 
39 | # Install conda
40 | ENV PATH="/root/miniconda3/bin:${PATH}"
41 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \
42 |     && mkdir /root/.conda \
43 |     && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \
44 |     && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh \
45 |     && conda init
46 | 
47 | # install cuML
48 | 
49 | RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=11.8 cuml=$CUML_VERSION numpy~=1.0 \
50 |     && conda clean --all -f -y
51 | 
52 | # install python dependencies
53 | RUN pip install --no-cache-dir "pyspark>=3.2.1" "scikit-learn>=1.2.1" \
54 |     && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \
55 |     numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0"
56 | 
57 | ### END OF CACHE ###
58 | 
59 | #ARG RAPIDS_ML_VER=main
60 | #RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git
61 | COPY . /spark-rapids-ml
62 | WORKDIR /spark-rapids-ml/python
63 | 
64 | # install spark-rapids-ml with requirements_dev.txt (in case it has diverged from cache)
65 | RUN pip install --no-cache-dir -r requirements_dev.txt \
66 |     && pip install --no-cache-dir -e .
67 | 
68 | SHELL ["conda", "run", "--no-capture-output", "-n", "base", "/bin/bash", "-c"]
69 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Build in Docker
 2 | 
 3 | We provide the following Dockerfiles:
 4 | - [Dockerfile](./Dockerfile) - for building the Scala API.
 5 | - [Dockerfile.python](./Dockerfile.python) - for building the Python API (using conda for RAPIDS dependencies).
 6 | - [Dockerfile.pip](./Dockerfile.pip) - for building the Python API (using pip for RAPIDS dependencies).
 7 | 
 8 | ## Python API
 9 | 
10 | First, build the development image.
11 | ```bash
12 | docker build -t spark-rapids-ml:python -f Dockerfile.python ..
13 | # OPTIONAL: docker build -t spark-rapids-ml:pip -f Dockerfile.pip ..
14 | ```
15 | 
16 | Launch the container
17 | ```bash
18 | nvidia-docker run -it --rm spark-rapids-ml:python
19 | # OPTIONAL: nvidia-docker run -it --rm spark-rapids-ml:pip
20 | ```
21 | Run the unit tests inside the container.
22 | ```bash
23 | ./run_test.sh --runslow
24 | ```
25 | 
26 | Run the benchmarks inside the container.
27 | ```bash
28 | ./run_benchmark.sh
29 | ```
30 | 
31 | Build the pip package.
32 | ```bash
33 | python -m build
34 | ```
35 | 
36 | Build the documentation.
37 | ```
38 | cd ../docs
39 | make html
40 | cp -r build/html site/api/python
41 | # copy site/* to 'gh-pages' branch to publish
42 | ```
43 | 
44 | ## Scala API (Deprecated)
45 | 
46 | First, build the development image.  **Note**: see the Dockerfile for configurable build arguments.
47 | ```bash
48 | docker build -t spark-rapids-ml:jvm -f Dockerfile ..
49 | ```
50 | 
51 | Run the container.
52 | ```bash
53 | nvidia-docker run -it --rm spark-rapids-ml:jvm
54 | ```
55 | 
56 | Then, inside the container, build the Scala API [as usual](../jvm/README.md#build-target-jar).
57 | ```bash
58 | mvn clean package
59 | ```
60 | 
61 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | clean: Makefile
18 | 	rm -rf $(BUILDDIR)
19 | 	rm -rf $(SOURCEDIR)/api
20 | 
21 | # Catch-all target: route all unknown targets to Sphinx using the new
22 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
23 | %: Makefile
24 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | numpydoc
16 | pydata-sphinx-theme
17 | 


--------------------------------------------------------------------------------
/docs/site/404.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | permalink: /404.html
 3 | layout: default
 4 | ---
 5 | 
 6 | <style type="text/css" media="screen">
 7 |   .container {
 8 |     margin: 10px auto;
 9 |     max-width: 600px;
10 |     text-align: center;
11 |   }
12 |   h1 {
13 |     margin: 30px 0;
14 |     font-size: 4em;
15 |     line-height: 1;
16 |     letter-spacing: -1px;
17 |   }
18 | </style>
19 | 
20 | <div class="container">
21 |   <h1>404</h1>
22 | 
23 |   <p><strong>Page not found :(</strong></p>
24 |   <p>The requested page could not be found.</p>
25 | </div>
26 | 


--------------------------------------------------------------------------------
/docs/site/FAQ.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Frequently Asked Questions
 3 | nav_order: 4
 4 | ---
 5 | # Frequently Asked Questions
 6 | 
 7 | * TOC
 8 | {:toc}
 9 | 
10 | ### What versions of Apache Spark are supported?
11 | 
12 | Apache Spark version 3.3.1 or higher.
13 | 
14 | ### What versions of Python are supported
15 | 
16 | Python 3.10 or higher.
17 | 


--------------------------------------------------------------------------------
/docs/site/_config.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2023, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | title: Spark Rapids ML
18 | description: >- # this means to ignore newlines until "baseurl:"
19 |   Spark Rapids ML enables GPU accelerated distributed machine learning on Apache Spark
20 |   powered by the RAPIDS cuML library.
21 | baseurl: "/spark-rapids-ml" # the subpath of your site, e.g. /blog
22 | url: "nvidia.github.io" # the base hostname & protocol for your site, e.g. http://example.com
23 | 
24 | aux_links:
25 |   "Spark Rapids ML on GitHub":
26 |     - "//github.com/nvidia/spark-rapids-ml"
27 | 
28 | # Build settings
29 | remote_theme: pmarsceill/just-the-docs
30 | plugins:
31 |   - jekyll-feed
32 | 
33 | # Exclude from processing.
34 | # The following items will not be processed, by default.
35 | # Any item listed under the `exclude:` key here will be automatically added to
36 | # the internal "default list".
37 | #
38 | # Excluded items can be processed by explicitly listing the directories or
39 | # their entries' file path in the `include:` list.
40 | #
41 | exclude:
42 |   - .sass-cache/
43 |   - .jekyll-cache/
44 |   - gemfiles/
45 |   - Gemfile
46 |   - Gemfile.lock
47 |   - node_modules/
48 |   - vendor/bundle/
49 |   - vendor/cache/
50 |   - vendor/gems/
51 |   - vendor/ruby/
52 | 
53 | include:
54 |   - _static 
55 |   - _sphinx*
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/site/api/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: API
 3 | nav_order: 5
 4 | ---
 5 | # API Documentation
 6 | 
 7 | - Python API 
 8 |   - [Stable Release](python)
 9 |   - [Draft](python-draft)
10 | 


--------------------------------------------------------------------------------
/docs/site/compatibility.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Compatibility
 3 | nav_order: 3
 4 | ---
 5 | # Compatibility with Apache Spark
 6 | 
 7 | ## Supported Algorithms
 8 | 
 9 | The following table shows the currently supported algorithms.  The goal is to expand this over time with support from the underlying RAPIDS cuML libraries.  If you would like support for a specific algorithm, please file a [git issue](https://github.com/NVIDIA/spark-rapids-ml/issues) to help us prioritize.
10 | 
11 | | Supported Algorithms   | Python |
12 | | :--------------------- | :----: |
13 | | CrossValidator         |   √    |
14 | | DBSCAN (*)             |   √    |
15 | | KMeans                 |   √    |
16 | | approx/exact k-NN (*)  |   √    |
17 | | LinearRegression       |   √    |
18 | | LogisticRegression     |   √    |
19 | | PCA                    |   √    |
20 | | RandomForestClassifier |   √    |
21 | | RandomForestRegressor  |   √    |
22 | | UMAP (*)               |   √    |
23 | 
24 | (*) Notes: 
25 | - As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library.
26 | - Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. 
27 | - As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. 
28 | 
29 | 
30 | ## Supported Versions
31 | 
32 | | Spark Rapids ML | CUDA  | Spark  | Python |
33 | | :-------------- | :---- | :----- | :----- |
34 | | 1.0.0           | 11.4+ | 3.3+   | 3.10+  |
35 | 
36 | 
37 | ## Single vs Double precision inputs
38 | The underlying cuML implementations all accept single precision (e.g. Float or float32) input types and offer the best performance in this case.  As a result, by default, Spark RAPIDs ML converts Spark DataFrames supplied to `fit` and `transform` methods having double precision data types (i.e. `VectorUDT`, `ArrayType(DoubleType())`, `DoubleType()` columns) to single precision before passing them down to the cuML layer.  Most of the cuML algorithm implementations also support double precision inputs.   The Estimator (for all algorithms) constructor parameter `float32_inputs` can be used to control this behavior.  The default value is `True` which forces the conversion to single precision for all algorithms, but it can be set to `False` in which case double precision input data is passed to those cuML algorithms which support it.
39 | 
40 | Currently all algorithms *except* the following support double precision:  k-NN, UMAP.
41 | 


--------------------------------------------------------------------------------
/docs/site/configuration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Configuration
 3 | nav_order: 6
 4 | ---
 5 | # Configuration
 6 | 
 7 | The following configurations can be supplied as Spark properties.
 8 | 
 9 | | Property name   | Default | Meaning  |
10 | | :-------------- | :------ | :------- |
11 | | spark.rapids.ml.uvm.enabled | false | if set to true, enables [CUDA unified virtual memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) (aka managed memory) during estimator.fit() operations to allow processing of larger datasets than would fit in GPU memory |
12 | | spark.rapids.ml.gpuMemRatioForData | None |  If set to a float value between 0 and 1, Spark Rapids ML will reserve a portion of free GPU memory on each GPU and incrementally append PySpark data batches into this reserved space. This setting is recommended for large datasets, as it prevents duplicating the entire dataset in GPU memory and reduces the risk of out-of-memory errors. |
13 | | spark.rapids.ml.cpu.fallback.enabled | false | if set to true and spark-rapids-ml estimator.fit() is invoked with unsupported parameters or parameter values, the pyspark.ml cpu based estimator.fit() and model.transform() will be run; if set to false, an exception is raised in this case (default). |
14 | | spark.rapids.ml.verbose | None | if set to a boolean value (true/false) or an integer between 0 and 6, controls the verbosity level for cuML logging during estimator.fit() operations. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. |
15 | | spark.rapids.ml.float32_inputs | None | if set to a boolean value (true/false), controls whether input data should be converted to float32 precision before being passed to cuML algorithms. Setting this to true can reduce memory usage and potentially improve performance, but may affect numerical precision. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. |
16 | | spark.rapids.ml.num_workers | None | if set to an integer value greater than 0, specifies the number of workers to use for distributed training. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. |
17 | 
18 | 
19 | Since the algorithms rely heavily on Pandas UDFs, we also require `spark.sql.execution.arrow.pyspark.enabled=true` to ensure efficient data transfer between the JVM and Python processes. 


--------------------------------------------------------------------------------
/docs/site/contact.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Contact Us
 3 | nav_order: 7
 4 | ---
 5 | # Contact Us
 6 | 
 7 | We use github to track bugs, feature requests, and answer questions. File an
 8 | [issue](https://github.com/NVIDIA/spark-rapids-ml/issues/new) for a bug or feature request.
 9 | 
10 | For security issues, [report the vulnerability via email](security.md).


--------------------------------------------------------------------------------
/docs/site/get-started/databricks.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Databricks
3 | parent: Getting Started
4 | ---
5 | # Getting Started on Databricks
6 | 
7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/databricks/README.md)
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/site/get-started/dataproc.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Dataproc
3 | parent: Getting Started
4 | ---
5 | # Getting Started on Google Dataproc
6 | 
7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/dataproc/README.md)
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/site/get-started/emr.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: EMR
3 | parent: Getting Started
4 | ---
5 | # Getting Started on AWS EMR
6 | 
7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/aws-emr/README.md)
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/site/get-started/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Getting Started
3 | nav_order: 2
4 | has_children: true
5 | ---
6 | # Getting Started
7 | 


--------------------------------------------------------------------------------
/docs/site/get-started/local.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Local Server
3 | parent: Getting Started
4 | ---
5 | # Getting Started on a Local Server
6 | 
7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/python/README.md)
8 | 


--------------------------------------------------------------------------------
/docs/site/get-started/spark_connect.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Spark Connect
3 | parent: Getting Started
4 | ---
5 | # Getting Started on Spark Connect
6 | 
7 | Spark Rapids ML supports Spark Connect via the [Spark Rapids ML Connect Plugin](https://github.com/NVIDIA/spark-rapids-ml/blob/main/jvm).  A prebuilt plugin jar compatible with Spark Connect 4.0 is bundled with the `spark-rapids-ml` pip package.   See the getting-started [guide](https://github.com/NVIDIA/spark-rapids-ml/blob/main/jvm/README.md) for more information.


--------------------------------------------------------------------------------
/docs/site/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Home
3 | nav_order: 1
4 | ---
5 | # Spark Rapids ML
6 | 
7 | Spark Rapids ML enables GPU accelerated distributed machine learning on [Apache Spark](https://spark.apache.org/).  It provides several PySpark ML compatible algorithms powered by the [RAPIDS cuML](https://docs.rapids.ai/api/cuml/stable/) library.


--------------------------------------------------------------------------------
/docs/site/performance.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Performance Tuning
 3 | nav_order: 6
 4 | ---
 5 | # Performance Tuning
 6 | 
 7 | * TOC
 8 | {:toc}
 9 | 
10 | ## Stage-level scheduling
11 | 
12 | Starting from spark-rapids-ml `23.10.0`, stage-level scheduling is automatically enabled.
13 | Therefore, if you are using Spark **standalone** cluster version **`3.4.0`** or higher, we strongly recommend
14 | configuring the `"spark.task.resource.gpu.amount"` as a fractional value. This will
15 | enable running multiple tasks in parallel during the ETL phase to help the performance. An example configuration
16 | would be `"spark.task.resource.gpu.amount=1/spark.executor.cores"`. For example,
17 | 
18 | ``` bash
19 | spark-submit \
20 |   --master spark://<master-ip>:7077 \
21 |   --conf spark.executor.cores=12 \
22 |   --conf spark.task.cpus=1 \
23 |   --conf spark.executor.resource.gpu.amount=1 \
24 |   --conf spark.task.resource.gpu.amount=0.08 \
25 |   ...
26 | ```
27 | 
28 | The above spark-submit command specifies a request for 1 GPU and 12 CPUs per executor. So you can see,
29 | a total of 12 tasks per executor will be executed concurrently during the ETL phase. And the stage-level scheduling
30 | is then used internally to the library to automatically carry out the ML training phases using the required 1 gpu per task.
31 | 
32 | However, if you are using a spark-rapids-ml version earlier than 23.10.0 or a Spark
33 | standalone cluster version below 3.4.0, you need to make sure there will be only 1 task running at any time per executor.
34 | You can set `spark.task.cpus` equal to `spark.executor.cores`, or `"spark.task.resource.gpu.amount"=1`. For example,
35 | 
36 | ``` bash
37 | spark-submit \
38 |   --master spark://<master-ip>:7077 \
39 |   --conf spark.executor.cores=12 \
40 |   --conf spark.task.cpus=1 \
41 |   --conf spark.executor.resource.gpu.amount=1 \
42 |   --conf spark.task.resource.gpu.amount=1 \
43 |   ...
44 | ```
45 | 


--------------------------------------------------------------------------------
/docs/site/security.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Security
 3 | nav_order: 6
 4 | ---
 5 | # Security
 6 | 
 7 | NVIDIA is dedicated to the security and trust of our software products and services, including all
 8 | source code repositories managed through our organization.
 9 | 
10 | If you need to report a security issue, please use the appropriate contact points outlined
11 | below. **Please do not report security vulnerabilities through GitHub.**
12 | 
13 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
14 | 
15 | To report a potential security vulnerability in any NVIDIA product:
16 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
17 | - E-Mail: psirt@nvidia.com
18 |    - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
19 |    - Please include the following information:
20 |       - Product/Driver name and version/branch that contains the vulnerability.


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | ..  Licensed to the Apache Software Foundation (ASF) under one
 2 |     or more contributor license agreements.  See the NOTICE file
 3 |     distributed with this work for additional information
 4 |     regarding copyright ownership.  The ASF licenses this file
 5 |     to you under the Apache License, Version 2.0 (the
 6 |     "License"); you may not use this file except in compliance
 7 |     with the License.  You may obtain a copy of the License at
 8 | 
 9 | ..    http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | ..  Unless required by applicable law or agreed to in writing,
12 |     software distributed under the License is distributed on an
13 |     "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 |     KIND, either express or implied.  See the License for the
15 |     specific language governing permissions and limitations
16 |     under the License.
17 | 
18 | .. Workaround to avoid documenting __init__.
19 | 
20 | {% extends "!autosummary/class.rst" %}
21 | 
22 | {% if '__init__' in methods %}
23 | {% set caught_result = methods.remove('__init__') %}
24 | {% endif %}
25 |     
26 | {% block methods %}
27 | {% if methods %}
28 | 
29 |    .. rubric:: Methods
30 | 
31 |    .. autosummary::
32 |       {% for item in methods %}
33 |       ~{{ name }}.{{ item }}
34 |       {%- endfor %}
35 | 
36 | {% endif %}
37 | {% endblock %}
38 | 
39 | 


--------------------------------------------------------------------------------
/docs/source/_templates/autosummary/class_with_docs.rst:
--------------------------------------------------------------------------------
 1 | ..  Licensed to the Apache Software Foundation (ASF) under one
 2 |     or more contributor license agreements.  See the NOTICE file
 3 |     distributed with this work for additional information
 4 |     regarding copyright ownership.  The ASF licenses this file
 5 |     to you under the Apache License, Version 2.0 (the
 6 |     "License"); you may not use this file except in compliance
 7 |     with the License.  You may obtain a copy of the License at
 8 | 
 9 | ..    http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | ..  Unless required by applicable law or agreed to in writing,
12 |     software distributed under the License is distributed on an
13 |     "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 |     KIND, either express or implied.  See the License for the
15 |     specific language governing permissions and limitations
16 |     under the License.
17 | 
18 | 
19 | {{ objname }}
20 | {{ underline }}
21 | 
22 | .. currentmodule:: {{ module }}
23 | 
24 | .. autoclass:: {{ objname }}
25 |     :no-private-members:
26 | 
27 |     {% if '__init__' in methods %}
28 |       {% set caught_result = methods.remove('__init__') %}
29 |     {% endif %}
30 | 
31 |     {% block methods_summary %}
32 |     {% if methods %}
33 | 
34 |     .. rubric:: Methods
35 | 
36 |     .. autosummary::
37 |     {% for item in methods %}
38 |        ~{{ name }}.{{ item }}
39 |     {%- endfor %}
40 | 
41 |     {% endif %}
42 |     {% endblock %}
43 | 
44 |     {% block attributes_summary %}
45 |     {% if attributes %}
46 | 
47 |     .. rubric:: Attributes
48 | 
49 |     .. autosummary::
50 |     {% for item in attributes %}
51 |     {%- if item != "uid" %}
52 |        ~{{ name }}.{{ item }}
53 |     {%- endif %}
54 |     {%- endfor %}
55 | 
56 |     {% endif %}
57 |     {% endblock %}
58 | 
59 |     {% block methods_documentation %}
60 |     {% if methods %}
61 | 
62 |     .. rubric:: Methods Documentation
63 | 
64 |     {% for item in methods %}
65 |     .. automethod:: {{ item }}
66 |     {%- endfor %}
67 | 
68 |     {% endif %}
69 |     {% endblock %}
70 | 
71 |     {% block attributes_documentation %}
72 |     {% if attributes %}
73 | 
74 |     .. rubric:: Attributes Documentation
75 | 
76 |     {% for item in attributes %}
77 |     {%- if item != "uid" %}
78 |     .. autoattribute:: {{ item }}
79 |     {%- endif %}
80 |     {%- endfor %}
81 | 
82 |     {% endif %}
83 |     {% endblock %}
84 | 
85 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Configuration file for the Sphinx documentation builder.
16 | #
17 | # For the full list of built-in configuration values, see the documentation:
18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
19 | 
20 | # -- Project information -----------------------------------------------------
21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
22 | 
23 | project = 'spark-rapids-ml'
24 | copyright = '2025, NVIDIA'
25 | author = 'NVIDIA'
26 | release = '25.06.0'
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
30 | 
31 | 
32 | extensions = [
33 |     'numpydoc',
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.autosummary',
36 |     'sphinx.ext.doctest',
37 |     'sphinx.ext.githubpages',
38 |     'sphinx.ext.intersphinx',
39 | ]
40 | 
41 | numpydoc_show_class_members = False
42 | 
43 | autodoc_inherit_docstrings = False
44 | 
45 | templates_path = ['_templates']
46 | exclude_patterns = []
47 | 
48 | intersphinx_mapping = {
49 |     'pyspark': ('https://spark.apache.org/docs/latest/api/python', None),
50 |     'cuml': ('https://docs.rapids.ai/api/cuml/stable', None),
51 | }
52 | 
53 | # -- Options for HTML output -------------------------------------------------
54 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
55 | 
56 | html_theme = 'pydata_sphinx_theme'
57 | 
58 | html_show_sourcelink = False
59 | 
60 | import inspect
61 | from spark_rapids_ml.utils import _unsupported_methods_attributes
62 | 
63 | _unsupported_by_class = {}
64 | def autodoc_skip_member(app, what, name, obj, skip, options):
65 |     # adapted from this https://github.com/sphinx-doc/sphinx/issues/9533#issuecomment-962007846
66 |     doc_class=None
67 |     for frame in inspect.stack():
68 |         if frame.function == "get_members":
69 |             doc_class = frame.frame.f_locals["obj"]
70 |             break
71 |     
72 |     exclude = skip
73 |     if doc_class:
74 |         if doc_class not in _unsupported_by_class:
75 |             _unsupported_by_class[doc_class] = _unsupported_methods_attributes(doc_class)
76 | 
77 |         exclude = name in _unsupported_by_class[doc_class]
78 | 
79 |     # return True if (skip or exclude) else None  # Can interfere with subsequent skip functions.
80 |     return True if exclude or skip else None
81 | 
82 | def setup(app):
83 |     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
84 |     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
85 |     app.connect('autodoc-skip-member', autodoc_skip_member)
86 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. spark-rapids-ml documentation master file, created by
 2 |    sphinx-quickstart on Thu Jan 19 13:20:52 2023.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to spark-rapids-ml's documentation!
 7 | ===========================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    spark_rapids_ml
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`search`
20 | 


--------------------------------------------------------------------------------
/docs/source/spark_rapids_ml.rst:
--------------------------------------------------------------------------------
  1 | ..  Licensed to the Apache Software Foundation (ASF) under one
  2 |     or more contributor license agreements.  See the NOTICE file
  3 |     distributed with this work for additional information
  4 |     regarding copyright ownership.  The ASF licenses this file
  5 |     to you under the Apache License, Version 2.0 (the
  6 |     "License"); you may not use this file except in compliance
  7 |     with the License.  You may obtain a copy of the License at
  8 | 
  9 | ..    http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 | ..  Unless required by applicable law or agreed to in writing,
 12 |     software distributed under the License is distributed on an
 13 |     "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 |     KIND, either express or implied.  See the License for the
 15 |     specific language governing permissions and limitations
 16 |     under the License.
 17 | 
 18 | 
 19 | Spark Rapids ML
 20 | ===============
 21 | 
 22 | .. toctree::
 23 |    :maxdepth: 4
 24 | 
 25 | Feature
 26 | -------
 27 | 
 28 | .. currentmodule:: spark_rapids_ml.feature
 29 | 
 30 | .. autosummary::
 31 |     :template: autosummary/class_with_docs.rst
 32 |     :toctree: api/feature
 33 | 
 34 |     PCA
 35 |     PCAModel
 36 | 
 37 | Classification
 38 | --------------
 39 | 
 40 | .. currentmodule:: spark_rapids_ml.classification
 41 | 
 42 | .. autosummary::
 43 |     :template: autosummary/class_with_docs.rst
 44 |     :toctree: api
 45 | 
 46 |     LogisticRegression
 47 |     LogisticRegressionModel
 48 |     RandomForestClassifier
 49 |     RandomForestClassificationModel
 50 | 
 51 | Clustering
 52 | ----------
 53 | 
 54 | .. currentmodule:: spark_rapids_ml.clustering
 55 | 
 56 | .. autosummary::
 57 |     :template: autosummary/class_with_docs.rst
 58 |     :toctree: api
 59 | 
 60 |     DBSCAN
 61 |     DBSCANModel
 62 |     KMeans
 63 |     KMeansModel
 64 | 
 65 | 
 66 | Regression
 67 | ----------
 68 | 
 69 | .. currentmodule:: spark_rapids_ml.regression
 70 | 
 71 | .. autosummary::
 72 |     :template: autosummary/class_with_docs.rst
 73 |     :toctree: api
 74 | 
 75 |     LinearRegression
 76 |     LinearRegressionModel
 77 |     RandomForestRegressor
 78 |     RandomForestRegressionModel
 79 | 
 80 | 
 81 | Nearest Neighbors
 82 | -----------------
 83 | 
 84 | .. currentmodule:: spark_rapids_ml.knn
 85 | 
 86 | .. autosummary::
 87 |     :template: autosummary/class_with_docs.rst
 88 |     :toctree: api
 89 | 
 90 |     ApproximateNearestNeighbors
 91 |     ApproximateNearestNeighborsModel
 92 |     NearestNeighbors
 93 |     NearestNeighborsModel
 94 | 
 95 | 
 96 | Tuning
 97 | ------
 98 | 
 99 | .. currentmodule:: spark_rapids_ml.tuning
100 | 
101 | .. autosummary::
102 |     :template: autosummary/class_with_docs.rst
103 |     :toctree: api
104 | 
105 |     CrossValidator
106 |     
107 | 
108 | UMAP
109 | ----
110 | 
111 | .. currentmodule:: spark_rapids_ml.umap
112 | 
113 | .. autosummary::
114 |     :template: autosummary/class_with_docs.rst
115 |     :toctree: api
116 | 
117 |     UMAP
118 |     UMAPModel
119 | 


--------------------------------------------------------------------------------
/jvm/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | target
3 | 


--------------------------------------------------------------------------------
/jvm/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml estimators.
21 | com.nvidia.rapids.ml.RapidsLogisticRegression
22 | com.nvidia.rapids.ml.RapidsRandomForestClassifier
23 | com.nvidia.rapids.ml.RapidsPCA
24 | com.nvidia.rapids.ml.RapidsRandomForestRegressor
25 | com.nvidia.rapids.ml.RapidsLinearRegression
26 | com.nvidia.rapids.ml.RapidsKMeans
27 | 


--------------------------------------------------------------------------------
/jvm/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml models.
21 | org.apache.spark.ml.rapids.RapidsLogisticRegressionModel
22 | org.apache.spark.ml.rapids.RapidsRandomForestClassificationModel
23 | org.apache.spark.ml.rapids.RapidsPCAModel
24 | org.apache.spark.ml.rapids.RapidsRandomForestRegressionModel
25 | org.apache.spark.ml.rapids.RapidsLinearRegressionModel
26 | org.apache.spark.ml.clustering.rapids.RapidsKMeansModel
27 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/Plugin.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.nvidia.rapids.ml
17 | 
18 | import org.apache.spark.sql.connect.plugin.MLBackendPlugin
19 | 
20 | import java.util.Optional
21 | 
22 | /**
23 |  * Spark connect ml plugin is used to replace the spark built-in algorithms with
24 |  * spark-rapids-ml python implementations.
25 |  */
26 | class Plugin extends MLBackendPlugin {
27 | 
28 |   override def transform(mlName: String): Optional[String] = {
29 |     mlName match {
30 |       case "org.apache.spark.ml.classification.LogisticRegression" =>
31 |         Optional.of("com.nvidia.rapids.ml.RapidsLogisticRegression")
32 |       case "org.apache.spark.ml.classification.LogisticRegressionModel" =>
33 |         Optional.of("org.apache.spark.ml.rapids.RapidsLogisticRegressionModel")
34 |       case "org.apache.spark.ml.classification.RandomForestClassifier" =>
35 |         Optional.of("com.nvidia.rapids.ml.RapidsRandomForestClassifier")
36 |       case "org.apache.spark.ml.classification.RandomForestClassificationModel" =>
37 |         Optional.of("org.apache.spark.ml.rapids.RapidsRandomForestClassificationModel")
38 |       case "org.apache.spark.ml.feature.PCA" =>
39 |         Optional.of("com.nvidia.rapids.ml.RapidsPCA")
40 |       case "org.apache.spark.ml.feature.PCAModel" =>
41 |         Optional.of("org.apache.spark.ml.rapids.RapidsPCAModel")
42 |       case "org.apache.spark.ml.regression.RandomForestRegressor" =>
43 |         Optional.of("com.nvidia.rapids.ml.RapidsRandomForestRegressor")
44 |       case "org.apache.spark.ml.regression.RandomForestRegressionModel" =>
45 |         Optional.of("org.apache.spark.ml.rapids.RapidsRandomForestRegressionModel")
46 |       case "org.apache.spark.ml.regression.LinearRegression" =>
47 |         Optional.of("com.nvidia.rapids.ml.RapidsLinearRegression")
48 |       case "org.apache.spark.ml.regression.LinearRegressionModel" =>
49 |         Optional.of("org.apache.spark.ml.rapids.RapidsLinearRegressionModel")
50 |       case "org.apache.spark.ml.clustering.KMeans" =>
51 |         Optional.of("com.nvidia.rapids.ml.RapidsKMeans")
52 |       case "org.apache.spark.ml.clustering.KMeansModel" =>
53 |         Optional.of("org.apache.spark.ml.clustering.rapids.RapidsKMeansModel")
54 |       case _ => Optional.empty()
55 |     }
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsKMeans.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.clustering.rapids.RapidsKMeansModel
20 | import org.apache.spark.ml.clustering.KMeans
21 | import org.apache.spark.ml.rapids.ModelHelper
22 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
23 | import org.apache.spark.sql.Dataset
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | /**
27 |  * RapidsKMeans is a JVM wrapper of KMeans in spark-rapids-ml python package.
28 |  *
29 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
30 |  * KMeans and return the corresponding model
31 |  *
32 |  * @param uid unique ID of the estimator
33 |  */
34 | class RapidsKMeans(override val uid: String) extends KMeans with DefaultParamsWritable
35 |   with RapidsEstimator {
36 | 
37 |   def this() = this(Identifiable.randomUID("kmeans"))
38 | 
39 |   override def fit(dataset: Dataset[_]): RapidsKMeansModel = {
40 |     val trainedModel = trainOnPython(dataset)
41 |     val parentModel = ModelHelper.createKMeansModel(trainedModel.modelAttributes)
42 |     copyValues(new RapidsKMeansModel(uid, parentModel, trainedModel.modelAttributes))
43 |   }
44 | 
45 |   // Override this function to allow feature to be array
46 |   override def transformSchema(schema: StructType): StructType = schema
47 | 
48 |   /**
49 |    * The estimator name
50 |    */
51 |   override def name: String = "KMeans"
52 | }
53 | 
54 | object RapidsKMeans extends DefaultParamsReadable[RapidsKMeans] {
55 | 
56 |   override def load(path: String): RapidsKMeans = super.load(path)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsLinearRegression.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsLinearRegressionModel}
20 | import org.apache.spark.ml.regression.LinearRegression
21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
22 | import org.apache.spark.sql.Dataset
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | /**
26 |  * RapidsLinearRegression is a JVM wrapper of LinearRegression in spark-rapids-ml python package.
27 |  *
28 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
29 |  * LinearRegression and return the corresponding model
30 |  *
31 |  * @param uid unique ID of the estimator
32 |  */
33 | class RapidsLinearRegression(override val uid: String) extends LinearRegression
34 |   with DefaultParamsWritable with RapidsEstimator {
35 | 
36 |   def this() = this(Identifiable.randomUID("linReg"))
37 | 
38 |   override def train(dataset: Dataset[_]): RapidsLinearRegressionModel = {
39 |     val trainedModel = trainOnPython(dataset)
40 |     val (coef, intercept, scale) = ModelHelper.createLinearRegressionModel(trainedModel.modelAttributes)
41 |     copyValues(new RapidsLinearRegressionModel(uid, coef, intercept, scale,
42 |       trainedModel.modelAttributes))
43 |   }
44 | 
45 |   // Override this function to allow feature to be array
46 |   override def transformSchema(schema: StructType): StructType = schema
47 | 
48 |   /**
49 |    * The estimator name
50 |    */
51 |   override def name: String = "LinearRegression"
52 | }
53 | 
54 | object RapidsLinearRegression extends DefaultParamsReadable[RapidsLinearRegression] {
55 | 
56 |   override def load(path: String): RapidsLinearRegression = super.load(path)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsLogisticRegression.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
20 | import org.apache.spark.ml.classification.LogisticRegression
21 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsLogisticRegressionModel}
22 | import org.apache.spark.sql.Dataset
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | /**
26 |  * RapidsLogisticRegression is a JVM wrapper of LogisticRegression in spark-rapids-ml python package.
27 |  *
28 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
29 |  * LogisticRegression and return the corresponding model
30 |  *
31 |  * @param uid unique ID of the estimator
32 |  */
33 | class RapidsLogisticRegression(override val uid: String) extends LogisticRegression
34 |   with DefaultParamsWritable with RapidsEstimator {
35 | 
36 |   def this() = this(Identifiable.randomUID("logreg"))
37 | 
38 |   override def train(dataset: Dataset[_]): RapidsLogisticRegressionModel = {
39 |     val trainedModel = trainOnPython(dataset)
40 |     val (coef, intercept, numClasses) =
41 |       ModelHelper.createLogisticRegressionModel(trainedModel.modelAttributes)
42 |     copyValues(new RapidsLogisticRegressionModel(uid, coef, intercept, numClasses, trainedModel.modelAttributes))
43 |   }
44 | 
45 |   // Override this function to allow feature to be array
46 |   override def transformSchema(schema: StructType): StructType = schema
47 | 
48 |   /**
49 |    * The estimator name
50 |    */
51 |   override def name: String = "LogisticRegression"
52 | }
53 | 
54 | object RapidsLogisticRegression extends DefaultParamsReadable[RapidsLogisticRegression] {
55 | 
56 |   override def load(path: String): RapidsLogisticRegression = super.load(path)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsPCA.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.feature.PCA
20 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsPCAModel}
21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
22 | import org.apache.spark.sql.Dataset
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | /**
26 |  * RapidsPCA is a JVM wrapper of PCA in spark-rapids-ml python package.
27 |  *
28 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
29 |  * PCA and return the corresponding model
30 |  *
31 |  * @param uid unique ID of the estimator
32 |  */
33 | class RapidsPCA(override val uid: String) extends PCA with DefaultParamsWritable
34 |   with RapidsEstimator {
35 | 
36 |   def this() = this(Identifiable.randomUID("pca"))
37 | 
38 |   override def fit(dataset: Dataset[_]): RapidsPCAModel = {
39 |     val trainedModel = trainOnPython(dataset)
40 |     val (pc, explainedVariance) = ModelHelper.createPCAModel(trainedModel.modelAttributes)
41 |     copyValues(new RapidsPCAModel(uid, pc, explainedVariance, trainedModel.modelAttributes))
42 |   }
43 | 
44 |   // Override this function to allow feature to be array
45 |   override def transformSchema(schema: StructType): StructType = schema
46 | 
47 |   /**
48 |    * The estimator name
49 |    */
50 |   override def name: String = "PCA"
51 | }
52 | 
53 | object RapidsPCA extends DefaultParamsReadable[RapidsPCA] {
54 | 
55 |   override def load(path: String): RapidsPCA = super.load(path)
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsRandomForestClassifier.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.classification.RandomForestClassifier
20 | import org.apache.spark.ml.rapids.{RapidsRandomForestClassificationModel, ModelHelper}
21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
22 | import org.apache.spark.sql.Dataset
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | /**
26 |  * RapidsRandomForestClassifier is a JVM wrapper of RandomForestClassifier in spark-rapids-ml python package.
27 |  *
28 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
29 |  * RandomForestClassifier and return the corresponding model
30 |  *
31 |  * @param uid unique ID of the estimator
32 |  */
33 | class RapidsRandomForestClassifier(override val uid: String) extends RandomForestClassifier
34 |   with DefaultParamsWritable with RapidsEstimator {
35 | 
36 |   def this() = this(Identifiable.randomUID("rfc"))
37 | 
38 |   override def train(dataset: Dataset[_]): RapidsRandomForestClassificationModel = {
39 |     val trainedModel = trainOnPython(dataset)
40 |     val (trees, numFeatures, numClasses) = ModelHelper.createRandomForestClassificationModel(
41 |       trainedModel.modelAttributes, getImpurity, uid)
42 |     copyValues(new RapidsRandomForestClassificationModel(uid, trees, numFeatures, numClasses,
43 |       trainedModel.modelAttributes))
44 |   }
45 | 
46 |   // Override this function to allow feature to be array
47 |   override def transformSchema(schema: StructType): StructType = schema
48 | 
49 |   /**
50 |    * The estimator name
51 |    */
52 |   override def name: String = "RandomForestClassifier"
53 | }
54 | 
55 | object RapidsRandomForestClassifier extends DefaultParamsReadable[RapidsRandomForestClassifier] {
56 | 
57 |   override def load(path: String): RapidsRandomForestClassifier = super.load(path)
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsRandomForestRegressor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.spark.ml.rapids.{RapidsRandomForestRegressionModel, ModelHelper}
20 | import org.apache.spark.ml.regression.RandomForestRegressor
21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
22 | import org.apache.spark.sql.Dataset
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | /**
26 |  * RapidsRandomForestRegressor is a JVM wrapper of RandomForestRegressor in spark-rapids-ml python package.
27 |  *
28 |  * The training process is going to launch a Python Process where to run spark-rapids-ml
29 |  * RandomForestRegressor and return the corresponding model
30 |  *
31 |  * @param uid unique ID of the estimator
32 |  */
33 | class RapidsRandomForestRegressor(override val uid: String) extends RandomForestRegressor
34 |   with DefaultParamsWritable with RapidsEstimator {
35 | 
36 |   def this() = this(Identifiable.randomUID("rfr"))
37 | 
38 |   override def train(dataset: Dataset[_]): RapidsRandomForestRegressionModel = {
39 |     val trainedModel = trainOnPython(dataset)
40 |     val (trees, numFeatures) = ModelHelper.createRandomForestRegressionModel(
41 |       trainedModel.modelAttributes, getImpurity, uid)
42 |     copyValues(new RapidsRandomForestRegressionModel(uid, trees, numFeatures, trainedModel.modelAttributes))
43 |   }
44 | 
45 |   // Override this function to allow feature to be array
46 |   override def transformSchema(schema: StructType): StructType = schema
47 | 
48 |   /**
49 |    * The estimator name
50 |    */
51 |   override def name: String = "RandomForestRegressor"
52 | }
53 | 
54 | object RapidsRandomForestRegressor extends DefaultParamsReadable[RapidsRandomForestRegressor] {
55 | 
56 |   override def load(path: String): RapidsRandomForestRegressor = super.load(path)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/com/nvidia/rapids/ml/RapidsTraits.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.nvidia.rapids.ml
18 | 
19 | import org.apache.commons.logging.LogFactory
20 | import org.apache.spark.ml.param.Params
21 | import org.apache.spark.ml.rapids.{Fit, PythonEstimatorRunner, RapidsUtils, TrainedModel}
22 | import org.apache.spark.sql.Dataset
23 | 
24 | /** Implementation of the automatic-resource-management pattern */
25 | object Arm {
26 |   /** Executes the provided code block and then closes the resource */
27 |   def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = {
28 |     try {
29 |       block(r)
30 |     } finally {
31 |       r.close()
32 |     }
33 |   }
34 | }
35 | 
36 | trait RapidsEstimator extends Params {
37 |   protected val logger = LogFactory.getLog("Spark-Rapids-ML Plugin")
38 | 
39 |   /**
40 |    * The estimator name
41 |    *
42 |    * @return
43 |    */
44 |   def name: String
45 | 
46 |   def trainOnPython(dataset: Dataset[_]): TrainedModel = {
47 |     logger.info(s"Training $name ...")
48 |     // Get the user-defined parameters and pass them to python process as a dictionary
49 |     val params = RapidsUtils.getUserDefinedParams(this)
50 | 
51 |     val runner = new PythonEstimatorRunner(
52 |       Fit(name, params),
53 |       dataset.toDF)
54 | 
55 |     val trainedModel = Arm.withResource(runner) { _ =>
56 |       runner.runInPython(useDaemon = false)
57 |     }
58 | 
59 |     logger.info(s"Finished $name training.")
60 |     trainedModel
61 |   }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/clustering/rapids/RapidsKMeansModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.clustering.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.clustering.KMeansModel
21 | import org.apache.spark.ml.param.ParamMap
22 | import org.apache.spark.ml.rapids.{RapidsModel, RapidsModelWriter}
23 | import org.apache.spark.ml.util.{GeneralMLWriter, MLReadable, MLReader}
24 | import org.apache.spark.mllib.clustering.{KMeansModel => MLlibKMeansModel}
25 | import org.apache.spark.sql.{DataFrame, Dataset}
26 | 
27 | /**
28 |  * Model fitted by RapidsKMeans.
29 |  *
30 |  * RapidsKMeansModel extends from the Spark KMeansModel and stores
31 |  * the model attributes trained by spark-rapids-ml python in string format.
32 |  */
33 | class RapidsKMeansModel(override val uid: String,
34 |                         override private[clustering] val parentModel: MLlibKMeansModel,
35 |                         override val modelAttributes: String)
36 |   extends KMeansModel(uid, parentModel) with RapidsModel {
37 | 
38 |   private[ml] def this() = this("", null, null)
39 | 
40 |   override def transform(dataset: Dataset[_]): DataFrame = {
41 |     transformOnPython(dataset, super.transform)
42 |   }
43 | 
44 |   /**
45 |    * The model name
46 |    */
47 |   override def name: String = "KMeansModel"
48 | 
49 |   override def copy(extra: ParamMap): RapidsKMeansModel = {
50 |     val newModel = copyValues(
51 |       new RapidsKMeansModel(uid, parentModel, modelAttributes), extra)
52 |     newModel
53 |   }
54 | 
55 |   override def write: GeneralMLWriter = new RapidsModelWriter(this)
56 | 
57 |   override def cpu: KMeansModel = {
58 |     copyValues(new KMeansModel(uid, parentModel))
59 |   }
60 | }
61 | 
62 | object RapidsKMeansModel extends MLReadable[RapidsKMeansModel] {
63 | 
64 |   override def read: MLReader[RapidsKMeansModel] = new RapidsKMeansModelReader
65 | 
66 |   override def load(path: String): RapidsKMeansModel = super.load(path)
67 | 
68 |   private class RapidsKMeansModelReader extends MLReader[RapidsKMeansModel] {
69 | 
70 |     override def load(path: String): RapidsKMeansModel = {
71 |       val cpuModel = KMeansModel.load(path)
72 |       val attributesPath = new Path(path, "attributes").toString
73 |       val row = sparkSession.read.parquet(attributesPath).first()
74 |       val model = new RapidsKMeansModel(row.getString(0),
75 |         cpuModel.parentModel, row.getString(1))
76 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
77 |       model
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/PythonEstimatorRunner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import java.io.{DataInputStream, DataOutputStream}
20 | 
21 | import net.razorvine.pickle.Pickler
22 | 
23 | import org.apache.spark.api.java.JavaSparkContext
24 | import org.apache.spark.api.python.{PythonFunction, PythonRDD, PythonWorkerUtils}
25 | import org.apache.spark.sql.DataFrame
26 | import org.apache.spark.sql.execution.python.PythonPlannerRunner
27 | 
28 | 
29 | case class Fit(name: String, params: String)
30 | 
31 | case class TrainedModel(modelAttributes: String)
32 | 
33 | /**
34 |  * PythonEstimatorRunner is a bridge to launch and manage Python process. It sends the
35 |  * estimator-related messages to the python process and runs it.
36 |  *
37 |  * @param fit     the fit information
38 |  * @param dataset input dataset
39 |  */
40 | class PythonEstimatorRunner(fit: Fit,
41 |                             dataset: DataFrame,
42 |                             func: PythonFunction = PythonRunnerUtils.RAPIDS_PYTHON_FUNC)
43 |   extends PythonPlannerRunner[TrainedModel](func) with AutoCloseable {
44 | 
45 |   private val datasetKey = PythonRunnerUtils.putNewObjectToPy4j(dataset)
46 |   private val jscKey = PythonRunnerUtils.putNewObjectToPy4j(new JavaSparkContext(dataset.sparkSession.sparkContext))
47 | 
48 |   override protected val workerModule: String = "spark_rapids_ml.connect_plugin"
49 | 
50 |   override protected def writeToPython(dataOut: DataOutputStream, pickler: Pickler): Unit = {
51 |     PythonRDD.writeUTF(PythonRunnerUtils.AUTH_TOKEN, dataOut)
52 |     PythonRDD.writeUTF(fit.name, dataOut)
53 |     PythonRDD.writeUTF(fit.params, dataOut)
54 |     PythonRDD.writeUTF(jscKey, dataOut)
55 |     PythonRDD.writeUTF(datasetKey, dataOut)
56 |   }
57 | 
58 |   override protected def receiveFromPython(dataIn: DataInputStream): TrainedModel = {
59 |     val modelAttributes = PythonWorkerUtils.readUTF(dataIn)
60 |     TrainedModel(modelAttributes)
61 |   }
62 | 
63 |   override def close(): Unit = {
64 |     PythonRunnerUtils.deleteObject(jscKey)
65 |     PythonRunnerUtils.deleteObject(datasetKey)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/PythonModelRunner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import java.io.{DataInputStream, DataOutputStream}
20 | 
21 | import net.razorvine.pickle.Pickler
22 | 
23 | import org.apache.spark.api.java.JavaSparkContext
24 | import org.apache.spark.api.python.{PythonFunction, PythonRDD, PythonWorkerUtils}
25 | import org.apache.spark.sql.DataFrame
26 | import org.apache.spark.sql.execution.python.PythonPlannerRunner
27 | 
28 | 
29 | case class Transform(name: String, params: String, modelAttributes: String)
30 | 
31 | /**
32 |  * PythonModelRunner is a bridge to launch and manage Python process. it sends the
33 |  * model-related messages to the python process and runs it.
34 |  *
35 |  * @param transform the transform information
36 |  * @param dataset   input dataset
37 |  */
38 | class PythonModelRunner(transform: Transform,
39 |                         dataset: DataFrame,
40 |                         func: PythonFunction = PythonRunnerUtils.RAPIDS_PYTHON_FUNC)
41 |   extends PythonPlannerRunner[DataFrame](func) with AutoCloseable {
42 | 
43 |   private val datasetKey = PythonRunnerUtils.putNewObjectToPy4j(dataset)
44 |   private val jscKey = PythonRunnerUtils.putNewObjectToPy4j(new JavaSparkContext(dataset.sparkSession.sparkContext))
45 | 
46 |   override protected val workerModule: String = "spark_rapids_ml.connect_plugin"
47 | 
48 |   override protected def writeToPython(dataOut: DataOutputStream, pickler: Pickler): Unit = {
49 |     PythonRDD.writeUTF(PythonRunnerUtils.AUTH_TOKEN, dataOut)
50 |     PythonRDD.writeUTF(transform.name, dataOut)
51 |     PythonRDD.writeUTF(transform.params, dataOut)
52 |     PythonRDD.writeUTF(jscKey, dataOut)
53 |     PythonRDD.writeUTF(datasetKey, dataOut)
54 |     PythonRDD.writeUTF(transform.modelAttributes, dataOut)
55 |   }
56 | 
57 |   override protected def receiveFromPython(dataIn: DataInputStream): DataFrame = {
58 |     // Read the dataset target id in py4j server
59 |     val dfId = PythonWorkerUtils.readUTF(dataIn)
60 |     PythonRunnerUtils.getObjectAndDeref(dfId).asInstanceOf[DataFrame]
61 |   }
62 | 
63 |   override def close(): Unit = {
64 |     PythonRunnerUtils.deleteObject(jscKey)
65 |     PythonRunnerUtils.deleteObject(datasetKey)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsLinearRegressionModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.linalg.Vector
21 | import org.apache.spark.ml.param.ParamMap
22 | import org.apache.spark.ml.regression.LinearRegressionModel
23 | import org.apache.spark.ml.util.{GeneralMLWriter, MLReadable, MLReader}
24 | import org.apache.spark.sql.{DataFrame, Dataset}
25 | 
26 | /**
27 |  * Model produced by RapidsLinearRegression.
28 |  *
29 |  * RapidsLinearRegressionModel extends from the Spark LinearRegressionModel and stores
30 |  * the model attributes trained by spark-rapids-ml python in string format.
31 |  */
32 | class RapidsLinearRegressionModel(override val uid: String,
33 |                                   override val coefficients: Vector,
34 |                                   override val intercept: Double,
35 |                                   override val scale: Double,
36 |                                   override val modelAttributes: String)
37 |   extends LinearRegressionModel(uid, coefficients, intercept, scale) with RapidsModel {
38 | 
39 |   private[ml] def this() = this("", null, 1.0, 1.0, "")
40 | 
41 |   override def transform(dataset: Dataset[_]): DataFrame = {
42 |     transformOnPython(dataset, super.transform)
43 |   }
44 | 
45 |   /**
46 |    * The model name
47 |    */
48 |   override def name: String = "LinearRegressionModel"
49 | 
50 |   override def copy(extra: ParamMap): RapidsLinearRegressionModel = {
51 |     copyValues(
52 |       new RapidsLinearRegressionModel(uid, coefficients, intercept, scale, modelAttributes), extra)
53 |   }
54 | 
55 |   override def cpu: LinearRegressionModel = {
56 |     copyValues(new LinearRegressionModel(uid, coefficients, intercept, scale))
57 |   }
58 | 
59 |   override def write: GeneralMLWriter = new RapidsModelWriter(this)
60 | }
61 | 
62 | object RapidsLinearRegressionModel extends MLReadable[RapidsLinearRegressionModel] {
63 | 
64 |   override def read: MLReader[RapidsLinearRegressionModel] = new RapidsLinearRegressionModelReader
65 | 
66 |   override def load(path: String): RapidsLinearRegressionModel = super.load(path)
67 | 
68 |   private class RapidsLinearRegressionModelReader extends MLReader[RapidsLinearRegressionModel] {
69 | 
70 |     override def load(path: String): RapidsLinearRegressionModel = {
71 |       val cpuModel = LinearRegressionModel.load(path)
72 |       val attributesPath = new Path(path, "attributes").toString
73 |       val row = sparkSession.read.parquet(attributesPath).first()
74 |       val model = new RapidsLinearRegressionModel(row.getString(0), cpuModel.coefficients,
75 |         cpuModel.intercept, cpuModel.scale, row.getString(1))
76 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
77 |       model
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsLogisticRegressionModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.classification.LogisticRegressionModel
21 | import org.apache.spark.ml.linalg.{Matrix, Vector}
22 | import org.apache.spark.ml.param.ParamMap
23 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable}
24 | import org.apache.spark.sql.{DataFrame, Dataset}
25 | 
26 | /**
27 |  * Model produced by RapidsLogisticRegression.
28 |  *
29 |  * RapidsLogisticRegressionModel extends from the Spark LogisticRegressionModel and stores
30 |  * the model attributes trained by spark-rapids-ml python in string format.
31 |  */
32 | class RapidsLogisticRegressionModel(override val uid: String,
33 |                                     override val coefficientMatrix: Matrix,
34 |                                     override val interceptVector: Vector,
35 |                                     override val numClasses: Int,
36 |                                     override val modelAttributes: String)
37 |   extends LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
38 |     numClasses, numClasses != 2) with MLWritable with RapidsModel {
39 | 
40 |   private[ml] def this() = this("", null, null, 2, null)
41 | 
42 |   override def transform(dataset: Dataset[_]): DataFrame = {
43 |     transformOnPython(dataset, super.transform)
44 |   }
45 | 
46 |   /**
47 |    * The model name
48 |    */
49 |   override def name: String = "LogisticRegressionModel"
50 | 
51 |   override def copy(extra: ParamMap): RapidsLogisticRegressionModel = {
52 |     val newModel = copyValues(
53 |       new RapidsLogisticRegressionModel(uid, coefficientMatrix, interceptVector,
54 |         numClasses, modelAttributes), extra)
55 |     newModel.setSummary(trainingSummary).setParent(parent)
56 |     newModel
57 |   }
58 | 
59 |   override def cpu: LogisticRegressionModel = {
60 |     copyValues(
61 |       new LogisticRegressionModel(uid, coefficientMatrix, interceptVector, numClasses, numClasses != 2))
62 |   }
63 | }
64 | 
65 | object RapidsLogisticRegressionModel extends MLReadable[RapidsLogisticRegressionModel] {
66 | 
67 |   override def read: MLReader[RapidsLogisticRegressionModel] = new RapidsLogisticRegressionModelReader
68 | 
69 |   override def load(path: String): RapidsLogisticRegressionModel = super.load(path)
70 | 
71 |   private class RapidsLogisticRegressionModelReader extends MLReader[RapidsLogisticRegressionModel] {
72 | 
73 |     override def load(path: String): RapidsLogisticRegressionModel = {
74 |       val cpuModel = LogisticRegressionModel.load(path)
75 |       val attributesPath = new Path(path, "attributes").toString
76 |       val row = sparkSession.read.parquet(attributesPath).first()
77 |       val model = new RapidsLogisticRegressionModel(row.getString(0),
78 |         cpuModel.coefficientMatrix, cpuModel.interceptVector, cpuModel.numClasses, row.getString(1))
79 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
80 |       model
81 |     }
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import com.nvidia.rapids.ml.Arm
20 | import org.apache.commons.logging.LogFactory
21 | import org.apache.hadoop.fs.Path
22 | import org.apache.spark.internal.Logging
23 | import org.apache.spark.ml.Model
24 | import org.apache.spark.ml.linalg.VectorUDT
25 | import org.apache.spark.ml.param.Params
26 | import org.apache.spark.ml.param.shared.HasFeaturesCol
27 | import org.apache.spark.ml.util.{GeneralMLWriter, MLWritable, MLWriter}
28 | import org.apache.spark.sql.{DataFrame, Dataset}
29 | 
30 | trait RapidsModel extends MLWritable with Params with HasFeaturesCol {
31 | 
32 |   /**
33 |    * The attributes of the corresponding spark-rapids-ml model, it has been
34 |    * encoded to json format. We don't need to access it
35 |    */
36 |   protected[ml] val modelAttributes: String
37 | 
38 |   /**
39 |    * The model name
40 |    */
41 |   def name: String
42 | 
43 |   def featureName: String = getFeaturesCol
44 | 
45 |   protected val logger = LogFactory.getLog("Spark-Rapids-ML Plugin")
46 | 
47 |   def transformOnPython(dataset: Dataset[_],
48 |                         cpuTransformFunc: Dataset[_] => DataFrame): DataFrame = {
49 |     val usePython = dataset.sparkSession.conf.get("spark.rapids.ml.python.transform.enabled", "true").toBoolean
50 |     val isVector = dataset.schema(featureName).dataType.isInstanceOf[VectorUDT]
51 |     if (!isVector && !usePython) {
52 |       throw new IllegalArgumentException("Please enable spark.rapids.ml.python.transform.enabled to " +
53 |         "transform dataset in python for non-vector input.")
54 |     }
55 | 
56 |     if (usePython) {
57 |       logger.info("Transform in python")
58 |       // Get the user-defined parameters and pass them to python process as a dictionary
59 |       val params = RapidsUtils.getUserDefinedParams(this)
60 | 
61 |       val runner = new PythonModelRunner(
62 |         Transform(name, params, modelAttributes),
63 |         dataset.toDF)
64 | 
65 |       Arm.withResource(runner) { _ =>
66 |         runner.runInPython(useDaemon = false)
67 |       }
68 |     } else {
69 |       logger.info(s"Transform using CPU $name")
70 |       cpuTransformFunc(dataset)
71 |     }
72 |   }
73 | 
74 |   override def write: MLWriter = new RapidsModelWriter(this)
75 | 
76 |   def cpu: Model[_]
77 | }
78 | 
79 | class RapidsModelWriter(instance: RapidsModel) extends
80 |   GeneralMLWriter(instance.asInstanceOf[Model[_]]) with Logging {
81 | 
82 |   override protected def saveImpl(path: String): Unit = {
83 |     val writer = instance.cpu.asInstanceOf[MLWritable].write
84 |     if (shouldOverwrite) {
85 |       writer.overwrite()
86 |     }
87 |     optionMap.foreach { case (k, v) => writer.option(k, v) }
88 |     writer.save(path)
89 | 
90 |     val attributesPath = new Path(path, "attributes").toString
91 |     sparkSession.createDataFrame(
92 |       Seq(Tuple2(instance.uid, instance.modelAttributes))
93 |     ).write.parquet(attributesPath)
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsPCAModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.feature.PCAModel
21 | import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector}
22 | import org.apache.spark.ml.param.ParamMap
23 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWriter}
24 | import org.apache.spark.sql.{DataFrame, Dataset}
25 | 
26 | /**
27 |  * Model produced by RapidsPCA.
28 |  *
29 |  * RapidsPCAModel extends from the Spark PCAModel and stores
30 |  * the model attributes trained by spark-rapids-ml python in string format.
31 |  */
32 | class RapidsPCAModel(override val uid: String,
33 |                      override val pc: DenseMatrix,
34 |                      override val explainedVariance: DenseVector,
35 |                      override val modelAttributes: String)
36 |   extends PCAModel(uid, pc, explainedVariance) with RapidsModel {
37 | 
38 |   private[ml] def this() = this("", null, null, "")
39 | 
40 |   override def transform(dataset: Dataset[_]): DataFrame = {
41 |     transformOnPython(dataset, super.transform)
42 |   }
43 | 
44 |   /**
45 |    * The model name
46 |    */
47 |   override def name: String = "PCAModel"
48 | 
49 |   override def copy(extra: ParamMap): RapidsPCAModel = {
50 |     copyValues(
51 |       new RapidsPCAModel(uid, pc, explainedVariance, modelAttributes), extra)
52 |   }
53 | 
54 |   override def featureName: String = getInputCol
55 | 
56 |   override def write: MLWriter = super.write
57 | 
58 |   override def cpu: PCAModel = {
59 |     copyValues(new PCAModel(uid, pc, explainedVariance))
60 |   }
61 | }
62 | 
63 | 
64 | object RapidsPCAModel extends MLReadable[RapidsPCAModel] {
65 | 
66 |   override def read: MLReader[RapidsPCAModel] = new RapidsPCAModelReader
67 | 
68 |   override def load(path: String): RapidsPCAModel = super.load(path)
69 | 
70 |   private class RapidsPCAModelReader extends MLReader[RapidsPCAModel] {
71 | 
72 |     override def load(path: String): RapidsPCAModel = {
73 |       val cpuModel = PCAModel.load(path)
74 |       val attributesPath = new Path(path, "attributes").toString
75 |       val row = sparkSession.read.parquet(attributesPath).first()
76 |       val model = new RapidsPCAModel(row.getString(0),
77 |         cpuModel.pc, cpuModel.explainedVariance, row.getString(1))
78 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
79 |       model
80 |     }
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsRandomForestClassificationModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, RandomForestClassificationModel}
21 | import org.apache.spark.ml.param.ParamMap
22 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable}
23 | import org.apache.spark.sql.{DataFrame, Dataset}
24 | 
25 | /**
26 |  * Model produced by RapidsRandomForestClassifier.
27 |  *
28 |  * RapidsRandomForestClassificationModel extends from the Spark RandomForestClassificationModel and stores
29 |  * the model attributes trained by spark-rapids-ml python in string format.
30 |  */
31 | class RapidsRandomForestClassificationModel(override val uid: String,
32 |                                             private val _trees: Array[DecisionTreeClassificationModel],
33 |                                             override val numFeatures: Int,
34 |                                             override val numClasses: Int,
35 |                                             override val modelAttributes: String)
36 |   extends RandomForestClassificationModel(uid, _trees, numFeatures, numClasses)
37 |     with MLWritable with RapidsModel {
38 | 
39 |   private[ml] def this() = this("", null, 1, 1, "")
40 | 
41 |   override def transform(dataset: Dataset[_]): DataFrame = {
42 |     transformOnPython(dataset, super.transform)
43 |   }
44 | 
45 |   /**
46 |    * The model name
47 |    */
48 |   override def name: String = "RandomForestClassificationModel"
49 | 
50 |   override def copy(extra: ParamMap): RapidsRandomForestClassificationModel = {
51 |     copyValues(new RapidsRandomForestClassificationModel(uid, _trees, numFeatures,
52 |       numClasses, modelAttributes), extra)
53 |   }
54 | 
55 |   override def cpu: RandomForestClassificationModel = {
56 |     copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses))
57 |   }
58 | }
59 | 
60 | object RapidsRandomForestClassificationModel extends MLReadable[RapidsRandomForestClassificationModel] {
61 | 
62 |   override def read: MLReader[RapidsRandomForestClassificationModel] = new RapidsRandomForestClassificationModelReader
63 | 
64 |   override def load(path: String): RapidsRandomForestClassificationModel = super.load(path)
65 | 
66 |   private class RapidsRandomForestClassificationModelReader extends MLReader[RapidsRandomForestClassificationModel] {
67 | 
68 |     override def load(path: String): RapidsRandomForestClassificationModel = {
69 |       val cpuModel = RandomForestClassificationModel.load(path)
70 |       val attributesPath = new Path(path, "attributes").toString
71 |       val row = sparkSession.read.parquet(attributesPath).first()
72 |       val model = new RapidsRandomForestClassificationModel(row.getString(0),
73 |         cpuModel.trees, cpuModel.numFeatures, cpuModel.numClasses, row.getString(1))
74 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
75 |       model
76 |     }
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsRandomForestRegressionModel.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.ml.rapids
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.ml.param.ParamMap
21 | import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, RandomForestRegressionModel}
22 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable}
23 | import org.apache.spark.sql.{DataFrame, Dataset}
24 | 
25 | /**
26 |  * Model produced by RapidsRandomForestClassifier.
27 |  *
28 |  * RapidsRandomForestRegressionModel extends from the Spark RandomForestRegressionModel and stores
29 |  * the model attributes trained by spark-rapids-ml python in string format.
30 |  */
31 | class RapidsRandomForestRegressionModel(override val uid: String,
32 |                                         private val _trees: Array[DecisionTreeRegressionModel],
33 |                                         override val numFeatures: Int,
34 |                                         override val modelAttributes: String)
35 |   extends RandomForestRegressionModel(uid, _trees, numFeatures)
36 |     with MLWritable with RapidsModel {
37 | 
38 |   private[ml] def this() = this("", null, 1, "")
39 | 
40 |   override def transform(dataset: Dataset[_]): DataFrame = {
41 |     transformOnPython(dataset, super.transform)
42 |   }
43 | 
44 |   /**
45 |    * The model name
46 |    */
47 |   override def name: String = "RandomForestRegressionModel"
48 | 
49 |   override def copy(extra: ParamMap): RapidsRandomForestRegressionModel = {
50 |     copyValues(
51 |       new RapidsRandomForestRegressionModel(uid, _trees, numFeatures, modelAttributes), extra)
52 |   }
53 | 
54 |   override def cpu: RandomForestRegressionModel = {
55 |     copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures))
56 |   }
57 | }
58 | 
59 | object RapidsRandomForestRegressionModel extends MLReadable[RapidsRandomForestRegressionModel] {
60 | 
61 |   override def read: MLReader[RapidsRandomForestRegressionModel] = new RapidsRandomForestRegressionModelReader
62 | 
63 |   override def load(path: String): RapidsRandomForestRegressionModel = super.load(path)
64 | 
65 |   private class RapidsRandomForestRegressionModelReader extends MLReader[RapidsRandomForestRegressionModel] {
66 | 
67 |     override def load(path: String): RapidsRandomForestRegressionModel = {
68 |       val cpuModel = RandomForestRegressionModel.load(path)
69 |       val attributesPath = new Path(path, "attributes").toString
70 |       val row = sparkSession.read.parquet(attributesPath).first()
71 |       val model = new RapidsRandomForestRegressionModel(row.getString(0), cpuModel.trees,
72 |         cpuModel.numFeatures, row.getString(1))
73 |       cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value))
74 |       model
75 |     }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/jvm/src/main/scala/org/apache/spark/ml/rapids/Utils.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2025, NVIDIA CORPORATION.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package org.apache.spark.ml.rapids
 18 | 
 19 | import java.security.SecureRandom
 20 | import java.util.Base64
 21 | import java.io.File
 22 | import scala.collection.mutable.ArrayBuffer
 23 | import scala.jdk.CollectionConverters._
 24 | import scala.sys.process.Process
 25 | import py4j.GatewayServer.GatewayServerBuilder
 26 | import org.apache.spark.api.python.SimplePythonFunction
 27 | import org.apache.spark.ml.linalg
 28 | import org.apache.spark.ml.linalg.{DenseVector, Vectors}
 29 | import org.apache.spark.ml.param.{ParamPair, Params}
 30 | import org.apache.spark.util.Utils
 31 | import org.json4s.{DefaultFormats, JArray}
 32 | import org.json4s.JsonDSL._
 33 | import org.json4s.jackson.JsonMethods.{compact, parse, render}
 34 | 
 35 | object RapidsUtils {
 36 | 
 37 |   def getUserDefinedParams(instance: Params): String = {
 38 |     compact(render(instance.paramMap.toSeq.map { case ParamPair(p, v) =>
 39 |       p.name -> parse(p.jsonEncode(v))
 40 |     }.toList))
 41 |   }
 42 | 
 43 |   def createTempDir(namePrefix: String = "spark"): File = {
 44 |     Utils.createTempDir(namePrefix)
 45 |   }
 46 | 
 47 |   def deleteRecursively(file: File): Unit = {
 48 |     Utils.deleteRecursively(file)
 49 |   }
 50 | 
 51 | }
 52 | 
 53 | object PythonRunnerUtils {
 54 |   private def generateSecrets = {
 55 |     val rnd = new SecureRandom()
 56 |     val token = new Array[Byte](32)
 57 |     rnd.nextBytes(token)
 58 |     Base64.getEncoder.encodeToString(token)
 59 |   }
 60 | 
 61 |   private[rapids] lazy val AUTH_TOKEN: String = generateSecrets
 62 | 
 63 |   private[rapids] lazy val RAPIDS_PYTHON_FUNC = {
 64 |     val defaultPythonExec: String = sys.env.getOrElse(
 65 |       "PYSPARK_DRIVER_PYTHON", sys.env.getOrElse("PYSPARK_PYTHON", "python3"))
 66 |     val pythonVer: String =
 67 |       Process(
 68 |         Seq(defaultPythonExec, "-c", "import sys; print('%d.%d' % sys.version_info[:2])")).!!.trim()
 69 | 
 70 |     new SimplePythonFunction(
 71 |       command = Array[Byte](),
 72 |       envVars = Map(
 73 |         "PYSPARK_PYTHON" -> defaultPythonExec,
 74 |         "PYSPARK_DRIVER_PYTHON" -> defaultPythonExec,
 75 |       ).asJava,
 76 |       pythonIncludes = ArrayBuffer("").asJava,
 77 |       pythonExec = defaultPythonExec,
 78 |       pythonVer = pythonVer,
 79 |       broadcastVars = List.empty.asJava,
 80 |       accumulator = null
 81 |     )
 82 |   }
 83 | 
 84 |   private val gwLock = new Object() // Lock object
 85 | 
 86 |   private lazy val gw: py4j.Gateway = gwLock.synchronized {
 87 |     val server = new GatewayServerBuilder().authToken(AUTH_TOKEN).build()
 88 |     server.start()
 89 |     server.getGateway
 90 |   }
 91 | 
 92 |   def putNewObjectToPy4j(o: Object): String = gwLock.synchronized {
 93 |     gw.putNewObject(o)
 94 |   }
 95 | 
 96 |   def deleteObject(key: String): Unit = gwLock.synchronized {
 97 |     gw.deleteObject(key)
 98 |   }
 99 | 
100 |   /**
101 |    * Get the model from py4j server and remove its reference in py4j server
102 |    */
103 |   def getObjectAndDeref(id: String): Object = gwLock.synchronized {
104 |     val o = gw.getObject(id)
105 |     gw.deleteObject(id)
106 |     o
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Running notebooks locally
 2 | 
 3 | To run notebooks using Spark local mode on a server with one or more NVIDIA GPUs:
 4 | 1. Follow the [installation instructions](../python/README.md#installation) to setup your environment.
 5 | 2. Install `jupyter` into the conda environment.
 6 |     ```bash
 7 |     pip install jupyter
 8 |     ```
 9 | 3. Set `SPARK_HOME`.
10 |     ```bash
11 |     export SPARK_HOME=$( pip show pyspark | grep Location | grep -o '/.*' )/pyspark
12 |     ls $SPARK_HOME/bin/pyspark
13 |     ```
14 | 4. In the notebooks directory, start PySpark in local mode with the Jupyter UI.
15 |     ```bash
16 |     cd spark-rapids-ml/notebooks
17 | 
18 |     PYSPARK_DRIVER_PYTHON=jupyter \
19 |     PYSPARK_DRIVER_PYTHON_OPTS='notebook --ip=0.0.0.0' \
20 |     CUDA_VISIBLE_DEVICES=0 \
21 |     $SPARK_HOME/bin/pyspark --master local[12] \
22 |     --driver-memory 128g \
23 |     --conf spark.sql.execution.arrow.pyspark.enabled=true
24 |     ```
25 | 5. Follow the instructions printed by the above command to browse to the Jupyter notebook server.
26 | 6. In the Jupyter file browser, open and run any of the notebooks.
27 | 7. **OPTIONAL**: If your server is remote with no direct `http` access, but you have `ssh` access, you can connect via an `ssh` tunnel, as follows:
28 |     ```bash
29 |     export REMOTE_USER=<your_remote_username>
30 |     export REMOTE_HOST=<your_remote_hostname>
31 |     ssh -A -L 8888:127.0.0.1:8888 -L 4040:127.0.0.1:4040 ${REMOTE_USER}@${REMOTE_HOST}
32 |     ```
33 |     Then, browse to the `127.0.0.1` URL printed by the command in step 4.   Note that a tunnel is also opened to the Spark UI server on port 4040.  Once a notebook is opened, you can view it by browsing to http://127.0.0.1:4040 in another tab or window.
34 | 8. **OPTIONAL**: If you have multiple GPUs in your server, replace the `CUDA_VISIBLE_DEVICES` setting in step 4 with a comma-separated list of the corresponding indices.  For example, for two GPUs use `CUDA_VISIBLE_DEVICES=0,1`.
35 | 
36 | ## No import change
37 | In the default notebooks, the GPU accelerated implementations of algorithms in Spark MLlib are enabled via import statements from the `spark_rapids_ml` package.   
38 | 
39 | Alternatively, acceleration can also be enabled by executing the following import statement at the start of a notebook:
40 | ```
41 | import spark_rapids_ml.install
42 | ```
43 | or by modifying the PySpark/Jupyter launch command above to use a CLI `pyspark-rapids` installed by our `pip` package to start Jupyter with pyspark as follows: 
44 | ```bash
45 | cd spark-rapids-ml/notebooks
46 | 
47 | PYSPARK_DRIVER_PYTHON=jupyter \
48 | PYSPARK_DRIVER_PYTHON_OPTS='notebook --ip=0.0.0.0' \
49 | CUDA_VISIBLE_DEVICES=0 \
50 | pyspark-rapids --master local[12] \
51 | --driver-memory 128g \
52 | --conf spark.sql.execution.arrow.pyspark.enabled=true
53 | ``` 
54 | 
55 | After executing either of the above, all subsequent imports and accesses of supported accelerated classes from `pyspark.ml` will automatically redirect and return their counterparts in `spark_rapids_ml`.  Unaccelerated classes will import from `pyspark.ml` as usual.  Thus, all supported acceleration in an existing `pyspark` notebook is enabled with no additional import statement or code changes.  Directly importing from `spark_rapids_ml` also still works (needed for non-MLlib algorithms like UMAP).
56 | 
57 | For an example notebook, see the notebook [kmeans-no-import-change.ipynb](kmeans-no-import-change.ipynb).
58 | 
59 | *Note*: As of this release, in this mode, the remaining unsupported methods and attributes on accelerated classes and objects will still raise exceptions.
60 | 
61 | ## Running notebooks on Databricks
62 | See [these instructions](databricks/README.md) for running the notebooks in a Databricks Spark cluster.
63 | 
64 | ## Running notebooks on Google Dataproc
65 | See [these instructions](dataproc/README.md) for running the notebooks in a Dataproc Spark cluster.
66 | 
67 | ## Running notebooks on AWS EMR
68 | See [these instructions](aws-emr/README.md) for running the notebooks in an AWS-EMR cluster.
69 | 
70 | 


--------------------------------------------------------------------------------
/notebooks/aws-emr/init-bootstrap-action.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | set -ex
18 | 
19 | sudo mkdir -p /spark-rapids-cgroup/devices
20 | sudo mount -t cgroup -o devices cgroupv1-devices /spark-rapids-cgroup/devices
21 | sudo chmod a+rwx -R /spark-rapids-cgroup
22 | 
23 | sudo yum update -y
24 | sudo yum install -y gcc bzip2-devel libffi-devel tar gzip wget make 
25 | sudo yum install -y mysql-devel --skip-broken
26 | sudo bash -c "wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tgz && \
27 | tar xzf Python-3.10.9.tgz && cd Python-3.10.9 && \
28 | ./configure --enable-optimizations && make altinstall"
29 | 
30 | RAPIDS_VERSION=25.6.0
31 | 
32 | sudo /usr/local/bin/pip3.10 install --upgrade pip
33 | 
34 | # install scikit-learn 
35 | sudo /usr/local/bin/pip3.10 install scikit-learn
36 | 
37 | # install cudf and cuml
38 | sudo /usr/local/bin/pip3.10 install --no-cache-dir cudf-cu12~=${RAPIDS_VERSION} \
39 |          cuml-cu12~=${RAPIDS_VERSION} \
40 |          cuvs-cu12~=${RAPIDS_VERSION} \
41 |          --extra-index-url=https://pypi.nvidia.com --verbose
42 | sudo /usr/local/bin/pip3.10 install spark-rapids-ml
43 | sudo /usr/local/bin/pip3.10 list
44 | 
45 | # set up no-import-change for cluster if enabled
46 | if [[ $1 == "--no-import-enabled" && $2 == 1 ]]; then
47 |     echo "enabling no import change in cluster" 1>&2
48 |     cd /usr/lib/livy/repl_2.12-jars
49 |     sudo jar xf livy-repl_2.12*.jar fake_shell.py
50 |     sudo sed -i fake_shell.py -e '/from __future__/ s/\(.*\)/\1\ntry:\n    import spark_rapids_ml.install\nexcept:\n    pass\n/g'
51 |     sudo jar uf livy-repl_2.12*.jar fake_shell.py
52 |     sudo rm fake_shell.py
53 | fi 
54 | 
55 | # ensure notebook comes up in python 3.10 by using a background script that waits for an 
56 | # application file to be installed before modifying.
57 | cat <<EOF >/tmp/mod_start_kernel.sh
58 | #!/bin/bash
59 | set -ex
60 | while [ ! -f /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh ]; do
61 | echo "waiting for /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh"
62 | sleep 10
63 | done
64 | echo "done waiting"
65 | sleep 10
66 | sudo sed -i /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh -e 's#"spark.pyspark.python": "python3"#"spark.pyspark.python": "/usr/local/bin/python3.10"#g'
67 | sudo sed -i /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh -e 's#"spark.pyspark.virtualenv.enabled": "true"#"spark.pyspark.virtualenv.enabled": "false"#g'
68 | exit 0
69 | EOF
70 | sudo bash /tmp/mod_start_kernel.sh &
71 | exit 0
72 | 
73 | 


--------------------------------------------------------------------------------
/notebooks/databricks/init-pip-cuda-11.8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -ex
17 | 
18 | # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
19 | # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
20 | # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
21 | RAPIDS_VERSION=25.6.0
22 | SPARK_RAPIDS_VERSION=25.04.0
23 | 
24 | curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
25 | 
26 | # install cudatoolkit 11.8 via runfile approach
27 | wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
28 | sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
29 | 
30 | # reset symlink and update library loading paths
31 | rm /usr/local/cuda
32 | ln -s /usr/local/cuda-11.8 /usr/local/cuda
33 | 
34 | # upgrade pip
35 | /databricks/python/bin/pip install --upgrade pip
36 | 
37 | # install cudf, cuml and their rapids dependencies
38 | # using ~= pulls in latest micro version patches
39 | /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
40 |     cuml-cu11~=${RAPIDS_VERSION} \
41 |     cuvs-cu11~=${RAPIDS_VERSION} \
42 |     --extra-index-url=https://pypi.nvidia.com
43 | 
44 | # install spark-rapids-ml
45 | /databricks/python/bin/pip install spark-rapids-ml
46 | 
47 | # set up no-import-change for cluster if enabled
48 | if [[ $SPARK_RAPIDS_ML_NO_IMPORT_ENABLED == 1 ]]; then
49 |     echo "enabling no import change in cluster" 1>&2
50 |     mkdir -p /root/.ipython/profile_default/startup
51 |     echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py
52 | fi
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/notebooks/dataproc/README.md:
--------------------------------------------------------------------------------
 1 | ## Running notebooks on Dataproc
 2 | 
 3 | If you already have a Dataproc account, you can run the example notebooks on a Dataproc cluster, as follows:
 4 | - Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install) and initialize it via `gcloud init`.
 5 | - Configure the following settings:
 6 |   ```
 7 |   export PROJECT=<your_project>
 8 |   export DATAPROC_REGION=<your_dataproc_region>
 9 |   export COMPUTE_REGION=<your_compute_region>
10 |   export COMPUTE_ZONE=<your_compute_zone>
11 | 
12 |   gcloud config set project ${PROJECT}
13 |   gcloud config set dataproc/region ${DATAPROC_REGION}
14 |   gcloud config set compute/region ${COMPUTE_REGION}
15 |   gcloud config set compute/zone ${COMPUTE_ZONE}
16 |   ```
17 | - Create a GCS bucket if you don't already have one:
18 |   ```
19 |   export GCS_BUCKET=<your_gcs_bucket_name>
20 | 
21 |   gcloud storage buckets create gs://${GCS_BUCKET}
22 |   ```
23 | - Upload the initialization scripts to your GCS bucket:
24 |   ```
25 |   gsutil cp spark_rapids_ml.sh gs://${GCS_BUCKET}
26 |   curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh
27 |   gsutil cp spark-rapids.sh gs://${GCS_BUCKET}/spark-rapids.sh
28 |   ```
29 | - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS.
30 |   
31 |   If you wish to enable [no-import-change](../README.md#no-import-change) UX for the cluster, change the `spark-rapids-ml-no-import-enabled` metadata value to `1` in the command.  The initialization script `spark_rapids_ml.sh` checks this metadata value and modifies the run time accordingly.
32 | 
33 |   ```
34 |   export RAPIDS_VERSION=25.6.0
35 | 
36 |   gcloud dataproc clusters create $USER-spark-rapids-ml \
37 |   --image-version=2.2-ubuntu22 \
38 |   --region ${COMPUTE_REGION} \
39 |   --master-machine-type n1-standard-16 \
40 |   --master-accelerator type=nvidia-tesla-t4,count=1 \
41 |   --num-workers 2 \
42 |   --worker-min-cpu-platform=Intel\ Skylake \
43 |   --worker-accelerator type=nvidia-tesla-t4,count=1 \
44 |   --worker-machine-type n1-standard-16 \
45 |   --num-worker-local-ssds 4 \
46 |   --worker-local-ssd-interface=NVME \
47 |   --initialization-actions gs://${GCS_BUCKET}/spark-rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
48 |   --initialization-action-timeout=20m \
49 |   --optional-components=JUPYTER \
50 |   --metadata gpu-driver-provider="NVIDIA" \
51 |   --metadata rapids-runtime=SPARK \
52 |   --metadata rapids-version=${RAPIDS_VERSION} \
53 |   --metadata spark-rapids-ml-no-import-enabled=0 \
54 |   --properties spark:spark.executor.resource.gpu.amount=1,\
55 |   spark:spark.task.resource.gpu.amount=0.0625,\
56 |   spark:spark.executorEnv.CUPY_CACHE_DIR=/tmp/.cupy,\
57 |   spark:spark.locality.wait=0,\
58 |   spark:spark.sql.execution.arrow.pyspark.enabled=true,\
59 |   spark:spark.sql.execution.arrow.maxRecordsPerBatch=100000,\
60 |   spark:spark.rapids.memory.gpu.pooling.enabled=false \
61 |   --bucket ${GCS_BUCKET} \
62 |   --enable-component-gateway \
63 |   --subnet=default \
64 |   --no-shielded-secure-boot
65 |   ```
66 |   **Note**: the `properties` settings are for demonstration purposes only.  Additional tuning may be required for optimal performance.
67 | - In the [Dataproc console](https://console.cloud.google.com/dataproc/clusters), select your cluster, go to the "Web Interfaces" tab, and click on the "JupyterLab" link.
68 | - In JupyterLab, upload the desired [notebook](../) via the `Upload Files` button.  For the no-import-change UX, you can try the example [kmeans-no-import-change.ipynb](../kmeans-no-import-change.ipynb).
69 |   
70 |   Open the notebook and select the `PySpark` kernel using, e.g., the drop down that appears after clicking on the kernel name appearing in the top right corner of the notebook view.
71 | 
72 | - Run the notebook cells.  **Note**: you may need to change file paths to use `hdfs://` paths.
73 | 


--------------------------------------------------------------------------------
/notebooks/dataproc/spark_rapids_ml.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -ex
17 | 
18 | RAPIDS_VERSION=25.6.0
19 | 
20 | 
21 | # install cudf and cuml
22 | pip install --upgrade pip
23 | pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \
24 |     --extra-index-url=https://pypi.nvidia.com
25 | 
26 | # install spark-rapids-ml
27 | pip install spark-rapids-ml
28 | 
29 | # set up no-import-change for cluster if enabled
30 | no_import_change=$(/usr/share/google/get_metadata_value attributes/spark-rapids-ml-no-import-enabled)
31 | if [[ $no_import_change == 1 ]]; then
32 |     echo "enabling no import change in cluster" 1>&2
33 |     mkdir -p /root/.ipython/profile_default/startup
34 |     echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py
35 | fi
36 | 


--------------------------------------------------------------------------------
/python/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking
 2 | This directory contains python scripts for benchmarking the supported algorithms.
 3 | 
 4 | ## Local
 5 | This [script](../run_benchmark.sh) can be used to run them locally.
 6 | 
 7 | ## Databricks
 8 | They can also be run on the Databricks AWS hosted Spark service.  See [these instructions](databricks/README.md) and accompanying scripts for running a set of high compute workloads on comparable CPU and GPU clusters.   The graph below shows the resulting Spark ML CPU and Spark Rapids ML GPU average running times.
 9 | 
10 | ![Databricks AWS benchmarking results](databricks/results/running_times.png)
11 | 
12 | ## Other CSPs
13 | Click on the below for instructions on running the benchmarking scripts in the respective CSP Spark environments 
14 | - [GCP Dataproc](dataproc/README.md)
15 | - [AWS EMR](aws-emr/README.md)
16 | 
17 | 


--------------------------------------------------------------------------------
/python/benchmark/aws-emr/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking on AWS EMR
 2 | 
 3 | This directory contains shell scripts for running larger-scale benchmarks on an AWS EMR cluster. You will need an AWS account to run them.  The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py). For convenience, these have been precomputed and are available in the public S3 bucket `spark-rapids-ml-bm-datasets-public`.  The benchmark scripts are currently configured to read the data from there.
 4 | 
 5 | ## Setup
 6 | 
 7 | - Install the [AWS CLI](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-cli.html) and initialize it via `aws configure`. You may need to obtain your [access keys and region code](../../../notebooks/aws-emr/README.md).  
 8 | 
 9 | - Create an S3 bucket if you don't already have one.
10 |   ```
11 |   export S3_BUCKET=<your_s3_bucket_name>
12 |   aws s3 mb s3://${S3_BUCKET}
13 |   ```
14 | 
15 | - Upload the benchmarking files to your S3 bucket:
16 |   ```
17 |   # path to store benchmarking files inside your S3 bucket
18 |   export BENCHMARK_HOME=${S3_BUCKET}/benchmark
19 | 
20 |   ./setup.sh
21 |   ```
22 |   **Note**: this step should be repeated for each new version of the spark-rapids-ml package that you want to test.
23 | 
24 | ## Create an ssh key pair
25 | - The benchmark script needs ssh access to the EMR cluster and this requires creating an [EC2 key pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html).  Choose the **pem** format.  After saving the private key locally with `.pem` as the file extension, set the following environment variable to point to its location.
26 |   ```
27 |   export KEYPAIR=/path/to/private/key.pem
28 |   ```
29 | 
30 | ## Prepare Subnet 
31 | - Print out available subnets in CLI then pick a SubnetId of your region (e.g. subnet-0744566f of AvailabilityZone us-east-2a in region Ohio). A subnet is required to start an EMR cluster.  Make sure that your selected subnet allows SSH access (port 22) from your local host where you will be invoking the benchmarking script.  The public subnet in the default VPC in your account might be a suitable choice.   See AWS EMR documentation for more info on [VPCs for EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-vpc-host-job-flows.html) and related info on SSH access in [managed security groups used by EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html).
32 | 
33 |   ```
34 |   aws ec2 describe-subnets
35 |   export SUBNET_ID=<your_SubnetId>
36 |   ```
37 | ## Run Benchmarks
38 | 
39 | - Start the cpu or gpu cluster and run all benchmarks.
40 |   ```
41 |   ./run_benchmark.sh [cpu|gpu] 2>&1 | tee benchmark.log
42 |   ```
43 |   **Note**: the created cluster is configured to automatically terminate after 30 minutes of idle time, but it can still be manually terminated or deleted via the AWS EMR Console.
44 | 
45 |   **Note**: monitor benchmark progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases.
46 | 
47 | - Extract timing information. To view the original EMR log files, please log in [AWS EMR console](https://console.aws.amazon.com/emr/). Click "Clusters", choose the created cluster, click "Steps", then click "stdout" of each spark submit application.  
48 |   ```
49 |   egrep -e "[0-9.]* seconds" *.log
50 |   ```
51 | 
52 | - Stop the cluster via the AWS EMR Console, or via command line. 
53 |   ```
54 |   cluster_id=$(grep "cluster-id" benchmark.log | grep -o 'j-[0-9|A-Z]*' | head -n 1)
55 |   aws emr terminate-clusters --cluster-ids ${cluster_id}
56 |   ```
57 | - **OPTIONAL**: To run a single benchmark manually, search the `benchmark.log` for the `aws emr add-steps` command line associated with the target benchmark. If needed, start the cluster first and obtain its cluster_id. Then, just copy-and-paste that command line into your shell with the correct cluster_id.
58 |   ```
59 |   ./start_cluster.sh [cpu|gpu]
60 |   <paste aws emr command line with the correct cluster_id for benchmark>
61 |   ```
62 | 


--------------------------------------------------------------------------------
/python/benchmark/aws-emr/cpu-init-configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "Classification":"spark-defaults",
 4 |         "Properties":{
 5 |             "spark.pyspark.python":"/usr/local/bin/python3.10",
 6 |             "spark.pyspark.driver.python":"/usr/local/bin/python3.10"
 7 |         }
 8 |     }
 9 | ]
10 | 


--------------------------------------------------------------------------------
/python/benchmark/aws-emr/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # copies files to s3 bucket
18 | 
19 | if [[ -z $BENCHMARK_HOME ]]; then
20 |     echo "please export BENCHMARK_HOME per README.md"
21 |     exit 1
22 | fi
23 | 
24 | SPARK_RAPIDS_ML_HOME='../..'
25 | 
26 | echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****"
27 | 
28 | aws s3 cp ../../../notebooks/aws-emr/init-bootstrap-action.sh s3://${BENCHMARK_HOME}/init-bootstrap-action.sh
29 | 
30 | pushd ${SPARK_RAPIDS_ML_HOME}/benchmark
31 | zip -r - benchmark > benchmark.zip
32 | aws s3 cp benchmark.zip s3://${BENCHMARK_HOME}/benchmark.zip
33 | popd
34 | 
35 | pushd ${SPARK_RAPIDS_ML_HOME}
36 | aws s3 cp benchmark/benchmark_runner.py s3://${BENCHMARK_HOME}/benchmark_runner.py
37 | popd
38 | 
39 | pushd ${SPARK_RAPIDS_ML_HOME}/src
40 | zip -r - spark_rapids_ml >spark_rapids_ml.zip
41 | aws s3 cp spark_rapids_ml.zip s3://${BENCHMARK_HOME}/spark_rapids_ml.zip
42 | popd
43 | 


--------------------------------------------------------------------------------
/python/benchmark/aws-emr/start_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o pipefail
17 | 
18 | cluster_type=${1:-gpu}
19 | 
20 | # configure arguments
21 | if [[ -z ${SUBNET_ID} ]]; then
22 |     echo "Please export SUBNET_ID per README.md"
23 |     exit 1
24 | fi
25 | 
26 | if [[ -z ${BENCHMARK_HOME} ]]; then
27 |     echo "Please export BENCHMARK_HOME per README.md"
28 |     exit 1
29 | fi
30 | 
31 | if [[ -z ${KEYPAIR} ]]; then
32 |     echo "Please export KEYPAIR per README.md"
33 |     exit 1
34 | fi
35 | 
36 | cluster_name=spark-rapids-ml-${cluster_type}
37 | cur_dir=$(pwd)
38 | 
39 | if [[ ${cluster_type} == "gpu" ]]; then
40 |     core_type=g5.2xlarge
41 |     config_json="file://${cur_dir}/../../../notebooks/aws-emr/init-configurations.json"
42 |     bootstrap_actions="--bootstrap-actions Name='Spark Rapids ML Bootstrap action',Path=s3://${BENCHMARK_HOME}/init-bootstrap-action.sh"
43 | elif [[ ${cluster_type} == "cpu" ]]; then
44 |     core_type=m6gd.2xlarge
45 |     config_json="file://${cur_dir}/cpu-init-configurations.json"
46 |     bootstrap_actions=""
47 | else
48 |     echo "unknown cluster type ${cluster_type}"
49 |     echo "usage: ./${script_name} cpu|gpu"
50 |     exit 1
51 | fi
52 | 
53 | start_cmd="aws emr create-cluster \
54 | --name ${cluster_name} \
55 | --release-label emr-7.3.0 \
56 | --applications Name=Hadoop Name=Spark \
57 | --service-role EMR_DefaultRole \
58 | --log-uri s3://${BENCHMARK_HOME}/logs \
59 | --ec2-attributes KeyName=$(basename ${KEYPAIR} | sed -e 's/\.pem//g' ),SubnetId=${SUBNET_ID},InstanceProfile=EMR_EC2_DefaultRole \
60 | --ebs-root-volume-size=32 \
61 | --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.2xlarge \
62 |                   InstanceGroupType=CORE,InstanceCount=3,InstanceType=${core_type} \
63 | --configurations ${config_json} $bootstrap_actions
64 | "
65 | 
66 | CLUSTER_ID=$( eval ${start_cmd} | tee /dev/tty | grep "ClusterId" | grep -o 'j-[0-9|A-Z]*')
67 | aws emr put-auto-termination-policy --cluster-id ${CLUSTER_ID} --auto-termination-policy IdleTimeout=1800
68 | echo "waiting for cluster ${CLUSTER_ID} to start ... " 1>&2
69 | 
70 | aws emr wait cluster-running --cluster-id $CLUSTER_ID
71 | 
72 | echo "cluster started." 1>&2
73 | echo $CLUSTER_ID
74 | 


--------------------------------------------------------------------------------
/python/benchmark/benchmark/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/python/benchmark/benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022-2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import inspect
17 | from distutils.util import strtobool
18 | from time import time
19 | from typing import Any, Callable, Dict, List
20 | 
21 | from pyspark.sql import SparkSession
22 | 
23 | 
24 | class WithSparkSession(object):
25 |     def __init__(self, confs: List[str], shutdown: bool = True) -> None:
26 |         builder = SparkSession.builder
27 |         for conf in confs:
28 |             key, value = (conf.split("=")[0], "=".join(conf.split("=")[1:]))
29 |             print(key, value)
30 |             builder = builder.config(key, value)
31 |         self.spark = builder.getOrCreate()
32 |         self.shutdown = shutdown
33 | 
34 |     def __enter__(self) -> SparkSession:
35 |         return self.spark
36 | 
37 |     def __exit__(self, *args: Any) -> None:
38 |         if self.shutdown:
39 |             print("stopping spark session")
40 |             self.spark.stop()
41 | 
42 | 
43 | def with_benchmark(phrase: str, action: Callable) -> Any:
44 |     start = time()
45 |     result = action()
46 |     end = time()
47 |     print("-" * 100)
48 |     duration = round(end - start, 2)
49 |     print("{}: {} seconds".format(phrase, duration))
50 |     print("-" * 100)
51 |     return result, duration
52 | 
53 | 
54 | def inspect_default_params_from_func(
55 |     func: Callable, unsupported_set: List[str] = []
56 | ) -> Dict[str, Any]:
57 |     """
58 |     Returns a dictionary of parameters and their default value of function fn.
59 |     Only the parameters with a default value will be included.
60 |     """
61 |     sig = inspect.signature(func)
62 |     filtered_params_dict = {}
63 |     for parameter in sig.parameters.values():
64 |         # Remove parameters without a default value and those in the unsupported_set
65 |         if (
66 |             parameter.default is not parameter.empty
67 |             and parameter.default is not None
68 |             and parameter.name not in unsupported_set
69 |         ):
70 |             filtered_params_dict[parameter.name] = parameter.default
71 |     return filtered_params_dict
72 | 
73 | 
74 | def to_bool(literal: str) -> bool:
75 |     return bool(strtobool(literal))
76 | 
77 | 
78 | def is_remote() -> bool:
79 |     try:
80 |         # pyspark.sql.utils.is_remote is not available in older versions of pyspark in which case remote is not supported
81 |         from pyspark.sql.utils import is_remote  # type: ignore
82 | 
83 |         return is_remote()
84 |     except:
85 |         return False
86 | 


--------------------------------------------------------------------------------
/python/benchmark/benchmark/utils_knn.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022-2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from typing import Optional, Tuple
17 | 
18 | from pyspark.sql import DataFrame
19 | 
20 | from spark_rapids_ml.core import (
21 |     EvalMetricInfo,
22 |     _ConstructFunc,
23 |     _EvaluateFunc,
24 |     _TransformFunc,
25 | )
26 | from spark_rapids_ml.knn import ApproximateNearestNeighborsModel
27 | 
28 | 
29 | class CPUNearestNeighborsModel(ApproximateNearestNeighborsModel):
30 |     def __init__(self, item_df: DataFrame):
31 |         super().__init__(item_df)
32 | 
33 |     def kneighbors(
34 |         self, query_df: DataFrame, sort_knn_df_by_query_id: bool = True
35 |     ) -> Tuple[DataFrame, DataFrame, DataFrame]:
36 |         self._item_df_withid = self._ensureIdCol(self._item_df_withid)
37 |         return super().kneighbors(
38 |             query_df, sort_knn_df_by_query_id=sort_knn_df_by_query_id
39 |         )
40 | 
41 |     def _get_cuml_transform_func(
42 |         self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None
43 |     ) -> Tuple[
44 |         _ConstructFunc,
45 |         _TransformFunc,
46 |         Optional[_EvaluateFunc],
47 |     ]:
48 |         self._cuml_params["algorithm"] = "brute"
49 |         _, _transform_internal, _ = super()._get_cuml_transform_func(
50 |             dataset, eval_metric_info
51 |         )
52 | 
53 |         from sklearn.neighbors import NearestNeighbors as SKNN
54 | 
55 |         n_neighbors = self.getK()
56 | 
57 |         def _construct_sknn() -> SKNN:
58 |             nn_object = SKNN(algorithm="brute", n_neighbors=n_neighbors)
59 |             return nn_object
60 | 
61 |         return _construct_sknn, _transform_internal, None
62 | 
63 |     def _concate_pdf_batches(self) -> bool:
64 |         return False
65 | 


--------------------------------------------------------------------------------
/python/benchmark/benchmark_runner.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | import argparse
17 | import sys
18 | 
19 | from benchmark.bench_approximate_nearest_neighbors import (
20 |     BenchmarkApproximateNearestNeighbors,
21 | )
22 | from benchmark.bench_dbscan import BenchmarkDBSCAN
23 | from benchmark.bench_kmeans import BenchmarkKMeans
24 | from benchmark.bench_linear_regression import BenchmarkLinearRegression
25 | from benchmark.bench_logistic_regression import BenchmarkLogisticRegression
26 | from benchmark.bench_nearest_neighbors import BenchmarkNearestNeighbors
27 | from benchmark.bench_pca import BenchmarkPCA
28 | from benchmark.bench_random_forest import (
29 |     BenchmarkRandomForestClassifier,
30 |     BenchmarkRandomForestRegressor,
31 | )
32 | from benchmark.bench_umap import BenchmarkUMAP
33 | 
34 | 
35 | class BenchmarkRunner:
36 |     def __init__(self) -> None:
37 |         registered_algorithms = {
38 |             "approximate_nearest_neighbors": BenchmarkApproximateNearestNeighbors,
39 |             "dbscan": BenchmarkDBSCAN,
40 |             "kmeans": BenchmarkKMeans,
41 |             "knn": BenchmarkNearestNeighbors,
42 |             "linear_regression": BenchmarkLinearRegression,
43 |             "pca": BenchmarkPCA,
44 |             "random_forest_classifier": BenchmarkRandomForestClassifier,
45 |             "random_forest_regressor": BenchmarkRandomForestRegressor,
46 |             "logistic_regression": BenchmarkLogisticRegression,
47 |             "umap": BenchmarkUMAP,
48 |         }
49 |         algorithms = "\n    ".join(registered_algorithms.keys())
50 |         parser = argparse.ArgumentParser(
51 |             description="Benchmark Spark Rapids ML algorithms",
52 |             usage=f"""benchmark_runner.py <algorithm> [<args>]
53 | 
54 |         Supported algorithms are:
55 |         {algorithms}
56 |         """,
57 |         )
58 |         parser.add_argument("algorithm", help="benchmark the ML algorithms")
59 |         # parse_args defaults to [1:] for args, but you need to
60 |         # exclude the rest of the args too, or validation will fail
61 |         args = parser.parse_args(sys.argv[1:2])
62 | 
63 |         if args.algorithm not in registered_algorithms:
64 |             print("Unrecognized algorithm: ", args.algorithm)
65 |             parser.print_help()
66 |             exit(1)
67 | 
68 |         self._runner: BenchmarkBase = registered_algorithms[args.algorithm](  # type: ignore
69 |             sys.argv[2:]
70 |         )
71 | 
72 |     def run(self) -> None:
73 |         self._runner.run()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     """
78 |     There're two ways to do the benchmark.
79 | 
80 |     1.
81 |         python benchmark_runner.py [linear_regression] \
82 |             --num_gpus=2 \
83 |             --train_path=xxx \
84 |             --spark_confs="spark.master=local[12]" \
85 | 
86 |     2.
87 |         spark-submit --master local[12] benchmark_runner.py --num_gpus=2 --train_path=xxx
88 |     """
89 |     BenchmarkRunner().run()
90 | 


--------------------------------------------------------------------------------
/python/benchmark/conftest.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import os
18 | import sys
19 | 
20 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21 | from tests.conftest import (
22 |     pytest_addoption,
23 |     pytest_collection_modifyitems,
24 |     pytest_configure,
25 | )
26 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/cpu_cluster_spec.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # needed for bm script arguments
16 | cat <<EOF
17 | {
18 |     "num_workers": $(( $num_cpus / 8)),
19 |     "cluster_name": "$cluster_name",
20 |     "spark_version": "${db_version}.x-cpu-ml-scala2.12",
21 |     "spark_conf": {},
22 |     "aws_attributes": {
23 |         "first_on_demand": 1,
24 |         "availability": "SPOT_WITH_FALLBACK",
25 |         "zone_id": "us-west-2a",
26 |         "spot_bid_price_percent": 100,
27 |         "ebs_volume_type": "GENERAL_PURPOSE_SSD",
28 |         "ebs_volume_count": 3,
29 |         "ebs_volume_size": 100
30 |     },
31 |     "node_type_id": "m5.2xlarge",
32 |     "driver_node_type_id": "m5.2xlarge",
33 |     "ssh_public_keys": [],
34 |     "custom_tags": {},
35 |     "cluster_log_conf": {
36 |         "dbfs": {
37 |             "destination": "dbfs:${BENCHMARK_HOME}/cluster_logs/${cluster_name}"
38 |         }
39 |     },
40 |     "spark_env_vars": {},
41 |     "autotermination_minutes": 30,
42 |     "enable_elastic_disk": false,
43 |     "init_scripts": [
44 |         {
45 |             "workspace": {
46 |                 "destination": "${INIT_SCRIPT_DIR}/init-cpu.sh"
47 |             }
48 |         }
49 |     ],
50 |     "enable_local_disk_encryption": false,
51 |     "runtime_engine": "STANDARD"
52 | }
53 | EOF
54 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/gpu_cluster_spec.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # needed for bm script arguments
16 | cat <<EOF
17 | {
18 |     "num_workers": $num_gpus,
19 |     "cluster_name": "$cluster_name",
20 |     "spark_version": "${db_version}.x-gpu-ml-scala2.12",
21 |     "spark_conf": {
22 |         "spark.task.resource.gpu.amount": "0.25",
23 |         "spark.task.cpus": "1",
24 |         "spark.databricks.delta.preview.enabled": "true",
25 |         "spark.python.worker.reuse": "true",
26 |         "spark.executorEnv.PYTHONPATH": "/databricks/spark/python",
27 |         "spark.sql.files.minPartitionNum": "2",
28 |         "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
29 |         "spark.executor.cores": "8",
30 |         "spark.executor.memory": "5g",
31 |         "spark.locality.wait": "0s",
32 |         "spark.sql.execution.sortBeforeRepartition": "false",
33 |         "spark.sql.execution.arrow.pyspark.enabled": "true",
34 |         "spark.sql.files.maxPartitionBytes": "2000000000000",
35 |         "spark.databricks.delta.optimizeWrite.enabled": "false"
36 |     },
37 |     "aws_attributes": {
38 |         "first_on_demand": 1,
39 |         "availability": "SPOT_WITH_FALLBACK",
40 |         "zone_id": "us-west-2a",
41 |         "spot_bid_price_percent": 100,
42 |         "ebs_volume_count": 0
43 |     },
44 |     "node_type_id": "g5.2xlarge",
45 |     "driver_node_type_id": "g4dn.xlarge",
46 |     "custom_tags": {},
47 |     "cluster_log_conf": {
48 |         "dbfs": {
49 |             "destination": "dbfs:${BENCHMARK_HOME}/cluster_logs/${cluster_name}"
50 |         }
51 |     },
52 |     "autotermination_minutes": 30,
53 |     "enable_elastic_disk": false,
54 |     "init_scripts": [
55 |         {
56 |             "workspace": {
57 |                 "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-11.8.sh"
58 |             }
59 |         }
60 |     ],
61 |     "enable_local_disk_encryption": false,
62 |     "runtime_engine": "STANDARD"
63 | }
64 | EOF
65 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/gpu_etl_cluster_spec.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # needed for bm script arguments
16 | cat <<EOF
17 | {
18 |     "num_workers": $num_gpus,
19 |     "cluster_name": "$cluster_name",
20 |     "spark_version": "${db_version}.x-gpu-ml-scala2.12",
21 |     "spark_conf": {
22 |         "spark.task.resource.gpu.amount": "0.25",
23 |         "spark.task.cpus": "1",
24 |         "spark.databricks.delta.preview.enabled": "true",
25 |         "spark.python.worker.reuse": "true",
26 |         "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-25.04.0.jar:/databricks/spark/python",
27 |         "spark.sql.files.minPartitionNum": "2",
28 |         "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
29 |         "spark.executor.cores": "8",
30 |         "spark.executor.memory": "5g",
31 |         "spark.plugins": "com.nvidia.spark.SQLPlugin",
32 |         "spark.locality.wait": "0s",
33 |         "spark.sql.cache.serializer": "com.nvidia.spark.ParquetCachedBatchSerializer",
34 |         "spark.rapids.memory.gpu.pool": "NONE",
35 |         "spark.rapids.sql.explain": "ALL",
36 |         "spark.sql.execution.sortBeforeRepartition": "false",
37 |         "spark.rapids.sql.python.gpu.enabled": "true",
38 |         "spark.rapids.memory.pinnedPool.size": "2G",
39 |         "spark.python.daemon.module": "rapids.daemon_databricks",
40 |         "spark.rapids.sql.batchSizeBytes": "512m",
41 |         "spark.sql.adaptive.enabled": "false",
42 |         "spark.sql.execution.arrow.pyspark.enabled": "true",
43 |         "spark.sql.files.maxPartitionBytes": "2000000000000",
44 |         "spark.databricks.delta.optimizeWrite.enabled": "false",
45 |         "spark.rapids.sql.concurrentGpuTasks": "2"
46 |     },
47 |     "aws_attributes": {
48 |         "first_on_demand": 1,
49 |         "availability": "SPOT_WITH_FALLBACK",
50 |         "zone_id": "us-west-2a",
51 |         "spot_bid_price_percent": 100,
52 |         "ebs_volume_count": 0
53 |     },
54 |     "node_type_id": "g5.2xlarge",
55 |     "driver_node_type_id": "g4dn.xlarge",
56 |     "custom_tags": {},
57 |     "cluster_log_conf": {
58 |         "dbfs": {
59 |             "destination": "dbfs:${BENCHMARK_HOME}/cluster_logs/${cluster_name}"
60 |         }
61 |     },
62 |     "spark_env_vars": {
63 |         "LIBCUDF_CUFILE_POLICY": "OFF",
64 |         "NCCL_DEBUG": "INFO"
65 |     },
66 |     "autotermination_minutes": 30,
67 |     "enable_elastic_disk": false,
68 |     "init_scripts": [
69 |         {
70 |             "workspace": {
71 |                 "destination": "${INIT_SCRIPT_DIR}/init-pip-cuda-11.8.sh"
72 |             }
73 |         }
74 |     ],
75 |     "enable_local_disk_encryption": false,
76 |     "runtime_engine": "STANDARD"
77 | }
78 | EOF
79 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/init-cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # set portion of path below after /dbfs/ to dbfs zip file location
17 | SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/spark-rapids-ml.zip
18 | BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip
19 | 
20 | # install spark-rapids-ml
21 | python_ver=`python --version | grep -oP '3\.[0-9]+'`
22 | unzip ${SPARK_RAPIDS_ML_ZIP} -d /databricks/python3/lib/python${python_ver}/site-packages
23 | unzip ${BENCHMARK_ZIP} -d /databricks/python3/lib/python${python_ver}/site-packages
24 | 
25 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/init-pip-cuda-11.8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # set portion of path below after /dbfs/ to dbfs zip file location
17 | SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/spark-rapids-ml.zip
18 | BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip
19 | # IMPORTANT: specify rapids fully 23.10.0 and not 23.10
20 | # also, in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
21 | # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
22 | RAPIDS_VERSION=25.6.0
23 | SPARK_RAPIDS_VERSION=25.04.0
24 | 
25 | curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
26 | 
27 | # install cudatoolkit 11.8 via runfile approach
28 | wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
29 | sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit
30 | 
31 | # reset symlink and update library loading paths
32 | # **** set LD_LIBRARY_PATH as below in env var section of cluster config in DB cluster UI ****
33 | rm /usr/local/cuda
34 | ln -s /usr/local/cuda-11.8 /usr/local/cuda
35 | 
36 | # upgrade pip
37 | /databricks/python/bin/pip install --upgrade pip
38 | 
39 | # install cudf and cuml
40 | # using ~= pulls in micro version patches
41 | /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
42 |     cuml-cu11~=${RAPIDS_VERSION} \
43 |     cuvs-cu11~=${RAPIDS_VERSION} \
44 |     pylibraft-cu11~=${RAPIDS_VERSION} \
45 |     rmm-cu11~=${RAPIDS_VERSION} \
46 |     --extra-index-url=https://pypi.nvidia.com
47 | 
48 | # install spark-rapids-ml
49 | python_ver=`python --version | grep -oP '3\.[0-9]+'`
50 | unzip ${SPARK_RAPIDS_ML_ZIP} -d /databricks/python3/lib/python${python_ver}/site-packages
51 | unzip ${BENCHMARK_ZIP} -d /databricks/python3/lib/python${python_ver}/site-packages
52 | 
53 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/process_bm_log.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | grep -oe '\(==* algo:.*==\|fit:........\|training:........\|parameters\)' $1
18 | 


--------------------------------------------------------------------------------
/python/benchmark/databricks/results/running_times.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/spark-rapids-ml/7267705b4f226f0b579844116f4cb72249e64a27/python/benchmark/databricks/results/running_times.png


--------------------------------------------------------------------------------
/python/benchmark/databricks/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # copies files to dbfs 
18 | 
19 | if [[ -z $DB_PROFILE ]]; then
20 |     echo "please export DB_PROFILE per README.md"
21 |     exit 1
22 | fi
23 | 
24 | if [[ -z $BENCHMARK_HOME ]]; then
25 |     echo "please export BENCHMARK_HOME per README.md"
26 |     exit 1
27 | fi
28 | 
29 | if [[ -z $WS_BENCHMARK_HOME ]]; then
30 |     echo "please expoert WS_BENCHMARK_HOME per README.md"
31 |     exit 1
32 | fi
33 | 
34 | SPARK_RAPIDS_ML_HOME='../..'
35 | 
36 | echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****"
37 | 
38 | INIT_SCRIPT_DIR="${WS_BENCHMARK_HOME}/init_scripts"
39 | SPARK_RAPIDS_ML_ZIP="${BENCHMARK_HOME}/zips/spark-rapids-ml.zip"
40 | BENCHMARK_ZIP="${BENCHMARK_HOME}/zips/benchmark.zip"
41 | BENCHMARK_SCRIPTS="${BENCHMARK_HOME}/scripts"
42 | databricks fs mkdirs dbfs:${BENCHMARK_HOME}/zips --profile $DB_PROFILE
43 | databricks fs mkdirs dbfs:${BENCHMARK_HOME}/scripts --profile $DB_PROFILE
44 | 
45 | pushd ${SPARK_RAPIDS_ML_HOME}/benchmark && rm -f benchmark.zip && \
46 | zip -r benchmark.zip benchmark && \
47 | databricks fs cp benchmark.zip dbfs:${BENCHMARK_ZIP} --profile ${DB_PROFILE} ${DB_OVERWRITE}
48 | popd
49 | 
50 | pushd ${SPARK_RAPIDS_ML_HOME} && \
51 | ls benchmark
52 | databricks fs cp benchmark/benchmark_runner.py dbfs:${BENCHMARK_SCRIPTS}/benchmark_runner.py --profile  ${DB_PROFILE} ${DB_OVERWRITE}
53 | popd
54 | 
55 | pushd ${SPARK_RAPIDS_ML_HOME}/src && rm -f spark-rapids-ml.zip && \
56 | zip -r spark-rapids-ml.zip spark_rapids_ml && \
57 | databricks fs cp spark-rapids-ml.zip dbfs:${SPARK_RAPIDS_ML_ZIP} --profile ${DB_PROFILE} ${DB_OVERWRITE}
58 | popd
59 | 
60 | # create workspace directory
61 | databricks workspace mkdirs ${INIT_SCRIPT_DIR} --profile ${DB_PROFILE} ${DB_OVERWRITE}
62 | # point cpu and gpu cluster init scripts to new files and upload
63 | for init_script in init-pip-cuda-11.8.sh init-cpu.sh
64 | do
65 | # NOTE: on linux delete the .bu after -i
66 |     if base64 --help | grep '\-w'; then
67 |         # linux
68 |         base64_cmd="base64 -w 0"
69 |     else
70 |         # mac os
71 |         base64_cmd="base64 -i"
72 |     fi
73 |     sed -e "s#/path/to/spark-rapids-ml\.zip#${SPARK_RAPIDS_ML_ZIP}#g" -e "s#/path/to/benchmark\.zip#${BENCHMARK_ZIP}#g" $init_script > ${init_script}.updated && \
74 |     databricks workspace import --format AUTO --content $(${base64_cmd} ${init_script}.updated) ${INIT_SCRIPT_DIR}/${init_script} --profile ${DB_PROFILE} ${DB_OVERWRITE}
75 | done
76 | 


--------------------------------------------------------------------------------
/python/benchmark/dataproc/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking on Dataproc
 2 | 
 3 | This directory contains shell scripts for running larger scale benchmarks on a Google Dataproc cluster.  You will need a Dataproc account to run them.  The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py).  For internal users, these have been precomputed and stored in the GCS bucket `gs://spark-rapids-ml-benchmarking/datasets`.   By default, the benchmark scripts will read the data from this location.  For external users, you will need to modify the `BENCHMARK_DATA_HOME` environment variable in the `run_benchmark.sh` after generating the datasets.
 4 | 
 5 | ## Setup
 6 | 
 7 | - Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install) and initialize it via `gcloud init`.
 8 | 
 9 | - Configure the following settings:
10 |   ```
11 |   export PROJECT=<your_project>
12 |   export DATAPROC_REGION=<your_dataproc_region>
13 |   export COMPUTE_REGION=<your_compute_region>
14 |   export COMPUTE_ZONE=<your_compute_zone>
15 | 
16 |   gcloud config set project ${PROJECT}
17 |   gcloud config set dataproc/region ${DATAPROC_REGION}
18 |   gcloud config set compute/region ${COMPUTE_REGION}
19 |   gcloud config set compute/zone ${COMPUTE_ZONE}
20 |   ```
21 | 
22 | - Create a GCS bucket if you don't already have one:
23 |   ```
24 |   export GCS_BUCKET=<your_gcs_bucket_name>
25 | 
26 |   gcloud storage buckets create gs://${GCS_BUCKET}
27 |   ```
28 | 
29 | - Upload the benchmarking files to your GCS bucket:
30 |   ```
31 |   # path to store benchmarking files inside your GCS bucket
32 |   export BENCHMARK_HOME=${GCS_BUCKET}/benchmark
33 | 
34 |   ./setup.sh
35 |   ```
36 |   **Note**: this step should be repeated for each new version of the spark-rapids-ml package that you want to test.
37 | 
38 | ## Run Benchmarks
39 | 
40 | - Start the cpu or gpu cluster and run all benchmarks:
41 |   ```
42 |   ./run_benchmark.sh [cpu|gpu] 2>&1 | tee benchmark.log
43 |   ```
44 |   **Note**: the created cluster is configured to automatically terminate after 30 minutes of idle time, but it can still be manually terminated or deleted via the Dataproc UI.
45 | 
46 |   **Note**: monitor benchmark progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases.
47 | 
48 | - Extract timing information:
49 |   ```
50 |   egrep -e "[0-9.]* seconds" *.out
51 |   ```
52 | 
53 | - Stop the cluster via the Dataproc UI, or via this command line:
54 |   ```
55 |   gcloud dataproc clusters delete ${USER}-spark-rapids-ml-[cpu|gpu] --region ${COMPUTE_REGION}
56 |   ```
57 | 
58 | - **OPTIONAL**: To run a single benchmark manually, search the `benchmark.log` for the `gcloud` command line associated with the target benchmark.  If needed, start the cluster first.  Then, just copy-and-paste that command line into your shell.
59 |   ```
60 |   ./start_cluster.sh [cpu|gpu]
61 |   <paste gcloud command line for benchmark>
62 |   ```
63 | 


--------------------------------------------------------------------------------
/python/benchmark/dataproc/init_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | set -euxo pipefail
18 | 
19 | function get_metadata_attribute() {
20 |   local -r attribute_name=$1
21 |   local -r default_value=$2
22 |   /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
23 | }
24 | 
25 | RAPIDS_VERSION=$(get_metadata_attribute rapids-version 25.6.0)
26 | 
27 | 
28 | # install cudf and cuml
29 | # using ~= pulls in lates micro version patches
30 | pip install --upgrade pip
31 | 
32 | pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \
33 |     pylibraft-cu12~=${RAPIDS_VERSION} \
34 |     rmm-cu12~=${RAPIDS_VERSION} \
35 |     --extra-index-url=https://pypi.nvidia.com
36 | 
37 | # install benchmark files
38 | BENCHMARK_HOME=$(get_metadata_attribute benchmark-home UNSET)
39 | if [[ ${BENCHMARK_HOME} == "UNSET" ]]; then
40 |     echo "Please set --metadata benchmark-home"
41 |     exit 1
42 | fi
43 | 
44 | gsutil cp gs://${BENCHMARK_HOME}/benchmark_runner.py .
45 | gsutil cp gs://${BENCHMARK_HOME}/spark_rapids_ml.zip .
46 | gsutil cp gs://${BENCHMARK_HOME}/benchmark.zip .
47 | 
48 | python_ver=`python --version | grep -oP '3\.[0-9]+'`
49 | unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages
50 | unzip benchmark.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages
51 | 


--------------------------------------------------------------------------------
/python/benchmark/dataproc/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -xe
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # copies files to GCS bucket
18 | 
19 | if [[ -z $BENCHMARK_HOME ]]; then
20 |     echo "please export BENCHMARK_HOME per README.md"
21 |     exit 1
22 | fi
23 | 
24 | SPARK_RAPIDS_ML_HOME='../..'
25 | 
26 | echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****"
27 | 
28 | gsutil cp init_benchmark.sh gs://${BENCHMARK_HOME}/init_benchmark.sh
29 | curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh
30 | gsutil cp spark-rapids.sh gs://${BENCHMARK_HOME}/spark-rapids.sh
31 | 
32 | pushd ${SPARK_RAPIDS_ML_HOME}/benchmark
33 | zip -r - benchmark >benchmark.zip
34 | gsutil cp benchmark.zip gs://${BENCHMARK_HOME}/benchmark.zip
35 | popd
36 | 
37 | pushd ${SPARK_RAPIDS_ML_HOME}
38 | gsutil cp benchmark/benchmark_runner.py gs://${BENCHMARK_HOME}/benchmark_runner.py
39 | popd
40 | 
41 | pushd ${SPARK_RAPIDS_ML_HOME}/src
42 | zip -r - spark_rapids_ml >spark_rapids_ml.zip
43 | gsutil cp spark_rapids_ml.zip gs://${BENCHMARK_HOME}/spark_rapids_ml.zip
44 | popd
45 | 


--------------------------------------------------------------------------------
/python/benchmark/dataproc/start_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | cluster_type=${1:-gpu}
17 | 
18 | # configure arguments
19 | if [[ -z ${COMPUTE_REGION} ]]; then
20 |     echo "Please export COMPUTE_REGION per README.md"
21 |     exit 1
22 | fi
23 | 
24 | if [[ -z ${GCS_BUCKET} ]]; then
25 |     echo "Please export GCS_BUCKET per README.md"
26 |     exit 1
27 | fi
28 | 
29 | BENCHMARK_HOME=${BENCHMARK_HOME:-${GCS_BUCKET}/benchmark}
30 | 
31 | gpu_args=$(cat <<EOF
32 | --master-accelerator type=nvidia-tesla-t4,count=1
33 | --worker-accelerator type=nvidia-tesla-t4,count=1
34 | --initialization-actions gs://${BENCHMARK_HOME}/spark-rapids.sh,gs://${BENCHMARK_HOME}/init_benchmark.sh
35 | --initialization-action-timeout=20m
36 | --metadata gpu-driver-provider="NVIDIA"
37 | --metadata rapids-runtime=SPARK
38 | --metadata benchmark-home=${BENCHMARK_HOME}
39 | EOF
40 | )
41 | 
42 | cpu_args=$(cat <<EOF
43 | --initialization-actions gs://${BENCHMARK_HOME}/init_benchmark.sh
44 | EOF
45 | )
46 | 
47 | if [[ ${cluster_type} == "gpu" ]]; then
48 |     extra_args=${gpu_args}
49 | elif [[ ${cluster_type} == "cpu" ]]; then
50 |     extra_args=${cpu_args}
51 | else
52 |     echo "unknown cluster type ${cluster_type}"
53 |     echo "usage: ./${script_name} cpu|gpu"
54 |     exit 1
55 | fi
56 | 
57 | # start cluster if not already running
58 | cluster_name=${CLUSTER_NAME:-"${USER}-spark-rapids-ml-${cluster_type}"}
59 | 
60 | gcloud dataproc clusters list | grep "${cluster_name}"
61 | if [[ $? == 0 ]]; then
62 |     echo "WARNING: Cluster ${cluster_name} is already started."
63 | else
64 |     set -x
65 |     gcloud dataproc clusters create ${cluster_name} \
66 |     --image-version=2.2-ubuntu22 \
67 |     --region ${COMPUTE_REGION} \
68 |     --master-machine-type n1-standard-16 \
69 |     --num-workers 2 \
70 |     --worker-min-cpu-platform="Intel Skylake" \
71 |     --worker-machine-type n1-standard-16 \
72 |     --num-worker-local-ssds 4 \
73 |     --worker-local-ssd-interface=NVME \
74 |     ${extra_args} \
75 |     --optional-components=JUPYTER \
76 |     --bucket ${GCS_BUCKET} \
77 |     --enable-component-gateway \
78 |     --max-idle "30m" \
79 |     --subnet=default \
80 |     --no-shielded-secure-boot
81 | fi
82 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [project]
16 | name = "spark-rapids-ml"
17 | version = "25.06.0"
18 | authors = [
19 |   { name="Jinfeng Li", email="jinfeng@nvidia.com" },
20 |   { name="Bobby Wang", email="bobwang@nvidia.com" },
21 |   { name="Erik Ordentlich", email="eordentlich@nvidia.com" },
22 |   { name="Lee Yang", email="leey@nvidia.com" }
23 | ]
24 | description = "Apache Spark integration with RAPIDS and cuML"
25 | readme = "README.md"
26 | requires-python = ">=3.10"
27 | classifiers = [
28 |     "Programming Language :: Python :: 3",
29 |     "Programming Language :: Python :: 3.10",
30 |     "Programming Language :: Python :: 3.11",
31 |     "License :: OSI Approved :: Apache Software License",
32 |     "Operating System :: OS Independent",
33 |     "Environment :: GPU :: NVIDIA CUDA :: 11",
34 |     "Environment :: GPU :: NVIDIA CUDA :: 11.4",
35 |     "Environment :: GPU :: NVIDIA CUDA :: 11.5",
36 |     "Environment :: GPU :: NVIDIA CUDA :: 11.6",
37 |     "Environment :: GPU :: NVIDIA CUDA :: 11.7",
38 |     "Environment :: GPU :: NVIDIA CUDA :: 11.8",
39 |     "Environment :: GPU :: NVIDIA CUDA :: 12",
40 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.0",
41 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1",
42 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
43 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.3",
44 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4",
45 |     "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.5",
46 | ]
47 | 
48 | [project.scripts]
49 | spark-rapids-submit = "spark_rapids_ml.spark_rapids_submit:main_cli"
50 | pyspark-rapids = "spark_rapids_ml.pyspark_rapids:main_cli"
51 | 
52 | [build-system]
53 | requires = ["setuptools>=61.0"]
54 | build-backend = "setuptools.build_meta"
55 | 
56 | [tool.setuptools.package-data]
57 | "spark_rapids_ml.jars" = ["*.jar"]
58 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | pyspark>=3.2.1,<3.5
16 | scikit-learn>=1.2.1
17 | 


--------------------------------------------------------------------------------
/python/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | -r requirements.txt
16 | black>=23.1.0
17 | build>=0.10.0
18 | isort>=5.12.0
19 | mypy>=1.0.0
20 | numpydoc
21 | pydata-sphinx-theme
22 | pylint
23 | pytest
24 | pytest-xdist
25 | sphinx<6.0
26 | twine>=4.0.0
27 | 


--------------------------------------------------------------------------------
/python/run_plugin_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -e
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | pip install pyspark~=4.0
17 | pushd ../jvm
18 | mvn clean test
19 | popd
20 | pip install -r requirements_dev.txt
21 | 


--------------------------------------------------------------------------------
/python/run_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -e
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | unset SPARK_HOME
17 | 
18 | python ../ci/lint_python.py --format --type-check || exit 1
19 | 
20 | total_num_gpus=$(python -c 'import cupy; print(cupy.cuda.runtime.getDeviceCount())')
21 | if [ ${total_num_gpus} -gt 4 ]
22 | then
23 |     echo "Tests use at most 4 GPUs. If failed, try setting CUDA_VISIBLE_DEVICES."
24 | fi
25 | 
26 | # no package import change tests
27 | # runs on gpu
28 | python -m spark_rapids_ml tests_no_import_change/test_no_import_change.py 0.2
29 | # runs on cpu
30 | python tests_no_import_change/test_no_import_change.py 0.2
31 | # runs on gpu with spark-submit (note: local[1] for spark-rapids-submit hangs probably due to barrier rdd timer threads. TBD root cause)
32 | spark-rapids-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2
33 | # runs on cpu with spark-submit
34 | spark-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2
35 | 
36 | 
37 | # calculate pytest parallelism by following https://github.com/NVIDIA/spark-rapids/tree/main/integration_tests/run_pyspark_from_build.sh
38 | MAX_PARALLEL=3
39 | NVIDIA_SMI_ARGS="" 
40 | if [ ${CUDA_VISIBLE_DEVICES} ]; then
41 |         NVIDIA_SMI_ARGS="${NVIDIA_SMI_ARGS} -i ${CUDA_VISIBLE_DEVICES}" 
42 | fi
43 | GPU_MEM_PARALLEL=`nvidia-smi ${NVIDIA_SMI_ARGS} --query-gpu=memory.free --format=csv,noheader | awk 'NR == 1 { MIN = $1 } { if ($1 < MIN) { MIN = $1 } } END { print int((MIN - 2 * 1024) / ((3 * 1024) + 750)) }'`
44 | CPU_CORES=`nproc`
45 | TMP_PARALLEL=$(( $GPU_MEM_PARALLEL > $CPU_CORES ? $CPU_CORES : $GPU_MEM_PARALLEL ))
46 | TMP_PARALLEL=$(( $TMP_PARALLEL > $MAX_PARALLEL ? $MAX_PARALLEL : $TMP_PARALLEL ))
47 | if  (( $TMP_PARALLEL <= 1 )); then
48 |         TEST_PARALLEL=1
49 |     else
50 |         TEST_PARALLEL=$TMP_PARALLEL
51 | fi
52 | echo "Test functions in benchmark/test_gen_data.py and tests/ directory will be executed in parallel with ${TEST_PARALLEL} pytest workers" 
53 | 
54 | echo "use --runslow to run all tests"
55 | pytest "$@" -n ${TEST_PARALLEL} benchmark/test_gen_data.py
56 | PYTHONPATH=`pwd`/benchmark pytest -ra "$@" -n ${TEST_PARALLEL} --durations=10 tests
57 | #PYTHONPATH=`pwd`/benchmark pytest -ra --runslow -n ${TEST_PARALLEL} --durations=10 tests
58 | #PYTHONPATH=`pwd`/benchmark pytest -ra "$@" --durations=10 tests_large
59 | 


--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | [mypy]
16 | ignore_missing_imports = True
17 | disallow_untyped_defs = True
18 | follow_imports = silent
19 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | __version__ = "25.06.0"
17 | 
18 | import pandas as pd
19 | import pyspark
20 | 
21 | # patch pandas 2.0+ for backward compatibility with psypark < 3.4
22 | from packaging import version
23 | 
24 | if version.parse(pyspark.__version__) < version.parse("3.4.0") and version.parse(
25 |     pd.__version__
26 | ) >= version.parse("2.0.0"):
27 |     pd.DataFrame.iteritems = pd.DataFrame.items
28 |     pd.Series.iteritems = pd.Series.items
29 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/__main__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import argparse
18 | import runpy
19 | import sys
20 | 
21 | import spark_rapids_ml.install
22 | 
23 | 
24 | # borrowed from rapids cudf.pandas
25 | def main() -> None:
26 |     parser = argparse.ArgumentParser(
27 |         prog="python -m spark_rapids_ml",
28 |         description=(
29 |             "Run a Python script with Spark RAPIDS ML enabled. "
30 |             "In this mode supported pyspark.ml estimator imports will automatically use GPU acclerated implementations."
31 |         ),
32 |     )
33 | 
34 |     parser.add_argument(
35 |         "-m",
36 |         dest="module",
37 |         nargs=1,
38 |     )
39 |     parser.add_argument(
40 |         "args",
41 |         nargs=argparse.REMAINDER,
42 |         help="Arguments to pass on to the script",
43 |     )
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     if args.module:
48 |         (module,) = args.module
49 |         # run the module passing the remaining arguments
50 |         # as if it were run with python -m <module> <args>
51 |         sys.argv[:] = [module] + args.args  # not thread safe?
52 |         runpy.run_module(module, run_name="__main__")
53 |     elif len(args.args) >= 1:
54 |         # Remove ourself from argv and continue
55 |         sys.argv[:] = args.args
56 |         runpy.run_path(args.args[0], run_name="__main__")
57 |     else:
58 |         parser.print_help()
59 |         exit(1)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/common/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/install.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024-2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import importlib
18 | import sys
19 | import types
20 | from typing import Any
21 | 
22 | _accelerated_attributes = {
23 |     "feature": ["PCA", "PCAModel"],
24 |     "clustering": ["KMeans", "KMeansModel"],
25 |     "classification": [
26 |         "LogisticRegression",
27 |         "LogisticRegressionModel",
28 |         "RandomForestClassifier",
29 |         "RandomForestClassificationModel",
30 |     ],
31 |     "regression": [
32 |         "LinearRegression",
33 |         "LinearRegressionModel" "RandomForestRegressor",
34 |         "RandomForestClassificationModel",
35 |     ],
36 |     "tuning": ["CrossValidator"],
37 |     "pipeline": ["Pipeline"],
38 | }
39 | 
40 | 
41 | _rapids_modules = {
42 |     module_name: importlib.import_module(f"spark_rapids_ml.{module_name}")
43 |     for module_name in _accelerated_attributes.keys()
44 | }
45 | _pyspark_modules = {
46 |     module_name: importlib.import_module(f"pyspark.ml.{module_name}")
47 |     for module_name in _accelerated_attributes.keys()
48 | }
49 | 
50 | 
51 | def _set_pyspark_mod_getattr(mod_name: str) -> None:
52 |     proxy_module = types.ModuleType(f"pyspark.ml.${mod_name}")
53 | 
54 |     def _getattr(attr: str) -> Any:
55 |         frame = sys._getframe()
56 |         assert frame.f_back
57 |         calling_path = frame.f_back.f_code.co_filename
58 |         if any(
59 |             [
60 |                 (
61 |                     f"pyspark/ml/{mod_name}" in calling_path
62 |                     or f"spark_rapids_ml/{mod_name}" in calling_path
63 |                 )
64 |                 for mod_name in _accelerated_attributes.keys()
65 |             ]
66 |         ) or (attr not in _accelerated_attributes[mod_name]):
67 |             try:
68 |                 attr_val = getattr(_pyspark_modules[mod_name], attr)
69 |             except:
70 |                 raise AttributeError(f"No attribute '{attr}'")
71 | 
72 |             return attr_val
73 |         else:
74 |             return getattr(_rapids_modules[mod_name], attr)
75 | 
76 |     setattr(proxy_module, "__getattr__", _getattr)
77 |     sys.modules[f"pyspark.ml.{mod_name}"] = proxy_module
78 | 
79 | 
80 | for mod_name in _accelerated_attributes.keys():
81 |     _set_pyspark_mod_getattr(mod_name)
82 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from collections import namedtuple
18 | from dataclasses import dataclass
19 | from typing import Optional
20 | 
21 | # Global parameter used by core and subclasses.
22 | TransformEvaluateMetric = namedtuple(
23 |     "TransformEvaluateMetric", ("accuracy_like", "log_loss", "regression")
24 | )
25 | transform_evaluate_metric = TransformEvaluateMetric(
26 |     "accuracy_like", "log_loss", "regression"
27 | )
28 | 
29 | 
30 | @dataclass
31 | class EvalMetricInfo:
32 |     """Class for holding info about
33 |     Spark evaluators to be passed in to transform_evaluate local computations"""
34 | 
35 |     # MulticlassClassificationEvaluator
36 |     eps: float = 1.0e-15  # logLoss
37 |     # BinaryClassificationEvaluator - placeholder till we support
38 |     numBins: int = 1000
39 | 
40 |     eval_metric: Optional[str] = None
41 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/pyspark_rapids.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import os
18 | import subprocess
19 | import sys
20 | 
21 | import spark_rapids_ml
22 | 
23 | 
24 | def main_cli() -> None:
25 | 
26 |     i = 1
27 |     while i < len(sys.argv) and sys.argv[i].startswith("-"):
28 |         if sys.argv[i] in ["--help", "-h", "--version"]:
29 |             output = subprocess.run(
30 |                 f"pyspark {sys.argv[i]}", shell=True, capture_output=True
31 |             ).stderr
32 |             output_str = output.decode("utf-8")
33 |             output_str = output_str.replace("pyspark", "pyspark-rapids")
34 |             print(output_str, file=sys.stderr)
35 |             exit(0)
36 |         elif sys.argv[i] in ["--verbose", "-v", "--supervise"]:
37 |             i += 1
38 |         else:
39 |             i += 2
40 | 
41 |     command_line = "pyspark " + " ".join(sys.argv[1:])
42 |     env = dict(os.environ)
43 |     env["PYTHONSTARTUP"] = f"{spark_rapids_ml.__path__[0]}/install.py"
44 |     subprocess.run(command_line, shell=True, env=env)
45 | 


--------------------------------------------------------------------------------
/python/src/spark_rapids_ml/spark_rapids_submit.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import subprocess
18 | import sys
19 | 
20 | import spark_rapids_ml
21 | 
22 | 
23 | def main_cli() -> None:
24 |     i = 1
25 |     while i < len(sys.argv) and sys.argv[i].startswith("-"):
26 |         if sys.argv[i] in ["--help", "-h", "--version"]:
27 |             output = subprocess.run(
28 |                 f"spark-submit {sys.argv[i]}", shell=True, capture_output=True
29 |             ).stderr
30 |             output_str = output.decode("utf-8")
31 |             output_str = output_str.replace("spark-submit", "spark-rapids-submit")
32 |             print(output_str, file=sys.stderr)
33 |             exit(0)
34 |         elif sys.argv[i] in ["--verbose", "-v", "--supervise"]:
35 |             i += 1
36 |         else:
37 |             i += 2
38 | 
39 |     if i >= len(sys.argv):
40 |         raise ValueError("No application file supplied.")
41 | 
42 |     command_line = (
43 |         "spark-submit "
44 |         + " ".join(sys.argv[1:i])
45 |         + f" {spark_rapids_ml.__path__[0]}/__main__.py "
46 |         + " ".join(sys.argv[i:])
47 |     )
48 | 
49 |     subprocess.run(command_line, shell=True)
50 | 


--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/tests/discover_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | if ! command -v nvidia-smi &> /dev/null
18 | then
19 |     # default to the first GPU
20 |     echo "{\"name\":\"gpu\",\"addresses\":[\"0\"]}"
21 |     exit
22 | else
23 |     # https://github.com/apache/spark/blob/master/examples/src/main/scripts/getGpusResources.sh
24 |     ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`
25 |     echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
26 | fi


--------------------------------------------------------------------------------
/python/tests/sparksession.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2025, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from typing import Any, Dict
17 | 
18 | from pyspark.sql import SparkSession
19 | 
20 | from .conftest import _default_conf, get_spark_i_know_what_i_am_doing
21 | 
22 | 
23 | # sparksession.py is copied from spark-rapids
24 | def _from_scala_map(scala_map) -> Dict[str, Any]:  # type: ignore
25 |     ret = {}
26 |     # The value we get is a scala map, not a java map, so we need to jump through some hoops
27 |     keys = scala_map.keys().iterator()  # type: ignore
28 |     while keys.hasNext():  # type: ignore
29 |         key = keys.next()  # type: ignore
30 |         ret[key] = scala_map.get(key).get()  # type: ignore
31 |     return ret  # type: ignore
32 | 
33 | 
34 | _spark = get_spark_i_know_what_i_am_doing()
35 | # Have to reach into a private member to get access to the API we need
36 | _orig_conf = _from_scala_map(_spark.conf._jconf.getAll())  # type: ignore
37 | _orig_conf_keys = _orig_conf.keys()  # type: ignore
38 | 
39 | 
40 | class CleanSparkSession:
41 |     """
42 |     A context manager to auto reset spark conf.
43 |     """
44 | 
45 |     def __init__(self, conf: Dict[str, Any] = {}) -> None:
46 |         self.conf = conf
47 |         self.spark = _spark
48 | 
49 |     def __enter__(self) -> SparkSession:
50 |         self._reset_spark_session_conf()
51 |         self._set_all_confs(self.conf)
52 |         return self.spark
53 | 
54 |     def __exit__(self, *args: Any) -> None:
55 |         self._reset_spark_session_conf()
56 | 
57 |     def _set_all_confs(self, conf: Dict[str, Any]) -> None:
58 |         newconf = _default_conf.copy()
59 |         newconf.update(conf)
60 |         for key, value in newconf.items():
61 |             if self.spark.conf.get(key, None) != value:
62 |                 self.spark.conf.set(key, value)
63 | 
64 |     def _reset_spark_session_conf(self) -> None:
65 |         """Reset all of the configs for a given spark session."""
66 |         self._set_all_confs(_orig_conf)
67 |         # Have to reach into a private member to get access to the API we need
68 |         current_keys = _from_scala_map(self.spark.conf._jconf.getAll()).keys()  # type: ignore
69 |         for key in current_keys:
70 |             if key not in _orig_conf_keys:
71 |                 self.spark.conf.unset(key)
72 | 


--------------------------------------------------------------------------------
/python/tests/test_tuning.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2024, NVIDIA CORPORATION.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | from typing import Tuple, Union
 17 | 
 18 | import numpy as np
 19 | import pytest
 20 | from pyspark.ml.evaluation import RegressionEvaluator
 21 | from pyspark.ml.tuning import CrossValidatorModel, ParamGridBuilder
 22 | 
 23 | from spark_rapids_ml.regression import RandomForestRegressor
 24 | from spark_rapids_ml.tuning import CrossValidator
 25 | 
 26 | from .sparksession import CleanSparkSession
 27 | from .utils import (
 28 |     create_pyspark_dataframe,
 29 |     feature_types,
 30 |     idfn,
 31 |     make_regression_dataset,
 32 | )
 33 | 
 34 | 
 35 | @pytest.mark.parametrize("feature_type", [feature_types.vector])
 36 | @pytest.mark.parametrize("data_type", [np.float32])
 37 | @pytest.mark.parametrize("data_shape", [(100, 8)], ids=idfn)
 38 | def test_crossvalidator(
 39 |     tmp_path: str,
 40 |     feature_type: str,
 41 |     data_type: np.dtype,
 42 |     data_shape: Tuple[int, int],
 43 | ) -> None:
 44 |     X, _, y, _ = make_regression_dataset(
 45 |         datatype=data_type,
 46 |         nrows=data_shape[0],
 47 |         ncols=data_shape[1],
 48 |     )
 49 | 
 50 |     with CleanSparkSession() as spark:
 51 |         df, features_col, label_col = create_pyspark_dataframe(
 52 |             spark, feature_type, data_type, X, y
 53 |         )
 54 |         assert label_col is not None
 55 | 
 56 |         rfc = RandomForestRegressor()
 57 |         rfc.setFeaturesCol(features_col)
 58 |         rfc.setLabelCol(label_col)
 59 | 
 60 |         evaluator = RegressionEvaluator()
 61 |         evaluator.setLabelCol(label_col)
 62 | 
 63 |         grid = ParamGridBuilder().addGrid(rfc.maxBins, [3, 5]).build()
 64 | 
 65 |         cv = CrossValidator(
 66 |             estimator=rfc,
 67 |             estimatorParamMaps=grid,
 68 |             evaluator=evaluator,
 69 |             numFolds=2,
 70 |             seed=101,
 71 |         )
 72 | 
 73 |         def check_cv(cv_est: Union[CrossValidator, CrossValidatorModel]) -> None:
 74 |             assert isinstance(cv_est, (CrossValidator, CrossValidatorModel))
 75 |             assert isinstance(cv_est.getEstimator(), RandomForestRegressor)
 76 |             assert isinstance(cv_est.getEvaluator(), RegressionEvaluator)
 77 |             assert cv_est.getNumFolds() == 2
 78 |             assert cv_est.getSeed() == 101
 79 |             assert cv_est.getEstimatorParamMaps() == grid
 80 | 
 81 |         check_cv(cv)
 82 | 
 83 |         path = tmp_path + "/cv"
 84 |         cv_path = f"{path}/cv"
 85 | 
 86 |         cv.write().overwrite().save(cv_path)
 87 |         cv_loaded = CrossValidator.load(cv_path)
 88 | 
 89 |         check_cv(cv_loaded)
 90 | 
 91 |         cv_model = cv.fit(df)
 92 |         check_cv(cv_model)
 93 | 
 94 |         cv_model_path = f"{path}/cv-model"
 95 |         cv_model.write().overwrite().save(cv_model_path)
 96 |         cv_model_loaded = CrossValidatorModel.load(cv_model_path)
 97 | 
 98 |         check_cv(cv_model_loaded)
 99 |         assert evaluator.evaluate(cv_model.transform(df)) == evaluator.evaluate(
100 |             cv_model_loaded.transform(df)
101 |         )
102 | 


--------------------------------------------------------------------------------
/python/tests_large/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2024, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/python/tests_large/conftest.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2022, NVIDIA CORPORATION.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | import logging
18 | import os
19 | 
20 | from pyspark.sql import SparkSession
21 | 
22 | _cpu_number = 32
23 | _default_conf = {
24 |     "spark.master": f"local[{_cpu_number}]",
25 |     "spark.python.worker.reuse": "false",
26 |     "spark.driver.host": "127.0.0.1",
27 |     "spark.task.maxFailures": "1",
28 |     "spark.driver.memory": "128g",
29 |     "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
30 |     "spark.sql.pyspark.jvmStacktrace.enabled": "true",
31 |     "spark.sql.execution.arrow.pyspark.enabled": "true",
32 |     "spark.rapids.ml.uvm.enabled": True,
33 | }
34 | 
35 | 
36 | def _get_spark() -> SparkSession:
37 |     builder = SparkSession.builder.appName(
38 |         name="spark-rapids-ml with tests on large datasets"
39 |     )
40 |     for k, v in _default_conf.items():
41 |         builder.config(k, v)
42 |     spark = builder.getOrCreate()
43 |     spark.sparkContext.setLogLevel("WARN")
44 |     logging.getLogger("pyspark").setLevel(logging.WARN)
45 |     return spark
46 | 
47 | 
48 | _spark = _get_spark()
49 | 


--------------------------------------------------------------------------------
/thirdparty/LICENSES/LICENSE.scikit_learn:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2007-2022 The scikit-learn developers.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------