├── .github └── workflows │ ├── auto-merge.yml │ ├── blossom-ci.yml │ ├── gcs-benchmark.yml │ ├── license-header-check.yml │ └── signoff-check.yml ├── .gitignore ├── .pylintrc ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── NOTICE-binary ├── README.md ├── SECURITY.md ├── ci ├── Dockerfile ├── Jenkinsfile.premerge ├── deploy.sh ├── docs.sh ├── lint_python.py └── test.sh ├── deprecated ├── README.md ├── native │ ├── CMakeLists.txt │ └── src │ │ ├── CMakeLists.txt │ │ ├── rapidsml_jni.cpp │ │ ├── rapidsml_jni.cu │ │ └── rapidsml_jni.hpp ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── nvidia │ │ │ └── spark │ │ │ └── ml │ │ │ └── linalg │ │ │ └── JniRAPIDSML.java │ └── scala │ │ ├── com │ │ └── nvidia │ │ │ └── spark │ │ │ └── ml │ │ │ └── feature │ │ │ └── PCA.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── ml │ │ ├── feature │ │ └── RapidsPCA.scala │ │ └── linalg │ │ ├── RAPIDSML.scala │ │ └── distributed │ │ └── RapidsRowMatrix.scala │ └── test │ ├── resources │ └── log4j.properties │ └── scala │ ├── com │ └── nvidia │ │ └── spark │ │ └── ml │ │ └── feature │ │ └── PCASuite.scala │ └── org │ └── apache │ └── spark │ └── ml │ └── util │ └── RapidsMLTest.scala ├── docker ├── Dockerfile ├── Dockerfile.pip ├── Dockerfile.python └── README.md ├── docs ├── Makefile ├── requirements.txt ├── site │ ├── 404.html │ ├── FAQ.md │ ├── _config.yml │ ├── api │ │ └── index.md │ ├── compatibility.md │ ├── configuration.md │ ├── contact.md │ ├── get-started │ │ ├── databricks.md │ │ ├── dataproc.md │ │ ├── emr.md │ │ ├── index.md │ │ ├── local.md │ │ └── spark_connect.md │ ├── index.md │ ├── performance.md │ └── security.md └── source │ ├── _templates │ └── autosummary │ │ ├── class.rst │ │ └── class_with_docs.rst │ ├── conf.py │ ├── index.rst │ └── spark_rapids_ml.rst ├── jvm ├── .gitignore ├── README.md ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ ├── org.apache.spark.ml.Estimator │ │ │ └── org.apache.spark.ml.Transformer │ └── scala │ │ ├── com │ │ └── nvidia │ │ │ └── rapids │ │ │ └── ml │ │ │ ├── Plugin.scala │ │ │ ├── RapidsKMeans.scala │ │ │ ├── RapidsLinearRegression.scala │ │ │ ├── RapidsLogisticRegression.scala │ │ │ ├── RapidsPCA.scala │ │ │ ├── RapidsRandomForestClassifier.scala │ │ │ ├── RapidsRandomForestRegressor.scala │ │ │ └── RapidsTraits.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── ml │ │ ├── clustering │ │ └── rapids │ │ │ └── RapidsKMeansModel.scala │ │ └── rapids │ │ ├── ModelHelper.scala │ │ ├── PythonEstimatorRunner.scala │ │ ├── PythonModelRunner.scala │ │ ├── RapidsLinearRegressionModel.scala │ │ ├── RapidsLogisticRegressionModel.scala │ │ ├── RapidsModel.scala │ │ ├── RapidsPCAModel.scala │ │ ├── RapidsRandomForestClassificationModel.scala │ │ ├── RapidsRandomForestRegressionModel.scala │ │ └── Utils.scala │ └── test │ └── scala │ └── com │ └── nvidia │ └── rapids │ └── ml │ └── SparkRapidsMLSuite.scala ├── notebooks ├── README.md ├── approx-nearest-neighbors.ipynb ├── aws-emr │ ├── README.md │ ├── init-bootstrap-action.sh │ └── init-configurations.json ├── cv-rf-regressor.ipynb ├── databricks │ ├── README.md │ └── init-pip-cuda-11.8.sh ├── dataproc │ ├── README.md │ └── spark_rapids_ml.sh ├── dbscan.ipynb ├── kmeans-no-import-change.ipynb ├── kmeans.ipynb ├── knn.ipynb ├── linear-regression.ipynb ├── logistic-regression.ipynb ├── pca.ipynb ├── random-forest-classification.ipynb ├── random-forest-regression.ipynb ├── spark-compat.ipynb └── umap.ipynb ├── python ├── README.md ├── benchmark │ ├── README.md │ ├── aws-emr │ │ ├── README.md │ │ ├── cpu-init-configurations.json │ │ ├── run_benchmark.sh │ │ ├── setup.sh │ │ └── start_cluster.sh │ ├── benchmark │ │ ├── __init__.py │ │ ├── base.py │ │ ├── bench_approximate_nearest_neighbors.py │ │ ├── bench_dbscan.py │ │ ├── bench_kmeans.py │ │ ├── bench_linear_regression.py │ │ ├── bench_logistic_regression.py │ │ ├── bench_nearest_neighbors.py │ │ ├── bench_pca.py │ │ ├── bench_random_forest.py │ │ ├── bench_umap.py │ │ ├── utils.py │ │ └── utils_knn.py │ ├── benchmark_runner.py │ ├── conftest.py │ ├── databricks │ │ ├── README.md │ │ ├── benchmark_utils.sh │ │ ├── cpu_cluster_spec.sh │ │ ├── gpu_cluster_spec.sh │ │ ├── gpu_etl_cluster_spec.sh │ │ ├── init-cpu.sh │ │ ├── init-pip-cuda-11.8.sh │ │ ├── process_bm_log.sh │ │ ├── results │ │ │ └── running_times.png │ │ ├── run_benchmark.sh │ │ └── setup.sh │ ├── dataproc │ │ ├── README.md │ │ ├── init_benchmark.sh │ │ ├── run_benchmark.sh │ │ ├── setup.sh │ │ └── start_cluster.sh │ ├── gen_data.py │ ├── gen_data_distributed.py │ └── test_gen_data.py ├── pyproject.toml ├── requirements.txt ├── requirements_dev.txt ├── run_benchmark.sh ├── run_plugin_test.sh ├── run_test.sh ├── setup.cfg ├── src │ └── spark_rapids_ml │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── classification.py │ │ ├── clustering.py │ │ ├── common │ │ ├── __init__.py │ │ └── cuml_context.py │ │ ├── connect_plugin.py │ │ ├── core.py │ │ ├── feature.py │ │ ├── install.py │ │ ├── knn.py │ │ ├── metrics │ │ ├── MulticlassMetrics.py │ │ ├── RegressionMetrics.py │ │ └── __init__.py │ │ ├── params.py │ │ ├── pipeline.py │ │ ├── pyspark_rapids.py │ │ ├── regression.py │ │ ├── spark_rapids_submit.py │ │ ├── tree.py │ │ ├── tuning.py │ │ ├── umap.py │ │ └── utils.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── discover_gpu.sh │ ├── sparksession.py │ ├── test_approximate_nearest_neighbors.py │ ├── test_benchmark.py │ ├── test_common_estimator.py │ ├── test_dbscan.py │ ├── test_kmeans.py │ ├── test_linear_model.py │ ├── test_logistic_regression.py │ ├── test_metrics.py │ ├── test_nearest_neighbors.py │ ├── test_pca.py │ ├── test_pipeline.py │ ├── test_random_forest.py │ ├── test_tuning.py │ ├── test_ucx.py │ ├── test_umap.py │ ├── test_utils.py │ └── utils.py ├── tests_large │ ├── __init__.py │ ├── conftest.py │ └── test_large_logistic_regression.py └── tests_no_import_change │ └── test_no_import_change.py └── thirdparty └── LICENSES ├── LICENSE.cuml ├── LICENSE.scikit_learn ├── LICENSE.spark └── LICENSE.xgboost /.github/workflows/auto-merge.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # A workflow to keep BASE branch up-to-date from HEAD branch 16 | name: auto-merge HEAD to BASE 17 | 18 | on: 19 | pull_request_target: 20 | branches: 21 | - branch-* 22 | types: [closed] 23 | 24 | jobs: 25 | auto-merge: 26 | if: github.event.pull_request.merged == true 27 | uses: NVIDIA/spark-rapids-common/.github/workflows/auto-merge.yml@main 28 | with: 29 | owner: ${{ github.repository_owner }} 30 | repo: spark-rapids-ml 31 | branch: ${{ github.event.pull_request.base.ref }} 32 | secrets: 33 | token: ${{ secrets.AUTOMERGE_TOKEN }} 34 | -------------------------------------------------------------------------------- /.github/workflows/gcs-benchmark.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023-2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # A workflow to trigger gcs tests 16 | name: GCS Benchmark Test 17 | 18 | on: 19 | workflow_dispatch: 20 | inputs: 21 | computer_region: 22 | required: true 23 | default: 'us-central1' 24 | description: 'COMPUTER REGION' 25 | schedule: 26 | - cron: "0 13 * * 1" 27 | 28 | jobs: 29 | Benchmark: 30 | if: github.repository == 'NVIDIA/spark-rapids-ml' 31 | runs-on: ubuntu-latest 32 | env: 33 | PROJECT: rapids-spark 34 | DATAPROC_REGION: us-central1 35 | COMPUTE_REGION: ${{ inputs.computer_region || 'us-central1' }} 36 | COMPUTE_ZONE: us-central1-a 37 | GCS_BUCKET: spark-rapids-ml-benchmarking 38 | KEY_FILE_CONTENT: ${{ secrets.GCLOUD_PRIVATE_KEY }} 39 | SERVICE_ACCOUNT: ${{ secrets.GCLOUD_SERVICE_ACCOUNT }} 40 | CLUSTER_NAME: github-spark-rapids-ml-${{github.run_number}} 41 | steps: 42 | - uses: actions/checkout@v4 43 | 44 | - name: run benchmark 45 | shell: bash 46 | run: | 47 | set -x 48 | cat <<< $KEY_FILE_CONTENT > key.json 49 | gcloud auth activate-service-account $SERVICE_ACCOUNT --key-file key.json 50 | gcloud config set project $PROJECT 51 | gcloud config set dataproc/region $DATAPROC_REGION 52 | gcloud config set compute/region $COMPUTE_REGION 53 | gcloud config set compute/zone $COMPUTE_ZONE 54 | export BENCHMARK_HOME=$GCS_BUCKET/benchmark 55 | cd python/benchmark/dataproc 56 | ./setup.sh 57 | ./run_benchmark.sh 58 | 59 | - name: delete cluster 60 | if: ${{ always() }} 61 | shell: bash 62 | continue-on-error: true 63 | run: | 64 | set -x 65 | cat <<< $KEY_FILE_CONTENT > key.json 66 | gcloud auth activate-service-account $SERVICE_ACCOUNT --key-file key.json 67 | gcloud config set project $PROJECT 68 | echo y | gcloud dataproc clusters delete $CLUSTER_NAME --region $COMPUTE_REGION 69 | -------------------------------------------------------------------------------- /.github/workflows/license-header-check.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # A workflow to check copyright/license header 16 | name: license header check 17 | 18 | on: 19 | pull_request: 20 | types: [opened, synchronize, reopened] 21 | 22 | jobs: 23 | license-header-check: 24 | runs-on: ubuntu-latest 25 | if: "!contains(github.event.pull_request.title, '[bot]')" 26 | steps: 27 | - name: Get checkout depth 28 | run: | 29 | echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV 30 | 31 | - name: Checkout code 32 | uses: actions/checkout@v4 33 | with: 34 | fetch-depth: ${{ env.PR_FETCH_DEPTH }} 35 | 36 | - name: license-header-check 37 | uses: NVIDIA/spark-rapids-common/license-header-check@main 38 | with: 39 | included_file_patterns: | 40 | *.sh, 41 | *.py, 42 | *.toml, 43 | *.cfg, 44 | *Dockerfile*, 45 | *Jenkinsfile*, 46 | *.yml, 47 | *.txt, 48 | *.xml 49 | excluded_file_patterns: | 50 | thirdparty/* 51 | -------------------------------------------------------------------------------- /.github/workflows/signoff-check.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # A workflow to check if PR got sign-off 16 | name: signoff check 17 | 18 | on: 19 | pull_request_target: 20 | types: [opened, synchronize, reopened] 21 | 22 | jobs: 23 | signoff-check: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: signoff 27 | uses: NVIDIA/spark-rapids-common/signoff-check@main 28 | with: 29 | owner: ${{ github.repository_owner }} 30 | repo: spark-rapids-ml 31 | pull_number: ${{ github.event.number }} 32 | token: ${{ secrets.GITHUB_TOKEN }} 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#*# 2 | *.#* 3 | *.iml 4 | *.ipr 5 | *.iws 6 | *.pyc 7 | *.pyo 8 | *.swp 9 | *~ 10 | .DS_Store 11 | .cache 12 | .classpath 13 | .ensime 14 | .ensime_cache/ 15 | .ensime_lucene 16 | .generated-mima* 17 | .idea/ 18 | .idea_modules/ 19 | .project 20 | .pydevproject 21 | .scala_dependencies 22 | .settings 23 | hs_err*.log 24 | dependency-reduced-pom.xml 25 | scalastyle-on-compile.generated.xml 26 | scalastyle-output.xml 27 | scalastyle.txt 28 | target/ 29 | */metastore_db 30 | */spark-warehouse 31 | */.vscode 32 | */.clang-format 33 | __pycache__/ 34 | dist/ 35 | docs/build/ 36 | */.ipynb_checkpoints/ 37 | *.egg-info/ 38 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | ignore=tests 4 | 5 | extension-pkg-whitelist=numpy 6 | 7 | disable=unexpected-special-method-signature,too-many-nested-blocks,useless-object-inheritance,import-outside-toplevel,unsubscriptable-object,attribute-defined-outside-init 8 | 9 | dummy-variables-rgx=(unused|)_.* 10 | 11 | reports=no 12 | 13 | [BASIC] 14 | 15 | # Enforce naming convention 16 | const-naming-style=UPPER_CASE 17 | class-naming-style=PascalCase 18 | function-naming-style=snake_case 19 | method-naming-style=snake_case 20 | attr-naming-style=snake_case 21 | argument-naming-style=snake_case 22 | variable-naming-style=snake_case 23 | class-attribute-naming-style=snake_case 24 | 25 | # Allow single-letter variables 26 | variable-rgx=[a-zA-Z_][a-z0-9_]{0,30}$ 27 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | RAPIDS ML for Apache Spark 2 | Copyright (c) 2021, NVIDIA CORPORATION 3 | 4 | // ------------------------------------------------------------------ 5 | // NOTICE file corresponding to the section 4d of The Apache License, 6 | // Version 2.0, in this case for 7 | // ------------------------------------------------------------------ 8 | 9 | Apache Spark 10 | Copyright 2014 and onwards The Apache Software Foundation 11 | 12 | This product includes software developed at 13 | The Apache Software Foundation (http://www.apache.org/). 14 | 15 | --------------------------------------------------------------------- 16 | 17 | raft - RAPIDS Analytics Frameworks Toolset 18 | Copyright 2020 NVIDIA Corporation 19 | 20 | --------------------------------------------------------------------- 21 | 22 | cuML - RAPIDS Machine Learning Library 23 | Copyright 2018 NVIDIA CORPORATION -------------------------------------------------------------------------------- /NOTICE-binary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/spark-rapids-ml/7267705b4f226f0b579844116f4cb72249e64a27/NOTICE-binary -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Rapids ML 2 | 3 | Spark Rapids ML enables GPU accelerated distributed machine learning on [Apache Spark](https://spark.apache.org/). It provides several PySpark ML compatible algorithms powered by the [RAPIDS cuML](https://docs.rapids.ai/api/cuml/stable/) library. 4 | 5 | These APIs seek to minimize any code changes to end user Spark code. After your environment is configured to support GPUs (with drivers, CUDA toolkit, and RAPIDS dependencies), you should be able to just change an import statement or class name to take advantage of GPU acceleration. See [here](./python/README.md#clis-enabling-no-package-import-change) for experimental CLIs that enable GPU acceleration without the need for changing the `pyspark.ml` package names in an existing pyspark ml application. 6 | 7 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/NVIDIA/spark-rapids-ml) 8 | 9 | **Python** 10 | ```python 11 | # from pyspark.ml.feature import PCA 12 | from spark_rapids_ml.feature import PCA 13 | 14 | pca = ( 15 | PCA() 16 | .setK(3) 17 | .setInputCol("features") 18 | .setOutputCol("pca_features") 19 | ) 20 | pca.fit(df) 21 | ``` 22 | 23 | ## Supported Algorithms 24 | 25 | The following table shows the currently supported algorithms. The goal is to expand this over time with support from the underlying RAPIDS cuML libraries. If you would like support for a specific algorithm, please file a [git issue](https://github.com/NVIDIA/spark-rapids-ml/issues) to help us prioritize. 26 | 27 | | Supported Algorithms | Python | 28 | | :--------------------- | :----: | 29 | | CrossValidator | √ | 30 | | DBSCAN (*) | √ | 31 | | KMeans | √ | 32 | | approx/exact k-NN (*) | √ | 33 | | LinearRegression | √ | 34 | | LogisticRegression | √ | 35 | | PCA | √ | 36 | | RandomForestClassifier | √ | 37 | | RandomForestRegressor | √ | 38 | | UMAP (*) | √ | 39 | 40 | (*) Notes: 41 | - As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library. 42 | - Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. 43 | - As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. 44 | 45 | ## Getting started 46 | 47 | For PySpark (Python) users, see [this guide](python/README.md). 48 | 49 | ## Performance 50 | 51 | GPU acceleration can provide significant performance and cost benefits. Benchmarking instructions and results can be found [here](python/benchmark/README.md). 52 | 53 | ## Contributing 54 | 55 | We welcome community contributions! Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) to get started. -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all 4 | source code repositories managed through our organization. 5 | 6 | If you need to report a security issue, please use the appropriate contact points outlined 7 | below. **Please do not report security vulnerabilities through GitHub/GitLab.** 8 | 9 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 10 | 11 | To report a potential security vulnerability in any NVIDIA product: 12 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 13 | - E-Mail: psirt@nvidia.com 14 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 15 | - Please include the following information: 16 | - Product/Driver name and version/branch that contains the vulnerability 17 | -------------------------------------------------------------------------------- /ci/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | ARG CUDA_VERSION=11.8.0 18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 19 | 20 | # ubuntu22 21 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ 22 | -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \ 23 | /etc/apt/sources.list 24 | # ubuntu24+ 25 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \ 26 | -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \ 27 | -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} + 28 | # Install packages to build spark-rapids-ml 29 | RUN chmod 1777 /tmp 30 | RUN apt update -y \ 31 | && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \ 32 | && apt install -y git numactl software-properties-common wget zip maven \ 33 | && rm -rf /var/lib/apt/lists 34 | 35 | # Config JAVA_HOME 36 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-amd64 37 | 38 | # Install conda 39 | ENV PATH="/root/miniconda3/bin:${PATH}" 40 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 41 | && mkdir /root/.conda \ 42 | && bash Miniconda3-latest-Linux-x86_64.sh -b \ 43 | && rm -f Miniconda3-latest-Linux-x86_64.sh \ 44 | && conda init && conda update -n base conda \ 45 | && conda install -n base conda-libmamba-solver \ 46 | && conda config --set solver libmamba 47 | 48 | # install cuML 49 | ARG CUML_VER=25.06 50 | RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \ 51 | && conda clean --all -f -y 52 | -------------------------------------------------------------------------------- /ci/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | set -ex 19 | 20 | # build plugin jar 21 | pushd jvm 22 | mvn clean package -DskipTests 23 | popd 24 | 25 | # copy plugin jar to python package 26 | JARS_DIR=python/src/spark_rapids_ml/jars 27 | mkdir -p $JARS_DIR 28 | rm -f $JARS_DIR/*.jar 29 | cp jvm/target/*.jar $JARS_DIR 30 | 31 | # build whl package 32 | pushd python 33 | pip install -r requirements_dev.txt && pip install -e . 34 | python -m build 35 | popd 36 | -------------------------------------------------------------------------------- /ci/docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | if [[ $1 == "nightly" ]]; then 19 | TAG=$(git log -1 --format="%h") 20 | BRANCH=$(git branch --show-current) 21 | else 22 | # get version tag 23 | TAG="v$VERSION" 24 | fi 25 | 26 | set -ex 27 | 28 | # build and publish docs 29 | pushd docs 30 | make clean 31 | make html 32 | git worktree add --track -b gh-pages _site origin/gh-pages 33 | 34 | pushd _site 35 | if [[ $1 == "nightly" ]]; then 36 | # draft copy 37 | api_dest=api/python-draft 38 | else 39 | # release copy 40 | api_dest=api/python 41 | # also copy site wide changes for release 42 | cp -r ../site/* . 43 | fi 44 | 45 | # in _site 46 | mkdir -p $api_dest 47 | cp -r ../build/html/* $api_dest/ 48 | 49 | git add --all 50 | dff=$(git diff --staged --stat) 51 | repo_url=$(git config --get remote.origin.url) 52 | url=${repo_url#https://} 53 | github_account=${GITHUB_ACCOUNT:-nvauto} 54 | if [[ -n $dff ]]; then 55 | git commit -m "Update draft api docs to commit ${TAG} on ${BRANCH}" 56 | git push -f https://${github_account}:${GITHUB_TOKEN}@${url} gh-pages 57 | fi 58 | 59 | popd #_site 60 | git worktree remove _site --force 61 | popd 62 | -------------------------------------------------------------------------------- /ci/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | set -ex 19 | 20 | type="$1" 21 | case $type in 22 | "pre-merge" | "") 23 | ut_args="" 24 | ;; 25 | "nightly" | "release") 26 | ut_args="--runslow" 27 | ;; 28 | *) 29 | echo "Unknown test type: $type"; exit 1;; 30 | esac 31 | bench_args="" 32 | 33 | # environment 34 | nvidia-smi 35 | which python 36 | 37 | # spark-rapids-ml and dependencies 38 | cd python 39 | pip install -r requirements_dev.txt && pip install -e . 40 | 41 | # unit tests 42 | ./run_test.sh $ut_args 43 | 44 | # benchmark 45 | ./run_benchmark.sh $bench_args 46 | 47 | # plugin tests 48 | ./run_plugin_test.sh 49 | 50 | # check compatibility with Spark 3.3 in nightly run 51 | # also push draft release docs to gh-pages 52 | if [[ $type == "nightly" ]]; then 53 | pip uninstall pyspark -y 54 | pip install pyspark~=3.3.0 55 | ./run_test.sh 56 | ./run_benchmark.sh $bench_args 57 | # if everything passed till now update draft release docs in gh-pages 58 | # need to invoke docs.sh from top level of repo 59 | cd .. # top level of repo 60 | ci/docs.sh nightly 61 | fi 62 | -------------------------------------------------------------------------------- /deprecated/native/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #============================================================================= 2 | # Copyright (c) 2021-2023, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #============================================================================= 16 | cmake_minimum_required(VERSION 3.20) 17 | 18 | file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake 19 | ${CMAKE_BINARY_DIR}/RAPIDS.cmake) 20 | include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) 21 | 22 | include(rapids-cuda) 23 | 24 | rapids_cuda_init_architectures(SPARK_RAPIDS_ML) 25 | project(SPARK_RAPIDS_ML LANGUAGES CXX CUDA C) 26 | 27 | # Build options. 28 | option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON) 29 | 30 | # Set C++ standard. 31 | set(CMAKE_CXX_STANDARD 17) 32 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 33 | set(CMAKE_CXX_EXTENSIONS OFF) 34 | 35 | # Set CUDA C++ standard. 36 | set(CMAKE_CUDA_STANDARD 17) 37 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 38 | set(CMAKE_CUDA_EXTENSIONS OFF) 39 | 40 | find_package(JNI REQUIRED) 41 | find_package(CUDAToolkit REQUIRED) 42 | 43 | # Add the project. 44 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 45 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 46 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 47 | add_subdirectory(src) 48 | -------------------------------------------------------------------------------- /deprecated/native/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #============================================================================= 2 | # Copyright (c) 2021, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | #============================================================================= 16 | 17 | 18 | # Install cuDF nightly via Conda, only for local development, will remove in CI. 19 | find_package(cudf) 20 | 21 | set (CMAKE_CUDA_FLAGS "--extended-lambda") 22 | 23 | if(DEFINED ENV{RAFT_PATH}) 24 | message(STATUS "RAFT_PATH environment variable detected.") 25 | message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}") 26 | set(RAFT_DIR ENV{RAFT_PATH}) 27 | 28 | else(DEFINED ENV{RAFT_PATH}) 29 | message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT") 30 | set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo") 31 | 32 | ExternalProject_Add(raft 33 | GIT_REPOSITORY git@github.com:rapidsai/raft.git 34 | GIT_TAG pinned_commit/git_tag/branch 35 | PREFIX ${RAFT_GIT_DIR} 36 | CONFIGURE_COMMAND "" 37 | BUILD_COMMAND "" 38 | INSTALL_COMMAND "") 39 | 40 | set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable") 41 | endif(DEFINED ENV{RAFT_PATH}) 42 | 43 | 44 | 45 | ################################################################################################# 46 | # - CPM ----------------------------------------------------------------------------------------- 47 | 48 | set(CPM_DOWNLOAD_VERSION 0.27.2) 49 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 50 | 51 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) 52 | message(STATUS "Downloading CPM.cmake") 53 | file(DOWNLOAD https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake ${CPM_DOWNLOAD_LOCATION}) 54 | endif() 55 | 56 | include(${CPM_DOWNLOAD_LOCATION}) 57 | ################################################################################################# 58 | 59 | # pull cuDF sources, to use jni_utils.hpp 60 | # cmake options should be added here for CI build. 61 | CPMAddPackage(NAME cudf 62 | VERSION "22.04.00" 63 | GIT_REPOSITORY https://github.com/rapidsai/cudf.git 64 | GIT_TAG branch-22.04 65 | ) 66 | 67 | add_library(rapidsml_jni SHARED rapidsml_jni.cpp 68 | rapidsml_jni.cu 69 | ) 70 | 71 | target_link_options(rapidsml_jni PRIVATE "-Wl,-as-needed") 72 | 73 | target_include_directories(rapidsml_jni PRIVATE 74 | "${JNI_INCLUDE_DIRS}" 75 | "${CUDAToolkit_INCLUDE_DIRS}" 76 | "$ENV{RAFT_PATH}/cpp/include" 77 | "${CMAKE_SOURCE_DIR}/src/utils" 78 | "${cudf_SOURCE_DIR}/java/src/main/native/include" 79 | ) 80 | 81 | target_link_libraries(rapidsml_jni PRIVATE 82 | libcudart_static.a 83 | libcusparse_static.a 84 | libcusolver_static.a 85 | libculibos.a 86 | liblapack_static.a 87 | CUDA::cublas 88 | cudf::cudf) 89 | 90 | if(PER_THREAD_DEFAULT_STREAM) 91 | target_compile_definitions(rapidsml_jni PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM) 92 | endif() 93 | 94 | -------------------------------------------------------------------------------- /deprecated/native/src/rapidsml_jni.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "rapidsml_jni.hpp" 31 | #include "jni_utils.hpp" 32 | 33 | extern "C" { 34 | 35 | JNIEXPORT jlong JNICALL Java_com_nvidia_spark_ml_linalg_JniRAPIDSML_dgemmWithColumnViewPtr( 36 | JNIEnv *env, jclass, jint transa, jint transb, jint m, jint n, jint k, jdouble alpha, 37 | jdoubleArray A, jint lda, jlong B,jint ldb, jdouble beta, jint ldc, jint deviceID) { 38 | try { 39 | cudf::jni::native_jdoubleArray native_A(env, A); 40 | auto ret_column = dgemm(transa, transb, m, n, k, alpha, native_A.data(), native_A.size(), lda, 41 | B, ldb, beta, ldc, deviceID); 42 | return ret_column; 43 | } 44 | CATCH_STD(env, 0); 45 | } 46 | 47 | JNIEXPORT jlong JNICALL Java_com_nvidia_spark_ml_linalg_JniRAPIDSML_dgemmCov(JNIEnv *env, jclass, 48 | jint transa, jint transb, jint m, jint n, jint k, jdouble alpha, jlong A, jint lda, jlong B, 49 | jint ldb, jdouble beta, jdoubleArray C, jint ldc, jint deviceID) { 50 | try { 51 | cudf::jni::native_jdoubleArray native_C(env, C); 52 | dgemmCov(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, native_C.data(), ldc, deviceID); 53 | } 54 | CATCH_STD(env, 0); 55 | } 56 | 57 | } // extern "C" 58 | -------------------------------------------------------------------------------- /deprecated/native/src/rapidsml_jni.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | cublasOperation_t convertToCublasOpEnum(int int_type); 22 | 23 | void signFlip(double* input, int n_rows, int n_cols, double* components, 24 | int n_cols_comp, cudaStream_t stream); 25 | 26 | long dgemm(int transa, int transb, int m, int n, 27 | int k, double alpha, double* A, int size_A, int lda, long B, 28 | int ldb, double beta, int ldc, int deviceID); 29 | 30 | void dgemmCov(int transa, int transb, int m, int n,int k, double alpha, long A, int lda,long B, 31 | int ldb, double beta, double* C, int ldc, int deviceID); 32 | -------------------------------------------------------------------------------- /deprecated/src/main/java/com/nvidia/spark/ml/linalg/JniRAPIDSML.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.spark.ml.linalg; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.nio.file.Files; 22 | import java.nio.file.Path; 23 | import java.nio.file.StandardCopyOption; 24 | import java.nio.file.attribute.PosixFilePermissions; 25 | 26 | public final class JniRAPIDSML { 27 | private static final JniRAPIDSML instance = new JniRAPIDSML(); 28 | private static boolean loaded = false; 29 | 30 | public static boolean depsLoaded() { 31 | return loaded; 32 | } 33 | 34 | private JniRAPIDSML() { 35 | String osArch = System.getProperty("os.arch"); 36 | if (osArch == null || osArch.isEmpty()) { 37 | throw new RuntimeException("Unable to load native implementation"); 38 | } 39 | String osName = System.getProperty("os.name"); 40 | if (osName == null || osName.isEmpty()) { 41 | throw new RuntimeException("Unable to load native implementation"); 42 | } 43 | 44 | Path temp; 45 | try (InputStream resource = this.getClass().getClassLoader().getResourceAsStream( 46 | String.format("%s/%s/librapidsml_jni.so", osArch, osName))) { 47 | assert resource != null; 48 | Files.copy(resource, temp = Files.createTempFile("librapidsml_jni.so", "", 49 | PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxr-x---"))), 50 | StandardCopyOption.REPLACE_EXISTING); 51 | temp.toFile().deleteOnExit(); 52 | } catch (IOException e) { 53 | throw new RuntimeException("Unable to load native implementation", e); 54 | } 55 | 56 | System.load(temp.toString()); 57 | loaded=true; 58 | } 59 | 60 | public static JniRAPIDSML getInstance() { 61 | return instance; 62 | } 63 | 64 | public native long dgemmCov(int transa, int transb, int m, int n, int k, double alpha, long A, int lda, long B, 65 | int ldb, double beta, double[] C, int ldc, int deviceID); 66 | 67 | public native long accumulateCov(long a, long b); 68 | 69 | /** Wrapper of JNI entrance for cuBLAS gemm routine. Most parameters are the same as the original gemm's: https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gemm. 70 | * Differences are: 71 | * 1. transa and transb are int values instead of enum. 72 | * 2. B is a long value that represeents the `cudf::lists_column_view *` holding the matrix data on device 73 | * 3. an extra deviceID to indicate which GPU device will perform this computation 74 | */ 75 | public native long dgemmWithColumnViewPtr(int transa, int transb, int m, int n, int k, double alpha, double[] A, 76 | int lda, long B, int ldb, double beta, int ldc, int deviceID); 77 | public native void calSVD(int m, double[] A, double[] U, double[] S, int deviceID); 78 | } 79 | -------------------------------------------------------------------------------- /deprecated/src/main/scala/com/nvidia/spark/ml/feature/PCA.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.spark.ml.feature 18 | 19 | import org.apache.spark.ml.feature.RapidsPCA 20 | import org.apache.spark.ml.param.ParamMap 21 | import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} 22 | 23 | /** 24 | * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k` 25 | * principal components. 26 | */ 27 | class PCA(override val uid: String) extends RapidsPCA { 28 | 29 | def this() = this(Identifiable.randomUID("pca")) 30 | 31 | override def copy(extra: ParamMap): PCA = defaultCopy(extra) 32 | } 33 | 34 | object PCA extends DefaultParamsReadable[PCA] { 35 | 36 | override def load(path: String): PCA = super.load(path) 37 | } 38 | -------------------------------------------------------------------------------- /deprecated/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # Set everything to be logged to the file target/unit-tests.log 18 | log4j.rootCategory=INFO, file 19 | log4j.appender.file=org.apache.log4j.FileAppender 20 | log4j.appender.file.append=true 21 | log4j.appender.file.file=target/unit-tests.log 22 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n 24 | 25 | # Ignore messages below warning level from Jetty, because it's a bit verbose 26 | log4j.logger.org.sparkproject.jetty=WARN 27 | 28 | -------------------------------------------------------------------------------- /deprecated/src/test/scala/org/apache/spark/ml/util/RapidsMLTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.util 18 | 19 | import org.apache.spark.SparkConf 20 | import org.apache.spark.sql.DataFrame 21 | 22 | trait RapidsMLTest extends MLTest { 23 | override def sparkConf: SparkConf = { 24 | super.sparkConf.set("spark.rapids.sql.enabled", "true") 25 | } 26 | 27 | override def checkVectorSizeOnDF( 28 | dataframe: DataFrame, 29 | vecColName: String, 30 | vecSize: Int): Unit = { 31 | super.checkVectorSizeOnDF(dataframe, vecColName, vecSize) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /docker/Dockerfile.pip: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | ARG CUDA_VERSION=11.8.0 18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 19 | 20 | ARG PYSPARK_VERSION=3.3.1 21 | ARG RAPIDS_VERSION=25.6.0 22 | ARG ARCH=amd64 23 | #ARG ARCH=arm64 24 | 25 | # ubuntu22 26 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ 27 | -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \ 28 | /etc/apt/sources.list 29 | # ubuntu24+ 30 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \ 31 | -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \ 32 | -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} + 33 | # Install packages to build spark-rapids-ml 34 | RUN apt-get update -y \ 35 | && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \ 36 | && rm -rf /var/lib/apt/lists 37 | 38 | RUN apt-get update -y \ 39 | && apt install -y git numactl python3.10-venv python3-pip python-is-python3 software-properties-common wget zip \ 40 | && python -m pip install --upgrade pip \ 41 | && rm -rf /var/lib/apt/lists 42 | 43 | RUN apt-get update -y \ 44 | && apt install -y python3.10-dev cmake curl \ 45 | && rm -rf /var/lib/apt/lists 46 | 47 | # install RAPIDS 48 | # using ~= pulls in micro version patches 49 | RUN pip install --no-cache-dir \ 50 | cudf-cu11~=${RAPIDS_VERSION} \ 51 | cuml-cu11~=${RAPIDS_VERSION} \ 52 | cuvs-cu11~=${RAPIDS_VERSION} \ 53 | numpy~=1.0 \ 54 | --extra-index-url=https://pypi.nvidia.com 55 | 56 | # install python dependencies 57 | RUN pip install --no-cache-dir pyspark==${PYSPARK_VERSION} "scikit-learn>=1.2.1" \ 58 | && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \ 59 | numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0" 60 | 61 | # Config JAVA_HOME 62 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-$ARCH 63 | 64 | ### END OF CACHE ### 65 | 66 | #ARG RAPIDS_ML_VER=main 67 | #RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git 68 | COPY . /spark-rapids-ml 69 | WORKDIR /spark-rapids-ml/python 70 | 71 | # install spark-rapids-ml with requirements_dev.txt (in case it has diverged from cache) 72 | RUN pip install --no-cache-dir -r requirements_dev.txt \ 73 | && pip install --no-cache-dir -e . 74 | 75 | SHELL ["/bin/bash", "-c"] 76 | -------------------------------------------------------------------------------- /docker/Dockerfile.python: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | ARG CUDA_VERSION=11.8.0 18 | FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 19 | 20 | ARG CUML_VERSION=25.06 21 | 22 | # ubuntu22 23 | RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ 24 | -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \ 25 | /etc/apt/sources.list 26 | # ubuntu24+ 27 | RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \ 28 | -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \ 29 | -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} + 30 | # Install packages to build spark-rapids-ml 31 | RUN apt update -y \ 32 | && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \ 33 | && apt install -y git numactl software-properties-common wget zip \ 34 | && rm -rf /var/lib/apt/lists 35 | 36 | # Config JAVA_HOME 37 | ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-amd64 38 | 39 | # Install conda 40 | ENV PATH="/root/miniconda3/bin:${PATH}" 41 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \ 42 | && mkdir /root/.conda \ 43 | && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \ 44 | && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh \ 45 | && conda init 46 | 47 | # install cuML 48 | 49 | RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=11.8 cuml=$CUML_VERSION numpy~=1.0 \ 50 | && conda clean --all -f -y 51 | 52 | # install python dependencies 53 | RUN pip install --no-cache-dir "pyspark>=3.2.1" "scikit-learn>=1.2.1" \ 54 | && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \ 55 | numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0" 56 | 57 | ### END OF CACHE ### 58 | 59 | #ARG RAPIDS_ML_VER=main 60 | #RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git 61 | COPY . /spark-rapids-ml 62 | WORKDIR /spark-rapids-ml/python 63 | 64 | # install spark-rapids-ml with requirements_dev.txt (in case it has diverged from cache) 65 | RUN pip install --no-cache-dir -r requirements_dev.txt \ 66 | && pip install --no-cache-dir -e . 67 | 68 | SHELL ["conda", "run", "--no-capture-output", "-n", "base", "/bin/bash", "-c"] 69 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Build in Docker 2 | 3 | We provide the following Dockerfiles: 4 | - [Dockerfile](./Dockerfile) - for building the Scala API. 5 | - [Dockerfile.python](./Dockerfile.python) - for building the Python API (using conda for RAPIDS dependencies). 6 | - [Dockerfile.pip](./Dockerfile.pip) - for building the Python API (using pip for RAPIDS dependencies). 7 | 8 | ## Python API 9 | 10 | First, build the development image. 11 | ```bash 12 | docker build -t spark-rapids-ml:python -f Dockerfile.python .. 13 | # OPTIONAL: docker build -t spark-rapids-ml:pip -f Dockerfile.pip .. 14 | ``` 15 | 16 | Launch the container 17 | ```bash 18 | nvidia-docker run -it --rm spark-rapids-ml:python 19 | # OPTIONAL: nvidia-docker run -it --rm spark-rapids-ml:pip 20 | ``` 21 | Run the unit tests inside the container. 22 | ```bash 23 | ./run_test.sh --runslow 24 | ``` 25 | 26 | Run the benchmarks inside the container. 27 | ```bash 28 | ./run_benchmark.sh 29 | ``` 30 | 31 | Build the pip package. 32 | ```bash 33 | python -m build 34 | ``` 35 | 36 | Build the documentation. 37 | ``` 38 | cd ../docs 39 | make html 40 | cp -r build/html site/api/python 41 | # copy site/* to 'gh-pages' branch to publish 42 | ``` 43 | 44 | ## Scala API (Deprecated) 45 | 46 | First, build the development image. **Note**: see the Dockerfile for configurable build arguments. 47 | ```bash 48 | docker build -t spark-rapids-ml:jvm -f Dockerfile .. 49 | ``` 50 | 51 | Run the container. 52 | ```bash 53 | nvidia-docker run -it --rm spark-rapids-ml:jvm 54 | ``` 55 | 56 | Then, inside the container, build the Scala API [as usual](../jvm/README.md#build-target-jar). 57 | ```bash 58 | mvn clean package 59 | ``` 60 | 61 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | clean: Makefile 18 | rm -rf $(BUILDDIR) 19 | rm -rf $(SOURCEDIR)/api 20 | 21 | # Catch-all target: route all unknown targets to Sphinx using the new 22 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 23 | %: Makefile 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | numpydoc 16 | pydata-sphinx-theme 17 | -------------------------------------------------------------------------------- /docs/site/404.html: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /404.html 3 | layout: default 4 | --- 5 | 6 | 19 | 20 |
21 |

404

22 | 23 |

Page not found :(

24 |

The requested page could not be found.

25 |
26 | -------------------------------------------------------------------------------- /docs/site/FAQ.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Frequently Asked Questions 3 | nav_order: 4 4 | --- 5 | # Frequently Asked Questions 6 | 7 | * TOC 8 | {:toc} 9 | 10 | ### What versions of Apache Spark are supported? 11 | 12 | Apache Spark version 3.3.1 or higher. 13 | 14 | ### What versions of Python are supported 15 | 16 | Python 3.10 or higher. 17 | -------------------------------------------------------------------------------- /docs/site/_config.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | title: Spark Rapids ML 18 | description: >- # this means to ignore newlines until "baseurl:" 19 | Spark Rapids ML enables GPU accelerated distributed machine learning on Apache Spark 20 | powered by the RAPIDS cuML library. 21 | baseurl: "/spark-rapids-ml" # the subpath of your site, e.g. /blog 22 | url: "nvidia.github.io" # the base hostname & protocol for your site, e.g. http://example.com 23 | 24 | aux_links: 25 | "Spark Rapids ML on GitHub": 26 | - "//github.com/nvidia/spark-rapids-ml" 27 | 28 | # Build settings 29 | remote_theme: pmarsceill/just-the-docs 30 | plugins: 31 | - jekyll-feed 32 | 33 | # Exclude from processing. 34 | # The following items will not be processed, by default. 35 | # Any item listed under the `exclude:` key here will be automatically added to 36 | # the internal "default list". 37 | # 38 | # Excluded items can be processed by explicitly listing the directories or 39 | # their entries' file path in the `include:` list. 40 | # 41 | exclude: 42 | - .sass-cache/ 43 | - .jekyll-cache/ 44 | - gemfiles/ 45 | - Gemfile 46 | - Gemfile.lock 47 | - node_modules/ 48 | - vendor/bundle/ 49 | - vendor/cache/ 50 | - vendor/gems/ 51 | - vendor/ruby/ 52 | 53 | include: 54 | - _static 55 | - _sphinx* 56 | 57 | -------------------------------------------------------------------------------- /docs/site/api/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API 3 | nav_order: 5 4 | --- 5 | # API Documentation 6 | 7 | - Python API 8 | - [Stable Release](python) 9 | - [Draft](python-draft) 10 | -------------------------------------------------------------------------------- /docs/site/compatibility.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Compatibility 3 | nav_order: 3 4 | --- 5 | # Compatibility with Apache Spark 6 | 7 | ## Supported Algorithms 8 | 9 | The following table shows the currently supported algorithms. The goal is to expand this over time with support from the underlying RAPIDS cuML libraries. If you would like support for a specific algorithm, please file a [git issue](https://github.com/NVIDIA/spark-rapids-ml/issues) to help us prioritize. 10 | 11 | | Supported Algorithms | Python | 12 | | :--------------------- | :----: | 13 | | CrossValidator | √ | 14 | | DBSCAN (*) | √ | 15 | | KMeans | √ | 16 | | approx/exact k-NN (*) | √ | 17 | | LinearRegression | √ | 18 | | LogisticRegression | √ | 19 | | PCA | √ | 20 | | RandomForestClassifier | √ | 21 | | RandomForestRegressor | √ | 22 | | UMAP (*) | √ | 23 | 24 | (*) Notes: 25 | - As an alternative to KMeans, we also provide a Spark API for GPU accelerated Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density based clustering algorithm in the RAPIDS cuML library. 26 | - Spark does not provide a k-Nearest Neighbors (k-NN) implementation, but it does have an [LSH-based Approximate Nearest Neighbor](https://spark.apache.org/docs/latest/ml-features.html#approximate-nearest-neighbor-search) implementation. 27 | - As an alternative to PCA, we also provide a Spark API for GPU accelerated Uniform Manifold Approximation and Projection (UMAP), a non-linear dimensionality reduction algorithm in the RAPIDS cuML library. 28 | 29 | 30 | ## Supported Versions 31 | 32 | | Spark Rapids ML | CUDA | Spark | Python | 33 | | :-------------- | :---- | :----- | :----- | 34 | | 1.0.0 | 11.4+ | 3.3+ | 3.10+ | 35 | 36 | 37 | ## Single vs Double precision inputs 38 | The underlying cuML implementations all accept single precision (e.g. Float or float32) input types and offer the best performance in this case. As a result, by default, Spark RAPIDs ML converts Spark DataFrames supplied to `fit` and `transform` methods having double precision data types (i.e. `VectorUDT`, `ArrayType(DoubleType())`, `DoubleType()` columns) to single precision before passing them down to the cuML layer. Most of the cuML algorithm implementations also support double precision inputs. The Estimator (for all algorithms) constructor parameter `float32_inputs` can be used to control this behavior. The default value is `True` which forces the conversion to single precision for all algorithms, but it can be set to `False` in which case double precision input data is passed to those cuML algorithms which support it. 39 | 40 | Currently all algorithms *except* the following support double precision: k-NN, UMAP. 41 | -------------------------------------------------------------------------------- /docs/site/configuration.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Configuration 3 | nav_order: 6 4 | --- 5 | # Configuration 6 | 7 | The following configurations can be supplied as Spark properties. 8 | 9 | | Property name | Default | Meaning | 10 | | :-------------- | :------ | :------- | 11 | | spark.rapids.ml.uvm.enabled | false | if set to true, enables [CUDA unified virtual memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) (aka managed memory) during estimator.fit() operations to allow processing of larger datasets than would fit in GPU memory | 12 | | spark.rapids.ml.gpuMemRatioForData | None | If set to a float value between 0 and 1, Spark Rapids ML will reserve a portion of free GPU memory on each GPU and incrementally append PySpark data batches into this reserved space. This setting is recommended for large datasets, as it prevents duplicating the entire dataset in GPU memory and reduces the risk of out-of-memory errors. | 13 | | spark.rapids.ml.cpu.fallback.enabled | false | if set to true and spark-rapids-ml estimator.fit() is invoked with unsupported parameters or parameter values, the pyspark.ml cpu based estimator.fit() and model.transform() will be run; if set to false, an exception is raised in this case (default). | 14 | | spark.rapids.ml.verbose | None | if set to a boolean value (true/false) or an integer between 0 and 6, controls the verbosity level for cuML logging during estimator.fit() operations. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. | 15 | | spark.rapids.ml.float32_inputs | None | if set to a boolean value (true/false), controls whether input data should be converted to float32 precision before being passed to cuML algorithms. Setting this to true can reduce memory usage and potentially improve performance, but may affect numerical precision. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. | 16 | | spark.rapids.ml.num_workers | None | if set to an integer value greater than 0, specifies the number of workers to use for distributed training. This parameter can be set globally in Spark configuration and will be used if not explicitly set in the estimator constructor. | 17 | 18 | 19 | Since the algorithms rely heavily on Pandas UDFs, we also require `spark.sql.execution.arrow.pyspark.enabled=true` to ensure efficient data transfer between the JVM and Python processes. -------------------------------------------------------------------------------- /docs/site/contact.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Contact Us 3 | nav_order: 7 4 | --- 5 | # Contact Us 6 | 7 | We use github to track bugs, feature requests, and answer questions. File an 8 | [issue](https://github.com/NVIDIA/spark-rapids-ml/issues/new) for a bug or feature request. 9 | 10 | For security issues, [report the vulnerability via email](security.md). -------------------------------------------------------------------------------- /docs/site/get-started/databricks.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Databricks 3 | parent: Getting Started 4 | --- 5 | # Getting Started on Databricks 6 | 7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/databricks/README.md) 8 | 9 | -------------------------------------------------------------------------------- /docs/site/get-started/dataproc.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Dataproc 3 | parent: Getting Started 4 | --- 5 | # Getting Started on Google Dataproc 6 | 7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/dataproc/README.md) 8 | 9 | -------------------------------------------------------------------------------- /docs/site/get-started/emr.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: EMR 3 | parent: Getting Started 4 | --- 5 | # Getting Started on AWS EMR 6 | 7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/notebooks/aws-emr/README.md) 8 | 9 | -------------------------------------------------------------------------------- /docs/site/get-started/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Getting Started 3 | nav_order: 2 4 | has_children: true 5 | --- 6 | # Getting Started 7 | -------------------------------------------------------------------------------- /docs/site/get-started/local.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Local Server 3 | parent: Getting Started 4 | --- 5 | # Getting Started on a Local Server 6 | 7 | See [these instructions](https://github.com/NVIDIA/spark-rapids-ml/blob/main/python/README.md) 8 | -------------------------------------------------------------------------------- /docs/site/get-started/spark_connect.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Spark Connect 3 | parent: Getting Started 4 | --- 5 | # Getting Started on Spark Connect 6 | 7 | Spark Rapids ML supports Spark Connect via the [Spark Rapids ML Connect Plugin](https://github.com/NVIDIA/spark-rapids-ml/blob/main/jvm). A prebuilt plugin jar compatible with Spark Connect 4.0 is bundled with the `spark-rapids-ml` pip package. See the getting-started [guide](https://github.com/NVIDIA/spark-rapids-ml/blob/main/jvm/README.md) for more information. -------------------------------------------------------------------------------- /docs/site/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Home 3 | nav_order: 1 4 | --- 5 | # Spark Rapids ML 6 | 7 | Spark Rapids ML enables GPU accelerated distributed machine learning on [Apache Spark](https://spark.apache.org/). It provides several PySpark ML compatible algorithms powered by the [RAPIDS cuML](https://docs.rapids.ai/api/cuml/stable/) library. -------------------------------------------------------------------------------- /docs/site/performance.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Performance Tuning 3 | nav_order: 6 4 | --- 5 | # Performance Tuning 6 | 7 | * TOC 8 | {:toc} 9 | 10 | ## Stage-level scheduling 11 | 12 | Starting from spark-rapids-ml `23.10.0`, stage-level scheduling is automatically enabled. 13 | Therefore, if you are using Spark **standalone** cluster version **`3.4.0`** or higher, we strongly recommend 14 | configuring the `"spark.task.resource.gpu.amount"` as a fractional value. This will 15 | enable running multiple tasks in parallel during the ETL phase to help the performance. An example configuration 16 | would be `"spark.task.resource.gpu.amount=1/spark.executor.cores"`. For example, 17 | 18 | ``` bash 19 | spark-submit \ 20 | --master spark://:7077 \ 21 | --conf spark.executor.cores=12 \ 22 | --conf spark.task.cpus=1 \ 23 | --conf spark.executor.resource.gpu.amount=1 \ 24 | --conf spark.task.resource.gpu.amount=0.08 \ 25 | ... 26 | ``` 27 | 28 | The above spark-submit command specifies a request for 1 GPU and 12 CPUs per executor. So you can see, 29 | a total of 12 tasks per executor will be executed concurrently during the ETL phase. And the stage-level scheduling 30 | is then used internally to the library to automatically carry out the ML training phases using the required 1 gpu per task. 31 | 32 | However, if you are using a spark-rapids-ml version earlier than 23.10.0 or a Spark 33 | standalone cluster version below 3.4.0, you need to make sure there will be only 1 task running at any time per executor. 34 | You can set `spark.task.cpus` equal to `spark.executor.cores`, or `"spark.task.resource.gpu.amount"=1`. For example, 35 | 36 | ``` bash 37 | spark-submit \ 38 | --master spark://:7077 \ 39 | --conf spark.executor.cores=12 \ 40 | --conf spark.task.cpus=1 \ 41 | --conf spark.executor.resource.gpu.amount=1 \ 42 | --conf spark.task.resource.gpu.amount=1 \ 43 | ... 44 | ``` 45 | -------------------------------------------------------------------------------- /docs/site/security.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Security 3 | nav_order: 6 4 | --- 5 | # Security 6 | 7 | NVIDIA is dedicated to the security and trust of our software products and services, including all 8 | source code repositories managed through our organization. 9 | 10 | If you need to report a security issue, please use the appropriate contact points outlined 11 | below. **Please do not report security vulnerabilities through GitHub.** 12 | 13 | ## Reporting Potential Security Vulnerability in an NVIDIA Product 14 | 15 | To report a potential security vulnerability in any NVIDIA product: 16 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html) 17 | - E-Mail: psirt@nvidia.com 18 | - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key) 19 | - Please include the following information: 20 | - Product/Driver name and version/branch that contains the vulnerability. -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | or more contributor license agreements. See the NOTICE file 3 | distributed with this work for additional information 4 | regarding copyright ownership. The ASF licenses this file 5 | to you under the Apache License, Version 2.0 (the 6 | "License"); you may not use this file except in compliance 7 | with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | software distributed under the License is distributed on an 13 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | KIND, either express or implied. See the License for the 15 | specific language governing permissions and limitations 16 | under the License. 17 | 18 | .. Workaround to avoid documenting __init__. 19 | 20 | {% extends "!autosummary/class.rst" %} 21 | 22 | {% if '__init__' in methods %} 23 | {% set caught_result = methods.remove('__init__') %} 24 | {% endif %} 25 | 26 | {% block methods %} 27 | {% if methods %} 28 | 29 | .. rubric:: Methods 30 | 31 | .. autosummary:: 32 | {% for item in methods %} 33 | ~{{ name }}.{{ item }} 34 | {%- endfor %} 35 | 36 | {% endif %} 37 | {% endblock %} 38 | 39 | -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/class_with_docs.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | or more contributor license agreements. See the NOTICE file 3 | distributed with this work for additional information 4 | regarding copyright ownership. The ASF licenses this file 5 | to you under the Apache License, Version 2.0 (the 6 | "License"); you may not use this file except in compliance 7 | with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | software distributed under the License is distributed on an 13 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | KIND, either express or implied. See the License for the 15 | specific language governing permissions and limitations 16 | under the License. 17 | 18 | 19 | {{ objname }} 20 | {{ underline }} 21 | 22 | .. currentmodule:: {{ module }} 23 | 24 | .. autoclass:: {{ objname }} 25 | :no-private-members: 26 | 27 | {% if '__init__' in methods %} 28 | {% set caught_result = methods.remove('__init__') %} 29 | {% endif %} 30 | 31 | {% block methods_summary %} 32 | {% if methods %} 33 | 34 | .. rubric:: Methods 35 | 36 | .. autosummary:: 37 | {% for item in methods %} 38 | ~{{ name }}.{{ item }} 39 | {%- endfor %} 40 | 41 | {% endif %} 42 | {% endblock %} 43 | 44 | {% block attributes_summary %} 45 | {% if attributes %} 46 | 47 | .. rubric:: Attributes 48 | 49 | .. autosummary:: 50 | {% for item in attributes %} 51 | {%- if item != "uid" %} 52 | ~{{ name }}.{{ item }} 53 | {%- endif %} 54 | {%- endfor %} 55 | 56 | {% endif %} 57 | {% endblock %} 58 | 59 | {% block methods_documentation %} 60 | {% if methods %} 61 | 62 | .. rubric:: Methods Documentation 63 | 64 | {% for item in methods %} 65 | .. automethod:: {{ item }} 66 | {%- endfor %} 67 | 68 | {% endif %} 69 | {% endblock %} 70 | 71 | {% block attributes_documentation %} 72 | {% if attributes %} 73 | 74 | .. rubric:: Attributes Documentation 75 | 76 | {% for item in attributes %} 77 | {%- if item != "uid" %} 78 | .. autoattribute:: {{ item }} 79 | {%- endif %} 80 | {%- endfor %} 81 | 82 | {% endif %} 83 | {% endblock %} 84 | 85 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Configuration file for the Sphinx documentation builder. 16 | # 17 | # For the full list of built-in configuration values, see the documentation: 18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 19 | 20 | # -- Project information ----------------------------------------------------- 21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 22 | 23 | project = 'spark-rapids-ml' 24 | copyright = '2025, NVIDIA' 25 | author = 'NVIDIA' 26 | release = '25.06.0' 27 | 28 | # -- General configuration --------------------------------------------------- 29 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 30 | 31 | 32 | extensions = [ 33 | 'numpydoc', 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.autosummary', 36 | 'sphinx.ext.doctest', 37 | 'sphinx.ext.githubpages', 38 | 'sphinx.ext.intersphinx', 39 | ] 40 | 41 | numpydoc_show_class_members = False 42 | 43 | autodoc_inherit_docstrings = False 44 | 45 | templates_path = ['_templates'] 46 | exclude_patterns = [] 47 | 48 | intersphinx_mapping = { 49 | 'pyspark': ('https://spark.apache.org/docs/latest/api/python', None), 50 | 'cuml': ('https://docs.rapids.ai/api/cuml/stable', None), 51 | } 52 | 53 | # -- Options for HTML output ------------------------------------------------- 54 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 55 | 56 | html_theme = 'pydata_sphinx_theme' 57 | 58 | html_show_sourcelink = False 59 | 60 | import inspect 61 | from spark_rapids_ml.utils import _unsupported_methods_attributes 62 | 63 | _unsupported_by_class = {} 64 | def autodoc_skip_member(app, what, name, obj, skip, options): 65 | # adapted from this https://github.com/sphinx-doc/sphinx/issues/9533#issuecomment-962007846 66 | doc_class=None 67 | for frame in inspect.stack(): 68 | if frame.function == "get_members": 69 | doc_class = frame.frame.f_locals["obj"] 70 | break 71 | 72 | exclude = skip 73 | if doc_class: 74 | if doc_class not in _unsupported_by_class: 75 | _unsupported_by_class[doc_class] = _unsupported_methods_attributes(doc_class) 76 | 77 | exclude = name in _unsupported_by_class[doc_class] 78 | 79 | # return True if (skip or exclude) else None # Can interfere with subsequent skip functions. 80 | return True if exclude or skip else None 81 | 82 | def setup(app): 83 | app.add_css_file("https://docs.rapids.ai/assets/css/custom.css") 84 | app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer") 85 | app.connect('autodoc-skip-member', autodoc_skip_member) 86 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. spark-rapids-ml documentation master file, created by 2 | sphinx-quickstart on Thu Jan 19 13:20:52 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to spark-rapids-ml's documentation! 7 | =========================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | spark_rapids_ml 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /docs/source/spark_rapids_ml.rst: -------------------------------------------------------------------------------- 1 | .. Licensed to the Apache Software Foundation (ASF) under one 2 | or more contributor license agreements. See the NOTICE file 3 | distributed with this work for additional information 4 | regarding copyright ownership. The ASF licenses this file 5 | to you under the Apache License, Version 2.0 (the 6 | "License"); you may not use this file except in compliance 7 | with the License. You may obtain a copy of the License at 8 | 9 | .. http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | .. Unless required by applicable law or agreed to in writing, 12 | software distributed under the License is distributed on an 13 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | KIND, either express or implied. See the License for the 15 | specific language governing permissions and limitations 16 | under the License. 17 | 18 | 19 | Spark Rapids ML 20 | =============== 21 | 22 | .. toctree:: 23 | :maxdepth: 4 24 | 25 | Feature 26 | ------- 27 | 28 | .. currentmodule:: spark_rapids_ml.feature 29 | 30 | .. autosummary:: 31 | :template: autosummary/class_with_docs.rst 32 | :toctree: api/feature 33 | 34 | PCA 35 | PCAModel 36 | 37 | Classification 38 | -------------- 39 | 40 | .. currentmodule:: spark_rapids_ml.classification 41 | 42 | .. autosummary:: 43 | :template: autosummary/class_with_docs.rst 44 | :toctree: api 45 | 46 | LogisticRegression 47 | LogisticRegressionModel 48 | RandomForestClassifier 49 | RandomForestClassificationModel 50 | 51 | Clustering 52 | ---------- 53 | 54 | .. currentmodule:: spark_rapids_ml.clustering 55 | 56 | .. autosummary:: 57 | :template: autosummary/class_with_docs.rst 58 | :toctree: api 59 | 60 | DBSCAN 61 | DBSCANModel 62 | KMeans 63 | KMeansModel 64 | 65 | 66 | Regression 67 | ---------- 68 | 69 | .. currentmodule:: spark_rapids_ml.regression 70 | 71 | .. autosummary:: 72 | :template: autosummary/class_with_docs.rst 73 | :toctree: api 74 | 75 | LinearRegression 76 | LinearRegressionModel 77 | RandomForestRegressor 78 | RandomForestRegressionModel 79 | 80 | 81 | Nearest Neighbors 82 | ----------------- 83 | 84 | .. currentmodule:: spark_rapids_ml.knn 85 | 86 | .. autosummary:: 87 | :template: autosummary/class_with_docs.rst 88 | :toctree: api 89 | 90 | ApproximateNearestNeighbors 91 | ApproximateNearestNeighborsModel 92 | NearestNeighbors 93 | NearestNeighborsModel 94 | 95 | 96 | Tuning 97 | ------ 98 | 99 | .. currentmodule:: spark_rapids_ml.tuning 100 | 101 | .. autosummary:: 102 | :template: autosummary/class_with_docs.rst 103 | :toctree: api 104 | 105 | CrossValidator 106 | 107 | 108 | UMAP 109 | ---- 110 | 111 | .. currentmodule:: spark_rapids_ml.umap 112 | 113 | .. autosummary:: 114 | :template: autosummary/class_with_docs.rst 115 | :toctree: api 116 | 117 | UMAP 118 | UMAPModel 119 | -------------------------------------------------------------------------------- /jvm/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target 3 | -------------------------------------------------------------------------------- /jvm/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml estimators. 21 | com.nvidia.rapids.ml.RapidsLogisticRegression 22 | com.nvidia.rapids.ml.RapidsRandomForestClassifier 23 | com.nvidia.rapids.ml.RapidsPCA 24 | com.nvidia.rapids.ml.RapidsRandomForestRegressor 25 | com.nvidia.rapids.ml.RapidsLinearRegression 26 | com.nvidia.rapids.ml.RapidsKMeans 27 | -------------------------------------------------------------------------------- /jvm/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml models. 21 | org.apache.spark.ml.rapids.RapidsLogisticRegressionModel 22 | org.apache.spark.ml.rapids.RapidsRandomForestClassificationModel 23 | org.apache.spark.ml.rapids.RapidsPCAModel 24 | org.apache.spark.ml.rapids.RapidsRandomForestRegressionModel 25 | org.apache.spark.ml.rapids.RapidsLinearRegressionModel 26 | org.apache.spark.ml.clustering.rapids.RapidsKMeansModel 27 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/Plugin.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.nvidia.rapids.ml 17 | 18 | import org.apache.spark.sql.connect.plugin.MLBackendPlugin 19 | 20 | import java.util.Optional 21 | 22 | /** 23 | * Spark connect ml plugin is used to replace the spark built-in algorithms with 24 | * spark-rapids-ml python implementations. 25 | */ 26 | class Plugin extends MLBackendPlugin { 27 | 28 | override def transform(mlName: String): Optional[String] = { 29 | mlName match { 30 | case "org.apache.spark.ml.classification.LogisticRegression" => 31 | Optional.of("com.nvidia.rapids.ml.RapidsLogisticRegression") 32 | case "org.apache.spark.ml.classification.LogisticRegressionModel" => 33 | Optional.of("org.apache.spark.ml.rapids.RapidsLogisticRegressionModel") 34 | case "org.apache.spark.ml.classification.RandomForestClassifier" => 35 | Optional.of("com.nvidia.rapids.ml.RapidsRandomForestClassifier") 36 | case "org.apache.spark.ml.classification.RandomForestClassificationModel" => 37 | Optional.of("org.apache.spark.ml.rapids.RapidsRandomForestClassificationModel") 38 | case "org.apache.spark.ml.feature.PCA" => 39 | Optional.of("com.nvidia.rapids.ml.RapidsPCA") 40 | case "org.apache.spark.ml.feature.PCAModel" => 41 | Optional.of("org.apache.spark.ml.rapids.RapidsPCAModel") 42 | case "org.apache.spark.ml.regression.RandomForestRegressor" => 43 | Optional.of("com.nvidia.rapids.ml.RapidsRandomForestRegressor") 44 | case "org.apache.spark.ml.regression.RandomForestRegressionModel" => 45 | Optional.of("org.apache.spark.ml.rapids.RapidsRandomForestRegressionModel") 46 | case "org.apache.spark.ml.regression.LinearRegression" => 47 | Optional.of("com.nvidia.rapids.ml.RapidsLinearRegression") 48 | case "org.apache.spark.ml.regression.LinearRegressionModel" => 49 | Optional.of("org.apache.spark.ml.rapids.RapidsLinearRegressionModel") 50 | case "org.apache.spark.ml.clustering.KMeans" => 51 | Optional.of("com.nvidia.rapids.ml.RapidsKMeans") 52 | case "org.apache.spark.ml.clustering.KMeansModel" => 53 | Optional.of("org.apache.spark.ml.clustering.rapids.RapidsKMeansModel") 54 | case _ => Optional.empty() 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsKMeans.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.clustering.rapids.RapidsKMeansModel 20 | import org.apache.spark.ml.clustering.KMeans 21 | import org.apache.spark.ml.rapids.ModelHelper 22 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 23 | import org.apache.spark.sql.Dataset 24 | import org.apache.spark.sql.types.StructType 25 | 26 | /** 27 | * RapidsKMeans is a JVM wrapper of KMeans in spark-rapids-ml python package. 28 | * 29 | * The training process is going to launch a Python Process where to run spark-rapids-ml 30 | * KMeans and return the corresponding model 31 | * 32 | * @param uid unique ID of the estimator 33 | */ 34 | class RapidsKMeans(override val uid: String) extends KMeans with DefaultParamsWritable 35 | with RapidsEstimator { 36 | 37 | def this() = this(Identifiable.randomUID("kmeans")) 38 | 39 | override def fit(dataset: Dataset[_]): RapidsKMeansModel = { 40 | val trainedModel = trainOnPython(dataset) 41 | val parentModel = ModelHelper.createKMeansModel(trainedModel.modelAttributes) 42 | copyValues(new RapidsKMeansModel(uid, parentModel, trainedModel.modelAttributes)) 43 | } 44 | 45 | // Override this function to allow feature to be array 46 | override def transformSchema(schema: StructType): StructType = schema 47 | 48 | /** 49 | * The estimator name 50 | */ 51 | override def name: String = "KMeans" 52 | } 53 | 54 | object RapidsKMeans extends DefaultParamsReadable[RapidsKMeans] { 55 | 56 | override def load(path: String): RapidsKMeans = super.load(path) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsLinearRegression.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsLinearRegressionModel} 20 | import org.apache.spark.ml.regression.LinearRegression 21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 22 | import org.apache.spark.sql.Dataset 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * RapidsLinearRegression is a JVM wrapper of LinearRegression in spark-rapids-ml python package. 27 | * 28 | * The training process is going to launch a Python Process where to run spark-rapids-ml 29 | * LinearRegression and return the corresponding model 30 | * 31 | * @param uid unique ID of the estimator 32 | */ 33 | class RapidsLinearRegression(override val uid: String) extends LinearRegression 34 | with DefaultParamsWritable with RapidsEstimator { 35 | 36 | def this() = this(Identifiable.randomUID("linReg")) 37 | 38 | override def train(dataset: Dataset[_]): RapidsLinearRegressionModel = { 39 | val trainedModel = trainOnPython(dataset) 40 | val (coef, intercept, scale) = ModelHelper.createLinearRegressionModel(trainedModel.modelAttributes) 41 | copyValues(new RapidsLinearRegressionModel(uid, coef, intercept, scale, 42 | trainedModel.modelAttributes)) 43 | } 44 | 45 | // Override this function to allow feature to be array 46 | override def transformSchema(schema: StructType): StructType = schema 47 | 48 | /** 49 | * The estimator name 50 | */ 51 | override def name: String = "LinearRegression" 52 | } 53 | 54 | object RapidsLinearRegression extends DefaultParamsReadable[RapidsLinearRegression] { 55 | 56 | override def load(path: String): RapidsLinearRegression = super.load(path) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsLogisticRegression.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 20 | import org.apache.spark.ml.classification.LogisticRegression 21 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsLogisticRegressionModel} 22 | import org.apache.spark.sql.Dataset 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * RapidsLogisticRegression is a JVM wrapper of LogisticRegression in spark-rapids-ml python package. 27 | * 28 | * The training process is going to launch a Python Process where to run spark-rapids-ml 29 | * LogisticRegression and return the corresponding model 30 | * 31 | * @param uid unique ID of the estimator 32 | */ 33 | class RapidsLogisticRegression(override val uid: String) extends LogisticRegression 34 | with DefaultParamsWritable with RapidsEstimator { 35 | 36 | def this() = this(Identifiable.randomUID("logreg")) 37 | 38 | override def train(dataset: Dataset[_]): RapidsLogisticRegressionModel = { 39 | val trainedModel = trainOnPython(dataset) 40 | val (coef, intercept, numClasses) = 41 | ModelHelper.createLogisticRegressionModel(trainedModel.modelAttributes) 42 | copyValues(new RapidsLogisticRegressionModel(uid, coef, intercept, numClasses, trainedModel.modelAttributes)) 43 | } 44 | 45 | // Override this function to allow feature to be array 46 | override def transformSchema(schema: StructType): StructType = schema 47 | 48 | /** 49 | * The estimator name 50 | */ 51 | override def name: String = "LogisticRegression" 52 | } 53 | 54 | object RapidsLogisticRegression extends DefaultParamsReadable[RapidsLogisticRegression] { 55 | 56 | override def load(path: String): RapidsLogisticRegression = super.load(path) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsPCA.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.feature.PCA 20 | import org.apache.spark.ml.rapids.{ModelHelper, RapidsPCAModel} 21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 22 | import org.apache.spark.sql.Dataset 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * RapidsPCA is a JVM wrapper of PCA in spark-rapids-ml python package. 27 | * 28 | * The training process is going to launch a Python Process where to run spark-rapids-ml 29 | * PCA and return the corresponding model 30 | * 31 | * @param uid unique ID of the estimator 32 | */ 33 | class RapidsPCA(override val uid: String) extends PCA with DefaultParamsWritable 34 | with RapidsEstimator { 35 | 36 | def this() = this(Identifiable.randomUID("pca")) 37 | 38 | override def fit(dataset: Dataset[_]): RapidsPCAModel = { 39 | val trainedModel = trainOnPython(dataset) 40 | val (pc, explainedVariance) = ModelHelper.createPCAModel(trainedModel.modelAttributes) 41 | copyValues(new RapidsPCAModel(uid, pc, explainedVariance, trainedModel.modelAttributes)) 42 | } 43 | 44 | // Override this function to allow feature to be array 45 | override def transformSchema(schema: StructType): StructType = schema 46 | 47 | /** 48 | * The estimator name 49 | */ 50 | override def name: String = "PCA" 51 | } 52 | 53 | object RapidsPCA extends DefaultParamsReadable[RapidsPCA] { 54 | 55 | override def load(path: String): RapidsPCA = super.load(path) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsRandomForestClassifier.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.classification.RandomForestClassifier 20 | import org.apache.spark.ml.rapids.{RapidsRandomForestClassificationModel, ModelHelper} 21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 22 | import org.apache.spark.sql.Dataset 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * RapidsRandomForestClassifier is a JVM wrapper of RandomForestClassifier in spark-rapids-ml python package. 27 | * 28 | * The training process is going to launch a Python Process where to run spark-rapids-ml 29 | * RandomForestClassifier and return the corresponding model 30 | * 31 | * @param uid unique ID of the estimator 32 | */ 33 | class RapidsRandomForestClassifier(override val uid: String) extends RandomForestClassifier 34 | with DefaultParamsWritable with RapidsEstimator { 35 | 36 | def this() = this(Identifiable.randomUID("rfc")) 37 | 38 | override def train(dataset: Dataset[_]): RapidsRandomForestClassificationModel = { 39 | val trainedModel = trainOnPython(dataset) 40 | val (trees, numFeatures, numClasses) = ModelHelper.createRandomForestClassificationModel( 41 | trainedModel.modelAttributes, getImpurity, uid) 42 | copyValues(new RapidsRandomForestClassificationModel(uid, trees, numFeatures, numClasses, 43 | trainedModel.modelAttributes)) 44 | } 45 | 46 | // Override this function to allow feature to be array 47 | override def transformSchema(schema: StructType): StructType = schema 48 | 49 | /** 50 | * The estimator name 51 | */ 52 | override def name: String = "RandomForestClassifier" 53 | } 54 | 55 | object RapidsRandomForestClassifier extends DefaultParamsReadable[RapidsRandomForestClassifier] { 56 | 57 | override def load(path: String): RapidsRandomForestClassifier = super.load(path) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsRandomForestRegressor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.spark.ml.rapids.{RapidsRandomForestRegressionModel, ModelHelper} 20 | import org.apache.spark.ml.regression.RandomForestRegressor 21 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 22 | import org.apache.spark.sql.Dataset 23 | import org.apache.spark.sql.types.StructType 24 | 25 | /** 26 | * RapidsRandomForestRegressor is a JVM wrapper of RandomForestRegressor in spark-rapids-ml python package. 27 | * 28 | * The training process is going to launch a Python Process where to run spark-rapids-ml 29 | * RandomForestRegressor and return the corresponding model 30 | * 31 | * @param uid unique ID of the estimator 32 | */ 33 | class RapidsRandomForestRegressor(override val uid: String) extends RandomForestRegressor 34 | with DefaultParamsWritable with RapidsEstimator { 35 | 36 | def this() = this(Identifiable.randomUID("rfr")) 37 | 38 | override def train(dataset: Dataset[_]): RapidsRandomForestRegressionModel = { 39 | val trainedModel = trainOnPython(dataset) 40 | val (trees, numFeatures) = ModelHelper.createRandomForestRegressionModel( 41 | trainedModel.modelAttributes, getImpurity, uid) 42 | copyValues(new RapidsRandomForestRegressionModel(uid, trees, numFeatures, trainedModel.modelAttributes)) 43 | } 44 | 45 | // Override this function to allow feature to be array 46 | override def transformSchema(schema: StructType): StructType = schema 47 | 48 | /** 49 | * The estimator name 50 | */ 51 | override def name: String = "RandomForestRegressor" 52 | } 53 | 54 | object RapidsRandomForestRegressor extends DefaultParamsReadable[RapidsRandomForestRegressor] { 55 | 56 | override def load(path: String): RapidsRandomForestRegressor = super.load(path) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jvm/src/main/scala/com/nvidia/rapids/ml/RapidsTraits.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.nvidia.rapids.ml 18 | 19 | import org.apache.commons.logging.LogFactory 20 | import org.apache.spark.ml.param.Params 21 | import org.apache.spark.ml.rapids.{Fit, PythonEstimatorRunner, RapidsUtils, TrainedModel} 22 | import org.apache.spark.sql.Dataset 23 | 24 | /** Implementation of the automatic-resource-management pattern */ 25 | object Arm { 26 | /** Executes the provided code block and then closes the resource */ 27 | def withResource[T <: AutoCloseable, V](r: T)(block: T => V): V = { 28 | try { 29 | block(r) 30 | } finally { 31 | r.close() 32 | } 33 | } 34 | } 35 | 36 | trait RapidsEstimator extends Params { 37 | protected val logger = LogFactory.getLog("Spark-Rapids-ML Plugin") 38 | 39 | /** 40 | * The estimator name 41 | * 42 | * @return 43 | */ 44 | def name: String 45 | 46 | def trainOnPython(dataset: Dataset[_]): TrainedModel = { 47 | logger.info(s"Training $name ...") 48 | // Get the user-defined parameters and pass them to python process as a dictionary 49 | val params = RapidsUtils.getUserDefinedParams(this) 50 | 51 | val runner = new PythonEstimatorRunner( 52 | Fit(name, params), 53 | dataset.toDF) 54 | 55 | val trainedModel = Arm.withResource(runner) { _ => 56 | runner.runInPython(useDaemon = false) 57 | } 58 | 59 | logger.info(s"Finished $name training.") 60 | trainedModel 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/clustering/rapids/RapidsKMeansModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.clustering.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.clustering.KMeansModel 21 | import org.apache.spark.ml.param.ParamMap 22 | import org.apache.spark.ml.rapids.{RapidsModel, RapidsModelWriter} 23 | import org.apache.spark.ml.util.{GeneralMLWriter, MLReadable, MLReader} 24 | import org.apache.spark.mllib.clustering.{KMeansModel => MLlibKMeansModel} 25 | import org.apache.spark.sql.{DataFrame, Dataset} 26 | 27 | /** 28 | * Model fitted by RapidsKMeans. 29 | * 30 | * RapidsKMeansModel extends from the Spark KMeansModel and stores 31 | * the model attributes trained by spark-rapids-ml python in string format. 32 | */ 33 | class RapidsKMeansModel(override val uid: String, 34 | override private[clustering] val parentModel: MLlibKMeansModel, 35 | override val modelAttributes: String) 36 | extends KMeansModel(uid, parentModel) with RapidsModel { 37 | 38 | private[ml] def this() = this("", null, null) 39 | 40 | override def transform(dataset: Dataset[_]): DataFrame = { 41 | transformOnPython(dataset, super.transform) 42 | } 43 | 44 | /** 45 | * The model name 46 | */ 47 | override def name: String = "KMeansModel" 48 | 49 | override def copy(extra: ParamMap): RapidsKMeansModel = { 50 | val newModel = copyValues( 51 | new RapidsKMeansModel(uid, parentModel, modelAttributes), extra) 52 | newModel 53 | } 54 | 55 | override def write: GeneralMLWriter = new RapidsModelWriter(this) 56 | 57 | override def cpu: KMeansModel = { 58 | copyValues(new KMeansModel(uid, parentModel)) 59 | } 60 | } 61 | 62 | object RapidsKMeansModel extends MLReadable[RapidsKMeansModel] { 63 | 64 | override def read: MLReader[RapidsKMeansModel] = new RapidsKMeansModelReader 65 | 66 | override def load(path: String): RapidsKMeansModel = super.load(path) 67 | 68 | private class RapidsKMeansModelReader extends MLReader[RapidsKMeansModel] { 69 | 70 | override def load(path: String): RapidsKMeansModel = { 71 | val cpuModel = KMeansModel.load(path) 72 | val attributesPath = new Path(path, "attributes").toString 73 | val row = sparkSession.read.parquet(attributesPath).first() 74 | val model = new RapidsKMeansModel(row.getString(0), 75 | cpuModel.parentModel, row.getString(1)) 76 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 77 | model 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/PythonEstimatorRunner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import java.io.{DataInputStream, DataOutputStream} 20 | 21 | import net.razorvine.pickle.Pickler 22 | 23 | import org.apache.spark.api.java.JavaSparkContext 24 | import org.apache.spark.api.python.{PythonFunction, PythonRDD, PythonWorkerUtils} 25 | import org.apache.spark.sql.DataFrame 26 | import org.apache.spark.sql.execution.python.PythonPlannerRunner 27 | 28 | 29 | case class Fit(name: String, params: String) 30 | 31 | case class TrainedModel(modelAttributes: String) 32 | 33 | /** 34 | * PythonEstimatorRunner is a bridge to launch and manage Python process. It sends the 35 | * estimator-related messages to the python process and runs it. 36 | * 37 | * @param fit the fit information 38 | * @param dataset input dataset 39 | */ 40 | class PythonEstimatorRunner(fit: Fit, 41 | dataset: DataFrame, 42 | func: PythonFunction = PythonRunnerUtils.RAPIDS_PYTHON_FUNC) 43 | extends PythonPlannerRunner[TrainedModel](func) with AutoCloseable { 44 | 45 | private val datasetKey = PythonRunnerUtils.putNewObjectToPy4j(dataset) 46 | private val jscKey = PythonRunnerUtils.putNewObjectToPy4j(new JavaSparkContext(dataset.sparkSession.sparkContext)) 47 | 48 | override protected val workerModule: String = "spark_rapids_ml.connect_plugin" 49 | 50 | override protected def writeToPython(dataOut: DataOutputStream, pickler: Pickler): Unit = { 51 | PythonRDD.writeUTF(PythonRunnerUtils.AUTH_TOKEN, dataOut) 52 | PythonRDD.writeUTF(fit.name, dataOut) 53 | PythonRDD.writeUTF(fit.params, dataOut) 54 | PythonRDD.writeUTF(jscKey, dataOut) 55 | PythonRDD.writeUTF(datasetKey, dataOut) 56 | } 57 | 58 | override protected def receiveFromPython(dataIn: DataInputStream): TrainedModel = { 59 | val modelAttributes = PythonWorkerUtils.readUTF(dataIn) 60 | TrainedModel(modelAttributes) 61 | } 62 | 63 | override def close(): Unit = { 64 | PythonRunnerUtils.deleteObject(jscKey) 65 | PythonRunnerUtils.deleteObject(datasetKey) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/PythonModelRunner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import java.io.{DataInputStream, DataOutputStream} 20 | 21 | import net.razorvine.pickle.Pickler 22 | 23 | import org.apache.spark.api.java.JavaSparkContext 24 | import org.apache.spark.api.python.{PythonFunction, PythonRDD, PythonWorkerUtils} 25 | import org.apache.spark.sql.DataFrame 26 | import org.apache.spark.sql.execution.python.PythonPlannerRunner 27 | 28 | 29 | case class Transform(name: String, params: String, modelAttributes: String) 30 | 31 | /** 32 | * PythonModelRunner is a bridge to launch and manage Python process. it sends the 33 | * model-related messages to the python process and runs it. 34 | * 35 | * @param transform the transform information 36 | * @param dataset input dataset 37 | */ 38 | class PythonModelRunner(transform: Transform, 39 | dataset: DataFrame, 40 | func: PythonFunction = PythonRunnerUtils.RAPIDS_PYTHON_FUNC) 41 | extends PythonPlannerRunner[DataFrame](func) with AutoCloseable { 42 | 43 | private val datasetKey = PythonRunnerUtils.putNewObjectToPy4j(dataset) 44 | private val jscKey = PythonRunnerUtils.putNewObjectToPy4j(new JavaSparkContext(dataset.sparkSession.sparkContext)) 45 | 46 | override protected val workerModule: String = "spark_rapids_ml.connect_plugin" 47 | 48 | override protected def writeToPython(dataOut: DataOutputStream, pickler: Pickler): Unit = { 49 | PythonRDD.writeUTF(PythonRunnerUtils.AUTH_TOKEN, dataOut) 50 | PythonRDD.writeUTF(transform.name, dataOut) 51 | PythonRDD.writeUTF(transform.params, dataOut) 52 | PythonRDD.writeUTF(jscKey, dataOut) 53 | PythonRDD.writeUTF(datasetKey, dataOut) 54 | PythonRDD.writeUTF(transform.modelAttributes, dataOut) 55 | } 56 | 57 | override protected def receiveFromPython(dataIn: DataInputStream): DataFrame = { 58 | // Read the dataset target id in py4j server 59 | val dfId = PythonWorkerUtils.readUTF(dataIn) 60 | PythonRunnerUtils.getObjectAndDeref(dfId).asInstanceOf[DataFrame] 61 | } 62 | 63 | override def close(): Unit = { 64 | PythonRunnerUtils.deleteObject(jscKey) 65 | PythonRunnerUtils.deleteObject(datasetKey) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsLinearRegressionModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.linalg.Vector 21 | import org.apache.spark.ml.param.ParamMap 22 | import org.apache.spark.ml.regression.LinearRegressionModel 23 | import org.apache.spark.ml.util.{GeneralMLWriter, MLReadable, MLReader} 24 | import org.apache.spark.sql.{DataFrame, Dataset} 25 | 26 | /** 27 | * Model produced by RapidsLinearRegression. 28 | * 29 | * RapidsLinearRegressionModel extends from the Spark LinearRegressionModel and stores 30 | * the model attributes trained by spark-rapids-ml python in string format. 31 | */ 32 | class RapidsLinearRegressionModel(override val uid: String, 33 | override val coefficients: Vector, 34 | override val intercept: Double, 35 | override val scale: Double, 36 | override val modelAttributes: String) 37 | extends LinearRegressionModel(uid, coefficients, intercept, scale) with RapidsModel { 38 | 39 | private[ml] def this() = this("", null, 1.0, 1.0, "") 40 | 41 | override def transform(dataset: Dataset[_]): DataFrame = { 42 | transformOnPython(dataset, super.transform) 43 | } 44 | 45 | /** 46 | * The model name 47 | */ 48 | override def name: String = "LinearRegressionModel" 49 | 50 | override def copy(extra: ParamMap): RapidsLinearRegressionModel = { 51 | copyValues( 52 | new RapidsLinearRegressionModel(uid, coefficients, intercept, scale, modelAttributes), extra) 53 | } 54 | 55 | override def cpu: LinearRegressionModel = { 56 | copyValues(new LinearRegressionModel(uid, coefficients, intercept, scale)) 57 | } 58 | 59 | override def write: GeneralMLWriter = new RapidsModelWriter(this) 60 | } 61 | 62 | object RapidsLinearRegressionModel extends MLReadable[RapidsLinearRegressionModel] { 63 | 64 | override def read: MLReader[RapidsLinearRegressionModel] = new RapidsLinearRegressionModelReader 65 | 66 | override def load(path: String): RapidsLinearRegressionModel = super.load(path) 67 | 68 | private class RapidsLinearRegressionModelReader extends MLReader[RapidsLinearRegressionModel] { 69 | 70 | override def load(path: String): RapidsLinearRegressionModel = { 71 | val cpuModel = LinearRegressionModel.load(path) 72 | val attributesPath = new Path(path, "attributes").toString 73 | val row = sparkSession.read.parquet(attributesPath).first() 74 | val model = new RapidsLinearRegressionModel(row.getString(0), cpuModel.coefficients, 75 | cpuModel.intercept, cpuModel.scale, row.getString(1)) 76 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 77 | model 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsLogisticRegressionModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.classification.LogisticRegressionModel 21 | import org.apache.spark.ml.linalg.{Matrix, Vector} 22 | import org.apache.spark.ml.param.ParamMap 23 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable} 24 | import org.apache.spark.sql.{DataFrame, Dataset} 25 | 26 | /** 27 | * Model produced by RapidsLogisticRegression. 28 | * 29 | * RapidsLogisticRegressionModel extends from the Spark LogisticRegressionModel and stores 30 | * the model attributes trained by spark-rapids-ml python in string format. 31 | */ 32 | class RapidsLogisticRegressionModel(override val uid: String, 33 | override val coefficientMatrix: Matrix, 34 | override val interceptVector: Vector, 35 | override val numClasses: Int, 36 | override val modelAttributes: String) 37 | extends LogisticRegressionModel(uid, coefficientMatrix, interceptVector, 38 | numClasses, numClasses != 2) with MLWritable with RapidsModel { 39 | 40 | private[ml] def this() = this("", null, null, 2, null) 41 | 42 | override def transform(dataset: Dataset[_]): DataFrame = { 43 | transformOnPython(dataset, super.transform) 44 | } 45 | 46 | /** 47 | * The model name 48 | */ 49 | override def name: String = "LogisticRegressionModel" 50 | 51 | override def copy(extra: ParamMap): RapidsLogisticRegressionModel = { 52 | val newModel = copyValues( 53 | new RapidsLogisticRegressionModel(uid, coefficientMatrix, interceptVector, 54 | numClasses, modelAttributes), extra) 55 | newModel.setSummary(trainingSummary).setParent(parent) 56 | newModel 57 | } 58 | 59 | override def cpu: LogisticRegressionModel = { 60 | copyValues( 61 | new LogisticRegressionModel(uid, coefficientMatrix, interceptVector, numClasses, numClasses != 2)) 62 | } 63 | } 64 | 65 | object RapidsLogisticRegressionModel extends MLReadable[RapidsLogisticRegressionModel] { 66 | 67 | override def read: MLReader[RapidsLogisticRegressionModel] = new RapidsLogisticRegressionModelReader 68 | 69 | override def load(path: String): RapidsLogisticRegressionModel = super.load(path) 70 | 71 | private class RapidsLogisticRegressionModelReader extends MLReader[RapidsLogisticRegressionModel] { 72 | 73 | override def load(path: String): RapidsLogisticRegressionModel = { 74 | val cpuModel = LogisticRegressionModel.load(path) 75 | val attributesPath = new Path(path, "attributes").toString 76 | val row = sparkSession.read.parquet(attributesPath).first() 77 | val model = new RapidsLogisticRegressionModel(row.getString(0), 78 | cpuModel.coefficientMatrix, cpuModel.interceptVector, cpuModel.numClasses, row.getString(1)) 79 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 80 | model 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import com.nvidia.rapids.ml.Arm 20 | import org.apache.commons.logging.LogFactory 21 | import org.apache.hadoop.fs.Path 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.ml.Model 24 | import org.apache.spark.ml.linalg.VectorUDT 25 | import org.apache.spark.ml.param.Params 26 | import org.apache.spark.ml.param.shared.HasFeaturesCol 27 | import org.apache.spark.ml.util.{GeneralMLWriter, MLWritable, MLWriter} 28 | import org.apache.spark.sql.{DataFrame, Dataset} 29 | 30 | trait RapidsModel extends MLWritable with Params with HasFeaturesCol { 31 | 32 | /** 33 | * The attributes of the corresponding spark-rapids-ml model, it has been 34 | * encoded to json format. We don't need to access it 35 | */ 36 | protected[ml] val modelAttributes: String 37 | 38 | /** 39 | * The model name 40 | */ 41 | def name: String 42 | 43 | def featureName: String = getFeaturesCol 44 | 45 | protected val logger = LogFactory.getLog("Spark-Rapids-ML Plugin") 46 | 47 | def transformOnPython(dataset: Dataset[_], 48 | cpuTransformFunc: Dataset[_] => DataFrame): DataFrame = { 49 | val usePython = dataset.sparkSession.conf.get("spark.rapids.ml.python.transform.enabled", "true").toBoolean 50 | val isVector = dataset.schema(featureName).dataType.isInstanceOf[VectorUDT] 51 | if (!isVector && !usePython) { 52 | throw new IllegalArgumentException("Please enable spark.rapids.ml.python.transform.enabled to " + 53 | "transform dataset in python for non-vector input.") 54 | } 55 | 56 | if (usePython) { 57 | logger.info("Transform in python") 58 | // Get the user-defined parameters and pass them to python process as a dictionary 59 | val params = RapidsUtils.getUserDefinedParams(this) 60 | 61 | val runner = new PythonModelRunner( 62 | Transform(name, params, modelAttributes), 63 | dataset.toDF) 64 | 65 | Arm.withResource(runner) { _ => 66 | runner.runInPython(useDaemon = false) 67 | } 68 | } else { 69 | logger.info(s"Transform using CPU $name") 70 | cpuTransformFunc(dataset) 71 | } 72 | } 73 | 74 | override def write: MLWriter = new RapidsModelWriter(this) 75 | 76 | def cpu: Model[_] 77 | } 78 | 79 | class RapidsModelWriter(instance: RapidsModel) extends 80 | GeneralMLWriter(instance.asInstanceOf[Model[_]]) with Logging { 81 | 82 | override protected def saveImpl(path: String): Unit = { 83 | val writer = instance.cpu.asInstanceOf[MLWritable].write 84 | if (shouldOverwrite) { 85 | writer.overwrite() 86 | } 87 | optionMap.foreach { case (k, v) => writer.option(k, v) } 88 | writer.save(path) 89 | 90 | val attributesPath = new Path(path, "attributes").toString 91 | sparkSession.createDataFrame( 92 | Seq(Tuple2(instance.uid, instance.modelAttributes)) 93 | ).write.parquet(attributesPath) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsPCAModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.feature.PCAModel 21 | import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector} 22 | import org.apache.spark.ml.param.ParamMap 23 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWriter} 24 | import org.apache.spark.sql.{DataFrame, Dataset} 25 | 26 | /** 27 | * Model produced by RapidsPCA. 28 | * 29 | * RapidsPCAModel extends from the Spark PCAModel and stores 30 | * the model attributes trained by spark-rapids-ml python in string format. 31 | */ 32 | class RapidsPCAModel(override val uid: String, 33 | override val pc: DenseMatrix, 34 | override val explainedVariance: DenseVector, 35 | override val modelAttributes: String) 36 | extends PCAModel(uid, pc, explainedVariance) with RapidsModel { 37 | 38 | private[ml] def this() = this("", null, null, "") 39 | 40 | override def transform(dataset: Dataset[_]): DataFrame = { 41 | transformOnPython(dataset, super.transform) 42 | } 43 | 44 | /** 45 | * The model name 46 | */ 47 | override def name: String = "PCAModel" 48 | 49 | override def copy(extra: ParamMap): RapidsPCAModel = { 50 | copyValues( 51 | new RapidsPCAModel(uid, pc, explainedVariance, modelAttributes), extra) 52 | } 53 | 54 | override def featureName: String = getInputCol 55 | 56 | override def write: MLWriter = super.write 57 | 58 | override def cpu: PCAModel = { 59 | copyValues(new PCAModel(uid, pc, explainedVariance)) 60 | } 61 | } 62 | 63 | 64 | object RapidsPCAModel extends MLReadable[RapidsPCAModel] { 65 | 66 | override def read: MLReader[RapidsPCAModel] = new RapidsPCAModelReader 67 | 68 | override def load(path: String): RapidsPCAModel = super.load(path) 69 | 70 | private class RapidsPCAModelReader extends MLReader[RapidsPCAModel] { 71 | 72 | override def load(path: String): RapidsPCAModel = { 73 | val cpuModel = PCAModel.load(path) 74 | val attributesPath = new Path(path, "attributes").toString 75 | val row = sparkSession.read.parquet(attributesPath).first() 76 | val model = new RapidsPCAModel(row.getString(0), 77 | cpuModel.pc, cpuModel.explainedVariance, row.getString(1)) 78 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 79 | model 80 | } 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsRandomForestClassificationModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, RandomForestClassificationModel} 21 | import org.apache.spark.ml.param.ParamMap 22 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable} 23 | import org.apache.spark.sql.{DataFrame, Dataset} 24 | 25 | /** 26 | * Model produced by RapidsRandomForestClassifier. 27 | * 28 | * RapidsRandomForestClassificationModel extends from the Spark RandomForestClassificationModel and stores 29 | * the model attributes trained by spark-rapids-ml python in string format. 30 | */ 31 | class RapidsRandomForestClassificationModel(override val uid: String, 32 | private val _trees: Array[DecisionTreeClassificationModel], 33 | override val numFeatures: Int, 34 | override val numClasses: Int, 35 | override val modelAttributes: String) 36 | extends RandomForestClassificationModel(uid, _trees, numFeatures, numClasses) 37 | with MLWritable with RapidsModel { 38 | 39 | private[ml] def this() = this("", null, 1, 1, "") 40 | 41 | override def transform(dataset: Dataset[_]): DataFrame = { 42 | transformOnPython(dataset, super.transform) 43 | } 44 | 45 | /** 46 | * The model name 47 | */ 48 | override def name: String = "RandomForestClassificationModel" 49 | 50 | override def copy(extra: ParamMap): RapidsRandomForestClassificationModel = { 51 | copyValues(new RapidsRandomForestClassificationModel(uid, _trees, numFeatures, 52 | numClasses, modelAttributes), extra) 53 | } 54 | 55 | override def cpu: RandomForestClassificationModel = { 56 | copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses)) 57 | } 58 | } 59 | 60 | object RapidsRandomForestClassificationModel extends MLReadable[RapidsRandomForestClassificationModel] { 61 | 62 | override def read: MLReader[RapidsRandomForestClassificationModel] = new RapidsRandomForestClassificationModelReader 63 | 64 | override def load(path: String): RapidsRandomForestClassificationModel = super.load(path) 65 | 66 | private class RapidsRandomForestClassificationModelReader extends MLReader[RapidsRandomForestClassificationModel] { 67 | 68 | override def load(path: String): RapidsRandomForestClassificationModel = { 69 | val cpuModel = RandomForestClassificationModel.load(path) 70 | val attributesPath = new Path(path, "attributes").toString 71 | val row = sparkSession.read.parquet(attributesPath).first() 72 | val model = new RapidsRandomForestClassificationModel(row.getString(0), 73 | cpuModel.trees, cpuModel.numFeatures, cpuModel.numClasses, row.getString(1)) 74 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 75 | model 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/RapidsRandomForestRegressionModel.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.ml.param.ParamMap 21 | import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, RandomForestRegressionModel} 22 | import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable} 23 | import org.apache.spark.sql.{DataFrame, Dataset} 24 | 25 | /** 26 | * Model produced by RapidsRandomForestClassifier. 27 | * 28 | * RapidsRandomForestRegressionModel extends from the Spark RandomForestRegressionModel and stores 29 | * the model attributes trained by spark-rapids-ml python in string format. 30 | */ 31 | class RapidsRandomForestRegressionModel(override val uid: String, 32 | private val _trees: Array[DecisionTreeRegressionModel], 33 | override val numFeatures: Int, 34 | override val modelAttributes: String) 35 | extends RandomForestRegressionModel(uid, _trees, numFeatures) 36 | with MLWritable with RapidsModel { 37 | 38 | private[ml] def this() = this("", null, 1, "") 39 | 40 | override def transform(dataset: Dataset[_]): DataFrame = { 41 | transformOnPython(dataset, super.transform) 42 | } 43 | 44 | /** 45 | * The model name 46 | */ 47 | override def name: String = "RandomForestRegressionModel" 48 | 49 | override def copy(extra: ParamMap): RapidsRandomForestRegressionModel = { 50 | copyValues( 51 | new RapidsRandomForestRegressionModel(uid, _trees, numFeatures, modelAttributes), extra) 52 | } 53 | 54 | override def cpu: RandomForestRegressionModel = { 55 | copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures)) 56 | } 57 | } 58 | 59 | object RapidsRandomForestRegressionModel extends MLReadable[RapidsRandomForestRegressionModel] { 60 | 61 | override def read: MLReader[RapidsRandomForestRegressionModel] = new RapidsRandomForestRegressionModelReader 62 | 63 | override def load(path: String): RapidsRandomForestRegressionModel = super.load(path) 64 | 65 | private class RapidsRandomForestRegressionModelReader extends MLReader[RapidsRandomForestRegressionModel] { 66 | 67 | override def load(path: String): RapidsRandomForestRegressionModel = { 68 | val cpuModel = RandomForestRegressionModel.load(path) 69 | val attributesPath = new Path(path, "attributes").toString 70 | val row = sparkSession.read.parquet(attributesPath).first() 71 | val model = new RapidsRandomForestRegressionModel(row.getString(0), cpuModel.trees, 72 | cpuModel.numFeatures, row.getString(1)) 73 | cpuModel.paramMap.toSeq.foreach(p => model.set(p.param.name, p.value)) 74 | model 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /jvm/src/main/scala/org/apache/spark/ml/rapids/Utils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025, NVIDIA CORPORATION. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.ml.rapids 18 | 19 | import java.security.SecureRandom 20 | import java.util.Base64 21 | import java.io.File 22 | import scala.collection.mutable.ArrayBuffer 23 | import scala.jdk.CollectionConverters._ 24 | import scala.sys.process.Process 25 | import py4j.GatewayServer.GatewayServerBuilder 26 | import org.apache.spark.api.python.SimplePythonFunction 27 | import org.apache.spark.ml.linalg 28 | import org.apache.spark.ml.linalg.{DenseVector, Vectors} 29 | import org.apache.spark.ml.param.{ParamPair, Params} 30 | import org.apache.spark.util.Utils 31 | import org.json4s.{DefaultFormats, JArray} 32 | import org.json4s.JsonDSL._ 33 | import org.json4s.jackson.JsonMethods.{compact, parse, render} 34 | 35 | object RapidsUtils { 36 | 37 | def getUserDefinedParams(instance: Params): String = { 38 | compact(render(instance.paramMap.toSeq.map { case ParamPair(p, v) => 39 | p.name -> parse(p.jsonEncode(v)) 40 | }.toList)) 41 | } 42 | 43 | def createTempDir(namePrefix: String = "spark"): File = { 44 | Utils.createTempDir(namePrefix) 45 | } 46 | 47 | def deleteRecursively(file: File): Unit = { 48 | Utils.deleteRecursively(file) 49 | } 50 | 51 | } 52 | 53 | object PythonRunnerUtils { 54 | private def generateSecrets = { 55 | val rnd = new SecureRandom() 56 | val token = new Array[Byte](32) 57 | rnd.nextBytes(token) 58 | Base64.getEncoder.encodeToString(token) 59 | } 60 | 61 | private[rapids] lazy val AUTH_TOKEN: String = generateSecrets 62 | 63 | private[rapids] lazy val RAPIDS_PYTHON_FUNC = { 64 | val defaultPythonExec: String = sys.env.getOrElse( 65 | "PYSPARK_DRIVER_PYTHON", sys.env.getOrElse("PYSPARK_PYTHON", "python3")) 66 | val pythonVer: String = 67 | Process( 68 | Seq(defaultPythonExec, "-c", "import sys; print('%d.%d' % sys.version_info[:2])")).!!.trim() 69 | 70 | new SimplePythonFunction( 71 | command = Array[Byte](), 72 | envVars = Map( 73 | "PYSPARK_PYTHON" -> defaultPythonExec, 74 | "PYSPARK_DRIVER_PYTHON" -> defaultPythonExec, 75 | ).asJava, 76 | pythonIncludes = ArrayBuffer("").asJava, 77 | pythonExec = defaultPythonExec, 78 | pythonVer = pythonVer, 79 | broadcastVars = List.empty.asJava, 80 | accumulator = null 81 | ) 82 | } 83 | 84 | private val gwLock = new Object() // Lock object 85 | 86 | private lazy val gw: py4j.Gateway = gwLock.synchronized { 87 | val server = new GatewayServerBuilder().authToken(AUTH_TOKEN).build() 88 | server.start() 89 | server.getGateway 90 | } 91 | 92 | def putNewObjectToPy4j(o: Object): String = gwLock.synchronized { 93 | gw.putNewObject(o) 94 | } 95 | 96 | def deleteObject(key: String): Unit = gwLock.synchronized { 97 | gw.deleteObject(key) 98 | } 99 | 100 | /** 101 | * Get the model from py4j server and remove its reference in py4j server 102 | */ 103 | def getObjectAndDeref(id: String): Object = gwLock.synchronized { 104 | val o = gw.getObject(id) 105 | gw.deleteObject(id) 106 | o 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Running notebooks locally 2 | 3 | To run notebooks using Spark local mode on a server with one or more NVIDIA GPUs: 4 | 1. Follow the [installation instructions](../python/README.md#installation) to setup your environment. 5 | 2. Install `jupyter` into the conda environment. 6 | ```bash 7 | pip install jupyter 8 | ``` 9 | 3. Set `SPARK_HOME`. 10 | ```bash 11 | export SPARK_HOME=$( pip show pyspark | grep Location | grep -o '/.*' )/pyspark 12 | ls $SPARK_HOME/bin/pyspark 13 | ``` 14 | 4. In the notebooks directory, start PySpark in local mode with the Jupyter UI. 15 | ```bash 16 | cd spark-rapids-ml/notebooks 17 | 18 | PYSPARK_DRIVER_PYTHON=jupyter \ 19 | PYSPARK_DRIVER_PYTHON_OPTS='notebook --ip=0.0.0.0' \ 20 | CUDA_VISIBLE_DEVICES=0 \ 21 | $SPARK_HOME/bin/pyspark --master local[12] \ 22 | --driver-memory 128g \ 23 | --conf spark.sql.execution.arrow.pyspark.enabled=true 24 | ``` 25 | 5. Follow the instructions printed by the above command to browse to the Jupyter notebook server. 26 | 6. In the Jupyter file browser, open and run any of the notebooks. 27 | 7. **OPTIONAL**: If your server is remote with no direct `http` access, but you have `ssh` access, you can connect via an `ssh` tunnel, as follows: 28 | ```bash 29 | export REMOTE_USER= 30 | export REMOTE_HOST= 31 | ssh -A -L 8888:127.0.0.1:8888 -L 4040:127.0.0.1:4040 ${REMOTE_USER}@${REMOTE_HOST} 32 | ``` 33 | Then, browse to the `127.0.0.1` URL printed by the command in step 4. Note that a tunnel is also opened to the Spark UI server on port 4040. Once a notebook is opened, you can view it by browsing to http://127.0.0.1:4040 in another tab or window. 34 | 8. **OPTIONAL**: If you have multiple GPUs in your server, replace the `CUDA_VISIBLE_DEVICES` setting in step 4 with a comma-separated list of the corresponding indices. For example, for two GPUs use `CUDA_VISIBLE_DEVICES=0,1`. 35 | 36 | ## No import change 37 | In the default notebooks, the GPU accelerated implementations of algorithms in Spark MLlib are enabled via import statements from the `spark_rapids_ml` package. 38 | 39 | Alternatively, acceleration can also be enabled by executing the following import statement at the start of a notebook: 40 | ``` 41 | import spark_rapids_ml.install 42 | ``` 43 | or by modifying the PySpark/Jupyter launch command above to use a CLI `pyspark-rapids` installed by our `pip` package to start Jupyter with pyspark as follows: 44 | ```bash 45 | cd spark-rapids-ml/notebooks 46 | 47 | PYSPARK_DRIVER_PYTHON=jupyter \ 48 | PYSPARK_DRIVER_PYTHON_OPTS='notebook --ip=0.0.0.0' \ 49 | CUDA_VISIBLE_DEVICES=0 \ 50 | pyspark-rapids --master local[12] \ 51 | --driver-memory 128g \ 52 | --conf spark.sql.execution.arrow.pyspark.enabled=true 53 | ``` 54 | 55 | After executing either of the above, all subsequent imports and accesses of supported accelerated classes from `pyspark.ml` will automatically redirect and return their counterparts in `spark_rapids_ml`. Unaccelerated classes will import from `pyspark.ml` as usual. Thus, all supported acceleration in an existing `pyspark` notebook is enabled with no additional import statement or code changes. Directly importing from `spark_rapids_ml` also still works (needed for non-MLlib algorithms like UMAP). 56 | 57 | For an example notebook, see the notebook [kmeans-no-import-change.ipynb](kmeans-no-import-change.ipynb). 58 | 59 | *Note*: As of this release, in this mode, the remaining unsupported methods and attributes on accelerated classes and objects will still raise exceptions. 60 | 61 | ## Running notebooks on Databricks 62 | See [these instructions](databricks/README.md) for running the notebooks in a Databricks Spark cluster. 63 | 64 | ## Running notebooks on Google Dataproc 65 | See [these instructions](dataproc/README.md) for running the notebooks in a Dataproc Spark cluster. 66 | 67 | ## Running notebooks on AWS EMR 68 | See [these instructions](aws-emr/README.md) for running the notebooks in an AWS-EMR cluster. 69 | 70 | -------------------------------------------------------------------------------- /notebooks/aws-emr/init-bootstrap-action.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | set -ex 18 | 19 | sudo mkdir -p /spark-rapids-cgroup/devices 20 | sudo mount -t cgroup -o devices cgroupv1-devices /spark-rapids-cgroup/devices 21 | sudo chmod a+rwx -R /spark-rapids-cgroup 22 | 23 | sudo yum update -y 24 | sudo yum install -y gcc bzip2-devel libffi-devel tar gzip wget make 25 | sudo yum install -y mysql-devel --skip-broken 26 | sudo bash -c "wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tgz && \ 27 | tar xzf Python-3.10.9.tgz && cd Python-3.10.9 && \ 28 | ./configure --enable-optimizations && make altinstall" 29 | 30 | RAPIDS_VERSION=25.6.0 31 | 32 | sudo /usr/local/bin/pip3.10 install --upgrade pip 33 | 34 | # install scikit-learn 35 | sudo /usr/local/bin/pip3.10 install scikit-learn 36 | 37 | # install cudf and cuml 38 | sudo /usr/local/bin/pip3.10 install --no-cache-dir cudf-cu12~=${RAPIDS_VERSION} \ 39 | cuml-cu12~=${RAPIDS_VERSION} \ 40 | cuvs-cu12~=${RAPIDS_VERSION} \ 41 | --extra-index-url=https://pypi.nvidia.com --verbose 42 | sudo /usr/local/bin/pip3.10 install spark-rapids-ml 43 | sudo /usr/local/bin/pip3.10 list 44 | 45 | # set up no-import-change for cluster if enabled 46 | if [[ $1 == "--no-import-enabled" && $2 == 1 ]]; then 47 | echo "enabling no import change in cluster" 1>&2 48 | cd /usr/lib/livy/repl_2.12-jars 49 | sudo jar xf livy-repl_2.12*.jar fake_shell.py 50 | sudo sed -i fake_shell.py -e '/from __future__/ s/\(.*\)/\1\ntry:\n import spark_rapids_ml.install\nexcept:\n pass\n/g' 51 | sudo jar uf livy-repl_2.12*.jar fake_shell.py 52 | sudo rm fake_shell.py 53 | fi 54 | 55 | # ensure notebook comes up in python 3.10 by using a background script that waits for an 56 | # application file to be installed before modifying. 57 | cat </tmp/mod_start_kernel.sh 58 | #!/bin/bash 59 | set -ex 60 | while [ ! -f /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh ]; do 61 | echo "waiting for /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh" 62 | sleep 10 63 | done 64 | echo "done waiting" 65 | sleep 10 66 | sudo sed -i /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh -e 's#"spark.pyspark.python": "python3"#"spark.pyspark.python": "/usr/local/bin/python3.10"#g' 67 | sudo sed -i /mnt/notebook-env/bin/start_kernel_as_emr_notebook.sh -e 's#"spark.pyspark.virtualenv.enabled": "true"#"spark.pyspark.virtualenv.enabled": "false"#g' 68 | exit 0 69 | EOF 70 | sudo bash /tmp/mod_start_kernel.sh & 71 | exit 0 72 | 73 | -------------------------------------------------------------------------------- /notebooks/databricks/init-pip-cuda-11.8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -ex 17 | 18 | # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 19 | # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) 20 | # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) 21 | RAPIDS_VERSION=25.6.0 22 | SPARK_RAPIDS_VERSION=25.04.0 23 | 24 | curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar 25 | 26 | # install cudatoolkit 11.8 via runfile approach 27 | wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run 28 | sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit 29 | 30 | # reset symlink and update library loading paths 31 | rm /usr/local/cuda 32 | ln -s /usr/local/cuda-11.8 /usr/local/cuda 33 | 34 | # upgrade pip 35 | /databricks/python/bin/pip install --upgrade pip 36 | 37 | # install cudf, cuml and their rapids dependencies 38 | # using ~= pulls in latest micro version patches 39 | /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \ 40 | cuml-cu11~=${RAPIDS_VERSION} \ 41 | cuvs-cu11~=${RAPIDS_VERSION} \ 42 | --extra-index-url=https://pypi.nvidia.com 43 | 44 | # install spark-rapids-ml 45 | /databricks/python/bin/pip install spark-rapids-ml 46 | 47 | # set up no-import-change for cluster if enabled 48 | if [[ $SPARK_RAPIDS_ML_NO_IMPORT_ENABLED == 1 ]]; then 49 | echo "enabling no import change in cluster" 1>&2 50 | mkdir -p /root/.ipython/profile_default/startup 51 | echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py 52 | fi 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /notebooks/dataproc/README.md: -------------------------------------------------------------------------------- 1 | ## Running notebooks on Dataproc 2 | 3 | If you already have a Dataproc account, you can run the example notebooks on a Dataproc cluster, as follows: 4 | - Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install) and initialize it via `gcloud init`. 5 | - Configure the following settings: 6 | ``` 7 | export PROJECT= 8 | export DATAPROC_REGION= 9 | export COMPUTE_REGION= 10 | export COMPUTE_ZONE= 11 | 12 | gcloud config set project ${PROJECT} 13 | gcloud config set dataproc/region ${DATAPROC_REGION} 14 | gcloud config set compute/region ${COMPUTE_REGION} 15 | gcloud config set compute/zone ${COMPUTE_ZONE} 16 | ``` 17 | - Create a GCS bucket if you don't already have one: 18 | ``` 19 | export GCS_BUCKET= 20 | 21 | gcloud storage buckets create gs://${GCS_BUCKET} 22 | ``` 23 | - Upload the initialization scripts to your GCS bucket: 24 | ``` 25 | gsutil cp spark_rapids_ml.sh gs://${GCS_BUCKET} 26 | curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh 27 | gsutil cp spark-rapids.sh gs://${GCS_BUCKET}/spark-rapids.sh 28 | ``` 29 | - Create a cluster with at least two single-gpu workers. **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS. 30 | 31 | If you wish to enable [no-import-change](../README.md#no-import-change) UX for the cluster, change the `spark-rapids-ml-no-import-enabled` metadata value to `1` in the command. The initialization script `spark_rapids_ml.sh` checks this metadata value and modifies the run time accordingly. 32 | 33 | ``` 34 | export RAPIDS_VERSION=25.6.0 35 | 36 | gcloud dataproc clusters create $USER-spark-rapids-ml \ 37 | --image-version=2.2-ubuntu22 \ 38 | --region ${COMPUTE_REGION} \ 39 | --master-machine-type n1-standard-16 \ 40 | --master-accelerator type=nvidia-tesla-t4,count=1 \ 41 | --num-workers 2 \ 42 | --worker-min-cpu-platform=Intel\ Skylake \ 43 | --worker-accelerator type=nvidia-tesla-t4,count=1 \ 44 | --worker-machine-type n1-standard-16 \ 45 | --num-worker-local-ssds 4 \ 46 | --worker-local-ssd-interface=NVME \ 47 | --initialization-actions gs://${GCS_BUCKET}/spark-rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \ 48 | --initialization-action-timeout=20m \ 49 | --optional-components=JUPYTER \ 50 | --metadata gpu-driver-provider="NVIDIA" \ 51 | --metadata rapids-runtime=SPARK \ 52 | --metadata rapids-version=${RAPIDS_VERSION} \ 53 | --metadata spark-rapids-ml-no-import-enabled=0 \ 54 | --properties spark:spark.executor.resource.gpu.amount=1,\ 55 | spark:spark.task.resource.gpu.amount=0.0625,\ 56 | spark:spark.executorEnv.CUPY_CACHE_DIR=/tmp/.cupy,\ 57 | spark:spark.locality.wait=0,\ 58 | spark:spark.sql.execution.arrow.pyspark.enabled=true,\ 59 | spark:spark.sql.execution.arrow.maxRecordsPerBatch=100000,\ 60 | spark:spark.rapids.memory.gpu.pooling.enabled=false \ 61 | --bucket ${GCS_BUCKET} \ 62 | --enable-component-gateway \ 63 | --subnet=default \ 64 | --no-shielded-secure-boot 65 | ``` 66 | **Note**: the `properties` settings are for demonstration purposes only. Additional tuning may be required for optimal performance. 67 | - In the [Dataproc console](https://console.cloud.google.com/dataproc/clusters), select your cluster, go to the "Web Interfaces" tab, and click on the "JupyterLab" link. 68 | - In JupyterLab, upload the desired [notebook](../) via the `Upload Files` button. For the no-import-change UX, you can try the example [kmeans-no-import-change.ipynb](../kmeans-no-import-change.ipynb). 69 | 70 | Open the notebook and select the `PySpark` kernel using, e.g., the drop down that appears after clicking on the kernel name appearing in the top right corner of the notebook view. 71 | 72 | - Run the notebook cells. **Note**: you may need to change file paths to use `hdfs://` paths. 73 | -------------------------------------------------------------------------------- /notebooks/dataproc/spark_rapids_ml.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -ex 17 | 18 | RAPIDS_VERSION=25.6.0 19 | 20 | 21 | # install cudf and cuml 22 | pip install --upgrade pip 23 | pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \ 24 | --extra-index-url=https://pypi.nvidia.com 25 | 26 | # install spark-rapids-ml 27 | pip install spark-rapids-ml 28 | 29 | # set up no-import-change for cluster if enabled 30 | no_import_change=$(/usr/share/google/get_metadata_value attributes/spark-rapids-ml-no-import-enabled) 31 | if [[ $no_import_change == 1 ]]; then 32 | echo "enabling no import change in cluster" 1>&2 33 | mkdir -p /root/.ipython/profile_default/startup 34 | echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py 35 | fi 36 | -------------------------------------------------------------------------------- /python/benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking 2 | This directory contains python scripts for benchmarking the supported algorithms. 3 | 4 | ## Local 5 | This [script](../run_benchmark.sh) can be used to run them locally. 6 | 7 | ## Databricks 8 | They can also be run on the Databricks AWS hosted Spark service. See [these instructions](databricks/README.md) and accompanying scripts for running a set of high compute workloads on comparable CPU and GPU clusters. The graph below shows the resulting Spark ML CPU and Spark Rapids ML GPU average running times. 9 | 10 | ![Databricks AWS benchmarking results](databricks/results/running_times.png) 11 | 12 | ## Other CSPs 13 | Click on the below for instructions on running the benchmarking scripts in the respective CSP Spark environments 14 | - [GCP Dataproc](dataproc/README.md) 15 | - [AWS EMR](aws-emr/README.md) 16 | 17 | -------------------------------------------------------------------------------- /python/benchmark/aws-emr/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking on AWS EMR 2 | 3 | This directory contains shell scripts for running larger-scale benchmarks on an AWS EMR cluster. You will need an AWS account to run them. The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py). For convenience, these have been precomputed and are available in the public S3 bucket `spark-rapids-ml-bm-datasets-public`. The benchmark scripts are currently configured to read the data from there. 4 | 5 | ## Setup 6 | 7 | - Install the [AWS CLI](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-cli.html) and initialize it via `aws configure`. You may need to obtain your [access keys and region code](../../../notebooks/aws-emr/README.md). 8 | 9 | - Create an S3 bucket if you don't already have one. 10 | ``` 11 | export S3_BUCKET= 12 | aws s3 mb s3://${S3_BUCKET} 13 | ``` 14 | 15 | - Upload the benchmarking files to your S3 bucket: 16 | ``` 17 | # path to store benchmarking files inside your S3 bucket 18 | export BENCHMARK_HOME=${S3_BUCKET}/benchmark 19 | 20 | ./setup.sh 21 | ``` 22 | **Note**: this step should be repeated for each new version of the spark-rapids-ml package that you want to test. 23 | 24 | ## Create an ssh key pair 25 | - The benchmark script needs ssh access to the EMR cluster and this requires creating an [EC2 key pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html). Choose the **pem** format. After saving the private key locally with `.pem` as the file extension, set the following environment variable to point to its location. 26 | ``` 27 | export KEYPAIR=/path/to/private/key.pem 28 | ``` 29 | 30 | ## Prepare Subnet 31 | - Print out available subnets in CLI then pick a SubnetId of your region (e.g. subnet-0744566f of AvailabilityZone us-east-2a in region Ohio). A subnet is required to start an EMR cluster. Make sure that your selected subnet allows SSH access (port 22) from your local host where you will be invoking the benchmarking script. The public subnet in the default VPC in your account might be a suitable choice. See AWS EMR documentation for more info on [VPCs for EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-vpc-host-job-flows.html) and related info on SSH access in [managed security groups used by EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html). 32 | 33 | ``` 34 | aws ec2 describe-subnets 35 | export SUBNET_ID= 36 | ``` 37 | ## Run Benchmarks 38 | 39 | - Start the cpu or gpu cluster and run all benchmarks. 40 | ``` 41 | ./run_benchmark.sh [cpu|gpu] 2>&1 | tee benchmark.log 42 | ``` 43 | **Note**: the created cluster is configured to automatically terminate after 30 minutes of idle time, but it can still be manually terminated or deleted via the AWS EMR Console. 44 | 45 | **Note**: monitor benchmark progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases. 46 | 47 | - Extract timing information. To view the original EMR log files, please log in [AWS EMR console](https://console.aws.amazon.com/emr/). Click "Clusters", choose the created cluster, click "Steps", then click "stdout" of each spark submit application. 48 | ``` 49 | egrep -e "[0-9.]* seconds" *.log 50 | ``` 51 | 52 | - Stop the cluster via the AWS EMR Console, or via command line. 53 | ``` 54 | cluster_id=$(grep "cluster-id" benchmark.log | grep -o 'j-[0-9|A-Z]*' | head -n 1) 55 | aws emr terminate-clusters --cluster-ids ${cluster_id} 56 | ``` 57 | - **OPTIONAL**: To run a single benchmark manually, search the `benchmark.log` for the `aws emr add-steps` command line associated with the target benchmark. If needed, start the cluster first and obtain its cluster_id. Then, just copy-and-paste that command line into your shell with the correct cluster_id. 58 | ``` 59 | ./start_cluster.sh [cpu|gpu] 60 | 61 | ``` 62 | -------------------------------------------------------------------------------- /python/benchmark/aws-emr/cpu-init-configurations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification":"spark-defaults", 4 | "Properties":{ 5 | "spark.pyspark.python":"/usr/local/bin/python3.10", 6 | "spark.pyspark.driver.python":"/usr/local/bin/python3.10" 7 | } 8 | } 9 | ] 10 | -------------------------------------------------------------------------------- /python/benchmark/aws-emr/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # copies files to s3 bucket 18 | 19 | if [[ -z $BENCHMARK_HOME ]]; then 20 | echo "please export BENCHMARK_HOME per README.md" 21 | exit 1 22 | fi 23 | 24 | SPARK_RAPIDS_ML_HOME='../..' 25 | 26 | echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****" 27 | 28 | aws s3 cp ../../../notebooks/aws-emr/init-bootstrap-action.sh s3://${BENCHMARK_HOME}/init-bootstrap-action.sh 29 | 30 | pushd ${SPARK_RAPIDS_ML_HOME}/benchmark 31 | zip -r - benchmark > benchmark.zip 32 | aws s3 cp benchmark.zip s3://${BENCHMARK_HOME}/benchmark.zip 33 | popd 34 | 35 | pushd ${SPARK_RAPIDS_ML_HOME} 36 | aws s3 cp benchmark/benchmark_runner.py s3://${BENCHMARK_HOME}/benchmark_runner.py 37 | popd 38 | 39 | pushd ${SPARK_RAPIDS_ML_HOME}/src 40 | zip -r - spark_rapids_ml >spark_rapids_ml.zip 41 | aws s3 cp spark_rapids_ml.zip s3://${BENCHMARK_HOME}/spark_rapids_ml.zip 42 | popd 43 | -------------------------------------------------------------------------------- /python/benchmark/aws-emr/start_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o pipefail 17 | 18 | cluster_type=${1:-gpu} 19 | 20 | # configure arguments 21 | if [[ -z ${SUBNET_ID} ]]; then 22 | echo "Please export SUBNET_ID per README.md" 23 | exit 1 24 | fi 25 | 26 | if [[ -z ${BENCHMARK_HOME} ]]; then 27 | echo "Please export BENCHMARK_HOME per README.md" 28 | exit 1 29 | fi 30 | 31 | if [[ -z ${KEYPAIR} ]]; then 32 | echo "Please export KEYPAIR per README.md" 33 | exit 1 34 | fi 35 | 36 | cluster_name=spark-rapids-ml-${cluster_type} 37 | cur_dir=$(pwd) 38 | 39 | if [[ ${cluster_type} == "gpu" ]]; then 40 | core_type=g5.2xlarge 41 | config_json="file://${cur_dir}/../../../notebooks/aws-emr/init-configurations.json" 42 | bootstrap_actions="--bootstrap-actions Name='Spark Rapids ML Bootstrap action',Path=s3://${BENCHMARK_HOME}/init-bootstrap-action.sh" 43 | elif [[ ${cluster_type} == "cpu" ]]; then 44 | core_type=m6gd.2xlarge 45 | config_json="file://${cur_dir}/cpu-init-configurations.json" 46 | bootstrap_actions="" 47 | else 48 | echo "unknown cluster type ${cluster_type}" 49 | echo "usage: ./${script_name} cpu|gpu" 50 | exit 1 51 | fi 52 | 53 | start_cmd="aws emr create-cluster \ 54 | --name ${cluster_name} \ 55 | --release-label emr-7.3.0 \ 56 | --applications Name=Hadoop Name=Spark \ 57 | --service-role EMR_DefaultRole \ 58 | --log-uri s3://${BENCHMARK_HOME}/logs \ 59 | --ec2-attributes KeyName=$(basename ${KEYPAIR} | sed -e 's/\.pem//g' ),SubnetId=${SUBNET_ID},InstanceProfile=EMR_EC2_DefaultRole \ 60 | --ebs-root-volume-size=32 \ 61 | --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m4.2xlarge \ 62 | InstanceGroupType=CORE,InstanceCount=3,InstanceType=${core_type} \ 63 | --configurations ${config_json} $bootstrap_actions 64 | " 65 | 66 | CLUSTER_ID=$( eval ${start_cmd} | tee /dev/tty | grep "ClusterId" | grep -o 'j-[0-9|A-Z]*') 67 | aws emr put-auto-termination-policy --cluster-id ${CLUSTER_ID} --auto-termination-policy IdleTimeout=1800 68 | echo "waiting for cluster ${CLUSTER_ID} to start ... " 1>&2 69 | 70 | aws emr wait cluster-running --cluster-id $CLUSTER_ID 71 | 72 | echo "cluster started." 1>&2 73 | echo $CLUSTER_ID 74 | -------------------------------------------------------------------------------- /python/benchmark/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/benchmark/benchmark/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022-2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import inspect 17 | from distutils.util import strtobool 18 | from time import time 19 | from typing import Any, Callable, Dict, List 20 | 21 | from pyspark.sql import SparkSession 22 | 23 | 24 | class WithSparkSession(object): 25 | def __init__(self, confs: List[str], shutdown: bool = True) -> None: 26 | builder = SparkSession.builder 27 | for conf in confs: 28 | key, value = (conf.split("=")[0], "=".join(conf.split("=")[1:])) 29 | print(key, value) 30 | builder = builder.config(key, value) 31 | self.spark = builder.getOrCreate() 32 | self.shutdown = shutdown 33 | 34 | def __enter__(self) -> SparkSession: 35 | return self.spark 36 | 37 | def __exit__(self, *args: Any) -> None: 38 | if self.shutdown: 39 | print("stopping spark session") 40 | self.spark.stop() 41 | 42 | 43 | def with_benchmark(phrase: str, action: Callable) -> Any: 44 | start = time() 45 | result = action() 46 | end = time() 47 | print("-" * 100) 48 | duration = round(end - start, 2) 49 | print("{}: {} seconds".format(phrase, duration)) 50 | print("-" * 100) 51 | return result, duration 52 | 53 | 54 | def inspect_default_params_from_func( 55 | func: Callable, unsupported_set: List[str] = [] 56 | ) -> Dict[str, Any]: 57 | """ 58 | Returns a dictionary of parameters and their default value of function fn. 59 | Only the parameters with a default value will be included. 60 | """ 61 | sig = inspect.signature(func) 62 | filtered_params_dict = {} 63 | for parameter in sig.parameters.values(): 64 | # Remove parameters without a default value and those in the unsupported_set 65 | if ( 66 | parameter.default is not parameter.empty 67 | and parameter.default is not None 68 | and parameter.name not in unsupported_set 69 | ): 70 | filtered_params_dict[parameter.name] = parameter.default 71 | return filtered_params_dict 72 | 73 | 74 | def to_bool(literal: str) -> bool: 75 | return bool(strtobool(literal)) 76 | 77 | 78 | def is_remote() -> bool: 79 | try: 80 | # pyspark.sql.utils.is_remote is not available in older versions of pyspark in which case remote is not supported 81 | from pyspark.sql.utils import is_remote # type: ignore 82 | 83 | return is_remote() 84 | except: 85 | return False 86 | -------------------------------------------------------------------------------- /python/benchmark/benchmark/utils_knn.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022-2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from typing import Optional, Tuple 17 | 18 | from pyspark.sql import DataFrame 19 | 20 | from spark_rapids_ml.core import ( 21 | EvalMetricInfo, 22 | _ConstructFunc, 23 | _EvaluateFunc, 24 | _TransformFunc, 25 | ) 26 | from spark_rapids_ml.knn import ApproximateNearestNeighborsModel 27 | 28 | 29 | class CPUNearestNeighborsModel(ApproximateNearestNeighborsModel): 30 | def __init__(self, item_df: DataFrame): 31 | super().__init__(item_df) 32 | 33 | def kneighbors( 34 | self, query_df: DataFrame, sort_knn_df_by_query_id: bool = True 35 | ) -> Tuple[DataFrame, DataFrame, DataFrame]: 36 | self._item_df_withid = self._ensureIdCol(self._item_df_withid) 37 | return super().kneighbors( 38 | query_df, sort_knn_df_by_query_id=sort_knn_df_by_query_id 39 | ) 40 | 41 | def _get_cuml_transform_func( 42 | self, dataset: DataFrame, eval_metric_info: Optional[EvalMetricInfo] = None 43 | ) -> Tuple[ 44 | _ConstructFunc, 45 | _TransformFunc, 46 | Optional[_EvaluateFunc], 47 | ]: 48 | self._cuml_params["algorithm"] = "brute" 49 | _, _transform_internal, _ = super()._get_cuml_transform_func( 50 | dataset, eval_metric_info 51 | ) 52 | 53 | from sklearn.neighbors import NearestNeighbors as SKNN 54 | 55 | n_neighbors = self.getK() 56 | 57 | def _construct_sknn() -> SKNN: 58 | nn_object = SKNN(algorithm="brute", n_neighbors=n_neighbors) 59 | return nn_object 60 | 61 | return _construct_sknn, _transform_internal, None 62 | 63 | def _concate_pdf_batches(self) -> bool: 64 | return False 65 | -------------------------------------------------------------------------------- /python/benchmark/benchmark_runner.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022-2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import argparse 17 | import sys 18 | 19 | from benchmark.bench_approximate_nearest_neighbors import ( 20 | BenchmarkApproximateNearestNeighbors, 21 | ) 22 | from benchmark.bench_dbscan import BenchmarkDBSCAN 23 | from benchmark.bench_kmeans import BenchmarkKMeans 24 | from benchmark.bench_linear_regression import BenchmarkLinearRegression 25 | from benchmark.bench_logistic_regression import BenchmarkLogisticRegression 26 | from benchmark.bench_nearest_neighbors import BenchmarkNearestNeighbors 27 | from benchmark.bench_pca import BenchmarkPCA 28 | from benchmark.bench_random_forest import ( 29 | BenchmarkRandomForestClassifier, 30 | BenchmarkRandomForestRegressor, 31 | ) 32 | from benchmark.bench_umap import BenchmarkUMAP 33 | 34 | 35 | class BenchmarkRunner: 36 | def __init__(self) -> None: 37 | registered_algorithms = { 38 | "approximate_nearest_neighbors": BenchmarkApproximateNearestNeighbors, 39 | "dbscan": BenchmarkDBSCAN, 40 | "kmeans": BenchmarkKMeans, 41 | "knn": BenchmarkNearestNeighbors, 42 | "linear_regression": BenchmarkLinearRegression, 43 | "pca": BenchmarkPCA, 44 | "random_forest_classifier": BenchmarkRandomForestClassifier, 45 | "random_forest_regressor": BenchmarkRandomForestRegressor, 46 | "logistic_regression": BenchmarkLogisticRegression, 47 | "umap": BenchmarkUMAP, 48 | } 49 | algorithms = "\n ".join(registered_algorithms.keys()) 50 | parser = argparse.ArgumentParser( 51 | description="Benchmark Spark Rapids ML algorithms", 52 | usage=f"""benchmark_runner.py [] 53 | 54 | Supported algorithms are: 55 | {algorithms} 56 | """, 57 | ) 58 | parser.add_argument("algorithm", help="benchmark the ML algorithms") 59 | # parse_args defaults to [1:] for args, but you need to 60 | # exclude the rest of the args too, or validation will fail 61 | args = parser.parse_args(sys.argv[1:2]) 62 | 63 | if args.algorithm not in registered_algorithms: 64 | print("Unrecognized algorithm: ", args.algorithm) 65 | parser.print_help() 66 | exit(1) 67 | 68 | self._runner: BenchmarkBase = registered_algorithms[args.algorithm]( # type: ignore 69 | sys.argv[2:] 70 | ) 71 | 72 | def run(self) -> None: 73 | self._runner.run() 74 | 75 | 76 | if __name__ == "__main__": 77 | """ 78 | There're two ways to do the benchmark. 79 | 80 | 1. 81 | python benchmark_runner.py [linear_regression] \ 82 | --num_gpus=2 \ 83 | --train_path=xxx \ 84 | --spark_confs="spark.master=local[12]" \ 85 | 86 | 2. 87 | spark-submit --master local[12] benchmark_runner.py --num_gpus=2 --train_path=xxx 88 | """ 89 | BenchmarkRunner().run() 90 | -------------------------------------------------------------------------------- /python/benchmark/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import sys 19 | 20 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 21 | from tests.conftest import ( 22 | pytest_addoption, 23 | pytest_collection_modifyitems, 24 | pytest_configure, 25 | ) 26 | -------------------------------------------------------------------------------- /python/benchmark/databricks/cpu_cluster_spec.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # needed for bm script arguments 16 | cat < ${init_script}.updated && \ 74 | databricks workspace import --format AUTO --content $(${base64_cmd} ${init_script}.updated) ${INIT_SCRIPT_DIR}/${init_script} --profile ${DB_PROFILE} ${DB_OVERWRITE} 75 | done 76 | -------------------------------------------------------------------------------- /python/benchmark/dataproc/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking on Dataproc 2 | 3 | This directory contains shell scripts for running larger scale benchmarks on a Google Dataproc cluster. You will need a Dataproc account to run them. The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py). For internal users, these have been precomputed and stored in the GCS bucket `gs://spark-rapids-ml-benchmarking/datasets`. By default, the benchmark scripts will read the data from this location. For external users, you will need to modify the `BENCHMARK_DATA_HOME` environment variable in the `run_benchmark.sh` after generating the datasets. 4 | 5 | ## Setup 6 | 7 | - Install the [gcloud CLI](https://cloud.google.com/sdk/docs/install) and initialize it via `gcloud init`. 8 | 9 | - Configure the following settings: 10 | ``` 11 | export PROJECT= 12 | export DATAPROC_REGION= 13 | export COMPUTE_REGION= 14 | export COMPUTE_ZONE= 15 | 16 | gcloud config set project ${PROJECT} 17 | gcloud config set dataproc/region ${DATAPROC_REGION} 18 | gcloud config set compute/region ${COMPUTE_REGION} 19 | gcloud config set compute/zone ${COMPUTE_ZONE} 20 | ``` 21 | 22 | - Create a GCS bucket if you don't already have one: 23 | ``` 24 | export GCS_BUCKET= 25 | 26 | gcloud storage buckets create gs://${GCS_BUCKET} 27 | ``` 28 | 29 | - Upload the benchmarking files to your GCS bucket: 30 | ``` 31 | # path to store benchmarking files inside your GCS bucket 32 | export BENCHMARK_HOME=${GCS_BUCKET}/benchmark 33 | 34 | ./setup.sh 35 | ``` 36 | **Note**: this step should be repeated for each new version of the spark-rapids-ml package that you want to test. 37 | 38 | ## Run Benchmarks 39 | 40 | - Start the cpu or gpu cluster and run all benchmarks: 41 | ``` 42 | ./run_benchmark.sh [cpu|gpu] 2>&1 | tee benchmark.log 43 | ``` 44 | **Note**: the created cluster is configured to automatically terminate after 30 minutes of idle time, but it can still be manually terminated or deleted via the Dataproc UI. 45 | 46 | **Note**: monitor benchmark progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases. 47 | 48 | - Extract timing information: 49 | ``` 50 | egrep -e "[0-9.]* seconds" *.out 51 | ``` 52 | 53 | - Stop the cluster via the Dataproc UI, or via this command line: 54 | ``` 55 | gcloud dataproc clusters delete ${USER}-spark-rapids-ml-[cpu|gpu] --region ${COMPUTE_REGION} 56 | ``` 57 | 58 | - **OPTIONAL**: To run a single benchmark manually, search the `benchmark.log` for the `gcloud` command line associated with the target benchmark. If needed, start the cluster first. Then, just copy-and-paste that command line into your shell. 59 | ``` 60 | ./start_cluster.sh [cpu|gpu] 61 | 62 | ``` 63 | -------------------------------------------------------------------------------- /python/benchmark/dataproc/init_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | set -euxo pipefail 18 | 19 | function get_metadata_attribute() { 20 | local -r attribute_name=$1 21 | local -r default_value=$2 22 | /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" 23 | } 24 | 25 | RAPIDS_VERSION=$(get_metadata_attribute rapids-version 25.6.0) 26 | 27 | 28 | # install cudf and cuml 29 | # using ~= pulls in lates micro version patches 30 | pip install --upgrade pip 31 | 32 | pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \ 33 | pylibraft-cu12~=${RAPIDS_VERSION} \ 34 | rmm-cu12~=${RAPIDS_VERSION} \ 35 | --extra-index-url=https://pypi.nvidia.com 36 | 37 | # install benchmark files 38 | BENCHMARK_HOME=$(get_metadata_attribute benchmark-home UNSET) 39 | if [[ ${BENCHMARK_HOME} == "UNSET" ]]; then 40 | echo "Please set --metadata benchmark-home" 41 | exit 1 42 | fi 43 | 44 | gsutil cp gs://${BENCHMARK_HOME}/benchmark_runner.py . 45 | gsutil cp gs://${BENCHMARK_HOME}/spark_rapids_ml.zip . 46 | gsutil cp gs://${BENCHMARK_HOME}/benchmark.zip . 47 | 48 | python_ver=`python --version | grep -oP '3\.[0-9]+'` 49 | unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages 50 | unzip benchmark.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages 51 | -------------------------------------------------------------------------------- /python/benchmark/dataproc/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # copies files to GCS bucket 18 | 19 | if [[ -z $BENCHMARK_HOME ]]; then 20 | echo "please export BENCHMARK_HOME per README.md" 21 | exit 1 22 | fi 23 | 24 | SPARK_RAPIDS_ML_HOME='../..' 25 | 26 | echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****" 27 | 28 | gsutil cp init_benchmark.sh gs://${BENCHMARK_HOME}/init_benchmark.sh 29 | curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh 30 | gsutil cp spark-rapids.sh gs://${BENCHMARK_HOME}/spark-rapids.sh 31 | 32 | pushd ${SPARK_RAPIDS_ML_HOME}/benchmark 33 | zip -r - benchmark >benchmark.zip 34 | gsutil cp benchmark.zip gs://${BENCHMARK_HOME}/benchmark.zip 35 | popd 36 | 37 | pushd ${SPARK_RAPIDS_ML_HOME} 38 | gsutil cp benchmark/benchmark_runner.py gs://${BENCHMARK_HOME}/benchmark_runner.py 39 | popd 40 | 41 | pushd ${SPARK_RAPIDS_ML_HOME}/src 42 | zip -r - spark_rapids_ml >spark_rapids_ml.zip 43 | gsutil cp spark_rapids_ml.zip gs://${BENCHMARK_HOME}/spark_rapids_ml.zip 44 | popd 45 | -------------------------------------------------------------------------------- /python/benchmark/dataproc/start_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | cluster_type=${1:-gpu} 17 | 18 | # configure arguments 19 | if [[ -z ${COMPUTE_REGION} ]]; then 20 | echo "Please export COMPUTE_REGION per README.md" 21 | exit 1 22 | fi 23 | 24 | if [[ -z ${GCS_BUCKET} ]]; then 25 | echo "Please export GCS_BUCKET per README.md" 26 | exit 1 27 | fi 28 | 29 | BENCHMARK_HOME=${BENCHMARK_HOME:-${GCS_BUCKET}/benchmark} 30 | 31 | gpu_args=$(cat <=61.0"] 54 | build-backend = "setuptools.build_meta" 55 | 56 | [tool.setuptools.package-data] 57 | "spark_rapids_ml.jars" = ["*.jar"] 58 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | pyspark>=3.2.1,<3.5 16 | scikit-learn>=1.2.1 17 | -------------------------------------------------------------------------------- /python/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -r requirements.txt 16 | black>=23.1.0 17 | build>=0.10.0 18 | isort>=5.12.0 19 | mypy>=1.0.0 20 | numpydoc 21 | pydata-sphinx-theme 22 | pylint 23 | pytest 24 | pytest-xdist 25 | sphinx<6.0 26 | twine>=4.0.0 27 | -------------------------------------------------------------------------------- /python/run_plugin_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -e 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | pip install pyspark~=4.0 17 | pushd ../jvm 18 | mvn clean test 19 | popd 20 | pip install -r requirements_dev.txt 21 | -------------------------------------------------------------------------------- /python/run_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -e 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | unset SPARK_HOME 17 | 18 | python ../ci/lint_python.py --format --type-check || exit 1 19 | 20 | total_num_gpus=$(python -c 'import cupy; print(cupy.cuda.runtime.getDeviceCount())') 21 | if [ ${total_num_gpus} -gt 4 ] 22 | then 23 | echo "Tests use at most 4 GPUs. If failed, try setting CUDA_VISIBLE_DEVICES." 24 | fi 25 | 26 | # no package import change tests 27 | # runs on gpu 28 | python -m spark_rapids_ml tests_no_import_change/test_no_import_change.py 0.2 29 | # runs on cpu 30 | python tests_no_import_change/test_no_import_change.py 0.2 31 | # runs on gpu with spark-submit (note: local[1] for spark-rapids-submit hangs probably due to barrier rdd timer threads. TBD root cause) 32 | spark-rapids-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2 33 | # runs on cpu with spark-submit 34 | spark-submit --master local-cluster[1,1,1024] tests_no_import_change/test_no_import_change.py 0.2 35 | 36 | 37 | # calculate pytest parallelism by following https://github.com/NVIDIA/spark-rapids/tree/main/integration_tests/run_pyspark_from_build.sh 38 | MAX_PARALLEL=3 39 | NVIDIA_SMI_ARGS="" 40 | if [ ${CUDA_VISIBLE_DEVICES} ]; then 41 | NVIDIA_SMI_ARGS="${NVIDIA_SMI_ARGS} -i ${CUDA_VISIBLE_DEVICES}" 42 | fi 43 | GPU_MEM_PARALLEL=`nvidia-smi ${NVIDIA_SMI_ARGS} --query-gpu=memory.free --format=csv,noheader | awk 'NR == 1 { MIN = $1 } { if ($1 < MIN) { MIN = $1 } } END { print int((MIN - 2 * 1024) / ((3 * 1024) + 750)) }'` 44 | CPU_CORES=`nproc` 45 | TMP_PARALLEL=$(( $GPU_MEM_PARALLEL > $CPU_CORES ? $CPU_CORES : $GPU_MEM_PARALLEL )) 46 | TMP_PARALLEL=$(( $TMP_PARALLEL > $MAX_PARALLEL ? $MAX_PARALLEL : $TMP_PARALLEL )) 47 | if (( $TMP_PARALLEL <= 1 )); then 48 | TEST_PARALLEL=1 49 | else 50 | TEST_PARALLEL=$TMP_PARALLEL 51 | fi 52 | echo "Test functions in benchmark/test_gen_data.py and tests/ directory will be executed in parallel with ${TEST_PARALLEL} pytest workers" 53 | 54 | echo "use --runslow to run all tests" 55 | pytest "$@" -n ${TEST_PARALLEL} benchmark/test_gen_data.py 56 | PYTHONPATH=`pwd`/benchmark pytest -ra "$@" -n ${TEST_PARALLEL} --durations=10 tests 57 | #PYTHONPATH=`pwd`/benchmark pytest -ra --runslow -n ${TEST_PARALLEL} --durations=10 tests 58 | #PYTHONPATH=`pwd`/benchmark pytest -ra "$@" --durations=10 tests_large 59 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [mypy] 16 | ignore_missing_imports = True 17 | disallow_untyped_defs = True 18 | follow_imports = silent 19 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | __version__ = "25.06.0" 17 | 18 | import pandas as pd 19 | import pyspark 20 | 21 | # patch pandas 2.0+ for backward compatibility with psypark < 3.4 22 | from packaging import version 23 | 24 | if version.parse(pyspark.__version__) < version.parse("3.4.0") and version.parse( 25 | pd.__version__ 26 | ) >= version.parse("2.0.0"): 27 | pd.DataFrame.iteritems = pd.DataFrame.items 28 | pd.Series.iteritems = pd.Series.items 29 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/__main__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import argparse 18 | import runpy 19 | import sys 20 | 21 | import spark_rapids_ml.install 22 | 23 | 24 | # borrowed from rapids cudf.pandas 25 | def main() -> None: 26 | parser = argparse.ArgumentParser( 27 | prog="python -m spark_rapids_ml", 28 | description=( 29 | "Run a Python script with Spark RAPIDS ML enabled. " 30 | "In this mode supported pyspark.ml estimator imports will automatically use GPU acclerated implementations." 31 | ), 32 | ) 33 | 34 | parser.add_argument( 35 | "-m", 36 | dest="module", 37 | nargs=1, 38 | ) 39 | parser.add_argument( 40 | "args", 41 | nargs=argparse.REMAINDER, 42 | help="Arguments to pass on to the script", 43 | ) 44 | 45 | args = parser.parse_args() 46 | 47 | if args.module: 48 | (module,) = args.module 49 | # run the module passing the remaining arguments 50 | # as if it were run with python -m 51 | sys.argv[:] = [module] + args.args # not thread safe? 52 | runpy.run_module(module, run_name="__main__") 53 | elif len(args.args) >= 1: 54 | # Remove ourself from argv and continue 55 | sys.argv[:] = args.args 56 | runpy.run_path(args.args[0], run_name="__main__") 57 | else: 58 | parser.print_help() 59 | exit(1) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/common/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/install.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024-2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import importlib 18 | import sys 19 | import types 20 | from typing import Any 21 | 22 | _accelerated_attributes = { 23 | "feature": ["PCA", "PCAModel"], 24 | "clustering": ["KMeans", "KMeansModel"], 25 | "classification": [ 26 | "LogisticRegression", 27 | "LogisticRegressionModel", 28 | "RandomForestClassifier", 29 | "RandomForestClassificationModel", 30 | ], 31 | "regression": [ 32 | "LinearRegression", 33 | "LinearRegressionModel" "RandomForestRegressor", 34 | "RandomForestClassificationModel", 35 | ], 36 | "tuning": ["CrossValidator"], 37 | "pipeline": ["Pipeline"], 38 | } 39 | 40 | 41 | _rapids_modules = { 42 | module_name: importlib.import_module(f"spark_rapids_ml.{module_name}") 43 | for module_name in _accelerated_attributes.keys() 44 | } 45 | _pyspark_modules = { 46 | module_name: importlib.import_module(f"pyspark.ml.{module_name}") 47 | for module_name in _accelerated_attributes.keys() 48 | } 49 | 50 | 51 | def _set_pyspark_mod_getattr(mod_name: str) -> None: 52 | proxy_module = types.ModuleType(f"pyspark.ml.${mod_name}") 53 | 54 | def _getattr(attr: str) -> Any: 55 | frame = sys._getframe() 56 | assert frame.f_back 57 | calling_path = frame.f_back.f_code.co_filename 58 | if any( 59 | [ 60 | ( 61 | f"pyspark/ml/{mod_name}" in calling_path 62 | or f"spark_rapids_ml/{mod_name}" in calling_path 63 | ) 64 | for mod_name in _accelerated_attributes.keys() 65 | ] 66 | ) or (attr not in _accelerated_attributes[mod_name]): 67 | try: 68 | attr_val = getattr(_pyspark_modules[mod_name], attr) 69 | except: 70 | raise AttributeError(f"No attribute '{attr}'") 71 | 72 | return attr_val 73 | else: 74 | return getattr(_rapids_modules[mod_name], attr) 75 | 76 | setattr(proxy_module, "__getattr__", _getattr) 77 | sys.modules[f"pyspark.ml.{mod_name}"] = proxy_module 78 | 79 | 80 | for mod_name in _accelerated_attributes.keys(): 81 | _set_pyspark_mod_getattr(mod_name) 82 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from collections import namedtuple 18 | from dataclasses import dataclass 19 | from typing import Optional 20 | 21 | # Global parameter used by core and subclasses. 22 | TransformEvaluateMetric = namedtuple( 23 | "TransformEvaluateMetric", ("accuracy_like", "log_loss", "regression") 24 | ) 25 | transform_evaluate_metric = TransformEvaluateMetric( 26 | "accuracy_like", "log_loss", "regression" 27 | ) 28 | 29 | 30 | @dataclass 31 | class EvalMetricInfo: 32 | """Class for holding info about 33 | Spark evaluators to be passed in to transform_evaluate local computations""" 34 | 35 | # MulticlassClassificationEvaluator 36 | eps: float = 1.0e-15 # logLoss 37 | # BinaryClassificationEvaluator - placeholder till we support 38 | numBins: int = 1000 39 | 40 | eval_metric: Optional[str] = None 41 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/pyspark_rapids.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import subprocess 19 | import sys 20 | 21 | import spark_rapids_ml 22 | 23 | 24 | def main_cli() -> None: 25 | 26 | i = 1 27 | while i < len(sys.argv) and sys.argv[i].startswith("-"): 28 | if sys.argv[i] in ["--help", "-h", "--version"]: 29 | output = subprocess.run( 30 | f"pyspark {sys.argv[i]}", shell=True, capture_output=True 31 | ).stderr 32 | output_str = output.decode("utf-8") 33 | output_str = output_str.replace("pyspark", "pyspark-rapids") 34 | print(output_str, file=sys.stderr) 35 | exit(0) 36 | elif sys.argv[i] in ["--verbose", "-v", "--supervise"]: 37 | i += 1 38 | else: 39 | i += 2 40 | 41 | command_line = "pyspark " + " ".join(sys.argv[1:]) 42 | env = dict(os.environ) 43 | env["PYTHONSTARTUP"] = f"{spark_rapids_ml.__path__[0]}/install.py" 44 | subprocess.run(command_line, shell=True, env=env) 45 | -------------------------------------------------------------------------------- /python/src/spark_rapids_ml/spark_rapids_submit.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import subprocess 18 | import sys 19 | 20 | import spark_rapids_ml 21 | 22 | 23 | def main_cli() -> None: 24 | i = 1 25 | while i < len(sys.argv) and sys.argv[i].startswith("-"): 26 | if sys.argv[i] in ["--help", "-h", "--version"]: 27 | output = subprocess.run( 28 | f"spark-submit {sys.argv[i]}", shell=True, capture_output=True 29 | ).stderr 30 | output_str = output.decode("utf-8") 31 | output_str = output_str.replace("spark-submit", "spark-rapids-submit") 32 | print(output_str, file=sys.stderr) 33 | exit(0) 34 | elif sys.argv[i] in ["--verbose", "-v", "--supervise"]: 35 | i += 1 36 | else: 37 | i += 2 38 | 39 | if i >= len(sys.argv): 40 | raise ValueError("No application file supplied.") 41 | 42 | command_line = ( 43 | "spark-submit " 44 | + " ".join(sys.argv[1:i]) 45 | + f" {spark_rapids_ml.__path__[0]}/__main__.py " 46 | + " ".join(sys.argv[i:]) 47 | ) 48 | 49 | subprocess.run(command_line, shell=True) 50 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/tests/discover_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | if ! command -v nvidia-smi &> /dev/null 18 | then 19 | # default to the first GPU 20 | echo "{\"name\":\"gpu\",\"addresses\":[\"0\"]}" 21 | exit 22 | else 23 | # https://github.com/apache/spark/blob/master/examples/src/main/scripts/getGpusResources.sh 24 | ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'` 25 | echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} 26 | fi -------------------------------------------------------------------------------- /python/tests/sparksession.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2025, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from typing import Any, Dict 17 | 18 | from pyspark.sql import SparkSession 19 | 20 | from .conftest import _default_conf, get_spark_i_know_what_i_am_doing 21 | 22 | 23 | # sparksession.py is copied from spark-rapids 24 | def _from_scala_map(scala_map) -> Dict[str, Any]: # type: ignore 25 | ret = {} 26 | # The value we get is a scala map, not a java map, so we need to jump through some hoops 27 | keys = scala_map.keys().iterator() # type: ignore 28 | while keys.hasNext(): # type: ignore 29 | key = keys.next() # type: ignore 30 | ret[key] = scala_map.get(key).get() # type: ignore 31 | return ret # type: ignore 32 | 33 | 34 | _spark = get_spark_i_know_what_i_am_doing() 35 | # Have to reach into a private member to get access to the API we need 36 | _orig_conf = _from_scala_map(_spark.conf._jconf.getAll()) # type: ignore 37 | _orig_conf_keys = _orig_conf.keys() # type: ignore 38 | 39 | 40 | class CleanSparkSession: 41 | """ 42 | A context manager to auto reset spark conf. 43 | """ 44 | 45 | def __init__(self, conf: Dict[str, Any] = {}) -> None: 46 | self.conf = conf 47 | self.spark = _spark 48 | 49 | def __enter__(self) -> SparkSession: 50 | self._reset_spark_session_conf() 51 | self._set_all_confs(self.conf) 52 | return self.spark 53 | 54 | def __exit__(self, *args: Any) -> None: 55 | self._reset_spark_session_conf() 56 | 57 | def _set_all_confs(self, conf: Dict[str, Any]) -> None: 58 | newconf = _default_conf.copy() 59 | newconf.update(conf) 60 | for key, value in newconf.items(): 61 | if self.spark.conf.get(key, None) != value: 62 | self.spark.conf.set(key, value) 63 | 64 | def _reset_spark_session_conf(self) -> None: 65 | """Reset all of the configs for a given spark session.""" 66 | self._set_all_confs(_orig_conf) 67 | # Have to reach into a private member to get access to the API we need 68 | current_keys = _from_scala_map(self.spark.conf._jconf.getAll()).keys() # type: ignore 69 | for key in current_keys: 70 | if key not in _orig_conf_keys: 71 | self.spark.conf.unset(key) 72 | -------------------------------------------------------------------------------- /python/tests/test_tuning.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from typing import Tuple, Union 17 | 18 | import numpy as np 19 | import pytest 20 | from pyspark.ml.evaluation import RegressionEvaluator 21 | from pyspark.ml.tuning import CrossValidatorModel, ParamGridBuilder 22 | 23 | from spark_rapids_ml.regression import RandomForestRegressor 24 | from spark_rapids_ml.tuning import CrossValidator 25 | 26 | from .sparksession import CleanSparkSession 27 | from .utils import ( 28 | create_pyspark_dataframe, 29 | feature_types, 30 | idfn, 31 | make_regression_dataset, 32 | ) 33 | 34 | 35 | @pytest.mark.parametrize("feature_type", [feature_types.vector]) 36 | @pytest.mark.parametrize("data_type", [np.float32]) 37 | @pytest.mark.parametrize("data_shape", [(100, 8)], ids=idfn) 38 | def test_crossvalidator( 39 | tmp_path: str, 40 | feature_type: str, 41 | data_type: np.dtype, 42 | data_shape: Tuple[int, int], 43 | ) -> None: 44 | X, _, y, _ = make_regression_dataset( 45 | datatype=data_type, 46 | nrows=data_shape[0], 47 | ncols=data_shape[1], 48 | ) 49 | 50 | with CleanSparkSession() as spark: 51 | df, features_col, label_col = create_pyspark_dataframe( 52 | spark, feature_type, data_type, X, y 53 | ) 54 | assert label_col is not None 55 | 56 | rfc = RandomForestRegressor() 57 | rfc.setFeaturesCol(features_col) 58 | rfc.setLabelCol(label_col) 59 | 60 | evaluator = RegressionEvaluator() 61 | evaluator.setLabelCol(label_col) 62 | 63 | grid = ParamGridBuilder().addGrid(rfc.maxBins, [3, 5]).build() 64 | 65 | cv = CrossValidator( 66 | estimator=rfc, 67 | estimatorParamMaps=grid, 68 | evaluator=evaluator, 69 | numFolds=2, 70 | seed=101, 71 | ) 72 | 73 | def check_cv(cv_est: Union[CrossValidator, CrossValidatorModel]) -> None: 74 | assert isinstance(cv_est, (CrossValidator, CrossValidatorModel)) 75 | assert isinstance(cv_est.getEstimator(), RandomForestRegressor) 76 | assert isinstance(cv_est.getEvaluator(), RegressionEvaluator) 77 | assert cv_est.getNumFolds() == 2 78 | assert cv_est.getSeed() == 101 79 | assert cv_est.getEstimatorParamMaps() == grid 80 | 81 | check_cv(cv) 82 | 83 | path = tmp_path + "/cv" 84 | cv_path = f"{path}/cv" 85 | 86 | cv.write().overwrite().save(cv_path) 87 | cv_loaded = CrossValidator.load(cv_path) 88 | 89 | check_cv(cv_loaded) 90 | 91 | cv_model = cv.fit(df) 92 | check_cv(cv_model) 93 | 94 | cv_model_path = f"{path}/cv-model" 95 | cv_model.write().overwrite().save(cv_model_path) 96 | cv_model_loaded = CrossValidatorModel.load(cv_model_path) 97 | 98 | check_cv(cv_model_loaded) 99 | assert evaluator.evaluate(cv_model.transform(df)) == evaluator.evaluate( 100 | cv_model_loaded.transform(df) 101 | ) 102 | -------------------------------------------------------------------------------- /python/tests_large/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /python/tests_large/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import logging 18 | import os 19 | 20 | from pyspark.sql import SparkSession 21 | 22 | _cpu_number = 32 23 | _default_conf = { 24 | "spark.master": f"local[{_cpu_number}]", 25 | "spark.python.worker.reuse": "false", 26 | "spark.driver.host": "127.0.0.1", 27 | "spark.task.maxFailures": "1", 28 | "spark.driver.memory": "128g", 29 | "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false", 30 | "spark.sql.pyspark.jvmStacktrace.enabled": "true", 31 | "spark.sql.execution.arrow.pyspark.enabled": "true", 32 | "spark.rapids.ml.uvm.enabled": True, 33 | } 34 | 35 | 36 | def _get_spark() -> SparkSession: 37 | builder = SparkSession.builder.appName( 38 | name="spark-rapids-ml with tests on large datasets" 39 | ) 40 | for k, v in _default_conf.items(): 41 | builder.config(k, v) 42 | spark = builder.getOrCreate() 43 | spark.sparkContext.setLogLevel("WARN") 44 | logging.getLogger("pyspark").setLevel(logging.WARN) 45 | return spark 46 | 47 | 48 | _spark = _get_spark() 49 | -------------------------------------------------------------------------------- /thirdparty/LICENSES/LICENSE.scikit_learn: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2007-2022 The scikit-learn developers. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | --------------------------------------------------------------------------------