├── Dockerfile ├── LICENSE ├── README.md ├── advanced-topics ├── multi-gpu.md ├── performance_tuning.md └── pics │ ├── multi-gpu-advanced-options.png │ ├── multi-gpu-bootstrap.png │ ├── multi-gpu-s3.png │ └── performance_benchmark.png ├── api-docs ├── python.md └── scala.md ├── datasets ├── ETL │ ├── MortgageETL.ipynb │ └── Taxi_ETL.ipynb ├── agaricus.tar.gz ├── mortgage-small.tar.gz ├── preparing_datasets.md └── taxi-small.tar.gz ├── examples ├── app-parameters │ ├── supported_xgboost_parameters_python.md │ └── supported_xgboost_parameters_scala.md ├── apps │ ├── python │ │ ├── .gitignore │ │ ├── ai │ │ │ ├── __init__.py │ │ │ └── rapids │ │ │ │ ├── __init__.py │ │ │ │ └── spark │ │ │ │ ├── __init__.py │ │ │ │ └── examples │ │ │ │ ├── __init__.py │ │ │ │ ├── agaricus │ │ │ │ ├── __init__.py │ │ │ │ ├── consts.py │ │ │ │ ├── cpu_main.py │ │ │ │ └── gpu_main.py │ │ │ │ ├── main.py │ │ │ │ ├── mortgage │ │ │ │ ├── __init__.py │ │ │ │ ├── consts.py │ │ │ │ ├── cpu_cross_validator_main.py │ │ │ │ ├── cpu_main.py │ │ │ │ ├── gpu_cross_validator_main.py │ │ │ │ └── gpu_main.py │ │ │ │ ├── taxi │ │ │ │ ├── __init__.py │ │ │ │ ├── consts.py │ │ │ │ ├── cpu_main.py │ │ │ │ └── gpu_main.py │ │ │ │ └── utility │ │ │ │ ├── __init__.py │ │ │ │ ├── args.py │ │ │ │ └── utils.py │ │ └── main.py │ └── scala │ │ ├── .gitignore │ │ ├── assembly │ │ └── assembly-no-scala.xml │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ └── ai │ │ └── rapids │ │ └── spark │ │ └── examples │ │ ├── agaricus │ │ ├── CPUMain.scala │ │ └── GPUMain.scala │ │ ├── mortgage │ │ ├── CPUCrossValidatorMain.scala │ │ ├── CPUMain.scala │ │ ├── GPUCrossValidatorMain.scala │ │ ├── GPUMain.scala │ │ └── Mortgage.scala │ │ ├── taxi │ │ ├── CPUCrossValidatorMain.scala │ │ ├── CPUMain.scala │ │ ├── GPUCrossValidatorMain.scala │ │ ├── GPUMain.scala │ │ └── Taxi.scala │ │ └── utility │ │ ├── Benchmark.scala │ │ ├── SparkSetup.scala │ │ ├── Vectorize.scala │ │ └── XGBoostArgs.scala └── notebooks │ ├── .gitignore │ ├── python │ ├── agaricus-gpu.ipynb │ ├── cv-mortgage-gpu.ipynb │ ├── mortgage-gpu.ipynb │ └── taxi-gpu.ipynb │ └── scala │ ├── agaricus-gpu.ipynb │ ├── mortgage-gpu-databricks.scala │ ├── mortgage-gpu.ipynb │ ├── mortgage_gpu_crossvalidation.ipynb │ ├── taxi-gpu.ipynb │ └── taxi_gpu_crossvalidation.ipynb ├── getting-started-guides ├── building-sample-apps │ ├── python.md │ └── scala.md ├── csp │ ├── aws │ │ ├── EMR_Mortgage_Example_G4dn.ipynb │ │ ├── adv_full_datasets.md │ │ ├── emr.md │ │ ├── pics │ │ │ ├── emr-cluster-details.png │ │ │ ├── emr-cluster-dns.png │ │ │ ├── emr-cluster-ssh.png │ │ │ ├── emr-cluster-waiting.png │ │ │ ├── emr-stdout.png │ │ │ ├── emr-step-four-security.png │ │ │ ├── emr-step-one-s3-copy.png │ │ │ ├── emr-step-one-software-and-steps.png │ │ │ ├── emr-step-three-general-cluster-settings.png │ │ │ ├── emr-step-two-hardware.png │ │ │ ├── emr-view-logs.png │ │ │ ├── sagemaker-config-move.png │ │ │ ├── sagemaker-config-updated.png │ │ │ ├── sagemaker-curl-output.png │ │ │ ├── sagemaker-info-output.png │ │ │ ├── sagemaker-jupyter-new.gif │ │ │ ├── sagemaker-kernel-restart.png │ │ │ ├── sagemaker-notebook-instance.png │ │ │ ├── sagemaker-output.png │ │ │ ├── sagemaker-permission.png │ │ │ └── sagemaker-tcp-port.png │ │ └── sagemaker.md │ ├── databricks │ │ ├── databricks.md │ │ ├── init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb │ │ ├── init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb │ │ └── xgb_python_gpu_perf_blog.ipynb │ └── gcp │ │ ├── gcp.md │ │ └── spark-gpu │ │ ├── README.md │ │ ├── internal │ │ ├── install-gpu-driver-debian.sh │ │ └── install-gpu-driver-ubuntu.sh │ │ └── rapids.sh ├── notebook │ ├── python-notebook.md │ └── toree.md └── on-prem-cluster │ ├── kubernetes.md │ ├── standalone-python.md │ ├── standalone-scala.md │ ├── yarn-python.md │ └── yarn-scala.md ├── gpu_executor_template.yaml └── tools ├── jupyter_gpu_count_estimation.ipynb └── jupyter_gpu_max_loadable_row.ipynb /Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | FROM nvidia/cuda:10.0-devel-ubuntu18.04 19 | ARG spark_uid=185 20 | 21 | # Install java dependencies 22 | RUN apt-get update && apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre 23 | ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 24 | ENV PATH $PATH:/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre/bin:/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin 25 | 26 | # Before building the docker image, first build and make a Spark distribution following 27 | # the instructions in http://spark.apache.org/docs/latest/building-spark.html. 28 | # If this docker file is being used in the context of building your images from a Spark 29 | # distribution, the docker build command should be invoked from the top level directory 30 | # of the Spark distribution. E.g.: 31 | # docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile . 32 | 33 | RUN set -ex && \ 34 | ln -s /lib /lib64 && \ 35 | mkdir -p /opt/spark && \ 36 | mkdir -p /opt/spark/examples && \ 37 | mkdir -p /opt/spark/work-dir && \ 38 | touch /opt/spark/RELEASE && \ 39 | rm /bin/sh && \ 40 | ln -sv /bin/bash /bin/sh && \ 41 | echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ 42 | chgrp root /etc/passwd && chmod ug+rw /etc/passwd 43 | 44 | ENV DEBIAN_FRONTEND noninteractive 45 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils \ 46 | && apt-get install -y --no-install-recommends python libgomp1 \ 47 | && rm -rf /var/lib/apt/lists/* 48 | 49 | COPY jars /opt/spark/jars 50 | COPY bin /opt/spark/bin 51 | COPY sbin /opt/spark/sbin 52 | COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/ 53 | COPY examples /opt/spark/examples 54 | COPY kubernetes/tests /opt/spark/tests 55 | COPY data /opt/spark/data 56 | 57 | ENV SPARK_HOME /opt/spark 58 | 59 | WORKDIR /opt/spark/work-dir 60 | RUN chmod g+w /opt/spark/work-dir 61 | 62 | ENV TINI_VERSION v0.18.0 63 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini 64 | RUN chmod +rx /sbin/tini 65 | 66 | ENTRYPOINT [ "/opt/entrypoint.sh" ] 67 | 68 | # Specify the User that the actual main process will run as 69 | USER ${spark_uid} 70 | 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Please note that this repo has been moved to the new repo [spark-xgboost-examples](https://github.com/NVIDIA/spark-xgboost-examples). 2 | 3 | This repo provides docs and example applications that demonstrate the RAPIDS.ai GPU-accelerated XGBoost-Spark project. 4 | 5 | ### Examples 6 | 7 | - Mortgage: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage), [Python](/examples/apps/python/ai/rapids/spark/examples/mortgage) 8 | - Taxi: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi), [Python](/examples/apps/python/ai/rapids/spark/examples/taxi) 9 | - Agaricus: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus), [Python](/examples/apps/python/ai/rapids/spark/examples/agaricus) 10 | 11 | ### Getting Started Guides 12 | 13 | Try one of the Getting Started guides below. Please note that they target the Mortgage dataset as written, but with a few changes to `EXAMPLE_CLASS`, `trainDataPath`, and `evalDataPath`, they can be easily adapted to the Taxi or Agaricus datasets. 14 | 15 | You can get a small size datasets for each example in the [datasets](/datasets) folder. These datasets are only provided for convenience. In order to test for performance, please prepare a larger dataset by following [Preparing Datasets](/datasets/preparing_datasets.md). We also provide a larger dataset: [Morgage Dataset (1 GB uncompressed)](https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip), which is used in the guides below. 16 | 17 | - Building applications 18 | - [Scala](/getting-started-guides/building-sample-apps/scala.md) 19 | - [Python](/getting-started-guides/building-sample-apps/python.md) 20 | - Getting started on on-prem clusters 21 | - [Standalone cluster for Scala](/getting-started-guides/on-prem-cluster/standalone-scala.md) 22 | - [Standalone cluster for Python](/getting-started-guides/on-prem-cluster/standalone-python.md) 23 | - [YARN for Scala](/getting-started-guides/on-prem-cluster/yarn-scala.md) 24 | - [YARN for Python](/getting-started-guides/on-prem-cluster/yarn-python.md) 25 | - [Kubernetes](/getting-started-guides/on-prem-cluster/kubernetes.md) 26 | - Getting started on cloud service providers 27 | - Amazon AWS 28 | - [EMR](/getting-started-guides/csp/aws/emr.md) 29 | - [SageMaker](/getting-started-guides/csp/aws/sagemaker.md) 30 | - [Databricks](/getting-started-guides/csp/databricks/databricks.md) 31 | - [Google Cloud Platform](/getting-started-guides/csp/gcp/gcp.md) 32 | - Getting started for Jupyter Notebook applications 33 | - [Apache Toree Notebook for Scala](/getting-started-guides/notebook/toree.md) 34 | - [Jupyter Notebook for Python](/getting-started-guides/notebook/python-notebook.md) 35 | 36 | These examples use default parameters for demo purposes. For a full list please see Supported XGBoost Parameters for [Scala](/examples/app-parameters/supported_xgboost_parameters_scala.md) or [Python](/examples/app-parameters/supported_xgboost_parameters_python.md) 37 | 38 | ### XGBoost-Spark API 39 | 40 | - [Scala API](/api-docs/scala.md) 41 | - [Python API](/api-docs/python.md) 42 | 43 | ### Advanced Topics 44 | 45 | - [Multi-GPU configuration](/advanced-topics/multi-gpu.md) 46 | - [Performance tuning](/advanced-topics/performance_tuning.md) 47 | 48 | ### Contact Us 49 | 50 | Please see the [RAPIDS](https://rapids.ai/community.html) website for contact information. 51 | 52 | ### License 53 | 54 | This content is licensed under the [Apache License 2.0](/LICENSE) 55 | -------------------------------------------------------------------------------- /advanced-topics/multi-gpu.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-GPU Configuration for XGBoost4J-Spark 3 | 4 | This is an advanced guide on how to configure multiple GPUs to run XGBoost4j-Spark applications for each Spark workers. 5 | 6 | ### General Rules 7 | 8 | As a general rule, the number of Spark executors must be less than or equal to the number of GPUs on each host. This ensures that each XGBoost Spark task can run on one GPU exclusively. 9 | 10 | To enable this feature, *EXCLUSIVE_PROCESS* must be set for all GPUs for each host: 11 | 12 | ``` 13 | nvidia-smi -i [gpu index] -c EXCLUSIVE_PROCESS 14 | ``` 15 | 16 | For example: 17 | 18 | ``` 19 | nvidia-smi -i 0 -c EXCLUSIVE_PROCESS 20 | ``` 21 | 22 | sets *EXCLUSIVE_PROCESS* for GPU *0*. 23 | 24 | ### Configuration for On-Prem Clusters 25 | 26 | To set the *EXCLUSIVE_PROCESS* for an on-prem cluster, please run the above *nvidia-smi* command for each GPU on each host before setting up the cluster. These commands might be added as a bootstrap script. 27 | 28 | ### Configuration for Cloud Service Providers (CSP) 29 | 30 | Each CSP has its own way to run a bootstrap script. 31 | 32 | Below is an example on how to set *EXCLUSIVE_PROCESS* for AWS EMR: 33 | 34 | - Create cluster and select "go to advanced options". 35 | - At "Step 3: General Cluster Settings", add Bootstrap to setup GPU exclusive mode. 36 | ![Advanced Options](pics/multi-gpu-advanced-options.png) 37 | - Create a shell script file and upload to S3 (emr_gpu_set.sh). 38 | ``` 39 | #!/bin/bash 40 | sudo nvidia-smi -i 0 -c EXCLUSIVE_PROCESS 41 | # set exclusive mode for other GPUs 42 | ``` 43 | ![S3](pics/multi-gpu-s3.png) 44 | - In "Additional Options", select "Custom Action" to add a bootstrap action, select from S3 location and selec the script file (emr_gpu_set.sh). The script will be executed when EMR launches each instance. 45 | ![Bootstrap Script](pics/multi-gpu-bootstrap.png) 46 | -------------------------------------------------------------------------------- /advanced-topics/performance_tuning.md: -------------------------------------------------------------------------------- 1 | # Performance tuning for XGBoost4J-Spark 2 | 3 | This is an advanced guide on how to tune chunk size to achieve best performance. 4 | 5 | ## Chunk size 6 | 7 | We've supported chunk size reading and DMatrix building incrementally from version 0.2 release, and we also move DMatrix from GPU to CPU, which means XGBoost can load **any dataset size** if CPU memory is enough by tuning chunk size. 8 | 9 | Currently, Chunk size is controlled by three values: file size, max partition size and maxRowsPerChunk. 10 | 11 | - maxRowsPerChunk 12 | 13 | The granulity of maxRowsPerChunk is measured by row, which default to Integer.MAX_VALUE. According to maxRowsPerChunk, we can hardly evaluate the size of total maxRowsPerChunk rows. So for this reason, We'd suggest **not to touch this value**, just keep its default value. 14 | 15 | - max partition size 16 | 17 | max partition size is controlled by `spark.sql.files.maxPartitionBytes`, Please refer to [this page](https://spark.apache.org/docs/latest/sql-performance-tuning.html) 18 | 19 | After maxRowsPerChunk skipped, chunk size is equal to min{file size, max partition size} 20 | 21 | ### Benchmark 22 | 23 | We have some rounds of benchmark test against max partition size 24 | 25 | Given below files 26 | ``` 27 | 1.6G 2010_1.csv 28 | 1.6G 2010_2.csv 29 | 1.5G 2010_3.csv 30 | 1.3G 2010_4.csv 31 | 2.6G 2011_1.csv 32 | 1.3G 2011_2.csv 33 | 1.5G 2011_3.csv 34 | 2.8G 2011_4.csv 35 | 970M 2014_1.csv 36 | 822M 2016_1.csv 37 | 963M 2016_2.csv 38 | 978M 2016_3.csv 39 | 887M 2016_4.csv 40 | 19G total 41 | ``` 42 | 43 | ![performance benchmark](pics/performance_benchmark.png) 44 | 45 | From that, we can see, when max partition size is greater than the largest file size, it can achieve best performance. 46 | 47 | However, There is a peak memory limitation when building DMatrix for a single chunk size, which should be less than TOTAL_GPU_MEMORY/3 48 | 49 | > In summary, when each file size is equal to max partition size, and max partition size is configured to TOTAL_GPU_MEMORY/3, then XGBoost can achieve the best performance. 50 | 51 | ### How to use 52 | 53 | You can configure max parititon size when you submit your task. 54 | 55 | ``` 56 | spark-submit --conf spark.sql.files.maxPartitionBytes=XXXXXXX 57 | ``` 58 | -------------------------------------------------------------------------------- /advanced-topics/pics/multi-gpu-advanced-options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-advanced-options.png -------------------------------------------------------------------------------- /advanced-topics/pics/multi-gpu-bootstrap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-bootstrap.png -------------------------------------------------------------------------------- /advanced-topics/pics/multi-gpu-s3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-s3.png -------------------------------------------------------------------------------- /advanced-topics/pics/performance_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/performance_benchmark.png -------------------------------------------------------------------------------- /api-docs/python.md: -------------------------------------------------------------------------------- 1 | # Python API for XGBoost-Spark 2 | 3 | This doc focuses on GPU related Python API interfaces. 7 new classes are introduced: 4 | 5 | - [CrossValidator](#crossvalidator) 6 | - [GpuDataset](#gpudataset) 7 | - [GpuDataReader](#gpudatareader) 8 | - [XGBoostClassifier](#xgboostclassifier) 9 | - [XGBoostClassificationModel](#xgboostclassificationmodel) 10 | - [XGBoostRegressor](#xgboostregressor) 11 | - [XGBoostRegressionModel](#xgboostregressionmodel) 12 | 13 | ### CrossValidator 14 | 15 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator`, and it is a wrapper around [Scala CrossValidator](scala.md#crossvalidator). 16 | 17 | ##### Constructors 18 | 19 | + CrossValidator() 20 | 21 | ##### Methods 22 | 23 | *Note: Only GPU related methods are listed below.* 24 | 25 | + fit(dataset): This method triggers the corss validation for hyperparameter tuninng. 26 | + dataset: a [GpuDataset](#gpudataset) used for cross validation 27 | + returns the best [Model](https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.Model)[\_] for the given hyperparameters. 28 | + Note: For CPU version, you can still call `fit` by passing a [Dataset](https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.sql.Dataset). 29 | 30 | ### GpuDataset 31 | 32 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset`. A GpuDataset is an object that is produced by [GpuDataReader](#gpudatareader)s and consumed by [XGBoostClassifier](#xgboostclassifier)s and [XGBoostRegressor](#xgboostregressor)s. No constructors or methods are exposed for this class. 33 | 34 | ### GpuDataReader 35 | 36 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader`. A GpuDataReader sets options and builds [GpuDataset](#gpudataset) from data sources. The data loading is a lazy operation. It occurs when the data is processed later. 37 | 38 | ##### Constructors 39 | 40 | + GpuDataReader(spark_session) 41 | + spark_session: a [SparkSession](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=sparksession#pyspark.sql.SparkSession) for data loading 42 | 43 | ##### Methods 44 | 45 | + format(source): This method sets data format. Valid values include *csv*, *parquet* and *orc*. 46 | + source: a String represents the data format to set 47 | + returns the data reader itself 48 | + schema(schema): This method sets data schema. 49 | + schema: data schema either in [StructType](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=sparksession#pyspark.sql.types.StructType) format or a DDL-formatted String (e.g., *a INT, b STRING, c DOUBLE*) 50 | + returns the data reader itself 51 | + option(key, value): This method sets an option. 52 | + key: a String represents the option key 53 | + value: the option value, valid types include *Boolean*, *Integer*, *Float* and *String* 54 | + returns the data reader itself 55 | + options(options). This method sets options. 56 | + options: an option Dictionary[String, String] 57 | + returns the data reader itself 58 | + load(\*paths): This method builds a [GpuDataset](#gpudataset). 59 | + paths: the data source paths, might be empty, one path, or a list of paths 60 | + returns a [GpuDataset](#gpudataset) as the result 61 | + csv(\*paths): This method builds a [GpuDataset](#gpudataset). 62 | + paths: the CSV data paths, might be one path or a list of paths 63 | + returns a [GpuDataset](#gpudataset) as the result 64 | + parquet(\*paths): This method builds a [GpuDataset](#gpudataset). 65 | + paths: the Parquet data paths, might be one path or a list of paths 66 | + returns a [GpuDataset](#gpudataset) as the result 67 | + orc(\*paths):. This method builds a [GpuDataset](#gpudataset). 68 | + paths: the ORC data paths, might be one path or a list of paths 69 | + returns a [GpuDataset](#gpudataset) as the result 70 | 71 | ##### Options 72 | 73 | + Common options 74 | + asFloats: A Boolean flag indicates whether cast all numeric values to floats. Default is True. 75 | + maxRowsPerChunk: An Integer specifies the max rows per chunk. Default is 2147483647 (2^31-1). 76 | + Options for CSV 77 | + comment: A single character used for skipping lines beginning with this character. Default is empty string. By default, it is disabled. 78 | + header: A Boolean flag indicates whether the first line should be used as names of columns. Default is False. 79 | + nullValue: The string representation of a null(None) value. Default is empty string. 80 | + quote: A single character used for escaping quoted values where the separator can be part of the value. Default is `"`. 81 | + sep: A single character as a separator between adjacent values. Default is `,`. 82 | 83 | ### XGBoostClassifier 84 | 85 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier`. It is a wrapper around [Scala XGBoostClassifier](scala.md#xgboostclassifier). 86 | 87 | ##### Constructors 88 | 89 | + XGBoostClassifier(\*\*params) 90 | + all [standard xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported, but please note a few differences: 91 | + only camelCase is supported when specifying parameter names, e.g., *maxDepth* 92 | + parameter *lambda* is renamed to *lambda_*, because *lambda* is a keyword in Python 93 | 94 | ##### Methods 95 | 96 | *Note: Only GPU related methods are listed below.* 97 | 98 | + setFeaturesCols(features_cols). This method sets the feature columns for training. 99 | + features_cols: a list of feature column names in String format to set 100 | + returns the classifier itself 101 | + setEvalSets(eval_sets): This method sets eval sets for training. 102 | + eval_sets: eval sets of type Dictionary[String, [GpuDataset](#gpudataset)] for training (For CPU training, the type is Dictionary[String, [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)]) 103 | + returns the classifier itself 104 | + fit(dataset): This method triggers the training. 105 | + dataset: a [GpuDataset](#gpudataset) to train 106 | + returns the training result as a [XGBoostClassificationModel](#xgboostclassificationmodel) 107 | + Note: For CPU training, you can still call fit to train a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) 108 | 109 | ### XGBoostClassificationModel 110 | 111 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel`. It is a wrapper around [Scala XGBoostClassificationModel](scala.md#xgboostclassificationmodel). 112 | 113 | ##### Methods 114 | 115 | *Note: Only GPU related methods are listed below.* 116 | 117 | + transform(dataset:): This method predicts results based on the model. 118 | + dataset: a [GpuDataset](#gpudataset) to predicate 119 | + returns a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) with the prediction 120 | 121 | ### XGBoostRegressor 122 | 123 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor`. It is a wrapper around [Scala XGBoostRegressor](scala.md#xgboostregressor). 124 | 125 | ##### Constructors 126 | 127 | + XGBoostRegressor(\*\*params) 128 | + all [standard xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported, but please note a few differences: 129 | + only camelCase is supported when specifying parameter names, e.g., *maxDepth* 130 | + parameter *lambda* is renamed to *lambda_*, because *lambda* is a keyword in Python 131 | 132 | ##### Methods 133 | 134 | *Note: Only GPU related methods are listed below.* 135 | 136 | + setFeaturesCols(features_cols). This method sets the feature columns for training. 137 | + features_cols: a list of feature column names in String format to set 138 | + returns the regressor itself 139 | + setEvalSets(eval_sets): This method sets eval sets for training. 140 | + eval_sets: eval sets of type Dictionary[String, [GpuDataset](#gpudataset)] for training (For CPU training, the type is Dictionary[String, [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)]) 141 | + returns the regressor itself 142 | + fit(dataset): This method triggers the training. 143 | + dataset: a [GpuDataset](#gpudataset) to train 144 | + returns the training result as a [XGBoostRegressionModel](#xgboostregressionmodel) 145 | + Note: For CPU training, you can still call fit to train a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) 146 | 147 | ### XGBoostRegressionModel 148 | 149 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel`. It is a wrapper around [Scala XGBoostRegressionModel](scala.md#xgboostregressionmodel). 150 | 151 | ##### Methods 152 | 153 | *Note: Only GPU related methods are listed below.* 154 | 155 | + transform(dataset:): This method predicts results based on the model. 156 | + dataset: a [GpuDataset](#gpudataset) to predicate 157 | + returns a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) with the prediction 158 | -------------------------------------------------------------------------------- /datasets/ETL/Taxi_ETL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import cudf\n", 10 | "import numpy as np\n", 11 | "from numba import cuda\n", 12 | "import math" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "PERCENT_TRAIN = 0.8" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#Numba Kernel to calculate Haversine distance\n", 31 | "@cuda.jit\n", 32 | "def haversine_kernel(lat1, lon1, lat2, lon2, outputCol):\n", 33 | " iRow = cuda.grid(1)\n", 34 | " p = 0.017453292519943295 # Pi/180\n", 35 | " if iRow < outputCol.size:\n", 36 | " a = 0.5 - math.cos((lat2[iRow] - lat1[iRow]) * p)/2 + math.cos(lat1[iRow] * p) * \\\n", 37 | " math.cos(lat2[iRow] * p) * (1 - math.cos((lon2[iRow] - lon1[iRow]) * p)) / 2 \n", 38 | " outputCol[iRow] = 12734 * math.asin(math.sqrt(a))\n", 39 | " \n", 40 | "def haversine_distance(gdf):\n", 41 | " nRows = gdf.shape[0]\n", 42 | " blockSize = 128\n", 43 | " blockCount = nRows // blockSize + 1\n", 44 | " lat1_arr = gdf['pickup_latitude'].to_gpu_array()\n", 45 | " lon1_arr = gdf['pickup_longitude'].to_gpu_array()\n", 46 | " lat2_arr = gdf['dropoff_latitude'].to_gpu_array()\n", 47 | " lon2_arr = gdf['dropoff_longitude'].to_gpu_array()\n", 48 | " \n", 49 | " outputCol = cuda.device_array ( shape=(nRows), dtype=lat1_arr.dtype.name)\n", 50 | " \n", 51 | " haversine_kernel[(blockCount),(blockSize)](lat1_arr, lon1_arr, lat2_arr, lon2_arr, outputCol)\n", 52 | " gdf.add_column(name='h_distance', data = outputCol)\n", 53 | " return gdf\n", 54 | "\n", 55 | "#Numba Kernel to calculate day of the week from Date\n", 56 | "@cuda.jit\n", 57 | "def day_of_the_week_kernel(output ,year, month, day):\n", 58 | " iRow = cuda.grid(1)\n", 59 | " if iRow < output.size:\n", 60 | " year[iRow] -= month[iRow] < 3\n", 61 | " month[iRow] = (month[iRow] + 9)%12 + 1\n", 62 | " output[iRow] = (year[iRow] + int(year[iRow]/4) - int(year[iRow]/100) + int(year[iRow]/400) + math.floor(2.6*month[iRow] - 0.2) + day[iRow] -1) % 7\n", 63 | " \n", 64 | "def day_of_week(gdf):\n", 65 | " nRows = gdf.shape[0]\n", 66 | " blockSize = 128\n", 67 | " blockCount = nRows // blockSize + 1\n", 68 | " year_arr = gdf['year'].to_gpu_array()\n", 69 | " month_arr = gdf['month'].to_gpu_array()\n", 70 | " day_arr = gdf['day'].to_gpu_array()\n", 71 | " outputCol = cuda.device_array ( shape=(nRows), dtype=day_arr.dtype.name)\n", 72 | " \n", 73 | " day_of_the_week_kernel[(blockCount),(blockSize)](outputCol, year_arr, month_arr, day_arr)\n", 74 | " gdf.add_column(name='day_of_week', data = outputCol)\n", 75 | " gdf['day_of_week'] = gdf['day_of_week'].astype('float32')\n", 76 | " return gdf\n", 77 | " \n", 78 | "import pandas as pd\n", 79 | "def gpu_read_csv(file_path):\n", 80 | " names = ['vendor_id','pickup_datetime','dropoff_datetime','passenger_count','trip_distance','pickup_longitude',\n", 81 | " 'pickup_latitude','rate_code','store_and_fwd','dropoff_longitude','dropoff_latitude','payment_type',\n", 82 | " 'fare_amount','surcharge','mta_tax','tip_amount','tolls_amount','total_amount']\n", 83 | " \n", 84 | " dtypes = ['category','date','date','int','float64','float64','float64','category','category','float64','float64',\n", 85 | " 'category','float64','float64','float64','float64','float64','float64']\n", 86 | "\n", 87 | " df = cudf.read_csv(file_path, dtype=dtypes, names=names,skiprows=1)\n", 88 | " return df\n", 89 | "\n", 90 | "def null_workaround(df, **kwargs):\n", 91 | " for column, data_type in df.dtypes.items():\n", 92 | " if str(data_type) in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:\n", 93 | " df[column] = df[column].fillna(-1)\n", 94 | " return df\n", 95 | "\n", 96 | "def clean_data(df):\n", 97 | " drop_list = [\n", 98 | " 'dropoff_datetime', 'payment_type', 'surcharge', 'mta_tax',\n", 99 | " 'tip_amount', 'tolls_amount', 'total_amount'\n", 100 | " ]\n", 101 | "\n", 102 | " for column in drop_list:\n", 103 | " df.drop_column(column)\n", 104 | " \n", 105 | " df = null_workaround(df)\n", 106 | " \n", 107 | " df_fare = df.query('fare_amount > 0 and fare_amount < 500')\n", 108 | " del(df)\n", 109 | " \n", 110 | " df_pass = df_fare.query('passenger_count > 0 and passenger_count < 6')\n", 111 | " del(df_fare)\n", 112 | " \n", 113 | " df_picklong = df_pass.query('pickup_longitude > -75 and pickup_longitude < -73')\n", 114 | " del(df_pass)\n", 115 | " \n", 116 | " df_droplong = df_picklong.query('dropoff_longitude > -75 and dropoff_longitude < -73')\n", 117 | " del(df_picklong)\n", 118 | " \n", 119 | " df_picklat = df_droplong.query('pickup_latitude > 40 and pickup_latitude < 42')\n", 120 | " del(df_droplong)\n", 121 | " \n", 122 | " df_droplat = df_picklat.query('dropoff_latitude > 40 and dropoff_latitude < 42')\n", 123 | " del(df_picklat)\n", 124 | " \n", 125 | " return df_droplat\n", 126 | " \n", 127 | "def add_features(df):\n", 128 | " df['hour'] = df['pickup_datetime'].dt.hour\n", 129 | " df['year'] = df['pickup_datetime'].dt.year\n", 130 | " df['month'] = df['pickup_datetime'].dt.month\n", 131 | " df['day'] = df['pickup_datetime'].dt.day\n", 132 | " \n", 133 | " df.drop_column('pickup_datetime')\n", 134 | " \n", 135 | " df = day_of_week(df)\n", 136 | " df['is_weekend'] = (df['day_of_week']/4).floor()\n", 137 | " df = haversine_distance(df)\n", 138 | " return df\n", 139 | " \n", 140 | "\n", 141 | "def process_data(train_path):\n", 142 | " df = gpu_read_csv(train_path)\n", 143 | " df = clean_data(df)\n", 144 | " df = add_features(df)\n", 145 | " return df" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "month = 1\n", 155 | "start = 2016\n", 156 | "end = 2016\n", 157 | "year = start\n", 158 | "\n", 159 | "# INPUT DIRECTORY (change this to point to where you downloaded the dataset)\n", 160 | "INPUT_DIRECTORY = \"/datasets/test/raw/taxi\"\n", 161 | "\n", 162 | "# OUTPUT DIRECTORY (change this to point to where you want the processed dataset output)\n", 163 | "OUTPUT_DIRECTORY = \"/datasets/test/taxi\"\n", 164 | "\n", 165 | "while year <= end:\n", 166 | " current_part_path = INPUT_DIRECTORY + \"/yellow_tripdata_\" + str(year) + \"-\" + f\"{month:02d}\" + \".csv\"\n", 167 | " \n", 168 | " train_part_path_pq = OUTPUT_DIRECTORY + \"/parquet/train/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".parquet\" \n", 169 | " test_part_path_pq = OUTPUT_DIRECTORY + \"/parquet/test/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".parquet\"\n", 170 | " \n", 171 | " train_part_path_csv = OUTPUT_DIRECTORY + \"/csv/train/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".csv\" \n", 172 | " test_part_path_csv = OUTPUT_DIRECTORY + \"/csv/test/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".csv\"\n", 173 | " \n", 174 | " print(current_part_path)\n", 175 | " df = process_data(current_part_path)\n", 176 | " month += 1\n", 177 | " \n", 178 | " msk = np.random.rand(len(df)) < PERCENT_TRAIN\n", 179 | " \n", 180 | " train = df[msk]\n", 181 | " test = df[~msk]\n", 182 | " \n", 183 | " print(train.shape)\n", 184 | " print(test.shape)\n", 185 | " \n", 186 | " train.to_parquet(train_part_path_pq)\n", 187 | " test.to_parquet(test_part_path_pq)\n", 188 | " \n", 189 | " train.to_pandas().to_csv(train_part_path_csv, header=False)\n", 190 | " test.to_pandas().to_csv(train_part_path_csv, header=False)\n", 191 | " \n", 192 | " del train\n", 193 | " del test\n", 194 | " del df\n", 195 | " \n", 196 | " if month > 12:\n", 197 | " month = 1\n", 198 | " year += 1\n", 199 | " " 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.6.7" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /datasets/agaricus.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/agaricus.tar.gz -------------------------------------------------------------------------------- /datasets/mortgage-small.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/mortgage-small.tar.gz -------------------------------------------------------------------------------- /datasets/preparing_datasets.md: -------------------------------------------------------------------------------- 1 | Preparing Datasets 2 | ================== 3 | 4 | ## Mortgage Example 5 | 1. Setup [Apache Toree Jupyter notebook](/getting-started-guides/notebook/toree.md). 6 | 2. Download raw data from: https://rapidsai.github.io/demos/datasets/mortgage-data 7 | 3. Run [Mortgage ETL job](/datasets/ETL/MortgageETL.ipynb). 8 | 9 | ## Taxi Example 10 | 1. Setup [Apache Toree Jupyter notebook](/getting-started-guides/notebook/toree.md). 11 | 2. Install `cudatoolkit` and `numba` (`conda` example provided, but you can also use `pip`): 12 | ``` 13 | conda install numba 14 | conda install cudatoolkit 15 | ``` 16 | 3. Download raw data: 17 | ``` 18 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_20{09..16}-{01..12}.csv 19 | ``` 20 | 4. Run [Taxi ETL job](/datasets/ETL/Taxi_ETL.ipynb). 21 | -------------------------------------------------------------------------------- /datasets/taxi-small.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/taxi-small.tar.gz -------------------------------------------------------------------------------- /examples/app-parameters/supported_xgboost_parameters_python.md: -------------------------------------------------------------------------------- 1 | Supported XGBoost Parameters 2 | ============================ 3 | 4 | This is a description of all the parameters available when you are running examples in this repo: 5 | 6 | 1. All [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported. 7 | * Please use the `camelCase`, e.g., `--treeMethod=gpu_hist`. 8 | * `lambda` is replaced with `lambda_`, because `lambda` is a keyword in Python. 9 | 2. `--format=[csv|parquet|orc]`: The format of the data for training/transforming, now supports 'csv', 'parquet' and 'orc'. *Required*. 10 | 3. `--mode=[all|train|transform]`. To control the behavior of the sample app, default is 'all' if not specified. 11 | * all: Do both training and transforming, will save model to 'modelPath' if specified 12 | * train: Do training only, will save model to 'modelPath' if specified. 13 | * transform: Do transforming only, 'modelPath' is required to locate the model data to be loaded. 14 | 4. `--trainDataPath=[path]`: Path to your training data file(s), required when mode is NOT 'transform'. 15 | 5. `--trainEvalDataPath=[path]`: Path to your data file(s) for training with evaluation. Optional. 16 | 6. `--evalDataPath=[path]`: Path to your test(evaluation) data file(s), required when mode is NOT 'train'. 17 | 7. `--modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'. 18 | 8. `--overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists. 19 | 9. `--hasHeader=[true|false]`: Indicate if your csv file has header. 20 | 10. `--asFloats=[true|false]`: Whether to cast numerical schema to float schema. Default is true. 21 | 11. `--maxRowsPerChunk=[value]`: Max lines of row to be read per chunk. Default is 2147483647. 22 | -------------------------------------------------------------------------------- /examples/app-parameters/supported_xgboost_parameters_scala.md: -------------------------------------------------------------------------------- 1 | Supported XGBoost Parameters 2 | ============================ 3 | 4 | This is a description of all the parameters available when you are running examples in this repo: 5 | 6 | 1. All [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported. 7 | 2. `-format=[csv|parquet|orc]`: The format of the data for training/transforming, now supports 'csv', 'parquet' and 'orc'. *Required*. 8 | 3. `-mode=[all|train|transform]`. To control the behavior of the sample app, default is 'all' if not specified. 9 | * all: Do both training and transforming, will save model to 'modelPath' if specified 10 | * train: Do training only, will save model to 'modelPath' if specified. 11 | * transform: Do transforming only, 'modelPath' is required to locate the model data to be loaded. 12 | 4. `-trainDataPath=[path]`: Path to your training data file(s), required when mode is NOT 'transform'. 13 | 5. `-trainEvalDataPath=[path]`: Path to your data file(s) for training with evaluation. Optional. 14 | 6. `-evalDataPath=[path]`: Path to your test(evaluation) data file(s), required when mode is NOT 'train'. 15 | 7. `-modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'. 16 | 8. `-overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists. 17 | 9. `-hasHeader=[true|false]`: Indicate if your csv file has header. 18 | 10. `-asFloats=[true|false]`: Whether to cast numerical schema to float schema. Default is true. 19 | 11. `-maxRowsPerChunk=[value]`: Max lines of row to be read per chunk. Default is Integer.MAX_VALUE. 20 | -------------------------------------------------------------------------------- /examples/apps/python/.gitignore: -------------------------------------------------------------------------------- 1 | samples.zip 2 | -------------------------------------------------------------------------------- /examples/apps/python/ai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/agaricus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/agaricus/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/agaricus/consts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from pyspark.sql.types import * 18 | 19 | label = 'label' 20 | features = [ 'feature_' + str(i) for i in range(0, 126) ] 21 | schema = StructType([ StructField(x, FloatType()) for x in [label] + features ]) 22 | 23 | default_params = { 24 | 'eta': 0.1, 25 | 'missing': 0.0, 26 | 'maxDepth': 2, 27 | 'numWorkers': 1, 28 | } 29 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/agaricus/cpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.agaricus.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from pyspark.sql import SparkSession 21 | 22 | def main(args, xgboost_args): 23 | spark = (SparkSession 24 | .builder 25 | .appName(args.mainClass) 26 | .getOrCreate()) 27 | 28 | def prepare_data(path): 29 | reader = spark.read.format(args.format) 30 | if args.format == 'csv': 31 | reader.schema(schema).option('header', args.hasHeader) 32 | return vectorize(reader.load(path), label) 33 | 34 | if args.mode in [ 'all', 'train' ]: 35 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 36 | .setLabelCol(label) 37 | .setFeaturesCol('features')) 38 | 39 | if args.trainEvalDataPath: 40 | train_eval_data = prepare_data(args.trainEvalDataPath) 41 | classifier.setEvalSets({ 'test': train_eval_data }) 42 | 43 | train_data = prepare_data(args.trainDataPath) 44 | model = with_benchmark('Training', lambda: classifier.fit(train_data)) 45 | 46 | if args.modelPath: 47 | writer = model.write().overwrite() if args.overwrite else model 48 | writer.save(args.modelPath) 49 | else: 50 | model = XGBoostClassificationModel().load(args.modelPath) 51 | 52 | if args.mode in [ 'all', 'transform' ]: 53 | eval_data = prepare_data(args.evalDataPath) 54 | 55 | def transform(): 56 | result = model.transform(eval_data).cache() 57 | result.foreachPartition(lambda _: None) 58 | return result 59 | 60 | result = with_benchmark('Transformation', transform) 61 | show_sample(args, result, label) 62 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) 63 | 64 | spark.stop() 65 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/agaricus/gpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.agaricus.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader 21 | from pyspark.sql import SparkSession 22 | 23 | def main(args, xgboost_args): 24 | spark = (SparkSession 25 | .builder 26 | .appName(args.mainClass) 27 | .getOrCreate()) 28 | 29 | def prepare_data(path): 30 | reader = (GpuDataReader(spark) 31 | .format(args.format) 32 | .option('asFloats', args.asFloats) 33 | .option('maxRowsPerChunk', args.maxRowsPerChunk)) 34 | if args.format == 'csv': 35 | reader.schema(schema).option('header', args.hasHeader) 36 | return reader.load(path) 37 | 38 | if args.mode in [ 'all', 'train' ]: 39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 40 | .setLabelCol(label) 41 | .setFeaturesCols(features)) 42 | 43 | if args.trainEvalDataPath: 44 | train_eval_data = prepare_data(args.trainEvalDataPath) 45 | classifier.setEvalSets({ 'test': train_eval_data }) 46 | 47 | train_data = prepare_data(args.trainDataPath) 48 | model = with_benchmark('Training', lambda: classifier.fit(train_data)) 49 | 50 | if args.modelPath: 51 | writer = model.write().overwrite() if args.overwrite else model 52 | writer.save(args.modelPath) 53 | else: 54 | model = XGBoostClassificationModel().load(args.modelPath) 55 | 56 | if args.mode in [ 'all', 'transform' ]: 57 | eval_data = prepare_data(args.evalDataPath) 58 | 59 | def transform(): 60 | result = model.transform(eval_data).cache() 61 | result.foreachPartition(lambda _: None) 62 | return result 63 | 64 | result = with_benchmark('Transformation', transform) 65 | show_sample(args, result, label) 66 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) 67 | 68 | spark.stop() 69 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.utility.args import parse_arguments 17 | from importlib import import_module 18 | 19 | def main(): 20 | args, xgboost_args = parse_arguments() 21 | getattr(import_module(args.mainClass), 'main')(args, xgboost_args) 22 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/mortgage/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/consts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from pyspark.sql.types import * 18 | 19 | label = 'delinquency_12' 20 | 21 | schema = StructType([ 22 | StructField('orig_channel', FloatType()), 23 | StructField('first_home_buyer', FloatType()), 24 | StructField('loan_purpose', FloatType()), 25 | StructField('property_type', FloatType()), 26 | StructField('occupancy_status', FloatType()), 27 | StructField('property_state', FloatType()), 28 | StructField('product_type', FloatType()), 29 | StructField('relocation_mortgage_indicator', FloatType()), 30 | StructField('seller_name', FloatType()), 31 | StructField('mod_flag', FloatType()), 32 | StructField('orig_interest_rate', FloatType()), 33 | StructField('orig_upb', IntegerType()), 34 | StructField('orig_loan_term', IntegerType()), 35 | StructField('orig_ltv', FloatType()), 36 | StructField('orig_cltv', FloatType()), 37 | StructField('num_borrowers', FloatType()), 38 | StructField('dti', FloatType()), 39 | StructField('borrower_credit_score', FloatType()), 40 | StructField('num_units', IntegerType()), 41 | StructField('zip', IntegerType()), 42 | StructField('mortgage_insurance_percent', FloatType()), 43 | StructField('current_loan_delinquency_status', IntegerType()), 44 | StructField('current_actual_upb', FloatType()), 45 | StructField('interest_rate', FloatType()), 46 | StructField('loan_age', FloatType()), 47 | StructField('msa', FloatType()), 48 | StructField('non_interest_bearing_upb', FloatType()), 49 | StructField(label, IntegerType()), 50 | ]) 51 | 52 | features = [ x.name for x in schema if x.name != label ] 53 | 54 | default_params = { 55 | 'eta': 0.1, 56 | 'gamma': 0.1, 57 | 'missing': 0.0, 58 | 'maxDepth': 10, 59 | 'maxLeaves': 256, 60 | 'growPolicy': 'depthwise', 61 | 'objective': 'binary:logistic', 62 | 'minChildWeight': 30.0, 63 | 'lambda_': 1.0, 64 | 'scalePosWeight': 2.0, 65 | 'subsample': 1.0, 66 | 'nthread': 1, 67 | 'numRound': 100, 68 | 'numWorkers': 1, 69 | } 70 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/cpu_cross_validator_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.mortgage.consts import * 17 | from ai.rapids.spark.examples.utility.utils import * 18 | from ml.dmlc.xgboost4j.scala.spark import * 19 | from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator 20 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 21 | from pyspark.ml.tuning import ParamGridBuilder 22 | from pyspark.sql import SparkSession 23 | 24 | def main(args, xgboost_args): 25 | spark = (SparkSession 26 | .builder 27 | .appName(args.mainClass) 28 | .getOrCreate()) 29 | 30 | def prepare_data(path): 31 | reader = spark.read.format(args.format) 32 | if args.format == 'csv': 33 | reader.schema(schema).option('header', args.hasHeader) 34 | return vectorize(reader.load(path), label) 35 | 36 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 37 | .setLabelCol(label) 38 | .setFeaturesCol('features')) 39 | evaluator = (MulticlassClassificationEvaluator() 40 | .setLabelCol(label)) 41 | param_grid = (ParamGridBuilder() 42 | .addGrid(classifier.maxDepth, [5, 10]) 43 | .addGrid(classifier.numRound, [100, 200]) 44 | .build()) 45 | cross_validator = (CrossValidator() 46 | .setEstimator(classifier) 47 | .setEvaluator(evaluator) 48 | .setEstimatorParamMaps(param_grid) 49 | .setNumFolds(3)) 50 | 51 | train_data = prepare_data(args.trainDataPath) 52 | model = cross_validator.fit(train_data) 53 | 54 | spark.stop() 55 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/cpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.mortgage.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from pyspark.sql import SparkSession 21 | 22 | def main(args, xgboost_args): 23 | spark = (SparkSession 24 | .builder 25 | .appName(args.mainClass) 26 | .getOrCreate()) 27 | 28 | def prepare_data(path): 29 | reader = spark.read.format(args.format) 30 | if args.format == 'csv': 31 | reader.schema(schema).option('header', args.hasHeader) 32 | return vectorize(reader.load(path), label) 33 | 34 | if args.mode in [ 'all', 'train' ]: 35 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 36 | .setLabelCol(label) 37 | .setFeaturesCol('features')) 38 | 39 | if args.trainEvalDataPath: 40 | train_eval_data = prepare_data(args.trainEvalDataPath) 41 | classifier.setEvalSets({ 'test': train_eval_data }) 42 | 43 | train_data = prepare_data(args.trainDataPath) 44 | model = with_benchmark('Training', lambda: classifier.fit(train_data)) 45 | 46 | if args.modelPath: 47 | writer = model.write().overwrite() if args.overwrite else model 48 | writer.save(args.modelPath) 49 | else: 50 | model = XGBoostClassificationModel().load(args.modelPath) 51 | 52 | if args.mode in [ 'all', 'transform' ]: 53 | eval_data = prepare_data(args.evalDataPath) 54 | 55 | def transform(): 56 | result = model.transform(eval_data).cache() 57 | result.foreachPartition(lambda _: None) 58 | return result 59 | 60 | result = with_benchmark('Transformation', transform) 61 | show_sample(args, result, label) 62 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) 63 | 64 | spark.stop() 65 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/gpu_cross_validator_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.mortgage.consts import * 17 | from ai.rapids.spark.examples.utility.utils import * 18 | from ml.dmlc.xgboost4j.scala.spark import * 19 | from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator, GpuDataReader 20 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 21 | from pyspark.ml.tuning import ParamGridBuilder 22 | from pyspark.sql import SparkSession 23 | 24 | def main(args, xgboost_args): 25 | spark = (SparkSession 26 | .builder 27 | .appName(args.mainClass) 28 | .getOrCreate()) 29 | 30 | def prepare_data(path): 31 | reader = (GpuDataReader(spark) 32 | .format(args.format) 33 | .option('asFloats', args.asFloats) 34 | .option('maxRowsPerChunk', args.maxRowsPerChunk)) 35 | if args.format == 'csv': 36 | reader.schema(schema).option('header', args.hasHeader) 37 | return reader.load(path) 38 | 39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 40 | .setLabelCol(label) 41 | .setFeaturesCols(features)) 42 | evaluator = (MulticlassClassificationEvaluator() 43 | .setLabelCol(label)) 44 | param_grid = (ParamGridBuilder() 45 | .addGrid(classifier.maxDepth, [5, 10]) 46 | .addGrid(classifier.numRound, [100, 200]) 47 | .build()) 48 | cross_validator = (CrossValidator() 49 | .setEstimator(classifier) 50 | .setEvaluator(evaluator) 51 | .setEstimatorParamMaps(param_grid) 52 | .setNumFolds(3)) 53 | 54 | train_data = prepare_data(args.trainDataPath) 55 | model = cross_validator.fit(train_data) 56 | 57 | spark.stop() 58 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/mortgage/gpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.mortgage.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader 21 | from pyspark.sql import SparkSession 22 | 23 | def main(args, xgboost_args): 24 | spark = (SparkSession 25 | .builder 26 | .appName(args.mainClass) 27 | .getOrCreate()) 28 | 29 | def prepare_data(path): 30 | reader = (GpuDataReader(spark) 31 | .format(args.format) 32 | .option('asFloats', args.asFloats) 33 | .option('maxRowsPerChunk', args.maxRowsPerChunk)) 34 | if args.format == 'csv': 35 | reader.schema(schema).option('header', args.hasHeader) 36 | return reader.load(path) 37 | 38 | if args.mode in [ 'all', 'train' ]: 39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args)) 40 | .setLabelCol(label) 41 | .setFeaturesCols(features)) 42 | 43 | if args.trainEvalDataPath: 44 | train_eval_data = prepare_data(args.trainEvalDataPath) 45 | classifier.setEvalSets({ 'test': train_eval_data }) 46 | 47 | train_data = prepare_data(args.trainDataPath) 48 | model = with_benchmark('Training', lambda: classifier.fit(train_data)) 49 | 50 | if args.modelPath: 51 | writer = model.write().overwrite() if args.overwrite else model 52 | writer.save(args.modelPath) 53 | else: 54 | model = XGBoostClassificationModel().load(args.modelPath) 55 | 56 | if args.mode in [ 'all', 'transform' ]: 57 | eval_data = prepare_data(args.evalDataPath) 58 | 59 | def transform(): 60 | result = model.transform(eval_data).cache() 61 | result.foreachPartition(lambda _: None) 62 | return result 63 | 64 | result = with_benchmark('Transformation', transform) 65 | show_sample(args, result, label) 66 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label)) 67 | 68 | spark.stop() 69 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/taxi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/taxi/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/taxi/consts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from pyspark.sql.types import * 18 | 19 | label = 'fare_amount' 20 | 21 | schema = StructType([ 22 | StructField('vendor_id', FloatType()), 23 | StructField('passenger_count', FloatType()), 24 | StructField('trip_distance', FloatType()), 25 | StructField('pickup_longitude', FloatType()), 26 | StructField('pickup_latitude', FloatType()), 27 | StructField('rate_code', FloatType()), 28 | StructField('store_and_fwd', FloatType()), 29 | StructField('dropoff_longitude', FloatType()), 30 | StructField('dropoff_latitude', FloatType()), 31 | StructField(label, FloatType()), 32 | StructField('hour', FloatType()), 33 | StructField('year', IntegerType()), 34 | StructField('month', IntegerType()), 35 | StructField('day', FloatType()), 36 | StructField('day_of_week', FloatType()), 37 | StructField('is_weekend', FloatType()), 38 | ]) 39 | 40 | features = [ x.name for x in schema if x.name != label ] 41 | 42 | default_params = { 43 | 'eta': 0.05, 44 | 'maxDepth': 8, 45 | 'subsample': 0.8, 46 | 'gamma': 1.0, 47 | 'numRound': 100, 48 | 'numWorkers': 1, 49 | } 50 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/taxi/cpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.taxi.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from pyspark.sql import SparkSession 21 | 22 | def main(args, xgboost_args): 23 | spark = (SparkSession 24 | .builder 25 | .appName(args.mainClass) 26 | .getOrCreate()) 27 | 28 | def prepare_data(path): 29 | reader = spark.read.format(args.format) 30 | if args.format == 'csv': 31 | reader.schema(schema).option('header', args.hasHeader) 32 | return vectorize(reader.load(path), label) 33 | 34 | if args.mode in [ 'all', 'train' ]: 35 | regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) 36 | .setLabelCol(label) 37 | .setFeaturesCol('features')) 38 | 39 | if args.trainEvalDataPath: 40 | train_eval_data = prepare_data(args.trainEvalDataPath) 41 | regressor.setEvalSets({ 'test': train_eval_data }) 42 | 43 | train_data = prepare_data(args.trainDataPath) 44 | model = with_benchmark('Training', lambda: regressor.fit(train_data)) 45 | 46 | if args.modelPath: 47 | writer = model.write().overwrite() if args.overwrite else model 48 | writer.save(args.modelPath) 49 | else: 50 | model = XGBoostRegressionModel().load(args.modelPath) 51 | 52 | if args.mode in [ 'all', 'transform' ]: 53 | eval_data = prepare_data(args.evalDataPath) 54 | 55 | def transform(): 56 | result = model.transform(eval_data).cache() 57 | result.foreachPartition(lambda _: None) 58 | return result 59 | 60 | result = with_benchmark('Transformation', transform) 61 | show_sample(args, result, label) 62 | with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) 63 | 64 | spark.stop() 65 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/taxi/gpu_main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.taxi.consts import * 17 | from ai.rapids.spark.examples.utility.args import parse_arguments 18 | from ai.rapids.spark.examples.utility.utils import * 19 | from ml.dmlc.xgboost4j.scala.spark import * 20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader 21 | from pyspark.sql import SparkSession 22 | 23 | def main(args, xgboost_args): 24 | spark = (SparkSession 25 | .builder 26 | .appName(args.mainClass) 27 | .getOrCreate()) 28 | 29 | def prepare_data(path): 30 | reader = (GpuDataReader(spark) 31 | .format(args.format) 32 | .option('asFloats', args.asFloats) 33 | .option('maxRowsPerChunk', args.maxRowsPerChunk)) 34 | if args.format == 'csv': 35 | reader.schema(schema).option('header', args.hasHeader) 36 | return reader.load(path) 37 | 38 | if args.mode in [ 'all', 'train' ]: 39 | regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args)) 40 | .setLabelCol(label) 41 | .setFeaturesCols(features)) 42 | 43 | if args.trainEvalDataPath: 44 | train_eval_data = prepare_data(args.trainEvalDataPath) 45 | regressor.setEvalSets({ 'test': train_eval_data }) 46 | 47 | train_data = prepare_data(args.trainDataPath) 48 | model = with_benchmark('Training', lambda: regressor.fit(train_data)) 49 | 50 | if args.modelPath: 51 | writer = model.write().overwrite() if args.overwrite else model 52 | writer.save(args.modelPath) 53 | else: 54 | model = XGBoostRegressionModel().load(args.modelPath) 55 | 56 | if args.mode in [ 'all', 'transform' ]: 57 | eval_data = prepare_data(args.evalDataPath) 58 | 59 | def transform(): 60 | result = model.transform(eval_data).cache() 61 | result.foreachPartition(lambda _: None) 62 | return result 63 | 64 | result = with_benchmark('Transformation', transform) 65 | show_sample(args, result, label) 66 | with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label)) 67 | 68 | spark.stop() 69 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/utility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/utility/__init__.py -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/utility/args.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from argparse import ArgumentParser 17 | from distutils.util import strtobool 18 | from sys import exit 19 | 20 | def _to_bool(literal): 21 | return bool(strtobool(literal)) 22 | 23 | MAX_CHUNK_SIZE = 2 ** 31 - 1 24 | 25 | _examples = [ 26 | 'ai.rapids.spark.examples.agaricus.cpu_main', 27 | 'ai.rapids.spark.examples.agaricus.gpu_main', 28 | 'ai.rapids.spark.examples.mortgage.cpu_main', 29 | 'ai.rapids.spark.examples.mortgage.gpu_main', 30 | 'ai.rapids.spark.examples.mortgage.cpu_cross_validator_main', 31 | 'ai.rapids.spark.examples.mortgage.gpu_cross_validator_main', 32 | 'ai.rapids.spark.examples.taxi.cpu_main', 33 | 'ai.rapids.spark.examples.taxi.gpu_main', 34 | ] 35 | 36 | _xgboost_simple_args = [ 37 | ('cacheTrainingSet', _to_bool), 38 | ('maximizeEvaluationMetrics', _to_bool), 39 | ('useExternalMemory', _to_bool), 40 | ('checkpointInterval', int), 41 | ('maxBins', int), 42 | ('maxDepth', int), 43 | ('maxLeaves', int), 44 | ('nthread', int), 45 | ('numClass', int), 46 | ('numEarlyStoppingRounds', int), 47 | ('numRound', int), 48 | ('numWorkers', int), 49 | ('seed', int), 50 | ('silent', int), 51 | ('timeoutRequestWorkers', int), 52 | ('treeLimit', int), 53 | ('verbosity', int), 54 | ('alpha', float), 55 | ('baseScore', float), 56 | ('colsampleBylevel', float), 57 | ('colsampleBytree', float), 58 | ('eta', float), 59 | ('gamma', float), 60 | ('lambda_', float), 61 | ('lambdaBias', float), 62 | ('maxDeltaStep', float), 63 | ('minChildWeight', float), 64 | ('missing', float), 65 | ('rateDrop', float), 66 | ('scalePosWeight', float), 67 | ('sketchEps', float), 68 | ('skipDrop', float), 69 | ('subsample', float), 70 | ('trainTestRatio', float), 71 | ('baseMarginCol', str), 72 | ('checkpointPath', str), 73 | ('contribPredictionCol', str), 74 | ('evalMetric', str), 75 | ('featuresCol', str), 76 | ('groupCol', str), 77 | ('growPolicy', str), 78 | ('interactionConstraints', str), 79 | ('labelCol', str), 80 | ('leafPredictionCol', str), 81 | ('monotoneConstraints', str), 82 | ('normalizeType', str), 83 | ('objective', str), 84 | ('objectiveType', str), 85 | ('predictionCol', str), 86 | ('probabilityCol', str), 87 | ('rawPredictionCol', str), 88 | ('sampleType', str), 89 | ('treeMethod', str), 90 | ('weightCol', str), 91 | ] 92 | 93 | _xgboost_array_args = [ 94 | ('thresholds', float), 95 | ] 96 | 97 | def _validate_args(args): 98 | usage = '' 99 | if args.mode in [ 'all', 'train' ] and not args.trainDataPath: 100 | usage += ' --trainDataPath is required for training\n' 101 | if args.mode in [ 'all', 'transform' ] and not args.evalDataPath: 102 | usage += ' --evalDataPath is required for transformation\n' 103 | if args.mode == 'transform' and not args.modelPath: 104 | usage += ' --modelPath is required for transformation\n' 105 | if not (1 <= args.maxRowsPerChunk <= MAX_CHUNK_SIZE): 106 | usage += ' --maxRowsPerChunk should be in range [1, {}]\n'.format(MAX_CHUNK_SIZE) 107 | if usage: 108 | print('-' * 80) 109 | print('Usage:\n' + usage) 110 | exit(1) 111 | 112 | def parse_arguments(): 113 | parser = ArgumentParser() 114 | 115 | # application arguments 116 | parser.add_argument('--mainClass', required=True, choices=_examples) 117 | parser.add_argument('--mode', choices=['all', 'train', 'transform'], default='all') 118 | parser.add_argument('--format', required=True, choices=['csv', 'parquet', 'orc']) 119 | parser.add_argument('--hasHeader', type=_to_bool, default=True) 120 | parser.add_argument('--asFloats', type=_to_bool, default=True) 121 | parser.add_argument('--maxRowsPerChunk', type=int, default=MAX_CHUNK_SIZE) 122 | parser.add_argument('--modelPath') 123 | parser.add_argument('--overwrite', type=_to_bool, default=False) 124 | parser.add_argument('--trainDataPath') 125 | parser.add_argument('--trainEvalDataPath') 126 | parser.add_argument('--evalDataPath') 127 | parser.add_argument('--numRows', type=int, default=5) 128 | parser.add_argument('--showFeatures', type=bool, default=True) 129 | 130 | # xgboost simple args 131 | for arg, arg_type in _xgboost_simple_args: 132 | parser.add_argument('--' + arg, type=arg_type) 133 | 134 | # xgboost array args 135 | for arg, arg_type in _xgboost_array_args: 136 | parser.add_argument('--' + arg, type=arg_type, action='append') 137 | 138 | parsed_all = parser.parse_args() 139 | _validate_args(parsed_all) 140 | 141 | xgboost_args = [ arg for (arg, _) in _xgboost_simple_args + _xgboost_array_args ] 142 | parsed_xgboost = { 143 | k: v 144 | for k, v in vars(parsed_all).items() 145 | if k in xgboost_args and v is not None 146 | } 147 | 148 | return parsed_all, parsed_xgboost 149 | -------------------------------------------------------------------------------- /examples/apps/python/ai/rapids/spark/examples/utility/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from pyspark.ml.evaluation import * 17 | from pyspark.ml.feature import VectorAssembler 18 | from pyspark.sql.functions import col 19 | from pyspark.sql.types import FloatType 20 | from time import time 21 | 22 | def merge_dicts(dict_x, dict_y): 23 | result = dict_x.copy() 24 | result.update(dict_y) 25 | return result 26 | 27 | def show_sample(args, data_frame, label): 28 | data_frame = data_frame if args.showFeatures else data_frame.select(label, 'prediction') 29 | data_frame.show(args.numRows) 30 | 31 | def vectorize(data_frame, label): 32 | features = [ x.name for x in data_frame.schema if x.name != label ] 33 | to_floats = [ col(x.name).cast(FloatType()) for x in data_frame.schema ] 34 | return (VectorAssembler() 35 | .setInputCols(features) 36 | .setOutputCol('features') 37 | .transform(data_frame.select(to_floats)) 38 | .select(col('features'), col(label))) 39 | 40 | def with_benchmark(phrase, action): 41 | start = time() 42 | result = action() 43 | end = time() 44 | print('-' * 100) 45 | print('{} takes {} seconds'.format(phrase, round(end - start, 2))) 46 | return result 47 | 48 | def check_classification_accuracy(data_frame, label): 49 | accuracy = (MulticlassClassificationEvaluator() 50 | .setLabelCol(label) 51 | .evaluate(data_frame)) 52 | print('-' * 100) 53 | print('Accuracy is ' + str(accuracy)) 54 | 55 | def check_regression_accuracy(data_frame, label): 56 | accuracy = (RegressionEvaluator() 57 | .setLabelCol(label) 58 | .evaluate(data_frame)) 59 | print('-' * 100) 60 | print('RMSE is ' + str(accuracy)) 61 | -------------------------------------------------------------------------------- /examples/apps/python/main.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | from ai.rapids.spark.examples.main import main 17 | 18 | main() 19 | -------------------------------------------------------------------------------- /examples/apps/scala/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target 3 | *.iml 4 | -------------------------------------------------------------------------------- /examples/apps/scala/assembly/assembly-no-scala.xml: -------------------------------------------------------------------------------- 1 | 4 | jar-with-dependencies 5 | 6 | jar 7 | 8 | false 9 | 10 | 11 | 12 | org.scala-lang*:scala-* 13 | 14 | / 15 | true 16 | true 17 | runtime 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /examples/apps/scala/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | ai.rapids 7 | sample_xgboost_apps 8 | 9 | jar 10 | Sample XGBoost4J-Spark applications 11 | 0.1.5 12 | sample_xgboost_apps 13 | 14 | 15 | UTF-8 16 | 1.0.0-Beta5 17 | 0.9.2 18 | 2.4.0 19 | 2.11.6 20 | 2.11 21 | 22 | 23 | 24 | 25 | 26 | ai.rapids 27 | xgboost4j_2.x 28 | ${xgboost.version} 29 | 30 | 31 | ai.rapids 32 | cudf 33 | ${cudf.version} 34 | ${cuda.classifier} 35 | 36 | 37 | ai.rapids 38 | xgboost4j-spark_2.x 39 | ${xgboost.version} 40 | 41 | 42 | org.scala-lang 43 | scala-library 44 | ${scala.version} 45 | provided 46 | 47 | 48 | org.apache.spark 49 | spark-sql_${scala.binary.version} 50 | ${spark.version} 51 | provided 52 | 53 | 54 | org.apache.spark 55 | spark-mllib_${scala.binary.version} 56 | ${spark.version} 57 | provided 58 | 59 | 60 | org.scalatest 61 | scalatest_${scala.binary.version} 62 | 3.0.5 63 | test 64 | 65 | 66 | 67 | 68 | 69 | 70 | org.scala-tools 71 | maven-scala-plugin 72 | 2.15.2 73 | 74 | 75 | 76 | compile 77 | testCompile 78 | 79 | 80 | 81 | 82 | 83 | org.scalatest 84 | scalatest-maven-plugin 85 | 1.0 86 | 87 | 88 | 89 | 90 | test 91 | 92 | test 93 | 94 | 95 | 96 | 97 | 98 | org.apache.maven.plugins 99 | maven-assembly-plugin 100 | 2.6 101 | 102 | 103 | assembly/assembly-no-scala.xml 104 | 105 | 106 | 107 | 108 | assembly 109 | package 110 | 111 | single 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | sonatype-repo 121 | 122 | 123 | sonatype-staging-repo 124 | Sonatype staging repo 125 | https://oss.sonatype.org/content/repositories/staging 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus/CPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.agaricus 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, SparkSetup, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 21 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 22 | import org.apache.spark.sql.types.{FloatType, StructField, StructType} 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object CPUMain { 26 | def main(args: Array[String]): Unit = { 27 | 28 | val labelName = "label" 29 | def featureNames(length: Int): List[String] = 30 | 0.until(length).map(i => s"feature_$i").toList.+:(labelName) 31 | 32 | def schema(length: Int): StructType = 33 | StructType(featureNames(length).map(n => StructField(n, FloatType))) 34 | 35 | val dataSchema = schema(126) 36 | val xgboostArgs = XGBoostArgs.parse(args) 37 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 38 | val appInfo = Seq("Agaricus", processor, xgboostArgs.format) 39 | 40 | // build spark session 41 | val spark = SparkSetup(args, appInfo.mkString("-")) 42 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 43 | // === diff === 44 | // build data reader 45 | val dataReader = spark.read 46 | 47 | // load datasets, the order is (train, train-eval, eval) 48 | var datasets = xgboostArgs.dataPaths.map(_.map{ 49 | path => 50 | xgboostArgs.format match { 51 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(dataSchema).csv(path) 52 | case "parquet" => dataReader.parquet(path) 53 | case "orc" => dataReader.orc(path) 54 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 55 | } 56 | }) 57 | 58 | val featureCols = dataSchema.filter(_.name != labelName).map(_.name) 59 | 60 | // === diff === 61 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureCols, labelName))) 62 | 63 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 64 | // build XGBoost classifier 65 | val paramMap = xgboostArgs.xgboostParams(Map( 66 | "eta" -> 0.1, 67 | "missing" -> 0.0, 68 | "max_depth" -> 2, 69 | "eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty) 70 | )) 71 | val xgbClassifier = new XGBoostClassifier(paramMap) 72 | .setLabelCol(labelName) 73 | // === diff === 74 | .setFeaturesCol("features") 75 | 76 | println("\n------ Training ------") 77 | val (model, _) = benchmark.time("train") { 78 | xgbClassifier.fit(datasets(0).get) 79 | } 80 | // Save model if modelPath exists 81 | xgboostArgs.modelPath.foreach(path => 82 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 83 | model 84 | } else { 85 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 86 | } 87 | 88 | if (xgboostArgs.isToTransform) { 89 | // start transform 90 | println("\n------ Transforming ------") 91 | var (results, _) = benchmark.time("transform") { 92 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 93 | ret.foreachPartition(_ => ()) 94 | ret 95 | } 96 | results = if (xgboostArgs.isShowFeatures) { 97 | results 98 | } else { 99 | results.select(labelName, "rawPrediction", "probability", "prediction") 100 | } 101 | results.show(xgboostArgs.numRows) 102 | 103 | println("\n------Accuracy of Evaluation------") 104 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName) 105 | evaluator.evaluate(results) match { 106 | case accuracy if !accuracy.isNaN => 107 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 108 | // Throw an exception when NaN ? 109 | } 110 | } 111 | 112 | spark.close() 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus/GPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.agaricus 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, SparkSetup, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 21 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 22 | import org.apache.spark.sql.types.{FloatType, StructField, StructType} 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object GPUMain { 26 | def main(args: Array[String]): Unit = { 27 | 28 | val labelName = "label" 29 | def featureNames(length: Int): List[String] = 30 | 0.until(length).map(i => s"feature_$i").toList.+:(labelName) 31 | 32 | def schema(length: Int): StructType = 33 | StructType(featureNames(length).map(n => StructField(n, FloatType))) 34 | 35 | val dataSchema = schema(126) 36 | val xgboostArgs = XGBoostArgs.parse(args) 37 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 38 | val appInfo = Seq("Agaricus", processor, xgboostArgs.format) 39 | 40 | // build spark session 41 | val spark = SparkSetup(args, appInfo.mkString("-")) 42 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 43 | // === diff === 44 | // build data reader 45 | val dataReader = new GpuDataReader(spark) 46 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk) 47 | 48 | // load datasets, the order is (train, train-eval, eval) 49 | var datasets = xgboostArgs.dataPaths.map(_.map{ 50 | path => 51 | xgboostArgs.format match { 52 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(dataSchema).csv(path) 53 | case "parquet" => dataReader.parquet(path) 54 | case "orc" => dataReader.orc(path) 55 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 56 | } 57 | }) 58 | 59 | val featureCols = dataSchema.filter(_.name != labelName).map(_.name) 60 | 61 | // === diff === 62 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols' 63 | 64 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 65 | // build XGBoost classifier 66 | val paramMap = xgboostArgs.xgboostParams(Map( 67 | "eta" -> 0.1, 68 | "missing" -> 0.0, 69 | "max_depth" -> 2, 70 | "eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty) 71 | )) 72 | val xgbClassifier = new XGBoostClassifier(paramMap) 73 | .setLabelCol(labelName) 74 | // === diff === 75 | .setFeaturesCols(featureCols) 76 | 77 | println("\n------ Training ------") 78 | val (model, _) = benchmark.time("train") { 79 | xgbClassifier.fit(datasets(0).get) 80 | } 81 | // Save model if modelPath exists 82 | xgboostArgs.modelPath.foreach(path => 83 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 84 | model 85 | } else { 86 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 87 | } 88 | 89 | if (xgboostArgs.isToTransform) { 90 | // start transform 91 | println("\n------ Transforming ------") 92 | var (results, _) = benchmark.time("transform") { 93 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 94 | ret.foreachPartition(_ => ()) 95 | ret 96 | } 97 | results = if (xgboostArgs.isShowFeatures) { 98 | results 99 | } else { 100 | results.select(labelName, "rawPrediction", "probability", "prediction") 101 | } 102 | results.show(xgboostArgs.numRows) 103 | 104 | println("\n------Accuracy of Evaluation------") 105 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName) 106 | evaluator.evaluate(results) match { 107 | case accuracy if !accuracy.isNaN => 108 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 109 | // Throw an exception when NaN ? 110 | } 111 | } 112 | 113 | spark.close() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/CPUCrossValidatorMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.mortgage 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 20 | import ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 23 | import org.apache.spark.ml.tuning.ParamGridBuilder 24 | 25 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 26 | object CPUCrossValidatorMain extends Mortgage { 27 | 28 | def main(args: Array[String]): Unit = { 29 | val xgboostArgs = XGBoostArgs.parse(args) 30 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 31 | val appInfo = Seq(appName, processor, xgboostArgs.format) 32 | 33 | // build spark session 34 | val spark = SparkSession.builder() 35 | .appName(appInfo.mkString("-")) 36 | .getOrCreate() 37 | 38 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 39 | 40 | // === diff === 41 | // build data reader 42 | val dataReader = spark.read 43 | 44 | // load datasets, the order is (train, train-eval, eval) 45 | var datasets = xgboostArgs.dataPaths.map(_.map{ 46 | path => 47 | xgboostArgs.format match { 48 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 49 | case "parquet" => dataReader.parquet(path) 50 | case "orc" => dataReader.orc(path) 51 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 52 | } 53 | }) 54 | 55 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 56 | 57 | // === diff === 58 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName))) 59 | 60 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) 62 | val classifier = new XGBoostClassifier(xgbParamFinal) 63 | .setLabelCol(labelColName) 64 | // === diff === 65 | .setFeaturesCol("features") 66 | 67 | // Tune model using cross validation 68 | val paramGrid = new ParamGridBuilder() 69 | .addGrid(classifier.maxDepth, Array(3, 8)) 70 | .addGrid(classifier.eta, Array(0.2, 0.6)) 71 | .build() 72 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 73 | 74 | val cv = new CrossValidator() 75 | .setEstimator(classifier) 76 | .setEvaluator(evaluator) 77 | .setEstimatorParamMaps(paramGrid) 78 | .setNumFolds(xgboostArgs.numFold) 79 | 80 | val (model, _) = benchmark.time("CrossValidation") { 81 | cv.fit(datasets.head.get).bestModel.asInstanceOf[XGBoostClassificationModel] 82 | } 83 | // Save model if modelPath exists 84 | xgboostArgs.modelPath.foreach(path => 85 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 86 | model 87 | } else { 88 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 89 | } 90 | 91 | if (xgboostArgs.isToTransform) { 92 | println("\n------ Transforming ------") 93 | var (results, _) = benchmark.time("transform") { 94 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 95 | // Trigger the transformation 96 | ret.foreachPartition(_ => ()) 97 | ret 98 | } 99 | results = if (xgboostArgs.isShowFeatures) { 100 | results 101 | } else { 102 | results.select(labelColName, "rawPrediction", "probability", "prediction") 103 | } 104 | results.show(xgboostArgs.numRows) 105 | 106 | println("\n------Accuracy of Evaluation------") 107 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 108 | evaluator.evaluate(results) match { 109 | case accuracy if !accuracy.isNaN => 110 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 111 | // Throw an exception when NaN ? 112 | } 113 | } 114 | spark.close() 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/CPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.mortgage 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object CPUMain extends Mortgage { 26 | 27 | def main(args: Array[String]): Unit = { 28 | val xgboostArgs = XGBoostArgs.parse(args) 29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 30 | val appInfo = Seq(appName, processor, xgboostArgs.format) 31 | 32 | // build spark session 33 | val spark = SparkSession.builder() 34 | .appName(appInfo.mkString("-")) 35 | .getOrCreate() 36 | 37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 38 | // === diff === 39 | // build data reader 40 | val dataReader = spark.read 41 | 42 | // load datasets, the order is (train, train-eval, eval) 43 | var datasets = xgboostArgs.dataPaths.map(_.map{ 44 | path => 45 | xgboostArgs.format match { 46 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 47 | case "parquet" => dataReader.parquet(path) 48 | case "orc" => dataReader.orc(path) 49 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 50 | } 51 | }) 52 | 53 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 54 | 55 | // === diff === 56 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName))) 57 | 58 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 59 | // build XGBoost classifier 60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + 61 | // Add train-eval dataset if specified 62 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) 63 | ) 64 | val xgbClassifier = new XGBoostClassifier(xgbParamFinal) 65 | .setLabelCol(labelColName) 66 | // === diff === 67 | .setFeaturesCol("features") 68 | 69 | // Start training 70 | println("\n------ Training ------") 71 | // Shall we not log the time if it is abnormal, which is usually caused by training failure 72 | val (model, _) = benchmark.time("train") { 73 | xgbClassifier.fit(datasets(0).get) 74 | } 75 | // Save model if modelPath exists 76 | xgboostArgs.modelPath.foreach(path => 77 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 78 | model 79 | } else { 80 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 81 | } 82 | 83 | if (xgboostArgs.isToTransform) { 84 | println("\n------ Transforming ------") 85 | var (results, _) = benchmark.time("transform") { 86 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 87 | // Trigger the transformation 88 | ret.foreachPartition(_ => ()) 89 | ret 90 | } 91 | results = if (xgboostArgs.isShowFeatures) { 92 | results 93 | } else { 94 | results.select(labelColName, "rawPrediction", "probability", "prediction") 95 | } 96 | results.show(xgboostArgs.numRows) 97 | 98 | println("\n------Accuracy of Evaluation------") 99 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 100 | evaluator.evaluate(results) match { 101 | case accuracy if !accuracy.isNaN => 102 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 103 | // Throw an exception when NaN ? 104 | } 105 | } 106 | 107 | spark.close() 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/GPUCrossValidatorMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.mortgage 17 | 18 | import ai.rapids.spark.examples.mortgage.GPUMain.labelColName 19 | import ai.rapids.spark.examples.utility.{Benchmark, XGBoostArgs} 20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 21 | import ml.dmlc.xgboost4j.scala.spark.rapids.{CrossValidator, GpuDataReader} 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 24 | import org.apache.spark.ml.tuning.ParamGridBuilder 25 | 26 | 27 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 28 | object GPUCrossValidatorMain extends Mortgage { 29 | 30 | def main(args: Array[String]): Unit = { 31 | val xgboostArgs = XGBoostArgs.parse(args) 32 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 33 | val appInfo = Seq(appName, processor, xgboostArgs.format) 34 | 35 | // build spark session 36 | val spark = SparkSession.builder() 37 | .appName(appInfo.mkString("-")) 38 | .getOrCreate() 39 | 40 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 41 | // === diff === 42 | // build data reader 43 | val dataReader = new GpuDataReader(spark) 44 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk) 45 | 46 | // load datasets, the order is (train, train-eval, eval) 47 | val datasets = xgboostArgs.dataPaths.map(_.map{ 48 | path => 49 | xgboostArgs.format match { 50 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 51 | case "parquet" => dataReader.parquet(path) 52 | case "orc" => dataReader.orc(path) 53 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 54 | } 55 | }) 56 | 57 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 58 | 59 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) 61 | val classifier = new XGBoostClassifier(xgbParamFinal) 62 | .setLabelCol(labelColName) 63 | // === diff === 64 | .setFeaturesCols(featureNames) 65 | 66 | // Tune model using cross validation 67 | val paramGrid = new ParamGridBuilder() 68 | .addGrid(classifier.maxDepth, Array(3, 10)) 69 | .addGrid(classifier.eta, Array(0.2, 0.6)) 70 | .build() 71 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 72 | 73 | val cv = new CrossValidator() 74 | .setEstimator(classifier) 75 | .setEvaluator(evaluator) 76 | .setEstimatorParamMaps(paramGrid) 77 | .setNumFolds(xgboostArgs.numFold) 78 | 79 | val (model, _) = benchmark.time("CrossValidation") { 80 | cv.fit(datasets.head.get).asInstanceOf[XGBoostClassificationModel] 81 | } 82 | // Save model if modelPath exists 83 | xgboostArgs.modelPath.foreach(path => 84 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 85 | model 86 | } else { 87 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 88 | } 89 | 90 | if (xgboostArgs.isToTransform) { 91 | println("\n------ Transforming ------") 92 | var (results, _) = benchmark.time("transform") { 93 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 94 | // Trigger the transformation 95 | ret.foreachPartition(_ => ()) 96 | ret 97 | } 98 | results = if (xgboostArgs.isShowFeatures) { 99 | results 100 | } else { 101 | results.select(labelColName, "rawPrediction", "probability", "prediction") 102 | } 103 | results.show(xgboostArgs.numRows) 104 | 105 | println("\n------Accuracy of Evaluation------") 106 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 107 | evaluator.evaluate(results) match { 108 | case accuracy if !accuracy.isNaN => 109 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 110 | // Throw an exception when NaN ? 111 | } 112 | } 113 | 114 | spark.close() 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/GPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.mortgage 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier} 20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 21 | import org.apache.spark.sql.SparkSession 22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object GPUMain extends Mortgage { 26 | 27 | def main(args: Array[String]): Unit = { 28 | val xgboostArgs = XGBoostArgs.parse(args) 29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 30 | val appInfo = Seq(appName, processor, xgboostArgs.format) 31 | 32 | // build spark session 33 | val spark = SparkSession.builder() 34 | .appName(appInfo.mkString("-")) 35 | .getOrCreate() 36 | 37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 38 | // === diff === 39 | // build data reader 40 | val dataReader = new GpuDataReader(spark) 41 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk) 42 | 43 | // load datasets, the order is (train, train-eval, eval) 44 | var datasets = xgboostArgs.dataPaths.map(_.map{ 45 | path => 46 | xgboostArgs.format match { 47 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 48 | case "parquet" => dataReader.parquet(path) 49 | case "orc" => dataReader.orc(path) 50 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 51 | } 52 | }) 53 | 54 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 55 | 56 | // === diff === 57 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols' 58 | 59 | val xgbClassificationModel = if (xgboostArgs.isToTrain) { 60 | // build XGBoost classifier 61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + 62 | // Add train-eval dataset if specified 63 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) 64 | ) 65 | val xgbClassifier = new XGBoostClassifier(xgbParamFinal) 66 | .setLabelCol(labelColName) 67 | // === diff === 68 | .setFeaturesCols(featureNames) 69 | 70 | // Start training 71 | println("\n------ Training ------") 72 | // Shall we not log the time if it is abnormal, which is usually caused by training failure 73 | val (model, _) = benchmark.time("train") { 74 | xgbClassifier.fit(datasets(0).get) 75 | } 76 | // Save model if modelPath exists 77 | xgboostArgs.modelPath.foreach(path => 78 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 79 | model 80 | } else { 81 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get) 82 | } 83 | 84 | if (xgboostArgs.isToTransform) { 85 | println("\n------ Transforming ------") 86 | var (results, _) = benchmark.time("transform") { 87 | val ret = xgbClassificationModel.transform(datasets(2).get).cache() 88 | // Trigger the transformation 89 | ret.foreachPartition(_ => ()) 90 | ret 91 | } 92 | results = if (xgboostArgs.isShowFeatures) { 93 | results 94 | } else { 95 | results.select(labelColName, "rawPrediction", "probability", "prediction") 96 | } 97 | results.show(xgboostArgs.numRows) 98 | 99 | println("\n------Accuracy of Evaluation------") 100 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName) 101 | evaluator.evaluate(results) match { 102 | case accuracy if !accuracy.isNaN => 103 | benchmark.value(accuracy, "Accuracy", "Accuracy for") 104 | // Throw an exception when NaN ? 105 | } 106 | } 107 | 108 | spark.close() 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/Mortgage.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.mortgage 17 | 18 | import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType} 19 | 20 | private[mortgage] trait Mortgage { 21 | val appName = "Mortgage" 22 | val labelColName = "delinquency_12" 23 | 24 | val schema = StructType(List( 25 | StructField("orig_channel", FloatType), 26 | StructField("first_home_buyer", FloatType), 27 | StructField("loan_purpose", FloatType), 28 | StructField("property_type", FloatType), 29 | StructField("occupancy_status", FloatType), 30 | StructField("property_state", FloatType), 31 | StructField("product_type", FloatType), 32 | StructField("relocation_mortgage_indicator", FloatType), 33 | StructField("seller_name", FloatType), 34 | StructField("mod_flag", FloatType), 35 | StructField("orig_interest_rate", FloatType), 36 | StructField("orig_upb", IntegerType), 37 | StructField("orig_loan_term", IntegerType), 38 | StructField("orig_ltv", FloatType), 39 | StructField("orig_cltv", FloatType), 40 | StructField("num_borrowers", FloatType), 41 | StructField("dti", FloatType), 42 | StructField("borrower_credit_score", FloatType), 43 | StructField("num_units", IntegerType), 44 | StructField("zip", IntegerType), 45 | StructField("mortgage_insurance_percent", FloatType), 46 | StructField("current_loan_delinquency_status", IntegerType), 47 | StructField("current_actual_upb", FloatType), 48 | StructField("interest_rate", FloatType), 49 | StructField("loan_age", FloatType), 50 | StructField("msa", FloatType), 51 | StructField("non_interest_bearing_upb", FloatType), 52 | StructField(labelColName, IntegerType))) 53 | 54 | val commParamMap = Map( 55 | "eta" -> 0.1, 56 | "gamma" -> 0.1, 57 | "missing" -> 0.0, 58 | "max_depth" -> 10, 59 | "max_leaves" -> 256, 60 | "grow_policy" -> "depthwise", 61 | "objective" -> "binary:logistic", 62 | "min_child_weight" -> 30, 63 | "lambda" -> 1, 64 | "scale_pos_weight" -> 2, 65 | "subsample" -> 1, 66 | "nthread" -> 1, 67 | "num_round" -> 100) 68 | } 69 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/CPUCrossValidatorMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.taxi 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator 20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor} 21 | import org.apache.spark.ml.evaluation.{RegressionEvaluator} 22 | import org.apache.spark.ml.tuning.ParamGridBuilder 23 | import org.apache.spark.sql.SparkSession 24 | 25 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 26 | object CPUCrossValidatorMain extends Taxi { 27 | 28 | def main(args: Array[String]): Unit = { 29 | val xgboostArgs = XGBoostArgs.parse(args) 30 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 31 | val appInfo = Seq(appName, processor, xgboostArgs.format) 32 | 33 | // build spark session 34 | val spark = SparkSession.builder() 35 | .appName(appInfo.mkString("-")) 36 | .getOrCreate() 37 | 38 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 39 | 40 | // === diff === 41 | // build data reader 42 | val dataReader = spark.read 43 | 44 | // load datasets, the order is (train, train-eval, eval) 45 | var datasets = xgboostArgs.dataPaths.map(_.map{ 46 | path => 47 | xgboostArgs.format match { 48 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 49 | case "parquet" => dataReader.parquet(path) 50 | case "orc" => dataReader.orc(path) 51 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 52 | } 53 | }) 54 | 55 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 56 | 57 | // === diff === 58 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName))) 59 | 60 | val xgbRegressionModel = if (xgboostArgs.isToTrain) { 61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) 62 | val regressor = new XGBoostRegressor(xgbParamFinal) 63 | .setLabelCol(labelColName) 64 | // === diff === 65 | .setFeaturesCol("features") 66 | 67 | // Tune model using cross validation 68 | val paramGrid = new ParamGridBuilder() 69 | .addGrid(regressor.maxDepth, Array(3, 8)) 70 | .addGrid(regressor.eta, Array(0.2, 0.6)) 71 | .build() 72 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 73 | 74 | val cv = new CrossValidator() 75 | .setEstimator(regressor) 76 | .setEvaluator(evaluator) 77 | .setEstimatorParamMaps(paramGrid) 78 | .setNumFolds(xgboostArgs.numFold) 79 | 80 | val (model, _) = benchmark.time("CrossValidation") { 81 | cv.fit(datasets.head.get).bestModel.asInstanceOf[XGBoostRegressionModel] 82 | } 83 | // Save model if modelPath exists 84 | xgboostArgs.modelPath.foreach(path => 85 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 86 | model 87 | } else { 88 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get) 89 | } 90 | 91 | if (xgboostArgs.isToTransform) { 92 | println("\n------ Transforming ------") 93 | var (prediction, _) = benchmark.time("transform") { 94 | val ret = xgbRegressionModel.transform(datasets(2).get).cache() 95 | ret.foreachPartition(_ => ()) 96 | ret 97 | } 98 | prediction = if (xgboostArgs.isShowFeatures) { 99 | prediction 100 | } else { 101 | prediction.select(labelColName, "prediction") 102 | } 103 | prediction.show(xgboostArgs.numRows) 104 | 105 | println("\n------Accuracy of Evaluation------") 106 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 107 | evaluator.evaluate(prediction) match { 108 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for") 109 | // Throw an exception when NaN ? 110 | } 111 | } 112 | 113 | spark.close() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/CPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.taxi 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor} 20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 21 | import org.apache.spark.ml.evaluation.RegressionEvaluator 22 | import org.apache.spark.sql.SparkSession 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object CPUMain extends Taxi { 26 | 27 | def main(args: Array[String]): Unit = { 28 | val xgboostArgs = XGBoostArgs.parse(args) 29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 30 | val appInfo = Seq(appName, processor, xgboostArgs.format) 31 | 32 | // build spark session 33 | val spark = SparkSession.builder() 34 | .appName(appInfo.mkString("-")) 35 | .getOrCreate() 36 | 37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 38 | // === diff === 39 | // build data reader 40 | val dataReader = spark.read 41 | 42 | // load datasets, the order is (train, train-eval, eval) 43 | var datasets = xgboostArgs.dataPaths.map(_.map{ 44 | path => 45 | xgboostArgs.format match { 46 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 47 | case "parquet" => dataReader.parquet(path) 48 | case "orc" => dataReader.orc(path) 49 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 50 | } 51 | }) 52 | 53 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 54 | 55 | // === diff === 56 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName))) 57 | 58 | val xgbRegressionModel = if (xgboostArgs.isToTrain) { 59 | // build XGBoost XGBoostRegressor 60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + 61 | // Add train-eval dataset if specified 62 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) 63 | ) 64 | val xgbRegressor = new XGBoostRegressor(xgbParamFinal) 65 | .setLabelCol(labelColName) 66 | // === diff === 67 | .setFeaturesCol("features") 68 | 69 | println("\n------ Training ------") 70 | // Shall we not log the time if it is abnormal, which is usually caused by training failure 71 | val (model, _) = benchmark.time("train") { 72 | xgbRegressor.fit(datasets(0).get) 73 | } 74 | // Save model if modelPath exists 75 | xgboostArgs.modelPath.foreach(path => 76 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 77 | model 78 | } else { 79 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get) 80 | } 81 | 82 | if (xgboostArgs.isToTransform) { 83 | println("\n------ Transforming ------") 84 | var (prediction, _) = benchmark.time("transform") { 85 | val ret = xgbRegressionModel.transform(datasets(2).get).cache() 86 | ret.foreachPartition(_ => ()) 87 | ret 88 | } 89 | prediction = if (xgboostArgs.isShowFeatures) { 90 | prediction 91 | } else { 92 | prediction.select(labelColName, "prediction") 93 | } 94 | prediction.show(xgboostArgs.numRows) 95 | 96 | println("\n------Accuracy of Evaluation------") 97 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 98 | evaluator.evaluate(prediction) match { 99 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for") 100 | // Throw an exception when NaN ? 101 | } 102 | } 103 | 104 | spark.close() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/GPUCrossValidatorMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.taxi 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.rapids.{CrossValidator, GpuDataReader} 20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor} 21 | import org.apache.spark.ml.evaluation.{RegressionEvaluator} 22 | import org.apache.spark.ml.tuning.ParamGridBuilder 23 | import org.apache.spark.sql.SparkSession 24 | 25 | 26 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 27 | object GPUCrossValidatorMain extends Taxi { 28 | 29 | def main(args: Array[String]): Unit = { 30 | val xgboostArgs = XGBoostArgs.parse(args) 31 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 32 | val appInfo = Seq(appName, processor, xgboostArgs.format) 33 | 34 | // build spark session 35 | val spark = SparkSession.builder() 36 | .appName(appInfo.mkString("-")) 37 | .getOrCreate() 38 | 39 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 40 | // === diff === 41 | // build data reader 42 | val dataReader = new GpuDataReader(spark) 43 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk) 44 | 45 | // load datasets, the order is (train, train-eval, eval) 46 | val datasets = xgboostArgs.dataPaths.map(_.map{ 47 | path => 48 | xgboostArgs.format match { 49 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 50 | case "parquet" => dataReader.parquet(path) 51 | case "orc" => dataReader.orc(path) 52 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 53 | } 54 | }) 55 | 56 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 57 | 58 | val xgbRegressionModel = if (xgboostArgs.isToTrain) { 59 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) 60 | val regressor = new XGBoostRegressor(xgbParamFinal) 61 | .setLabelCol(labelColName) 62 | // === diff === 63 | .setFeaturesCols(featureNames) 64 | 65 | // Tune model using cross validation 66 | val paramGrid = new ParamGridBuilder() 67 | .addGrid(regressor.maxDepth, Array(3, 10)) 68 | .addGrid(regressor.eta, Array(0.2, 0.6)) 69 | .build() 70 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 71 | 72 | val cv = new CrossValidator() 73 | .setEstimator(regressor) 74 | .setEvaluator(evaluator) 75 | .setEstimatorParamMaps(paramGrid) 76 | .setNumFolds(xgboostArgs.numFold) 77 | 78 | val (model, _) = benchmark.time("CrossValidation") { 79 | cv.fit(datasets.head.get).asInstanceOf[XGBoostRegressionModel] 80 | } 81 | // Save model if modelPath exists 82 | xgboostArgs.modelPath.foreach(path => 83 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 84 | model 85 | } else { 86 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get) 87 | } 88 | 89 | if (xgboostArgs.isToTransform) { 90 | println("\n------ Transforming ------") 91 | var (prediction, _) = benchmark.time("transform") { 92 | val ret = xgbRegressionModel.transform(datasets(2).get).cache() 93 | ret.foreachPartition(_ => ()) 94 | ret 95 | } 96 | prediction = if (xgboostArgs.isShowFeatures) { 97 | prediction 98 | } else { 99 | prediction.select(labelColName, "prediction") 100 | } 101 | prediction.show(xgboostArgs.numRows) 102 | 103 | println("\n------Accuracy of Evaluation------") 104 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 105 | evaluator.evaluate(prediction) match { 106 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for") 107 | // Throw an exception when NaN ? 108 | } 109 | } 110 | 111 | spark.close() 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/GPUMain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.taxi 17 | 18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs} 19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor} 20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader 21 | import org.apache.spark.ml.evaluation.RegressionEvaluator 22 | import org.apache.spark.sql.SparkSession 23 | 24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ===' 25 | object GPUMain extends Taxi { 26 | 27 | def main(args: Array[String]): Unit = { 28 | val xgboostArgs = XGBoostArgs.parse(args) 29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3) 30 | val appInfo = Seq(appName, processor, xgboostArgs.format) 31 | 32 | // build spark session 33 | val spark = SparkSession.builder() 34 | .appName(appInfo.mkString("-")) 35 | .getOrCreate() 36 | 37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2)) 38 | // === diff === 39 | // build data reader 40 | val dataReader = new GpuDataReader(spark) 41 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk) 42 | 43 | // load datasets, the order is (train, train-eval, eval) 44 | var datasets = xgboostArgs.dataPaths.map(_.map{ 45 | path => 46 | xgboostArgs.format match { 47 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path) 48 | case "parquet" => dataReader.parquet(path) 49 | case "orc" => dataReader.orc(path) 50 | case _ => throw new IllegalArgumentException("Unsupported data file format!") 51 | } 52 | }) 53 | 54 | val featureNames = schema.filter(_.name != labelColName).map(_.name) 55 | 56 | // === diff === 57 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols' 58 | 59 | val xgbRegressionModel = if (xgboostArgs.isToTrain) { 60 | // build XGBoost XGBoostRegressor 61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + 62 | // Add train-eval dataset if specified 63 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) 64 | ) 65 | val xgbRegressor = new XGBoostRegressor(xgbParamFinal) 66 | .setLabelCol(labelColName) 67 | // === diff === 68 | .setFeaturesCols(featureNames) 69 | 70 | println("\n------ Training ------") 71 | // Shall we not log the time if it is abnormal, which is usually caused by training failure 72 | val (model, _) = benchmark.time("train") { 73 | xgbRegressor.fit(datasets(0).get) 74 | } 75 | // Save model if modelPath exists 76 | xgboostArgs.modelPath.foreach(path => 77 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path)) 78 | model 79 | } else { 80 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get) 81 | } 82 | 83 | if (xgboostArgs.isToTransform) { 84 | println("\n------ Transforming ------") 85 | var (prediction, _) = benchmark.time("transform") { 86 | val ret = xgbRegressionModel.transform(datasets(2).get).cache() 87 | ret.foreachPartition(_ => ()) 88 | ret 89 | } 90 | prediction = if (xgboostArgs.isShowFeatures) { 91 | prediction 92 | } else { 93 | prediction.select(labelColName, "prediction") 94 | } 95 | prediction.show(xgboostArgs.numRows) 96 | 97 | println("\n------Accuracy of Evaluation------") 98 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName) 99 | evaluator.evaluate(prediction) match { 100 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for") 101 | // Throw an exception when NaN ? 102 | } 103 | } 104 | 105 | spark.close() 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/Taxi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.taxi 17 | 18 | import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType} 19 | 20 | private[taxi] trait Taxi { 21 | val appName = "Taxi" 22 | val labelColName = "fare_amount" 23 | 24 | lazy val commParamMap = Map( 25 | "learning_rate" -> 0.05, 26 | "max_depth" -> 8, 27 | "subsample" -> 0.8, 28 | "gamma" -> 1 29 | ) 30 | 31 | lazy val schema = 32 | StructType(Array( 33 | StructField("vendor_id", FloatType), 34 | StructField("passenger_count", FloatType), 35 | StructField("trip_distance", FloatType), 36 | StructField("pickup_longitude", FloatType), 37 | StructField("pickup_latitude", FloatType), 38 | StructField("rate_code", FloatType), 39 | StructField("store_and_fwd", FloatType), 40 | StructField("dropoff_longitude", FloatType), 41 | StructField("dropoff_latitude", FloatType), 42 | StructField(labelColName, FloatType), 43 | StructField("hour", FloatType), 44 | StructField("year", IntegerType), 45 | StructField("month", IntegerType), 46 | StructField("day", FloatType), 47 | StructField("day_of_week", FloatType), 48 | StructField("is_weekend", FloatType) 49 | )) 50 | } 51 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/Benchmark.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.utility 17 | 18 | import scala.util.Properties 19 | 20 | class Benchmark( 21 | appName: String, 22 | processor: String, 23 | dataFormat: String) { 24 | 25 | def time[R](phase: String, silent: (Any, Float) => Boolean = (_,_) => false) 26 | (block: => R): (R, Float) = { 27 | val t0 = System.currentTimeMillis 28 | val result = block // call-by-name 29 | val elapsedTimeSec = (System.currentTimeMillis - t0).toFloat / 1000 30 | logging(elapsedTimeSec, phase, "Elapsed time for", "s", silent(result, elapsedTimeSec)) 31 | (result, elapsedTimeSec) 32 | } 33 | 34 | def value(value: Any, name: String = "value", prefix: String="", suffix: String = "") = { 35 | logging(value, name, prefix, suffix, false) 36 | } 37 | 38 | private def logging(value: Any, name: String , prefix: String, suffix: String, silent: Boolean) = { 39 | if (!silent) { 40 | val logString = buildLogSimple(value, prefix, suffix, buildRuntimeInfo(name)) 41 | println("\n--------------") 42 | println("==> Benchmark: " + logString) 43 | println("--------------\n") 44 | } 45 | } 46 | 47 | private def buildRuntimeInfo(name: String): String = { 48 | // Get runtime information from Environment 49 | val osType = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_OS_TYPE", "Unknown") 50 | val cudaVersion = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_CUDA_VERSION", "Unknown") 51 | val sparkVersion = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_SPARK_VERSION", "Unknown") 52 | Seq(appName, processor, name, dataFormat, "stub", cudaVersion, osType, sparkVersion) 53 | .mkString(" ") 54 | } 55 | 56 | private def buildLogSimple(value: Any, prefix: String, suffix: String, runtimeInfo: String): String = 57 | prefix + " [" + runtimeInfo + "]: " + value + suffix 58 | } 59 | 60 | object Benchmark { 61 | def apply(appName: String, processor: String, dataFormat: String) = 62 | new Benchmark(appName, processor, dataFormat) 63 | } 64 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/SparkSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.utility 17 | 18 | import org.apache.spark.sql.SparkSession 19 | 20 | object SparkSetup { 21 | def apply(args: Array[String], appName: String) = { 22 | val builder = SparkSession.builder() 23 | val masterBuilder = Option(System.getenv("SPARK_MASTER")).map{master => 24 | builder.master(master) 25 | }.getOrElse(builder) 26 | 27 | masterBuilder.appName(appName).getOrCreate() 28 | } 29 | 30 | def apply(args: Array[String]): SparkSession = SparkSetup(args, "default") 31 | 32 | } 33 | -------------------------------------------------------------------------------- /examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/Vectorize.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package ai.rapids.spark.examples.utility 17 | 18 | import org.apache.spark.ml.feature.VectorAssembler 19 | import org.apache.spark.sql.DataFrame 20 | import org.apache.spark.sql.functions._ 21 | import org.apache.spark.sql.types.FloatType 22 | 23 | object Vectorize { 24 | def apply(df: DataFrame, labelName: String, changeLabelName: Boolean = true): DataFrame = { 25 | val features = df.schema.collect{case f if f.name != labelName => f.name} 26 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType)) 27 | val labelCol = if (changeLabelName) col(labelName).alias("label") else col(labelName) 28 | new VectorAssembler() 29 | .setInputCols(features.toArray) 30 | .setOutputCol("features") 31 | .transform(df.select(toFloat:_*)) 32 | .select(col("features"), labelCol) 33 | } 34 | 35 | def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = { 36 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType)) 37 | new VectorAssembler() 38 | .setInputCols(featureNames.toArray) 39 | .setOutputCol("features") 40 | .transform(df.select(toFloat:_*)) 41 | .select(col("features"), col(labelName)) 42 | } 43 | 44 | def criteoApply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = { 45 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType)) 46 | new VectorAssembler() 47 | .setHandleInvalid("keep") 48 | .setInputCols(featureNames.toArray) 49 | .setOutputCol("features") 50 | .transform(df.select(toFloat:_*)) 51 | .select(col("features"), col(labelName)) 52 | } 53 | 54 | def apply(featureNames: Seq[String], df: DataFrame, otherNames: String*): DataFrame = { 55 | val resultCols = (otherNames :+ "features").map(col(_)) 56 | new VectorAssembler() 57 | .setInputCols(featureNames.toArray) 58 | .setOutputCol("features") 59 | .transform(df) 60 | .select(resultCols: _*) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /examples/notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /getting-started-guides/building-sample-apps/python.md: -------------------------------------------------------------------------------- 1 | # Build XGBoost Python Examples 2 | 3 | ##### Build Process 4 | 5 | Follow these steps to package the Python zip file: 6 | 7 | ``` 8 | git clone https://github.com/rapidsai/spark-examples.git 9 | cd spark-examples/examples/apps/python 10 | zip -r samples.zip ai 11 | ``` 12 | 13 | ##### Files Required by PySpark 14 | 15 | Two files are required by PySpark: 16 | 17 | + *samples.zip* : the package including all example code 18 | + *main.py*: entrypoint for PySpark, you may just copy it from folder *spark-examples/examples/apps/python* 19 | -------------------------------------------------------------------------------- /getting-started-guides/building-sample-apps/scala.md: -------------------------------------------------------------------------------- 1 | # Build XGBoost Scala Examples 2 | 3 | Our examples rely on [cuDF](https://github.com/rapidsai/cudf) and [XGBoost](https://github.com/rapidsai/xgboost/tree/rapids-spark) 4 | 5 | ##### Build Process 6 | 7 | Follow these steps to build the Scala jars (Here take CUDA 10.0 as an example): 8 | 9 | ``` 10 | git clone https://github.com/rapidsai/spark-examples.git 11 | cd spark-examples/examples/apps/scala 12 | mvn package -Dcuda.classifier=cuda10 13 | ``` 14 | 15 | ##### Generated Jars 16 | 17 | The build process generates two jars: 18 | 19 | + *sample_xgboost_apps-0.1.5.jar* : only classes for the examples are included, so it should be submitted to spark together with other dependent jars 20 | + *sample_xgboost_apps-0.1.5-jar-with-dependencies.jar*: both classes for the examples and the classes from dependent jars are included 21 | 22 | ##### Build Options 23 | 24 | Classifiers: 25 | 26 | + *cuda.classifier* 27 | + For CUDA 9.2 building, omit this classifier 28 | + For CUDA 10.0 building, specify *cuda10* 29 | + For CUDA 10.1 building, specify *cuda10-1* 30 | -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/EMR_Mortgage_Example_G4dn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%configure -f\n", 10 | "{\n", 11 | " \"driverMemory\": \"8000M\",\n", 12 | " \"driverCores\": 2,\n", 13 | " \"executorMemory\": \"8000M\",\n", 14 | " \"conf\" : {\"spark.executor.instances\":2, \"spark.executor.cores\":4, \"spark.task.cpus\": 4, \"spark.yarn.maxAppAttempts\": 1, \"spark.dynamicAllocation.enabled\": false},\n", 15 | " \"jars\" : [\"https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2.jar\",\n", 16 | " \"https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\",\n", 17 | " \"https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\"]\n", 18 | "}" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "sc.listJars.foreach(println)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "%%info" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "// import notebook source\n", 46 | "import org.apache.spark.sql.SparkSession\n", 47 | "import org.apache.spark.ml.evaluation.RegressionEvaluator\n", 48 | "import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\n", 49 | "import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\n", 50 | "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\n", 51 | "import ml.dmlc.xgboost4j.scala.spark.rapids.{GpuDataReader, GpuDataset}\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "val trainPath = \"s3://sagemaker-gpu-xgboost/mortgage/csv/train/\"\n", 61 | "val evalPath = \"s3://sagemaker-gpu-xgboost/mortgage/csv/test/\"\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "sc.listJars.foreach(println)\n", 71 | "\n", 72 | "val spark = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n", 73 | "\n", 74 | "val dataReader = new GpuDataReader(spark)\n", 75 | "\n", 76 | "val labelColName = \"delinquency_12\"\n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "val schema = StructType(List(\n", 86 | " StructField(\"orig_channel\", DoubleType),\n", 87 | " StructField(\"first_home_buyer\", DoubleType),\n", 88 | " StructField(\"loan_purpose\", DoubleType),\n", 89 | " StructField(\"property_type\", DoubleType),\n", 90 | " StructField(\"occupancy_status\", DoubleType),\n", 91 | " StructField(\"property_state\", DoubleType),\n", 92 | " StructField(\"product_type\", DoubleType),\n", 93 | " StructField(\"relocation_mortgage_indicator\", DoubleType),\n", 94 | " StructField(\"seller_name\", DoubleType),\n", 95 | " StructField(\"mod_flag\", DoubleType),\n", 96 | " StructField(\"orig_interest_rate\", DoubleType),\n", 97 | " StructField(\"orig_upb\", IntegerType),\n", 98 | " StructField(\"orig_loan_term\", IntegerType),\n", 99 | " StructField(\"orig_ltv\", DoubleType),\n", 100 | " StructField(\"orig_cltv\", DoubleType),\n", 101 | " StructField(\"num_borrowers\", DoubleType),\n", 102 | " StructField(\"dti\", DoubleType),\n", 103 | " StructField(\"borrower_credit_score\", DoubleType),\n", 104 | " StructField(\"num_units\", IntegerType),\n", 105 | " StructField(\"zip\", IntegerType),\n", 106 | " StructField(\"mortgage_insurance_percent\", DoubleType),\n", 107 | " StructField(\"current_loan_delinquency_status\", IntegerType),\n", 108 | " StructField(\"current_actual_upb\", DoubleType),\n", 109 | " StructField(\"interest_rate\", DoubleType),\n", 110 | " StructField(\"loan_age\", DoubleType),\n", 111 | " StructField(\"msa\", DoubleType),\n", 112 | " StructField(\"non_interest_bearing_upb\", DoubleType),\n", 113 | " StructField(labelColName, IntegerType)))\n", 114 | "\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "val commParamMap = Map(\n", 124 | " \"eta\" -> 0.1,\n", 125 | " \"gamma\" -> 0.1,\n", 126 | " \"missing\" -> 0.0,\n", 127 | " \"max_depth\" -> 10,\n", 128 | " \"max_leaves\" -> 256,\n", 129 | " \"grow_policy\" -> \"depthwise\",\n", 130 | " \"min_child_weight\" -> 30,\n", 131 | " \"lambda\" -> 1,\n", 132 | " \"scale_pos_weight\" -> 2,\n", 133 | " \"subsample\" -> 1,\n", 134 | " \"nthread\" -> 4,\n", 135 | " \"num_round\" -> 100,\n", 136 | " \"num_workers\" -> 2,\n", 137 | " \"tree_method\" -> \"gpu_hist\")\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "var (trainSet, evalSet) = {\n", 147 | " dataReader.option(\"header\", true).schema(schema)\n", 148 | " (dataReader.csv(trainPath), dataReader.csv(evalPath))}\n", 149 | "\n", 150 | "val featureNames = schema.filter(_.name != labelColName).map(_.name)\n", 151 | "\n", 152 | "object Benchmark {\n", 153 | " def time[R](phase: String)(block: => R): (R, Float) = {\n", 154 | " val t0 = System.currentTimeMillis\n", 155 | " val result = block // call-by-name\n", 156 | " val t1 = System.currentTimeMillis\n", 157 | " println(\"==> Benchmark: Elapsed time for [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n", 158 | " (result, (t1 - t0).toFloat / 1000)\n", 159 | " }\n", 160 | "}\n", 161 | "\n" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "val modelPath = \"/tmp/model\"\n", 171 | "val xgbClassifier = new XGBoostClassifier(commParamMap).setLabelCol(labelColName).setFeaturesCols(featureNames)\n" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "println(\"\\n------ Training ------\")\n", 181 | "val (model, _) = Benchmark.time(\"train\") {\n", 182 | " xgbClassifier.fit(trainSet)\n", 183 | "}\n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "// Save model if modelPath exists\n", 193 | "model.write.overwrite().save(modelPath)\n", 194 | "val xgbClassificationModel = model\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "println(\"\\n------ Transforming ------\")\n", 204 | "val (results, _) = Benchmark.time(\"transform\") {\n", 205 | " xgbClassificationModel.transform(evalSet)\n", 206 | "}\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "println(\"\\n------Accuracy of Evaluation------\")\n", 216 | "val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)\n", 217 | "val accuracy = evaluator.evaluate(results)\n", 218 | "println(accuracy)\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": "Spark", 232 | "language": "", 233 | "name": "sparkkernel" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": "text/x-scala", 237 | "mimetype": "text/x-scala", 238 | "name": "scala", 239 | "pygments_lexer": "scala" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/adv_full_datasets.md: -------------------------------------------------------------------------------- 1 | # Advanced Topic: Using Large Mortgage Dataset for AWS EMR XGBOOST4J-SPARK 2 | 3 | 4 | This guide adds some additional instructions and tips for running large Mortgage datasets on XGBoost4J-Spark on AWS EMR. Please use this quick start guide - [Get Started with XGBoost4J-Spark on AWS EMR](emr.md) for complete step-by-step instructions to run GPU XGBoost Mortgage Examples. 5 | 6 | ### Use Multi-GPU P3 instance for AWS EMR Core Nodes 7 | 8 | For large datasets, we recommend to use two P3.8xlarge instances as Core node, which has total 8 GPUs. 9 | Please also increase the EBS storage on each Core node to 300G if using HDFS to store the data. 10 | 11 | Please also ssh into each Core node to issue following command to set *EXCLUSIVE_PROCESS* for all GPUs on each Core node. 12 | This step is explained in [Multi-GPU Configuration for XGBoost4J-Spark](/advanced-topics/multi-gpu.md) The public IP address of each core node can be found by clicking EMR Cluster Details. Go to Hardware tab and click the ID of Core. 13 | You can use the same private key and username *hadoop* to log into each node. The bootstrap function in EMR currently doesn't support this script. 14 | 15 | ``` 16 | nvidia-smi -c EXCLUSIVE_PROCESS 17 | ``` 18 | 19 | ### Using Full Mortgage Datasets for Mortgage Example 20 | 21 | #### Option 1: Using EMR steps to copy full Mortgage datasets to HDFS 22 | 23 | You can copy the full [mortgage data](https://rapidsai.github.io/demos/datasets/mortgage-data) to HDFS in EMR Master Node. 24 | Or Load the dataset from S3 when launch the AWS EMR cluster using steps 25 | 26 | In step 1: Software and Steps, add a step with Name, JAR location (command-runner.jar) and the following command in arguments. 27 | ``` 28 | s3-dist-cp --src=s3://spark-xgboost-mortgage-dataset/csv --dest=hdfs:///tmp/mortgage 29 | ``` 30 | 31 | ![Step 1: Software and Steps](pics/emr-step-one-s3-copy.png) 32 | 33 | 34 | #### Option 2: Using AWS S3 for Datasets Directly 35 | You can use dataseta on S3 directly when submit the spark job. 36 | Please refer to this [AWS document](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-file-systems.html) for detailed information. 37 | In our example, you can use following the format: 38 | ``` 39 | export DATA_PREFIX=s3://spark-xgboost-mortgage-dataset/csv 40 | ``` 41 | 42 | 43 | ### Run Mortgage Example with Full Datasets with 1000 rounds and 20 depth 44 | The same jar file built in [quick start guide](emr.md) will be used here. 45 | Now here is how to run the GPU Mortgage Example with full Mortgage Datasets for 1000 rounds and 20 depth: 46 | 47 | ``` 48 | export SPARK_DEPLOY_MODE=cluster 49 | export JARS_PATH=hdfs:/tmp/xgboost4j_spark/ 50 | export SPARK_DRIVER_MEMORY=10G 51 | export SPARK_EXECUTOR_MEMORY=40G 52 | export SPARK_NUM_EXECUTORS=8 53 | export CORES_PER_EXECUTOR=6 54 | export TOTAL_CORES=$((${CORES_PER_EXECUTOR}*${SPARK_NUM_EXECUTORS})) 55 | export JAR_PREFIX=hdfs:/tmp/xgboost4j_spark/ 56 | export EXAMPLE_CLASS=ai.rapids.spark.examples.mortgage.GPUMain 57 | export JAR_EXAMPLE=${JARS_PATH}/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar 58 | 59 | export DATA_PREFIX=hdfs:/tmp/mortgage 60 | //export DATA_PREFIX=s3://spark-xgboost-mortgage-dataset/csv for s3 storage 61 | export TRAIN_DATA=${DATA_PREFIX}/train/20* 62 | export EVAL_DATA=${DATA_PREFIX}/eval/20* 63 | 64 | export ROUND=100 65 | export TREE_METHOD=gpu_hist 66 | 67 | spark-submit \ 68 | --master yarn \ 69 | --deploy-mode ${SPARK_DEPLOY_MODE} \ 70 | --driver-memory ${SPARK_DRIVER_MEMORY} \ 71 | --executor-memory ${SPARK_EXECUTOR_MEMORY} \ 72 | --conf spark.executor.cores=${CORES_PER_EXECUTOR} \ 73 | --conf spark.task.cpus=${CORES_PER_EXECUTOR} \ 74 | --conf spark.yarn.maxAppAttempts=1 \ 75 | --conf spark.sql.files.maxPartitionBytes=4294967296 \ 76 | --num-executors ${SPARK_NUM_EXECUTORS} \ 77 | --class ${EXAMPLE_CLASS} \ 78 | ${JAR_EXAMPLE} \ 79 | -trainDataPath=$TRAIN_DATA \ 80 | -evalDataPath=$EVAL_DATA \ 81 | -format=csv \ 82 | -numRound=$ROUND \ 83 | -max_depth=20 \ 84 | -num_workers=${SPARK_NUM_EXECUTORS} \ 85 | -treeMethod=${TREE_METHOD} \ 86 | -nthread=${CORES_PER_EXECUTOR} 87 | ``` 88 | 89 | In the stdout driver log, you should see timings\* (in seconds), and the RMSE accuracy metric. To find the stdout, go to the details of cluster, select Application history tab, and then click the application you just ran, click Executors tab, in the driver row, click "view logs" and then click "stdout". The stdout log file will show all the outputs. 90 | 91 | ------ Training ------ 92 | 93 | Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=172.31.25.254, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=8} 94 | 95 | ==> Benchmark: Elapsed time for [Mortgage GPU train csv stub Unknown Unknown Unknown]: 785.183s 96 | 97 | ------ Transforming ------ 98 | 99 | ==> Benchmark: Elapsed time for [Mortgage GPU transform csv stub Unknown Unknown Unknown]: 383.537s 100 | 101 | ------Accuracy of Evaluation------ 102 | 103 | ==> Benchmark: Accuracy for [Mortgage GPU Accuracy csv stub Unknown Unknown Unknown]: 0.9909487814701571 104 | -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-cluster-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-details.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-cluster-dns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-dns.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-cluster-ssh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-ssh.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-cluster-waiting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-waiting.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-stdout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-stdout.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-step-four-security.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-four-security.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-step-one-s3-copy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-one-s3-copy.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-step-one-software-and-steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-one-software-and-steps.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-step-three-general-cluster-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-three-general-cluster-settings.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-step-two-hardware.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-two-hardware.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/emr-view-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-view-logs.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-config-move.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-config-move.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-config-updated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-config-updated.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-curl-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-curl-output.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-info-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-info-output.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-jupyter-new.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-jupyter-new.gif -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-kernel-restart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-kernel-restart.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-notebook-instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-notebook-instance.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-output.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-permission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-permission.png -------------------------------------------------------------------------------- /getting-started-guides/csp/aws/pics/sagemaker-tcp-port.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-tcp-port.png -------------------------------------------------------------------------------- /getting-started-guides/csp/databricks/databricks.md: -------------------------------------------------------------------------------- 1 | Get Started with XGBoost4J-Spark on Databricks 2 | ====================================================== 3 | This is a getting started guide to XGBoost4J-Spark on Databricks. At the end of this guide, the reader will be able to run a sample Apache Spark application that runs on NVIDIA GPUs on Databricks. 4 | 5 | Prerequisites 6 | ------------- 7 | * Apache Spark 2.4+ running in DataBricks Runtime 5.3 ML with GPU, 5.4 ML with GPU, or 5.5 ML with GPU. Make sure it matches the hardware and software requirements below. 8 | * Hardware Requirements 9 | * NVIDIA Pascal™ GPU architecture or better 10 | * Multi-node clusters with homogenous GPU configuration 11 | * Software Requirements 12 | * Ubuntu 16.04/CentOS 13 | * CUDA V10.1/10.0/9.2 14 | * NVIDIA driver compatible with your CUDA 15 | * NCCL 2.4.7 16 | 17 | The number of GPUs per node dictates the number of Spark executors that can run in that node. Each executor should only be allowed to run 1 task at any given time. 18 | 19 | Start A Databricks Cluster 20 | -------------------------- 21 | Create a Databricks cluster (`Clusters` -> `+ Create Cluster`) that meets the above prerequisites. 22 | 1. Make sure to use one of the 5.3 ML with GPU, 5.4 ML with GPU, or 5.5 LTS ML with GPU Databricks runtimes. 23 | 2. Use nodes with 1 GPU each such as p3.xlarge or Standard\_NC6s\_v3. We currently don't support nodes with multiple GPUs. p2 (AWS) and NC12/24 (Azure) nodes do not meet the architecture requirements for the XGBoost worker (although they can be used for the driver node). 24 | 3. Under Autopilot Options, disable autoscaling. 25 | 4. Choose the number of workers that matches the number of GPUs you want to use. 26 | 5. Select a worker type that has 1 GPU for the worker like p3.xlarge or NC6s_v3, for example. 27 | 28 | 29 | * After you start a Databricks cluster, use the initialization notebooks -- [5.3 & 5.4 notebook](/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb 30 | ) or [5.5 notebook](/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb 31 | ) to setup execution.
32 | 33 | The initialization notebooks will perform the following steps:
34 | 1.Downloading the CUDA and Rapids XGBoost4j Spark jars
35 | 2.Creating a new directory for initialization script in Databricks file system (DBFS)
36 | 3.Creating an initialization script inside the new directory to copy jars inside Databricks jar directory
37 | 4.Download and decompress the Sample Mortgage Notebook dataset
38 | 39 | After executing the steps in the initialization notebook, please follow the 1. Cluster initialization script and 2. Install the xgboost4j_spark jar in the cluster to ensure it is ready for XGBoost training. 40 | 41 | Add cluster initialization script 42 | --------------------------- 43 | 1. See [Initialization scripts](https://docs.databricks.com/user-guide/clusters/init-scripts.html) for how to configure cluster initialization scripts. 44 | 2. Edit your cluster, adding an initialization script from dbfs:/databricks/init_scripts/init.sh in the "Advanced Options" under "Init Scripts" tab 45 | 3. Reboot the cluster 46 | 47 | 48 | Install the xgboost4j_spark jar in the cluster 49 | --------------------------- 50 | 1. See [Libraries](https://docs.databricks.com/user-guide/libraries.html) for how to install jars from DBFS 51 | 2. Go to "Libraries" tab under your cluster and install dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar in your cluster by selecting the "DBFS" option for installing jars 52 | 53 | These steps will ensure you have a GPU Cluster ready for importing XGBoost notebooks or create your own XGBoost Application for training. 54 | 55 | 56 | Import the GPU Mortgage Example Notebook 57 | --------------------------- 58 | 1. See [Managing Notebooks](https://docs.databricks.com/user-guide/notebooks/notebook-manage.html) on how to import a notebook. 59 | 2. Import the example notebook: [XGBoost4j-Spark mortgage notebook](/examples/notebooks/python/mortgage-gpu.ipynb) 60 | 3. Inside the mortgage example notebook, update the data paths from 61 | "/data/datasets/mortgage-small/train" to "dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv" 62 | "/data/datasets/mortgage-small/eval" to "dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv" 63 | 64 | The example notebook comes with the following configuration, you can adjust this according to your setup. 65 | See supported configuration options here: [xgboost parameters](/examples/app-parameters/supported_xgboost_parameters_python.md) 66 | ``` 67 | params = { 68 | 'eta': 0.1, 69 | 'gamma': 0.1, 70 | 'missing': 0.0, 71 | 'treeMethod': 'gpu_hist', 72 | 'maxDepth': 10, 73 | 'maxLeaves': 256, 74 | 'growPolicy': 'depthwise', 75 | 'minChildWeight': 30.0, 76 | 'lambda_': 1.0, 77 | 'scalePosWeight': 2.0, 78 | 'subsample': 1.0, 79 | 'nthread': 1, 80 | 'numRound': 100, 81 | 'numWorkers': 1, 82 | } 83 | 84 | ``` 85 | 86 | 4. Run all the cells in the notebook. 87 | 88 | 5. View the results 89 | In the cell 5 (Training), 7 (Transforming) and 8 (Accuracy of Evaluation) you will see the output. 90 | 91 | ``` 92 | -------------- 93 | ==> Benchmark: 94 | Training takes 6.48 seconds 95 | -------------- 96 | 97 | -------------- 98 | ==> Benchmark: Transformation takes 3.2 seconds 99 | 100 | -------------- 101 | 102 | ------Accuracy of Evaluation------ 103 | Accuracy is 0.9980699597729774 104 | 105 | ``` 106 | 107 | * The timings in this Getting Started guide are only illustrative. Please see our [release announcement](https://medium.com/rapids-ai/nvidia-gpus-and-apache-spark-one-step-closer-2d99e37ac8fd) for official benchmarks. 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Download latest Jars" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/jars/\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "%sh\n", 26 | "cd ../../dbfs/FileStore/jars/\n", 27 | "wget -O cudf-0.9.2.jar https://search.maven.org/remotecontent?filepath=ai/rapids/cudf/0.9.2/cudf-0.9.2.jar\n", 28 | "wget -O xgboost4j_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\n", 29 | "wget -O xgboost4j-spark_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\n", 30 | "ls -ltr\n", 31 | "\n", 32 | "# Your Jars are downloaded in dbfs:/FileStore/jars directory" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Create a Directory for your init script" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 6, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "dbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n", 58 | "#!/bin/bash\n", 59 | "sudo cp /dbfs/FileStore/jars/xgboost4j_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j--ml.dmlc__xgboost4j__0.81.jar\n", 60 | "sudo cp /dbfs/FileStore/jars/cudf-0.9.2.jar /databricks/jars/\n", 61 | "sudo cp /dbfs/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j-spark--ml.dmlc__xgboost4j-spark__0.81.jar\"\"\", True)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Confirm your init script is in the new directory" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 8, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%sh\n", 78 | "cd ../../dbfs/databricks/init_scripts\n", 79 | "pwd\n", 80 | "ls -ltr" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### Download the Mortgage Dataset into your local machine and upload Data using import Data" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/tables/\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 11, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "%sh\n", 106 | "cd /dbfs/FileStore/tables/\n", 107 | "wget -O mortgage.zip https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip\n", 108 | "ls\n", 109 | "unzip mortgage.zip" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 12, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "%sh\n", 119 | "pwd\n", 120 | "cd ../../dbfs/FileStore/tables\n", 121 | "ls -ltr mortgage/csv/*" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Next steps\n", 129 | "\n", 130 | "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", 131 | "2. Reboot the cluster\n", 132 | "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", 133 | "4. Import the mortgage example notebook from `https://github.com/rapidsai/spark-examples/blob/master/examples/notebooks/python/mortgage-gpu.ipynb`\n", 134 | "5. Inside the mortgage example notebook, update the data paths\n", 135 | " `train_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv')`\n", 136 | " `eval_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv')`" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.7.4" 157 | }, 158 | "name": "Init Scripts_demo", 159 | "notebookId": 2585487876834579 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 1 163 | } 164 | -------------------------------------------------------------------------------- /getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Download latest Jars" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/jars/\")" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "%sh\n", 26 | "cd ../../dbfs/FileStore/jars/\n", 27 | "wget -O cudf-0.9.2-cuda10.jar https://search.maven.org/remotecontent?filepath=ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10.jar\n", 28 | "wget -O xgboost4j_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\n", 29 | "wget -O xgboost4j-spark_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\n", 30 | "ls -ltr\n", 31 | "\n", 32 | "# Your Jars are downloaded in dbfs:/FileStore/jars directory" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Create a Directory for your init script" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 6, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "dbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n", 58 | "#!/bin/bash\n", 59 | "sudo cp /dbfs/FileStore/jars/xgboost4j_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j--ml.dmlc__xgboost4j__0.90.jar\n", 60 | "sudo cp /dbfs/FileStore/jars/cudf-0.9.2-cuda10.jar /databricks/jars/\n", 61 | "sudo cp /dbfs/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j-spark--ml.dmlc__xgboost4j-spark__0.90.jar\"\"\", True)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Confirm your init script is in the new directory" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 8, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%sh\n", 78 | "cd ../../dbfs/databricks/init_scripts\n", 79 | "pwd\n", 80 | "ls -ltr" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### Download the Mortgage Dataset into your local machine and upload Data using import Data" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/tables/\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 11, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "%sh\n", 106 | "cd /dbfs/FileStore/tables/\n", 107 | "wget -O mortgage.zip https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip\n", 108 | "ls\n", 109 | "unzip mortgage.zip" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 12, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "%sh\n", 119 | "pwd\n", 120 | "cd ../../dbfs/FileStore/tables\n", 121 | "ls -ltr mortgage/csv/*" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Next steps\n", 129 | "\n", 130 | "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", 131 | "2. Reboot the cluster\n", 132 | "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", 133 | "4. Import the mortgage example notebook from `https://github.com/rapidsai/spark-examples/blob/master/examples/notebooks/python/mortgage-gpu.ipynb`\n", 134 | "5. Inside the mortgage example notebook, update the data paths\n", 135 | " `train_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv')`\n", 136 | " `eval_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv')`" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.7.4" 157 | }, 158 | "name": "Init Scripts_demo", 159 | "notebookId": 2585487876834616 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 1 163 | } 164 | -------------------------------------------------------------------------------- /getting-started-guides/csp/databricks/xgb_python_gpu_perf_blog.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"markdown","source":["## GPU based PySpark XGBoost"],"metadata":{}},{"cell_type":"markdown","source":["##### Importing XGBoost, hyperopt, scikit learn, pandas and other helper function packages"],"metadata":{}},{"cell_type":"code","source":["import xgboost as xgb\n\nfrom hyperopt import hp, fmin, tpe, STATUS_OK, SparkTrials\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\n\nimport numpy as np\nimport pandas as pd\n\nimport os\nimport shutil\nimport tempfile"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":["## Data Loading"],"metadata":{}},{"cell_type":"markdown","source":["##### For a large dataset, broadcasting the dataset would take significant cluster resources. We store the data on DBFS and load it back on workers via DBFS' local file interface.\n\nSee Databricks best practices for HyperOpt: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html"],"metadata":{}},{"cell_type":"code","source":["def load(path):\n \"\"\"\n Loads saved data (a tuple of numpy arrays).\n Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html\n \"\"\"\n return list(np.load(path).values())\n \ndef save_to_dbfs(data):\n \"\"\"\n Saves input data (a tuple of numpy arrays) to a temporary file on DBFS and returns its path.\n Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html\n \"\"\"\n # Save data to a local file first.\n data_filename = \"data.npz\"\n local_data_dir = tempfile.mkdtemp()\n local_data_path = os.path.join(local_data_dir, data_filename)\n np.savez(local_data_path, *data)\n # Move it to DBFS, which is shared among cluster nodes.\n dbfs_tmp_dir = \"/dbfs/ml/tmp/hyperopt\"\n os.makedirs(dbfs_tmp_dir, exist_ok=True)\n dbfs_data_dir = tempfile.mkdtemp(dir=dbfs_tmp_dir) \n dbfs_data_path = os.path.join(dbfs_data_dir, data_filename) \n shutil.move(local_data_path, dbfs_data_path)\n return dbfs_data_path"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["##### Preparing XGBoost Data"],"metadata":{}},{"cell_type":"code","source":["def prepare_xgb_data(data, id_col=\"Id\", label_col=\"Label\", test_size=0.2):\n \"\"\"\n Prepare data for xgboost training\n \"\"\"\n # Make sure last column is label, first col\n data[label_col+\"Temp\"] = data[label_col]\n data = data.drop([id_col, label_col], axis=1)\n data.rename(columns={label_col+\"Temp\": label_col}, inplace=True)\n \n # Prepare data\n X, y = data.iloc[:,:-1],data.iloc[:,-1]\n data_dmatrix = xgb.DMatrix(data=X,label=y)\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=21)\n return X_train, X_test, y_train, y_test\n \ndef get_raw_data(file_name, sample_size):\n input_file_location = \"/dbfs/FileStore/tables/\" + file_name\n pdf = pd.read_csv(input_file_location).dropna().sample(n=sample_size)\n return pdf"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"markdown","source":["## XGBoost Regression with Hyperopt + Spark Trials"],"metadata":{}},{"cell_type":"code","source":["def objective(space):\n \"\"\"\n Train and search input space\n \"\"\"\n clf = xgb.XGBRegressor(objective ='reg:squarederror', \n n_estimators = int(space['n_estimators']),\n colsample_bytree = space['colsample_bytree'],\n learning_rate = space['learning_rate'],\n max_depth = int(space['max_depth']),\n alpha = space['alpha'],\n tree_method= space['tree_method']\n )\n \n # Load data\n data = load(data_large_path)\n X_train, X_test, y_train, y_test = data[0], data[1], data[2], data[3]\n eval_set = [(X_train, y_train), (X_test, y_test)]\n\n # Train\n clf.fit(X_train, y_train,\n eval_set=eval_set, eval_metric=\"rmse\",\n early_stopping_rounds=10,verbose=False)\n \n # Validate\n pred = clf.predict(X_test)\n mse_scr = mean_squared_error(y_test, pred)\n\n return {'loss': mse_scr, 'status': STATUS_OK}\n\ndef run_hyperopt(df, treemethod, parallelism, max_evals):\n \"\"\"\n Run hyperopt and return best params\n \"\"\"\n # Hyperopt search space\n space ={'max_depth': hp.quniform('max_depth', 4, 16, 1),\n 'alpha' : hp.uniform('alpha', 1, 10),\n 'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1),\n 'learning_rate' : hp.uniform('learning_rate', 0.1, 1),\n 'n_estimators': hp.quniform('n_estimators', 25, 500, 25),\n 'tree_method': treemethod\n }\n if parallelism is None:\n trials = SparkTrials()\n else:\n trials = SparkTrials(parallelism=parallelism)\n\n # Hyperopt\n best_param = fmin(fn=objective,\n space=space,\n algo=tpe.suggest,\n max_evals=max_evals,\n trials=trials)\n print(best_param)\n \n return best_param"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"markdown","source":["## Train"],"metadata":{}},{"cell_type":"markdown","source":["##### Parallelism parameter is set \"2\" for 2 GPUs, which is effectively using 2 GPUs in parallel. Each new hyperparameter setting tested will be chosen based on previous results. Setting parallelism in between 1 and max_evals allows you to trade off scalability (getting results faster) and adaptiveness (sometimes getting better models). For GPU, is is advised to set number of GPUs used for training."],"metadata":{}},{"cell_type":"code","source":["# Dataset\nfile_name = \"your_file_name.csv\" # dataset file name\nid_col=\"unique_id_column_name\" # unique id for each row\nlabel_col=\"label_column_name\" # label column name\n\n# Load data\ndf = get_raw_data(file_name=file_name, sample_size=10000)\ndata_large = prepare_xgb_data(df, id_col=id_col, label_col=label_col, test_size=0.2)\ndata_large_path = save_to_dbfs(data_large)\n\n# Run training\nbest_param = run_hyperopt(df, treemethod='gpu_hist', parallelism=2, max_evals=10) # Set parallelism = Number of GPUs\n\n# Cleanup\nshutil.rmtree(data_large_path, ignore_errors=True)"],"metadata":{},"outputs":[],"execution_count":13}],"metadata":{"name":"xgb_python_gpu_perf_blog","notebookId":323},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /getting-started-guides/csp/gcp/spark-gpu/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Spark GPU 2 | 3 | This initialization action deploy the dependency of RAPIDS spark GPU(https://github.com/rapidsai/spark-examples) on a 4 | [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster. 5 | 6 | Prerequisites 7 | ------------- 8 | * Apache Spark 2.3+ 9 | * Hardware Requirements 10 | * NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later) 11 | * Multi-node clusters with homogenous GPU configuration 12 | * Software Requirements 13 | * NVIDIA driver 410.48+ 14 | * CUDA V10.1/10.0/9.2 15 | * NCCL 2.4.7 and later 16 | * `EXCLUSIVE_PROCESS` must be set for all GPUs in each NodeManager.(Initialization script provided in this guide will set this mode by default) 17 | * `spark.dynamicAllocation.enabled` must be set to False for spark 18 | 19 | Our initialization action does the following: 20 | 21 | ### Step 1. Initialization steps to download required files for Spark RAPIDS XGBoost app 22 | 23 | 1. Git clone the [spark-examples directory](https://github.com/rapidsai/spark-examples) to your local machine. 24 | 2. Upload the necessary files into your GCP bucket by executing the following commands. 25 | 26 | ```bash 27 | cd spark-examples/ 28 | export GCS_BUCKET=my-bucket 29 | export RAPIDS_SPARK_VERSION='2.x-1.0.0-Beta5' 30 | export RAPIDS_CUDF_VERSION='0.9.2-cuda10' 31 | pushd datasets/ 32 | tar -xvf mortgage-small.tar.gz 33 | gsutil cp -r mortgage-small/ gs://$GCS_BUCKET/ 34 | popd 35 | wget -O cudf-${RAPIDS_CUDF_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/cudf/${RAPIDS_CUDF_VERSION%-*}/cudf-${RAPIDS_CUDF_VERSION}.jar 36 | wget -O xgboost4j_${RAPIDS_SPARK_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/xgboost4j_${RAPIDS_SPARK_VERSION/-/\/}/xgboost4j_${RAPIDS_SPARK_VERSION}.jar 37 | wget -O xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_${RAPIDS_SPARK_VERSION/-/\/}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar 38 | gsutil cp cudf-${RAPIDS_CUDF_VERSION}.jar xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar xgboost4j_${RAPIDS_SPARK_VERSION}.jar gs://$GCS_BUCKET/ 39 | ``` 40 | 41 | After that, go to Google Cloud Platform console via browser to make sure your Google storage bucket “my-bucket” directory structure has the following files: 42 | * gs://my-bucket/cudf-${RAPIDS_CUDF_VERSION}.jar 43 | * gs://my-bucket/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar 44 | * gs://my-bucket/xgboost4j_${RAPIDS_SPARK_VERSION}.jar 45 | * gs://my-bucket/mortgage-small/eval/mortgage-small.csv 46 | * gs://my-bucket/mortgage-small/eval/mortgage-small.csv 47 | * gs://my-bucket/mortgage-small/trainWithEval/test.csv 48 | 49 | 50 | ### Step 2 Using the `gcloud` command to create a new cluster with this initialization action. 51 | 52 | The following command will create a new cluster named ``. 53 | 54 | ```bash 55 | export CLUSTER_NAME=my-gpu-cluster 56 | export ZONE=us-central1-b 57 | export REGION=us-central1 58 | export GCS_BUCKET=my-bucket 59 | export INIT_ACTIONS_BUCKET=my-bucket 60 | export NUM_GPUS=2 61 | export NUM_WORKERS=2 62 | export RAPIDS_SPARK_VERSION='2.x-1.0.0-Beta5' 63 | export RAPIDS_CUDF_VERSION='0.9.2-cuda10' 64 | 65 | gcloud beta dataproc clusters create $CLUSTER_NAME \ 66 | --zone $ZONE \ 67 | --region $REGION \ 68 | --master-machine-type n1-standard-32 \ 69 | --master-boot-disk-size 50 \ 70 | --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \ 71 | --worker-machine-type n1-standard-32 \ 72 | --worker-boot-disk-size 50 \ 73 | --num-worker-local-ssds 1 \ 74 | --num-workers $NUM_WORKERS \ 75 | --image-version 1.4-ubuntu18 \ 76 | --bucket $GCS_BUCKET \ 77 | --metadata JUPYTER_PORT=8123,INIT_ACTIONS_REPO="gs://$INIT_ACTIONS_BUCKET",linux-dist="ubuntu",GCS_BUCKET="gs://$GCS_BUCKET" \ 78 | --initialization-actions gs://goog-dataproc-initialization-actions${REGION}/gpu/install_gpu_driver.sh \ 79 | --optional-components=ANACONDA,JUPYTER \ 80 | --subnet=default \ 81 | --properties "^#^spark:spark.dynamicAllocation.enabled=false#spark:spark.shuffle.service.enabled=false#spark:spark.submit.pyFiles=/usr/lib/spark/python/lib/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar#spark:spark.jars=/usr/lib/spark/jars/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar,/usr/lib/spark/jars/xgboost4j_${RAPIDS_SPARK_VERSION}.jar,/usr/lib/spark/jars/cudf-${RAPIDS_CUDF_VERSION}.jar" \ 82 | --enable-component-gateway 83 | ``` 84 | 85 | ### Step 3, execute the sample app 86 | 87 | Once the cluster has been created, yarn resource manager could be accessed on port `8088` on the Dataproc master 88 | node. 89 | 90 | To connect to the dataproc web interface, you will need to create an SSH tunnel as 91 | described in the 92 | [dataproc web interfaces](https://cloud.google.com/dataproc/cluster-web-interfaces) 93 | documentation. 94 | 95 | See 96 | [the Mortgage example](https://github.com/rapidsai/spark-examples/tree/master/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage) 97 | that demonstrates end to end XGBoost4j in spark including data pre-processing and model 98 | training with RAPIDS Spark GPU APIs. Additional examples 99 | [are available](https://github.com/rapidsai/spark-examples/tree/master/examples). See the 100 | [RAPIDS Spark GPU API documentation](https://github.com/rapidsai/spark-examples/tree/master/api-docs) for API details. 101 | 102 | To submit such a job run: 103 | 104 | ```bash 105 | export MAIN_CLASS=ai.rapids.spark.examples.mortgage.GPUMain 106 | export RAPIDS_JARS=gs://$GCS_BUCKET/spark-gpu/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar 107 | export DATA_PATH=$GCS_BUCKET 108 | export TREE_METHOD=gpu_hist 109 | export SPARK_NUM_EXECUTORS=4 110 | export CLUSTER_NAME=my-gpu-cluster 111 | export REGION=us-central1 112 | 113 | gcloud beta dataproc jobs submit spark \ 114 | --cluster=$CLUSTER_NAME \ 115 | --region=$REGION \ 116 | --class=$MAIN_CLASS \ 117 | --jars=$RAPIDS_JARS \ 118 | --properties=spark.executor.cores=1,spark.executor.instances=${SPARK_NUM_EXECUTORS},spark.executor.memory=8G,spark.executorEnv.LD_LIBRARY_PATH=/usr/local/lib/x86_64-linux-gnu:/usr/local/cuda-10.0/lib64:${LD_LIBRARY_PATH} \ 119 | -- \ 120 | -format=csv \ 121 | -numRound=100 \ 122 | -numWorkers=${SPARK_NUM_EXECUTORS} \ 123 | -treeMethod=${TREE_METHOD} \ 124 | -trainDataPath=${DATA_PATH}/mortgage-small/train/mortgage_small.csv \ 125 | -evalDataPath=${DATA_PATH}/mortgage-small/eval/mortgage_small.csv \ 126 | -maxDepth=8 127 | ``` 128 | 129 | 130 | RAPIDS Spark GPU is a relatively young project with APIs evolving quickly. If you 131 | encounter unexpected errors or have feature requests, please file them at the 132 | relevant [RAPIDS Spark example repo](https://github.com/rapidsai/spark-examples). 133 | 134 | ### Options 135 | 136 | #### GPU Types & Driver Configuration 137 | 138 | By default, these initialization actions install a CUDA 10.0 with NVIDIA 418 driver. If you wish 139 | to install a different driver version, `metadata` need to be passed into initial action. Available options below: 140 | 141 | ``` 142 | cuda-version='10-0' 143 | nccl-url='https://developer.nvidia.com/compute/machine-learning/nccl/secure/v2.4/prod/nccl-repo-ubuntu1804-2.4.8-ga-cuda10.0_1-1_amd64.deb' 144 | nccl-version='2.4.8' 145 | ``` 146 | 147 | ## Important notes 148 | 149 | * RAPIDS Spark GPU is supported on Pascal or newer GPU architectures (Tesla K80s will 150 | _not_ work with RAPIDS). See 151 | [list](https://cloud.google.com/compute/docs/gpus/) of available GPU types 152 | by GCP region. 153 | * You must set a GPU accelerator type for worker nodes, else 154 | the GPU driver install will fail and the cluster will report an error state. 155 | * When running RAPIDS Spark GPU with multiple attached GPUs, We recommend an 156 | n1-standard-32 worker machine type or better to ensure sufficient 157 | host-memory for buffering data to and from GPUs. When running with a single 158 | attached GPU, GCP only permits machine types up to 24 vCPUs. 159 | 160 | -------------------------------------------------------------------------------- /getting-started-guides/csp/gcp/spark-gpu/internal/install-gpu-driver-debian.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | readonly DEFAULT_GPU_DRIVER_URL='http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-418.87.00.run' 6 | readonly GPU_DRIVER_URL=$(/usr/share/google/get_metadata_value attributes/gpu-driver-url || 7 | echo -n "${DEFAULT_GPU_DRIVER_URL}") 8 | 9 | readonly DEFAULT_CUDA_URL='https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux' 10 | readonly CUDA_URL=$(/usr/share/google/get_metadata_value attributes/gpu-cuda-url || 11 | echo -n "${DEFAULT_CUDA_URL}") 12 | 13 | readonly DEFAULT_CUDA_VERSION='10-0' 14 | readonly CUDA_VERSION=$(/usr/share/google/get_metadata_value attributes/cuda-version || 15 | echo -n "${DEFAULT_CUDA_VERSION}") 16 | 17 | readonly DEFAULT_NCCL_URL='https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb' 18 | readonly NCCL_URL=$(/usr/share/google/get_metadata_value attributes/nccl-url || 19 | echo -n "${DEFAULT_NCCL_URL}") 20 | 21 | readonly DEFAULT_NCCL_VERSION='2.4.8' 22 | readonly NCCL_VERSION=$(/usr/share/google/get_metadata_value attributes/nccl-version || 23 | echo -n "${DEFAULT_NCCL_VERSION}") 24 | 25 | apt-get update 26 | DEBIAN_FRONTEND=noninteractive apt-get install -y pciutils "linux-headers-$(uname -r)" 27 | 28 | wget --progress=dot:mega -O driver.run "${GPU_DRIVER_URL}" 29 | chmod +x "./driver.run" 30 | "./driver.run" --silent 31 | 32 | wget --progress=dot:mega -O cuda.run "${CUDA_URL}" 33 | chmod +x "./cuda.run" 34 | "./cuda.run" --silent --toolkit --no-opengl-libs 35 | 36 | wget --progress=dot:mega -O nccl.deb "${GPU_NCCL_URL}" 37 | chmod +x "./nccl.deb" 38 | dpkg -i nccl.deb 39 | apt update 40 | apt install "libnccl2=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" "libnccl-dev=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" -y 41 | 42 | /usr/bin/nvidia-smi -c EXCLUSIVE_PROCESS 43 | -------------------------------------------------------------------------------- /getting-started-guides/csp/gcp/spark-gpu/internal/install-gpu-driver-ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | readonly DEFAULT_CUDA_VERSION='10-0' 6 | readonly CUDA_VERSION=$(/usr/share/google/get_metadata_value attributes/cuda-version || 7 | echo -n "${DEFAULT_CUDA_VERSION}") 8 | 9 | readonly DEFAULT_NCCL_URL='https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb' 10 | readonly NCCL_URL=$(/usr/share/google/get_metadata_value attributes/nccl-url || 11 | echo -n "${DEFAULT_NCCL_URL}") 12 | 13 | readonly DEFAULT_NCCL_VERSION='2.4.8' 14 | readonly NCCL_VERSION=$(/usr/share/google/get_metadata_value attributes/nccl-version || 15 | echo -n "${DEFAULT_NCCL_VERSION}") 16 | 17 | apt-get update 18 | apt-get install build-essential 19 | 20 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin 21 | mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 22 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub 23 | add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" 24 | apt-get update 25 | 26 | if [[ "${CUDA_VERSION}" != '10-0' ]]; then 27 | apt-get -y install cuda 28 | else 29 | apt-get -y install cuda-10-0 30 | fi 31 | 32 | wget --progress=dot:mega -O nccl.deb "${NCCL_URL}" 33 | dpkg -i nccl.deb 34 | apt update 35 | apt install "libnccl2=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" "libnccl-dev=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" -y 36 | 37 | /usr/bin/nvidia-smi -c EXCLUSIVE_PROCESS 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /getting-started-guides/csp/gcp/spark-gpu/rapids.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role) 6 | readonly LINUX_DIST=$(/usr/share/google/get_metadata_value attributes/linux-dist) 7 | 8 | readonly DEFAULT_INIT_ACTIONS_REPO=gs://dataproc-initialization-actions 9 | readonly INIT_ACTIONS_REPO="$(/usr/share/google/get_metadata_value attributes/INIT_ACTIONS_REPO || 10 | echo ${DEFAULT_INIT_ACTIONS_REPO})" 11 | 12 | readonly DEFAULT_GCS_BUCKET=gs://my-bucket 13 | readonly GCS_BUCKET="$(/usr/share/google/get_metadata_value attributes/GCS_BUCKET || 14 | echo ${DEFAULT_GCS_BUCKET})" 15 | 16 | readonly DEFAULT_RAPIDS_SPARK_VERSION=2.x-1.0.0-Beta5 17 | readonly RAPIDS_SPARK_VERSION="$(/usr/share/google/get_metadata_value attributes/RAPIDS_SPARK_VERSION || 18 | echo ${DEFAULT_RAPIDS_SPARK_VERSION})" 19 | 20 | readonly DEFAULT_RAPIDS_CUDF_VERSION=0.9.2-cuda10 21 | readonly RAPIDS_CUDF_VERSION="$(/usr/share/google/get_metadata_value attributes/RAPIDS_CUDF_VERSION || 22 | echo ${DEFAULT_RAPIDS_CUDF_VERSION})" 23 | 24 | echo "Cloning RAPIDS initialization action from '${INIT_ACTIONS_REPO}' ..." 25 | RAPIDS_INIT_ACTION_DIR=$(mktemp -d -t rapids-init-action-XXXX) 26 | readonly RAPIDS_INIT_ACTION_DIR 27 | gsutil -m rsync -r "${INIT_ACTIONS_REPO}/spark-gpu" "${RAPIDS_INIT_ACTION_DIR}" 28 | 29 | if [[ "${LINUX_DIST}" == 'ubuntu' ]]; then 30 | mv "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver-ubuntu.sh" "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh" 31 | else 32 | mv "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver-debian.sh" "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh" 33 | fi 34 | find "${RAPIDS_INIT_ACTION_DIR}" -name '*.sh' -exec chmod +x {} \; 35 | 36 | if [[ "${ROLE}" != 'Master' ]]; then 37 | # Ensure we have GPU drivers installed. 38 | "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh" 39 | else 40 | gsutil cp ${GCS_BUCKET}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/python/lib/ 41 | gsutil cp ${GCS_BUCKET}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/jars/ 42 | gsutil cp ${GCS_BUCKET}/xgboost4j_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/jars/ 43 | gsutil cp ${GCS_BUCKET}/cudf-${RAPIDS_CUDF_VERSION}.jar /usr/lib/spark/jars/ 44 | fi 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /getting-started-guides/notebook/python-notebook.md: -------------------------------------------------------------------------------- 1 | Get Started with XGBoost4J-Spark with Jupyter Notebook 2 | =================================================================== 3 | This is a getting started guide to XGBoost4J-Spark using an [Jupyter notebook](https://jupyter.org/). At the end of this guide, the reader will be able to run a sample notebook that runs on NVIDIA GPUs. 4 | 5 | Before you begin, please ensure that you have setup a [Spark Standalone Cluster](/getting-started-guides/on-prem-cluster/standalone-python.md). 6 | 7 | It is assumed that the `SPARK_MASTER` and `SPARK_HOME` environment variables are defined and point to the master spark URL (e.g. `spark://localhost:7077`), and the home directory for Apache Spark respectively. 8 | 9 | 1. Make sure you have [Jupyter notebook installed](https://jupyter.org/install.html). If you install it with conda, please makes sure your Python version is consistent. 10 | 11 | 2. Make sure you have `SPARK_JARS` and `SPARK_PY_FILES` set properly. Please note, here *cudf-0.9.2-cuda10.jar* is used as an example. Please choose other *cudf-0.9.2* jars based on your environment. You may need to update these env variables because the working directory will be changed: 12 | ``` 13 | export LIBS_PATH=[full path to xgboost4j_spark/libs] 14 | export SPARK_JARS=${LIBS_PATH}/cudf-0.9.2-cuda10.jar,${LIBS_PATH}/xgboost4j_2.x-1.0.0-Beta5.jar,${LIBS_PATH}/xgboost4j-spark_2.x-1.0.0-Beta5.jar 15 | export SPARK_PY_FILES=${LIBS_PATH}/xgboost4j-spark_2.x-1.0.0-Beta5.jar 16 | ``` 17 | 18 | 3. Go to the project root directory and launch the notebook: 19 | ``` 20 | PYSPARK_DRIVER_PYTHON=jupyter \ 21 | PYSPARK_DRIVER_PYTHON_OPTS=notebook \ 22 | pyspark \ 23 | --master ${SPARK_MASTER} \ 24 | --jars ${SPARK_JARS} \ 25 | --py-files ${SPARK_PY_FILES} 26 | ``` 27 | 28 | Then you start your notebook and open [`mortgage-gpu.ipynb`](/examples/notebooks/python/mortgage-gpu.ipynb) to explore. 29 | -------------------------------------------------------------------------------- /getting-started-guides/notebook/toree.md: -------------------------------------------------------------------------------- 1 | Get Started with XGBoost4J-Spark with Apache Toree Jupyter Notebook 2 | =================================================================== 3 | This is a getting started guide to XGBoost4J-Spark using an [Apache Toree](https://toree.apache.org/) Jupyter notebook. At the end of this guide, the reader will be able to run a sample notebook that runs on NVIDIA GPUs. 4 | 5 | Before you begin, please ensure that you have setup a [Spark Standalone Cluster](/getting-started-guides/on-prem-cluster/standalone-scala.md). 6 | 7 | It is assumed that the `SPARK_MASTER` and `SPARK_HOME` environment variables are defined and point to the master spark URL (e.g. `spark://localhost:7077`), and the home directory for Apache Spark respectively. 8 | 9 | 1. Make sure you have jupyter notebook installed: 10 | Install Toree: 11 | ``` 12 | pip install toree 13 | ``` 14 | 15 | 2. Install kernel configured for our example: 16 | ``` 17 | export SPARK_EXAMPLES=[full path to spark-examples repo] 18 | export SPARK_JARS=${SPARK_EXAMPLES}/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar 19 | 20 | jupyter toree install \ 21 | --spark_home=${SPARK_HOME} \ 22 | --user \ 23 | --kernel_name="XGBoost4j-Spark" \ 24 | --spark_opts='--master ${SPARK_MASTER} --jars ${SPARK_JARS}' 25 | ``` 26 | 27 | 2. Launch the notebook: 28 | ``` 29 | jupyter notebook 30 | ``` 31 | 32 | Then you start your notebook and open [`mortgage-gpu.ipynb`](/examples/notebooks/scala/mortgage-gpu.ipynb) to explore. 33 | 34 | Please ensure that the *XGBoost4j-Spark* kernel is running. 35 | -------------------------------------------------------------------------------- /gpu_executor_template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | spec: 4 | containers: 5 | - name: executor 6 | resources: 7 | limits: 8 | nvidia.com/gpu: 1 9 | 10 | -------------------------------------------------------------------------------- /tools/jupyter_gpu_count_estimation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Script to estimate at least GPU count\n", 8 | "\n", 9 | "This script which typically simulates the GPU memory consumption flow is used to estimate the GPU minimum count by giving some parameters including rows and columns and others. The more precise parameters are, the more accurate the output is.\n", 10 | "\n", 11 | "| parameters | |\n", 12 | "|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", 13 | "| SINGLE_GPU_MEMORY_SIZE | The size of one gpu memory on device, you can get it by `nvidia-smi` |\n", 14 | "| NUM_OF_FEATURE_COLUMNS | The total feature columns of input dataset. |\n", 15 | "| NUM_OF_WEIGHT_COLUMNS | The total weight columns of input dataset. If no weight column, it should be set to 0. |\n", 16 | "| NUM_OF_GROUPS | the size of prediction per instance. This value is set to 1 for all tasks except multi-class classification. For multi-class classification, NUM_OF_GROUPS must be set to the number of classes |\n", 17 | "| SPARSITY | sparsity of input dataset. (1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A) |\n", 18 | "| MAX_BIN | maximum number of discrete bins to bucket continuous features. Default is 16 |\n", 19 | "\n", 20 | "---\n", 21 | "\n", 22 | "- ROW_STRIDE\n", 23 | "\n", 24 | "As to ROW_STRIDE, which is the largest number of features/items across all rows the input dataset. \n", 25 | "You can calculate it by \n", 26 | "```shell\n", 27 | "cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i 0\n", 49 | "\n", 50 | "SPARSITY = 0.5 #(1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Below parameters can also affects the result, all of them are default values\n", 60 | "\n", 61 | "MAX_BIN = 16 # max_bin default value. It is 256 in native xgboost, while it is 16 in xgboost-4j\n", 62 | "\n", 63 | "# ROW_STRIDE: it should be <= NUM_OF_FEATURE_COLUMNS\n", 64 | "# let's assume last column is feature column, so the ROW_STRIDE can be calculated with below script\n", 65 | "# cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i 0\n", 48 | "\n", 49 | "SPARSITY = 1 #(1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "MAX_BIN = 16 # max_bin default value, It is 256 in native xgboost, while it is 16 in xgboost-4j\n", 59 | "\n", 60 | "# ROW_STRIDE: it should be <= NUM_OF_FEATURE_COLUMNS\n", 61 | "# let's assume last column is feature column, so the ROW_STRIDE can be calculated with below script\n", 62 | "# cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i GPU_MEMORY:\n", 170 | " print(\"\\nMax loadable rows:%d Given cols:%d on GPU:%d G\\n\" % (rows_to_load, TOTAL_COLUMNS, GPU_MEMORY/1024/1024/1024))\n", 171 | " break\n", 172 | " loadable\n", 173 | " rows_to_load += step # speed up" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "rapids", 187 | "language": "python", 188 | "name": "rapids" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.6.7" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 2 205 | } 206 | --------------------------------------------------------------------------------