├── Dockerfile
├── LICENSE
├── README.md
├── advanced-topics
├── multi-gpu.md
├── performance_tuning.md
└── pics
│ ├── multi-gpu-advanced-options.png
│ ├── multi-gpu-bootstrap.png
│ ├── multi-gpu-s3.png
│ └── performance_benchmark.png
├── api-docs
├── python.md
└── scala.md
├── datasets
├── ETL
│ ├── MortgageETL.ipynb
│ └── Taxi_ETL.ipynb
├── agaricus.tar.gz
├── mortgage-small.tar.gz
├── preparing_datasets.md
└── taxi-small.tar.gz
├── examples
├── app-parameters
│ ├── supported_xgboost_parameters_python.md
│ └── supported_xgboost_parameters_scala.md
├── apps
│ ├── python
│ │ ├── .gitignore
│ │ ├── ai
│ │ │ ├── __init__.py
│ │ │ └── rapids
│ │ │ │ ├── __init__.py
│ │ │ │ └── spark
│ │ │ │ ├── __init__.py
│ │ │ │ └── examples
│ │ │ │ ├── __init__.py
│ │ │ │ ├── agaricus
│ │ │ │ ├── __init__.py
│ │ │ │ ├── consts.py
│ │ │ │ ├── cpu_main.py
│ │ │ │ └── gpu_main.py
│ │ │ │ ├── main.py
│ │ │ │ ├── mortgage
│ │ │ │ ├── __init__.py
│ │ │ │ ├── consts.py
│ │ │ │ ├── cpu_cross_validator_main.py
│ │ │ │ ├── cpu_main.py
│ │ │ │ ├── gpu_cross_validator_main.py
│ │ │ │ └── gpu_main.py
│ │ │ │ ├── taxi
│ │ │ │ ├── __init__.py
│ │ │ │ ├── consts.py
│ │ │ │ ├── cpu_main.py
│ │ │ │ └── gpu_main.py
│ │ │ │ └── utility
│ │ │ │ ├── __init__.py
│ │ │ │ ├── args.py
│ │ │ │ └── utils.py
│ │ └── main.py
│ └── scala
│ │ ├── .gitignore
│ │ ├── assembly
│ │ └── assembly-no-scala.xml
│ │ ├── pom.xml
│ │ └── src
│ │ └── main
│ │ └── scala
│ │ └── ai
│ │ └── rapids
│ │ └── spark
│ │ └── examples
│ │ ├── agaricus
│ │ ├── CPUMain.scala
│ │ └── GPUMain.scala
│ │ ├── mortgage
│ │ ├── CPUCrossValidatorMain.scala
│ │ ├── CPUMain.scala
│ │ ├── GPUCrossValidatorMain.scala
│ │ ├── GPUMain.scala
│ │ └── Mortgage.scala
│ │ ├── taxi
│ │ ├── CPUCrossValidatorMain.scala
│ │ ├── CPUMain.scala
│ │ ├── GPUCrossValidatorMain.scala
│ │ ├── GPUMain.scala
│ │ └── Taxi.scala
│ │ └── utility
│ │ ├── Benchmark.scala
│ │ ├── SparkSetup.scala
│ │ ├── Vectorize.scala
│ │ └── XGBoostArgs.scala
└── notebooks
│ ├── .gitignore
│ ├── python
│ ├── agaricus-gpu.ipynb
│ ├── cv-mortgage-gpu.ipynb
│ ├── mortgage-gpu.ipynb
│ └── taxi-gpu.ipynb
│ └── scala
│ ├── agaricus-gpu.ipynb
│ ├── mortgage-gpu-databricks.scala
│ ├── mortgage-gpu.ipynb
│ ├── mortgage_gpu_crossvalidation.ipynb
│ ├── taxi-gpu.ipynb
│ └── taxi_gpu_crossvalidation.ipynb
├── getting-started-guides
├── building-sample-apps
│ ├── python.md
│ └── scala.md
├── csp
│ ├── aws
│ │ ├── EMR_Mortgage_Example_G4dn.ipynb
│ │ ├── adv_full_datasets.md
│ │ ├── emr.md
│ │ ├── pics
│ │ │ ├── emr-cluster-details.png
│ │ │ ├── emr-cluster-dns.png
│ │ │ ├── emr-cluster-ssh.png
│ │ │ ├── emr-cluster-waiting.png
│ │ │ ├── emr-stdout.png
│ │ │ ├── emr-step-four-security.png
│ │ │ ├── emr-step-one-s3-copy.png
│ │ │ ├── emr-step-one-software-and-steps.png
│ │ │ ├── emr-step-three-general-cluster-settings.png
│ │ │ ├── emr-step-two-hardware.png
│ │ │ ├── emr-view-logs.png
│ │ │ ├── sagemaker-config-move.png
│ │ │ ├── sagemaker-config-updated.png
│ │ │ ├── sagemaker-curl-output.png
│ │ │ ├── sagemaker-info-output.png
│ │ │ ├── sagemaker-jupyter-new.gif
│ │ │ ├── sagemaker-kernel-restart.png
│ │ │ ├── sagemaker-notebook-instance.png
│ │ │ ├── sagemaker-output.png
│ │ │ ├── sagemaker-permission.png
│ │ │ └── sagemaker-tcp-port.png
│ │ └── sagemaker.md
│ ├── databricks
│ │ ├── databricks.md
│ │ ├── init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb
│ │ ├── init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb
│ │ └── xgb_python_gpu_perf_blog.ipynb
│ └── gcp
│ │ ├── gcp.md
│ │ └── spark-gpu
│ │ ├── README.md
│ │ ├── internal
│ │ ├── install-gpu-driver-debian.sh
│ │ └── install-gpu-driver-ubuntu.sh
│ │ └── rapids.sh
├── notebook
│ ├── python-notebook.md
│ └── toree.md
└── on-prem-cluster
│ ├── kubernetes.md
│ ├── standalone-python.md
│ ├── standalone-scala.md
│ ├── yarn-python.md
│ └── yarn-scala.md
├── gpu_executor_template.yaml
└── tools
├── jupyter_gpu_count_estimation.ipynb
└── jupyter_gpu_max_loadable_row.ipynb
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | FROM nvidia/cuda:10.0-devel-ubuntu18.04
19 | ARG spark_uid=185
20 |
21 | # Install java dependencies
22 | RUN apt-get update && apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre
23 | ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
24 | ENV PATH $PATH:/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre/bin:/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin
25 |
26 | # Before building the docker image, first build and make a Spark distribution following
27 | # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
28 | # If this docker file is being used in the context of building your images from a Spark
29 | # distribution, the docker build command should be invoked from the top level directory
30 | # of the Spark distribution. E.g.:
31 | # docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .
32 |
33 | RUN set -ex && \
34 | ln -s /lib /lib64 && \
35 | mkdir -p /opt/spark && \
36 | mkdir -p /opt/spark/examples && \
37 | mkdir -p /opt/spark/work-dir && \
38 | touch /opt/spark/RELEASE && \
39 | rm /bin/sh && \
40 | ln -sv /bin/bash /bin/sh && \
41 | echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
42 | chgrp root /etc/passwd && chmod ug+rw /etc/passwd
43 |
44 | ENV DEBIAN_FRONTEND noninteractive
45 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils \
46 | && apt-get install -y --no-install-recommends python libgomp1 \
47 | && rm -rf /var/lib/apt/lists/*
48 |
49 | COPY jars /opt/spark/jars
50 | COPY bin /opt/spark/bin
51 | COPY sbin /opt/spark/sbin
52 | COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/
53 | COPY examples /opt/spark/examples
54 | COPY kubernetes/tests /opt/spark/tests
55 | COPY data /opt/spark/data
56 |
57 | ENV SPARK_HOME /opt/spark
58 |
59 | WORKDIR /opt/spark/work-dir
60 | RUN chmod g+w /opt/spark/work-dir
61 |
62 | ENV TINI_VERSION v0.18.0
63 | ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /sbin/tini
64 | RUN chmod +rx /sbin/tini
65 |
66 | ENTRYPOINT [ "/opt/entrypoint.sh" ]
67 |
68 | # Specify the User that the actual main process will run as
69 | USER ${spark_uid}
70 |
71 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Please note that this repo has been moved to the new repo [spark-xgboost-examples](https://github.com/NVIDIA/spark-xgboost-examples).
2 |
3 | This repo provides docs and example applications that demonstrate the RAPIDS.ai GPU-accelerated XGBoost-Spark project.
4 |
5 | ### Examples
6 |
7 | - Mortgage: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage), [Python](/examples/apps/python/ai/rapids/spark/examples/mortgage)
8 | - Taxi: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi), [Python](/examples/apps/python/ai/rapids/spark/examples/taxi)
9 | - Agaricus: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus), [Python](/examples/apps/python/ai/rapids/spark/examples/agaricus)
10 |
11 | ### Getting Started Guides
12 |
13 | Try one of the Getting Started guides below. Please note that they target the Mortgage dataset as written, but with a few changes to `EXAMPLE_CLASS`, `trainDataPath`, and `evalDataPath`, they can be easily adapted to the Taxi or Agaricus datasets.
14 |
15 | You can get a small size datasets for each example in the [datasets](/datasets) folder. These datasets are only provided for convenience. In order to test for performance, please prepare a larger dataset by following [Preparing Datasets](/datasets/preparing_datasets.md). We also provide a larger dataset: [Morgage Dataset (1 GB uncompressed)](https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip), which is used in the guides below.
16 |
17 | - Building applications
18 | - [Scala](/getting-started-guides/building-sample-apps/scala.md)
19 | - [Python](/getting-started-guides/building-sample-apps/python.md)
20 | - Getting started on on-prem clusters
21 | - [Standalone cluster for Scala](/getting-started-guides/on-prem-cluster/standalone-scala.md)
22 | - [Standalone cluster for Python](/getting-started-guides/on-prem-cluster/standalone-python.md)
23 | - [YARN for Scala](/getting-started-guides/on-prem-cluster/yarn-scala.md)
24 | - [YARN for Python](/getting-started-guides/on-prem-cluster/yarn-python.md)
25 | - [Kubernetes](/getting-started-guides/on-prem-cluster/kubernetes.md)
26 | - Getting started on cloud service providers
27 | - Amazon AWS
28 | - [EMR](/getting-started-guides/csp/aws/emr.md)
29 | - [SageMaker](/getting-started-guides/csp/aws/sagemaker.md)
30 | - [Databricks](/getting-started-guides/csp/databricks/databricks.md)
31 | - [Google Cloud Platform](/getting-started-guides/csp/gcp/gcp.md)
32 | - Getting started for Jupyter Notebook applications
33 | - [Apache Toree Notebook for Scala](/getting-started-guides/notebook/toree.md)
34 | - [Jupyter Notebook for Python](/getting-started-guides/notebook/python-notebook.md)
35 |
36 | These examples use default parameters for demo purposes. For a full list please see Supported XGBoost Parameters for [Scala](/examples/app-parameters/supported_xgboost_parameters_scala.md) or [Python](/examples/app-parameters/supported_xgboost_parameters_python.md)
37 |
38 | ### XGBoost-Spark API
39 |
40 | - [Scala API](/api-docs/scala.md)
41 | - [Python API](/api-docs/python.md)
42 |
43 | ### Advanced Topics
44 |
45 | - [Multi-GPU configuration](/advanced-topics/multi-gpu.md)
46 | - [Performance tuning](/advanced-topics/performance_tuning.md)
47 |
48 | ### Contact Us
49 |
50 | Please see the [RAPIDS](https://rapids.ai/community.html) website for contact information.
51 |
52 | ### License
53 |
54 | This content is licensed under the [Apache License 2.0](/LICENSE)
55 |
--------------------------------------------------------------------------------
/advanced-topics/multi-gpu.md:
--------------------------------------------------------------------------------
1 |
2 | # Multi-GPU Configuration for XGBoost4J-Spark
3 |
4 | This is an advanced guide on how to configure multiple GPUs to run XGBoost4j-Spark applications for each Spark workers.
5 |
6 | ### General Rules
7 |
8 | As a general rule, the number of Spark executors must be less than or equal to the number of GPUs on each host. This ensures that each XGBoost Spark task can run on one GPU exclusively.
9 |
10 | To enable this feature, *EXCLUSIVE_PROCESS* must be set for all GPUs for each host:
11 |
12 | ```
13 | nvidia-smi -i [gpu index] -c EXCLUSIVE_PROCESS
14 | ```
15 |
16 | For example:
17 |
18 | ```
19 | nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
20 | ```
21 |
22 | sets *EXCLUSIVE_PROCESS* for GPU *0*.
23 |
24 | ### Configuration for On-Prem Clusters
25 |
26 | To set the *EXCLUSIVE_PROCESS* for an on-prem cluster, please run the above *nvidia-smi* command for each GPU on each host before setting up the cluster. These commands might be added as a bootstrap script.
27 |
28 | ### Configuration for Cloud Service Providers (CSP)
29 |
30 | Each CSP has its own way to run a bootstrap script.
31 |
32 | Below is an example on how to set *EXCLUSIVE_PROCESS* for AWS EMR:
33 |
34 | - Create cluster and select "go to advanced options".
35 | - At "Step 3: General Cluster Settings", add Bootstrap to setup GPU exclusive mode.
36 | 
37 | - Create a shell script file and upload to S3 (emr_gpu_set.sh).
38 | ```
39 | #!/bin/bash
40 | sudo nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
41 | # set exclusive mode for other GPUs
42 | ```
43 | 
44 | - In "Additional Options", select "Custom Action" to add a bootstrap action, select from S3 location and selec the script file (emr_gpu_set.sh). The script will be executed when EMR launches each instance.
45 | 
46 |
--------------------------------------------------------------------------------
/advanced-topics/performance_tuning.md:
--------------------------------------------------------------------------------
1 | # Performance tuning for XGBoost4J-Spark
2 |
3 | This is an advanced guide on how to tune chunk size to achieve best performance.
4 |
5 | ## Chunk size
6 |
7 | We've supported chunk size reading and DMatrix building incrementally from version 0.2 release, and we also move DMatrix from GPU to CPU, which means XGBoost can load **any dataset size** if CPU memory is enough by tuning chunk size.
8 |
9 | Currently, Chunk size is controlled by three values: file size, max partition size and maxRowsPerChunk.
10 |
11 | - maxRowsPerChunk
12 |
13 | The granulity of maxRowsPerChunk is measured by row, which default to Integer.MAX_VALUE. According to maxRowsPerChunk, we can hardly evaluate the size of total maxRowsPerChunk rows. So for this reason, We'd suggest **not to touch this value**, just keep its default value.
14 |
15 | - max partition size
16 |
17 | max partition size is controlled by `spark.sql.files.maxPartitionBytes`, Please refer to [this page](https://spark.apache.org/docs/latest/sql-performance-tuning.html)
18 |
19 | After maxRowsPerChunk skipped, chunk size is equal to min{file size, max partition size}
20 |
21 | ### Benchmark
22 |
23 | We have some rounds of benchmark test against max partition size
24 |
25 | Given below files
26 | ```
27 | 1.6G 2010_1.csv
28 | 1.6G 2010_2.csv
29 | 1.5G 2010_3.csv
30 | 1.3G 2010_4.csv
31 | 2.6G 2011_1.csv
32 | 1.3G 2011_2.csv
33 | 1.5G 2011_3.csv
34 | 2.8G 2011_4.csv
35 | 970M 2014_1.csv
36 | 822M 2016_1.csv
37 | 963M 2016_2.csv
38 | 978M 2016_3.csv
39 | 887M 2016_4.csv
40 | 19G total
41 | ```
42 |
43 | 
44 |
45 | From that, we can see, when max partition size is greater than the largest file size, it can achieve best performance.
46 |
47 | However, There is a peak memory limitation when building DMatrix for a single chunk size, which should be less than TOTAL_GPU_MEMORY/3
48 |
49 | > In summary, when each file size is equal to max partition size, and max partition size is configured to TOTAL_GPU_MEMORY/3, then XGBoost can achieve the best performance.
50 |
51 | ### How to use
52 |
53 | You can configure max parititon size when you submit your task.
54 |
55 | ```
56 | spark-submit --conf spark.sql.files.maxPartitionBytes=XXXXXXX
57 | ```
58 |
--------------------------------------------------------------------------------
/advanced-topics/pics/multi-gpu-advanced-options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-advanced-options.png
--------------------------------------------------------------------------------
/advanced-topics/pics/multi-gpu-bootstrap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-bootstrap.png
--------------------------------------------------------------------------------
/advanced-topics/pics/multi-gpu-s3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/multi-gpu-s3.png
--------------------------------------------------------------------------------
/advanced-topics/pics/performance_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/advanced-topics/pics/performance_benchmark.png
--------------------------------------------------------------------------------
/api-docs/python.md:
--------------------------------------------------------------------------------
1 | # Python API for XGBoost-Spark
2 |
3 | This doc focuses on GPU related Python API interfaces. 7 new classes are introduced:
4 |
5 | - [CrossValidator](#crossvalidator)
6 | - [GpuDataset](#gpudataset)
7 | - [GpuDataReader](#gpudatareader)
8 | - [XGBoostClassifier](#xgboostclassifier)
9 | - [XGBoostClassificationModel](#xgboostclassificationmodel)
10 | - [XGBoostRegressor](#xgboostregressor)
11 | - [XGBoostRegressionModel](#xgboostregressionmodel)
12 |
13 | ### CrossValidator
14 |
15 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator`, and it is a wrapper around [Scala CrossValidator](scala.md#crossvalidator).
16 |
17 | ##### Constructors
18 |
19 | + CrossValidator()
20 |
21 | ##### Methods
22 |
23 | *Note: Only GPU related methods are listed below.*
24 |
25 | + fit(dataset): This method triggers the corss validation for hyperparameter tuninng.
26 | + dataset: a [GpuDataset](#gpudataset) used for cross validation
27 | + returns the best [Model](https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.Model)[\_] for the given hyperparameters.
28 | + Note: For CPU version, you can still call `fit` by passing a [Dataset](https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.sql.Dataset).
29 |
30 | ### GpuDataset
31 |
32 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset`. A GpuDataset is an object that is produced by [GpuDataReader](#gpudatareader)s and consumed by [XGBoostClassifier](#xgboostclassifier)s and [XGBoostRegressor](#xgboostregressor)s. No constructors or methods are exposed for this class.
33 |
34 | ### GpuDataReader
35 |
36 | The full name is `ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader`. A GpuDataReader sets options and builds [GpuDataset](#gpudataset) from data sources. The data loading is a lazy operation. It occurs when the data is processed later.
37 |
38 | ##### Constructors
39 |
40 | + GpuDataReader(spark_session)
41 | + spark_session: a [SparkSession](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=sparksession#pyspark.sql.SparkSession) for data loading
42 |
43 | ##### Methods
44 |
45 | + format(source): This method sets data format. Valid values include *csv*, *parquet* and *orc*.
46 | + source: a String represents the data format to set
47 | + returns the data reader itself
48 | + schema(schema): This method sets data schema.
49 | + schema: data schema either in [StructType](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=sparksession#pyspark.sql.types.StructType) format or a DDL-formatted String (e.g., *a INT, b STRING, c DOUBLE*)
50 | + returns the data reader itself
51 | + option(key, value): This method sets an option.
52 | + key: a String represents the option key
53 | + value: the option value, valid types include *Boolean*, *Integer*, *Float* and *String*
54 | + returns the data reader itself
55 | + options(options). This method sets options.
56 | + options: an option Dictionary[String, String]
57 | + returns the data reader itself
58 | + load(\*paths): This method builds a [GpuDataset](#gpudataset).
59 | + paths: the data source paths, might be empty, one path, or a list of paths
60 | + returns a [GpuDataset](#gpudataset) as the result
61 | + csv(\*paths): This method builds a [GpuDataset](#gpudataset).
62 | + paths: the CSV data paths, might be one path or a list of paths
63 | + returns a [GpuDataset](#gpudataset) as the result
64 | + parquet(\*paths): This method builds a [GpuDataset](#gpudataset).
65 | + paths: the Parquet data paths, might be one path or a list of paths
66 | + returns a [GpuDataset](#gpudataset) as the result
67 | + orc(\*paths):. This method builds a [GpuDataset](#gpudataset).
68 | + paths: the ORC data paths, might be one path or a list of paths
69 | + returns a [GpuDataset](#gpudataset) as the result
70 |
71 | ##### Options
72 |
73 | + Common options
74 | + asFloats: A Boolean flag indicates whether cast all numeric values to floats. Default is True.
75 | + maxRowsPerChunk: An Integer specifies the max rows per chunk. Default is 2147483647 (2^31-1).
76 | + Options for CSV
77 | + comment: A single character used for skipping lines beginning with this character. Default is empty string. By default, it is disabled.
78 | + header: A Boolean flag indicates whether the first line should be used as names of columns. Default is False.
79 | + nullValue: The string representation of a null(None) value. Default is empty string.
80 | + quote: A single character used for escaping quoted values where the separator can be part of the value. Default is `"`.
81 | + sep: A single character as a separator between adjacent values. Default is `,`.
82 |
83 | ### XGBoostClassifier
84 |
85 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier`. It is a wrapper around [Scala XGBoostClassifier](scala.md#xgboostclassifier).
86 |
87 | ##### Constructors
88 |
89 | + XGBoostClassifier(\*\*params)
90 | + all [standard xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported, but please note a few differences:
91 | + only camelCase is supported when specifying parameter names, e.g., *maxDepth*
92 | + parameter *lambda* is renamed to *lambda_*, because *lambda* is a keyword in Python
93 |
94 | ##### Methods
95 |
96 | *Note: Only GPU related methods are listed below.*
97 |
98 | + setFeaturesCols(features_cols). This method sets the feature columns for training.
99 | + features_cols: a list of feature column names in String format to set
100 | + returns the classifier itself
101 | + setEvalSets(eval_sets): This method sets eval sets for training.
102 | + eval_sets: eval sets of type Dictionary[String, [GpuDataset](#gpudataset)] for training (For CPU training, the type is Dictionary[String, [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)])
103 | + returns the classifier itself
104 | + fit(dataset): This method triggers the training.
105 | + dataset: a [GpuDataset](#gpudataset) to train
106 | + returns the training result as a [XGBoostClassificationModel](#xgboostclassificationmodel)
107 | + Note: For CPU training, you can still call fit to train a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)
108 |
109 | ### XGBoostClassificationModel
110 |
111 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel`. It is a wrapper around [Scala XGBoostClassificationModel](scala.md#xgboostclassificationmodel).
112 |
113 | ##### Methods
114 |
115 | *Note: Only GPU related methods are listed below.*
116 |
117 | + transform(dataset:): This method predicts results based on the model.
118 | + dataset: a [GpuDataset](#gpudataset) to predicate
119 | + returns a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) with the prediction
120 |
121 | ### XGBoostRegressor
122 |
123 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor`. It is a wrapper around [Scala XGBoostRegressor](scala.md#xgboostregressor).
124 |
125 | ##### Constructors
126 |
127 | + XGBoostRegressor(\*\*params)
128 | + all [standard xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported, but please note a few differences:
129 | + only camelCase is supported when specifying parameter names, e.g., *maxDepth*
130 | + parameter *lambda* is renamed to *lambda_*, because *lambda* is a keyword in Python
131 |
132 | ##### Methods
133 |
134 | *Note: Only GPU related methods are listed below.*
135 |
136 | + setFeaturesCols(features_cols). This method sets the feature columns for training.
137 | + features_cols: a list of feature column names in String format to set
138 | + returns the regressor itself
139 | + setEvalSets(eval_sets): This method sets eval sets for training.
140 | + eval_sets: eval sets of type Dictionary[String, [GpuDataset](#gpudataset)] for training (For CPU training, the type is Dictionary[String, [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)])
141 | + returns the regressor itself
142 | + fit(dataset): This method triggers the training.
143 | + dataset: a [GpuDataset](#gpudataset) to train
144 | + returns the training result as a [XGBoostRegressionModel](#xgboostregressionmodel)
145 | + Note: For CPU training, you can still call fit to train a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame)
146 |
147 | ### XGBoostRegressionModel
148 |
149 | The full name is `ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel`. It is a wrapper around [Scala XGBoostRegressionModel](scala.md#xgboostregressionmodel).
150 |
151 | ##### Methods
152 |
153 | *Note: Only GPU related methods are listed below.*
154 |
155 | + transform(dataset:): This method predicts results based on the model.
156 | + dataset: a [GpuDataset](#gpudataset) to predicate
157 | + returns a [DataFrame](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) with the prediction
158 |
--------------------------------------------------------------------------------
/datasets/ETL/Taxi_ETL.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import cudf\n",
10 | "import numpy as np\n",
11 | "from numba import cuda\n",
12 | "import math"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "PERCENT_TRAIN = 0.8"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#Numba Kernel to calculate Haversine distance\n",
31 | "@cuda.jit\n",
32 | "def haversine_kernel(lat1, lon1, lat2, lon2, outputCol):\n",
33 | " iRow = cuda.grid(1)\n",
34 | " p = 0.017453292519943295 # Pi/180\n",
35 | " if iRow < outputCol.size:\n",
36 | " a = 0.5 - math.cos((lat2[iRow] - lat1[iRow]) * p)/2 + math.cos(lat1[iRow] * p) * \\\n",
37 | " math.cos(lat2[iRow] * p) * (1 - math.cos((lon2[iRow] - lon1[iRow]) * p)) / 2 \n",
38 | " outputCol[iRow] = 12734 * math.asin(math.sqrt(a))\n",
39 | " \n",
40 | "def haversine_distance(gdf):\n",
41 | " nRows = gdf.shape[0]\n",
42 | " blockSize = 128\n",
43 | " blockCount = nRows // blockSize + 1\n",
44 | " lat1_arr = gdf['pickup_latitude'].to_gpu_array()\n",
45 | " lon1_arr = gdf['pickup_longitude'].to_gpu_array()\n",
46 | " lat2_arr = gdf['dropoff_latitude'].to_gpu_array()\n",
47 | " lon2_arr = gdf['dropoff_longitude'].to_gpu_array()\n",
48 | " \n",
49 | " outputCol = cuda.device_array ( shape=(nRows), dtype=lat1_arr.dtype.name)\n",
50 | " \n",
51 | " haversine_kernel[(blockCount),(blockSize)](lat1_arr, lon1_arr, lat2_arr, lon2_arr, outputCol)\n",
52 | " gdf.add_column(name='h_distance', data = outputCol)\n",
53 | " return gdf\n",
54 | "\n",
55 | "#Numba Kernel to calculate day of the week from Date\n",
56 | "@cuda.jit\n",
57 | "def day_of_the_week_kernel(output ,year, month, day):\n",
58 | " iRow = cuda.grid(1)\n",
59 | " if iRow < output.size:\n",
60 | " year[iRow] -= month[iRow] < 3\n",
61 | " month[iRow] = (month[iRow] + 9)%12 + 1\n",
62 | " output[iRow] = (year[iRow] + int(year[iRow]/4) - int(year[iRow]/100) + int(year[iRow]/400) + math.floor(2.6*month[iRow] - 0.2) + day[iRow] -1) % 7\n",
63 | " \n",
64 | "def day_of_week(gdf):\n",
65 | " nRows = gdf.shape[0]\n",
66 | " blockSize = 128\n",
67 | " blockCount = nRows // blockSize + 1\n",
68 | " year_arr = gdf['year'].to_gpu_array()\n",
69 | " month_arr = gdf['month'].to_gpu_array()\n",
70 | " day_arr = gdf['day'].to_gpu_array()\n",
71 | " outputCol = cuda.device_array ( shape=(nRows), dtype=day_arr.dtype.name)\n",
72 | " \n",
73 | " day_of_the_week_kernel[(blockCount),(blockSize)](outputCol, year_arr, month_arr, day_arr)\n",
74 | " gdf.add_column(name='day_of_week', data = outputCol)\n",
75 | " gdf['day_of_week'] = gdf['day_of_week'].astype('float32')\n",
76 | " return gdf\n",
77 | " \n",
78 | "import pandas as pd\n",
79 | "def gpu_read_csv(file_path):\n",
80 | " names = ['vendor_id','pickup_datetime','dropoff_datetime','passenger_count','trip_distance','pickup_longitude',\n",
81 | " 'pickup_latitude','rate_code','store_and_fwd','dropoff_longitude','dropoff_latitude','payment_type',\n",
82 | " 'fare_amount','surcharge','mta_tax','tip_amount','tolls_amount','total_amount']\n",
83 | " \n",
84 | " dtypes = ['category','date','date','int','float64','float64','float64','category','category','float64','float64',\n",
85 | " 'category','float64','float64','float64','float64','float64','float64']\n",
86 | "\n",
87 | " df = cudf.read_csv(file_path, dtype=dtypes, names=names,skiprows=1)\n",
88 | " return df\n",
89 | "\n",
90 | "def null_workaround(df, **kwargs):\n",
91 | " for column, data_type in df.dtypes.items():\n",
92 | " if str(data_type) in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:\n",
93 | " df[column] = df[column].fillna(-1)\n",
94 | " return df\n",
95 | "\n",
96 | "def clean_data(df):\n",
97 | " drop_list = [\n",
98 | " 'dropoff_datetime', 'payment_type', 'surcharge', 'mta_tax',\n",
99 | " 'tip_amount', 'tolls_amount', 'total_amount'\n",
100 | " ]\n",
101 | "\n",
102 | " for column in drop_list:\n",
103 | " df.drop_column(column)\n",
104 | " \n",
105 | " df = null_workaround(df)\n",
106 | " \n",
107 | " df_fare = df.query('fare_amount > 0 and fare_amount < 500')\n",
108 | " del(df)\n",
109 | " \n",
110 | " df_pass = df_fare.query('passenger_count > 0 and passenger_count < 6')\n",
111 | " del(df_fare)\n",
112 | " \n",
113 | " df_picklong = df_pass.query('pickup_longitude > -75 and pickup_longitude < -73')\n",
114 | " del(df_pass)\n",
115 | " \n",
116 | " df_droplong = df_picklong.query('dropoff_longitude > -75 and dropoff_longitude < -73')\n",
117 | " del(df_picklong)\n",
118 | " \n",
119 | " df_picklat = df_droplong.query('pickup_latitude > 40 and pickup_latitude < 42')\n",
120 | " del(df_droplong)\n",
121 | " \n",
122 | " df_droplat = df_picklat.query('dropoff_latitude > 40 and dropoff_latitude < 42')\n",
123 | " del(df_picklat)\n",
124 | " \n",
125 | " return df_droplat\n",
126 | " \n",
127 | "def add_features(df):\n",
128 | " df['hour'] = df['pickup_datetime'].dt.hour\n",
129 | " df['year'] = df['pickup_datetime'].dt.year\n",
130 | " df['month'] = df['pickup_datetime'].dt.month\n",
131 | " df['day'] = df['pickup_datetime'].dt.day\n",
132 | " \n",
133 | " df.drop_column('pickup_datetime')\n",
134 | " \n",
135 | " df = day_of_week(df)\n",
136 | " df['is_weekend'] = (df['day_of_week']/4).floor()\n",
137 | " df = haversine_distance(df)\n",
138 | " return df\n",
139 | " \n",
140 | "\n",
141 | "def process_data(train_path):\n",
142 | " df = gpu_read_csv(train_path)\n",
143 | " df = clean_data(df)\n",
144 | " df = add_features(df)\n",
145 | " return df"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "month = 1\n",
155 | "start = 2016\n",
156 | "end = 2016\n",
157 | "year = start\n",
158 | "\n",
159 | "# INPUT DIRECTORY (change this to point to where you downloaded the dataset)\n",
160 | "INPUT_DIRECTORY = \"/datasets/test/raw/taxi\"\n",
161 | "\n",
162 | "# OUTPUT DIRECTORY (change this to point to where you want the processed dataset output)\n",
163 | "OUTPUT_DIRECTORY = \"/datasets/test/taxi\"\n",
164 | "\n",
165 | "while year <= end:\n",
166 | " current_part_path = INPUT_DIRECTORY + \"/yellow_tripdata_\" + str(year) + \"-\" + f\"{month:02d}\" + \".csv\"\n",
167 | " \n",
168 | " train_part_path_pq = OUTPUT_DIRECTORY + \"/parquet/train/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".parquet\" \n",
169 | " test_part_path_pq = OUTPUT_DIRECTORY + \"/parquet/test/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".parquet\"\n",
170 | " \n",
171 | " train_part_path_csv = OUTPUT_DIRECTORY + \"/csv/train/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".csv\" \n",
172 | " test_part_path_csv = OUTPUT_DIRECTORY + \"/csv/test/yellow_tripdata_\" + str(year) + \"-\" + str(month) + \".csv\"\n",
173 | " \n",
174 | " print(current_part_path)\n",
175 | " df = process_data(current_part_path)\n",
176 | " month += 1\n",
177 | " \n",
178 | " msk = np.random.rand(len(df)) < PERCENT_TRAIN\n",
179 | " \n",
180 | " train = df[msk]\n",
181 | " test = df[~msk]\n",
182 | " \n",
183 | " print(train.shape)\n",
184 | " print(test.shape)\n",
185 | " \n",
186 | " train.to_parquet(train_part_path_pq)\n",
187 | " test.to_parquet(test_part_path_pq)\n",
188 | " \n",
189 | " train.to_pandas().to_csv(train_part_path_csv, header=False)\n",
190 | " test.to_pandas().to_csv(train_part_path_csv, header=False)\n",
191 | " \n",
192 | " del train\n",
193 | " del test\n",
194 | " del df\n",
195 | " \n",
196 | " if month > 12:\n",
197 | " month = 1\n",
198 | " year += 1\n",
199 | " "
200 | ]
201 | }
202 | ],
203 | "metadata": {
204 | "kernelspec": {
205 | "display_name": "Python 3",
206 | "language": "python",
207 | "name": "python3"
208 | },
209 | "language_info": {
210 | "codemirror_mode": {
211 | "name": "ipython",
212 | "version": 3
213 | },
214 | "file_extension": ".py",
215 | "mimetype": "text/x-python",
216 | "name": "python",
217 | "nbconvert_exporter": "python",
218 | "pygments_lexer": "ipython3",
219 | "version": "3.6.7"
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 2
224 | }
225 |
--------------------------------------------------------------------------------
/datasets/agaricus.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/agaricus.tar.gz
--------------------------------------------------------------------------------
/datasets/mortgage-small.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/mortgage-small.tar.gz
--------------------------------------------------------------------------------
/datasets/preparing_datasets.md:
--------------------------------------------------------------------------------
1 | Preparing Datasets
2 | ==================
3 |
4 | ## Mortgage Example
5 | 1. Setup [Apache Toree Jupyter notebook](/getting-started-guides/notebook/toree.md).
6 | 2. Download raw data from: https://rapidsai.github.io/demos/datasets/mortgage-data
7 | 3. Run [Mortgage ETL job](/datasets/ETL/MortgageETL.ipynb).
8 |
9 | ## Taxi Example
10 | 1. Setup [Apache Toree Jupyter notebook](/getting-started-guides/notebook/toree.md).
11 | 2. Install `cudatoolkit` and `numba` (`conda` example provided, but you can also use `pip`):
12 | ```
13 | conda install numba
14 | conda install cudatoolkit
15 | ```
16 | 3. Download raw data:
17 | ```
18 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_20{09..16}-{01..12}.csv
19 | ```
20 | 4. Run [Taxi ETL job](/datasets/ETL/Taxi_ETL.ipynb).
21 |
--------------------------------------------------------------------------------
/datasets/taxi-small.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/datasets/taxi-small.tar.gz
--------------------------------------------------------------------------------
/examples/app-parameters/supported_xgboost_parameters_python.md:
--------------------------------------------------------------------------------
1 | Supported XGBoost Parameters
2 | ============================
3 |
4 | This is a description of all the parameters available when you are running examples in this repo:
5 |
6 | 1. All [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported.
7 | * Please use the `camelCase`, e.g., `--treeMethod=gpu_hist`.
8 | * `lambda` is replaced with `lambda_`, because `lambda` is a keyword in Python.
9 | 2. `--format=[csv|parquet|orc]`: The format of the data for training/transforming, now supports 'csv', 'parquet' and 'orc'. *Required*.
10 | 3. `--mode=[all|train|transform]`. To control the behavior of the sample app, default is 'all' if not specified.
11 | * all: Do both training and transforming, will save model to 'modelPath' if specified
12 | * train: Do training only, will save model to 'modelPath' if specified.
13 | * transform: Do transforming only, 'modelPath' is required to locate the model data to be loaded.
14 | 4. `--trainDataPath=[path]`: Path to your training data file(s), required when mode is NOT 'transform'.
15 | 5. `--trainEvalDataPath=[path]`: Path to your data file(s) for training with evaluation. Optional.
16 | 6. `--evalDataPath=[path]`: Path to your test(evaluation) data file(s), required when mode is NOT 'train'.
17 | 7. `--modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'.
18 | 8. `--overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists.
19 | 9. `--hasHeader=[true|false]`: Indicate if your csv file has header.
20 | 10. `--asFloats=[true|false]`: Whether to cast numerical schema to float schema. Default is true.
21 | 11. `--maxRowsPerChunk=[value]`: Max lines of row to be read per chunk. Default is 2147483647.
22 |
--------------------------------------------------------------------------------
/examples/app-parameters/supported_xgboost_parameters_scala.md:
--------------------------------------------------------------------------------
1 | Supported XGBoost Parameters
2 | ============================
3 |
4 | This is a description of all the parameters available when you are running examples in this repo:
5 |
6 | 1. All [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) are supported.
7 | 2. `-format=[csv|parquet|orc]`: The format of the data for training/transforming, now supports 'csv', 'parquet' and 'orc'. *Required*.
8 | 3. `-mode=[all|train|transform]`. To control the behavior of the sample app, default is 'all' if not specified.
9 | * all: Do both training and transforming, will save model to 'modelPath' if specified
10 | * train: Do training only, will save model to 'modelPath' if specified.
11 | * transform: Do transforming only, 'modelPath' is required to locate the model data to be loaded.
12 | 4. `-trainDataPath=[path]`: Path to your training data file(s), required when mode is NOT 'transform'.
13 | 5. `-trainEvalDataPath=[path]`: Path to your data file(s) for training with evaluation. Optional.
14 | 6. `-evalDataPath=[path]`: Path to your test(evaluation) data file(s), required when mode is NOT 'train'.
15 | 7. `-modelPath=[path]`: Path to save model after training, or where to load model for transforming only. Required only when mode is 'transform'.
16 | 8. `-overwrite=[true|false]`: Whether to overwrite the current model data under 'modelPath'. Default is false. You may need to set to true to avoid IOException when saving the model to a path already exists.
17 | 9. `-hasHeader=[true|false]`: Indicate if your csv file has header.
18 | 10. `-asFloats=[true|false]`: Whether to cast numerical schema to float schema. Default is true.
19 | 11. `-maxRowsPerChunk=[value]`: Max lines of row to be read per chunk. Default is Integer.MAX_VALUE.
20 |
--------------------------------------------------------------------------------
/examples/apps/python/.gitignore:
--------------------------------------------------------------------------------
1 | samples.zip
2 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/agaricus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/agaricus/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/agaricus/consts.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | from pyspark.sql.types import *
18 |
19 | label = 'label'
20 | features = [ 'feature_' + str(i) for i in range(0, 126) ]
21 | schema = StructType([ StructField(x, FloatType()) for x in [label] + features ])
22 |
23 | default_params = {
24 | 'eta': 0.1,
25 | 'missing': 0.0,
26 | 'maxDepth': 2,
27 | 'numWorkers': 1,
28 | }
29 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/agaricus/cpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.agaricus.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from pyspark.sql import SparkSession
21 |
22 | def main(args, xgboost_args):
23 | spark = (SparkSession
24 | .builder
25 | .appName(args.mainClass)
26 | .getOrCreate())
27 |
28 | def prepare_data(path):
29 | reader = spark.read.format(args.format)
30 | if args.format == 'csv':
31 | reader.schema(schema).option('header', args.hasHeader)
32 | return vectorize(reader.load(path), label)
33 |
34 | if args.mode in [ 'all', 'train' ]:
35 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
36 | .setLabelCol(label)
37 | .setFeaturesCol('features'))
38 |
39 | if args.trainEvalDataPath:
40 | train_eval_data = prepare_data(args.trainEvalDataPath)
41 | classifier.setEvalSets({ 'test': train_eval_data })
42 |
43 | train_data = prepare_data(args.trainDataPath)
44 | model = with_benchmark('Training', lambda: classifier.fit(train_data))
45 |
46 | if args.modelPath:
47 | writer = model.write().overwrite() if args.overwrite else model
48 | writer.save(args.modelPath)
49 | else:
50 | model = XGBoostClassificationModel().load(args.modelPath)
51 |
52 | if args.mode in [ 'all', 'transform' ]:
53 | eval_data = prepare_data(args.evalDataPath)
54 |
55 | def transform():
56 | result = model.transform(eval_data).cache()
57 | result.foreachPartition(lambda _: None)
58 | return result
59 |
60 | result = with_benchmark('Transformation', transform)
61 | show_sample(args, result, label)
62 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
63 |
64 | spark.stop()
65 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/agaricus/gpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.agaricus.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
21 | from pyspark.sql import SparkSession
22 |
23 | def main(args, xgboost_args):
24 | spark = (SparkSession
25 | .builder
26 | .appName(args.mainClass)
27 | .getOrCreate())
28 |
29 | def prepare_data(path):
30 | reader = (GpuDataReader(spark)
31 | .format(args.format)
32 | .option('asFloats', args.asFloats)
33 | .option('maxRowsPerChunk', args.maxRowsPerChunk))
34 | if args.format == 'csv':
35 | reader.schema(schema).option('header', args.hasHeader)
36 | return reader.load(path)
37 |
38 | if args.mode in [ 'all', 'train' ]:
39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
40 | .setLabelCol(label)
41 | .setFeaturesCols(features))
42 |
43 | if args.trainEvalDataPath:
44 | train_eval_data = prepare_data(args.trainEvalDataPath)
45 | classifier.setEvalSets({ 'test': train_eval_data })
46 |
47 | train_data = prepare_data(args.trainDataPath)
48 | model = with_benchmark('Training', lambda: classifier.fit(train_data))
49 |
50 | if args.modelPath:
51 | writer = model.write().overwrite() if args.overwrite else model
52 | writer.save(args.modelPath)
53 | else:
54 | model = XGBoostClassificationModel().load(args.modelPath)
55 |
56 | if args.mode in [ 'all', 'transform' ]:
57 | eval_data = prepare_data(args.evalDataPath)
58 |
59 | def transform():
60 | result = model.transform(eval_data).cache()
61 | result.foreachPartition(lambda _: None)
62 | return result
63 |
64 | result = with_benchmark('Transformation', transform)
65 | show_sample(args, result, label)
66 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
67 |
68 | spark.stop()
69 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.utility.args import parse_arguments
17 | from importlib import import_module
18 |
19 | def main():
20 | args, xgboost_args = parse_arguments()
21 | getattr(import_module(args.mainClass), 'main')(args, xgboost_args)
22 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/mortgage/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/consts.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | from pyspark.sql.types import *
18 |
19 | label = 'delinquency_12'
20 |
21 | schema = StructType([
22 | StructField('orig_channel', FloatType()),
23 | StructField('first_home_buyer', FloatType()),
24 | StructField('loan_purpose', FloatType()),
25 | StructField('property_type', FloatType()),
26 | StructField('occupancy_status', FloatType()),
27 | StructField('property_state', FloatType()),
28 | StructField('product_type', FloatType()),
29 | StructField('relocation_mortgage_indicator', FloatType()),
30 | StructField('seller_name', FloatType()),
31 | StructField('mod_flag', FloatType()),
32 | StructField('orig_interest_rate', FloatType()),
33 | StructField('orig_upb', IntegerType()),
34 | StructField('orig_loan_term', IntegerType()),
35 | StructField('orig_ltv', FloatType()),
36 | StructField('orig_cltv', FloatType()),
37 | StructField('num_borrowers', FloatType()),
38 | StructField('dti', FloatType()),
39 | StructField('borrower_credit_score', FloatType()),
40 | StructField('num_units', IntegerType()),
41 | StructField('zip', IntegerType()),
42 | StructField('mortgage_insurance_percent', FloatType()),
43 | StructField('current_loan_delinquency_status', IntegerType()),
44 | StructField('current_actual_upb', FloatType()),
45 | StructField('interest_rate', FloatType()),
46 | StructField('loan_age', FloatType()),
47 | StructField('msa', FloatType()),
48 | StructField('non_interest_bearing_upb', FloatType()),
49 | StructField(label, IntegerType()),
50 | ])
51 |
52 | features = [ x.name for x in schema if x.name != label ]
53 |
54 | default_params = {
55 | 'eta': 0.1,
56 | 'gamma': 0.1,
57 | 'missing': 0.0,
58 | 'maxDepth': 10,
59 | 'maxLeaves': 256,
60 | 'growPolicy': 'depthwise',
61 | 'objective': 'binary:logistic',
62 | 'minChildWeight': 30.0,
63 | 'lambda_': 1.0,
64 | 'scalePosWeight': 2.0,
65 | 'subsample': 1.0,
66 | 'nthread': 1,
67 | 'numRound': 100,
68 | 'numWorkers': 1,
69 | }
70 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/cpu_cross_validator_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.mortgage.consts import *
17 | from ai.rapids.spark.examples.utility.utils import *
18 | from ml.dmlc.xgboost4j.scala.spark import *
19 | from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator
20 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
21 | from pyspark.ml.tuning import ParamGridBuilder
22 | from pyspark.sql import SparkSession
23 |
24 | def main(args, xgboost_args):
25 | spark = (SparkSession
26 | .builder
27 | .appName(args.mainClass)
28 | .getOrCreate())
29 |
30 | def prepare_data(path):
31 | reader = spark.read.format(args.format)
32 | if args.format == 'csv':
33 | reader.schema(schema).option('header', args.hasHeader)
34 | return vectorize(reader.load(path), label)
35 |
36 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
37 | .setLabelCol(label)
38 | .setFeaturesCol('features'))
39 | evaluator = (MulticlassClassificationEvaluator()
40 | .setLabelCol(label))
41 | param_grid = (ParamGridBuilder()
42 | .addGrid(classifier.maxDepth, [5, 10])
43 | .addGrid(classifier.numRound, [100, 200])
44 | .build())
45 | cross_validator = (CrossValidator()
46 | .setEstimator(classifier)
47 | .setEvaluator(evaluator)
48 | .setEstimatorParamMaps(param_grid)
49 | .setNumFolds(3))
50 |
51 | train_data = prepare_data(args.trainDataPath)
52 | model = cross_validator.fit(train_data)
53 |
54 | spark.stop()
55 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/cpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.mortgage.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from pyspark.sql import SparkSession
21 |
22 | def main(args, xgboost_args):
23 | spark = (SparkSession
24 | .builder
25 | .appName(args.mainClass)
26 | .getOrCreate())
27 |
28 | def prepare_data(path):
29 | reader = spark.read.format(args.format)
30 | if args.format == 'csv':
31 | reader.schema(schema).option('header', args.hasHeader)
32 | return vectorize(reader.load(path), label)
33 |
34 | if args.mode in [ 'all', 'train' ]:
35 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
36 | .setLabelCol(label)
37 | .setFeaturesCol('features'))
38 |
39 | if args.trainEvalDataPath:
40 | train_eval_data = prepare_data(args.trainEvalDataPath)
41 | classifier.setEvalSets({ 'test': train_eval_data })
42 |
43 | train_data = prepare_data(args.trainDataPath)
44 | model = with_benchmark('Training', lambda: classifier.fit(train_data))
45 |
46 | if args.modelPath:
47 | writer = model.write().overwrite() if args.overwrite else model
48 | writer.save(args.modelPath)
49 | else:
50 | model = XGBoostClassificationModel().load(args.modelPath)
51 |
52 | if args.mode in [ 'all', 'transform' ]:
53 | eval_data = prepare_data(args.evalDataPath)
54 |
55 | def transform():
56 | result = model.transform(eval_data).cache()
57 | result.foreachPartition(lambda _: None)
58 | return result
59 |
60 | result = with_benchmark('Transformation', transform)
61 | show_sample(args, result, label)
62 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
63 |
64 | spark.stop()
65 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/gpu_cross_validator_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.mortgage.consts import *
17 | from ai.rapids.spark.examples.utility.utils import *
18 | from ml.dmlc.xgboost4j.scala.spark import *
19 | from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator, GpuDataReader
20 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
21 | from pyspark.ml.tuning import ParamGridBuilder
22 | from pyspark.sql import SparkSession
23 |
24 | def main(args, xgboost_args):
25 | spark = (SparkSession
26 | .builder
27 | .appName(args.mainClass)
28 | .getOrCreate())
29 |
30 | def prepare_data(path):
31 | reader = (GpuDataReader(spark)
32 | .format(args.format)
33 | .option('asFloats', args.asFloats)
34 | .option('maxRowsPerChunk', args.maxRowsPerChunk))
35 | if args.format == 'csv':
36 | reader.schema(schema).option('header', args.hasHeader)
37 | return reader.load(path)
38 |
39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
40 | .setLabelCol(label)
41 | .setFeaturesCols(features))
42 | evaluator = (MulticlassClassificationEvaluator()
43 | .setLabelCol(label))
44 | param_grid = (ParamGridBuilder()
45 | .addGrid(classifier.maxDepth, [5, 10])
46 | .addGrid(classifier.numRound, [100, 200])
47 | .build())
48 | cross_validator = (CrossValidator()
49 | .setEstimator(classifier)
50 | .setEvaluator(evaluator)
51 | .setEstimatorParamMaps(param_grid)
52 | .setNumFolds(3))
53 |
54 | train_data = prepare_data(args.trainDataPath)
55 | model = cross_validator.fit(train_data)
56 |
57 | spark.stop()
58 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/mortgage/gpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.mortgage.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
21 | from pyspark.sql import SparkSession
22 |
23 | def main(args, xgboost_args):
24 | spark = (SparkSession
25 | .builder
26 | .appName(args.mainClass)
27 | .getOrCreate())
28 |
29 | def prepare_data(path):
30 | reader = (GpuDataReader(spark)
31 | .format(args.format)
32 | .option('asFloats', args.asFloats)
33 | .option('maxRowsPerChunk', args.maxRowsPerChunk))
34 | if args.format == 'csv':
35 | reader.schema(schema).option('header', args.hasHeader)
36 | return reader.load(path)
37 |
38 | if args.mode in [ 'all', 'train' ]:
39 | classifier = (XGBoostClassifier(**merge_dicts(default_params, xgboost_args))
40 | .setLabelCol(label)
41 | .setFeaturesCols(features))
42 |
43 | if args.trainEvalDataPath:
44 | train_eval_data = prepare_data(args.trainEvalDataPath)
45 | classifier.setEvalSets({ 'test': train_eval_data })
46 |
47 | train_data = prepare_data(args.trainDataPath)
48 | model = with_benchmark('Training', lambda: classifier.fit(train_data))
49 |
50 | if args.modelPath:
51 | writer = model.write().overwrite() if args.overwrite else model
52 | writer.save(args.modelPath)
53 | else:
54 | model = XGBoostClassificationModel().load(args.modelPath)
55 |
56 | if args.mode in [ 'all', 'transform' ]:
57 | eval_data = prepare_data(args.evalDataPath)
58 |
59 | def transform():
60 | result = model.transform(eval_data).cache()
61 | result.foreachPartition(lambda _: None)
62 | return result
63 |
64 | result = with_benchmark('Transformation', transform)
65 | show_sample(args, result, label)
66 | with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))
67 |
68 | spark.stop()
69 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/taxi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/taxi/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/taxi/consts.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | from pyspark.sql.types import *
18 |
19 | label = 'fare_amount'
20 |
21 | schema = StructType([
22 | StructField('vendor_id', FloatType()),
23 | StructField('passenger_count', FloatType()),
24 | StructField('trip_distance', FloatType()),
25 | StructField('pickup_longitude', FloatType()),
26 | StructField('pickup_latitude', FloatType()),
27 | StructField('rate_code', FloatType()),
28 | StructField('store_and_fwd', FloatType()),
29 | StructField('dropoff_longitude', FloatType()),
30 | StructField('dropoff_latitude', FloatType()),
31 | StructField(label, FloatType()),
32 | StructField('hour', FloatType()),
33 | StructField('year', IntegerType()),
34 | StructField('month', IntegerType()),
35 | StructField('day', FloatType()),
36 | StructField('day_of_week', FloatType()),
37 | StructField('is_weekend', FloatType()),
38 | ])
39 |
40 | features = [ x.name for x in schema if x.name != label ]
41 |
42 | default_params = {
43 | 'eta': 0.05,
44 | 'maxDepth': 8,
45 | 'subsample': 0.8,
46 | 'gamma': 1.0,
47 | 'numRound': 100,
48 | 'numWorkers': 1,
49 | }
50 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/taxi/cpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.taxi.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from pyspark.sql import SparkSession
21 |
22 | def main(args, xgboost_args):
23 | spark = (SparkSession
24 | .builder
25 | .appName(args.mainClass)
26 | .getOrCreate())
27 |
28 | def prepare_data(path):
29 | reader = spark.read.format(args.format)
30 | if args.format == 'csv':
31 | reader.schema(schema).option('header', args.hasHeader)
32 | return vectorize(reader.load(path), label)
33 |
34 | if args.mode in [ 'all', 'train' ]:
35 | regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
36 | .setLabelCol(label)
37 | .setFeaturesCol('features'))
38 |
39 | if args.trainEvalDataPath:
40 | train_eval_data = prepare_data(args.trainEvalDataPath)
41 | regressor.setEvalSets({ 'test': train_eval_data })
42 |
43 | train_data = prepare_data(args.trainDataPath)
44 | model = with_benchmark('Training', lambda: regressor.fit(train_data))
45 |
46 | if args.modelPath:
47 | writer = model.write().overwrite() if args.overwrite else model
48 | writer.save(args.modelPath)
49 | else:
50 | model = XGBoostRegressionModel().load(args.modelPath)
51 |
52 | if args.mode in [ 'all', 'transform' ]:
53 | eval_data = prepare_data(args.evalDataPath)
54 |
55 | def transform():
56 | result = model.transform(eval_data).cache()
57 | result.foreachPartition(lambda _: None)
58 | return result
59 |
60 | result = with_benchmark('Transformation', transform)
61 | show_sample(args, result, label)
62 | with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
63 |
64 | spark.stop()
65 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/taxi/gpu_main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.taxi.consts import *
17 | from ai.rapids.spark.examples.utility.args import parse_arguments
18 | from ai.rapids.spark.examples.utility.utils import *
19 | from ml.dmlc.xgboost4j.scala.spark import *
20 | from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
21 | from pyspark.sql import SparkSession
22 |
23 | def main(args, xgboost_args):
24 | spark = (SparkSession
25 | .builder
26 | .appName(args.mainClass)
27 | .getOrCreate())
28 |
29 | def prepare_data(path):
30 | reader = (GpuDataReader(spark)
31 | .format(args.format)
32 | .option('asFloats', args.asFloats)
33 | .option('maxRowsPerChunk', args.maxRowsPerChunk))
34 | if args.format == 'csv':
35 | reader.schema(schema).option('header', args.hasHeader)
36 | return reader.load(path)
37 |
38 | if args.mode in [ 'all', 'train' ]:
39 | regressor = (XGBoostRegressor(**merge_dicts(default_params, xgboost_args))
40 | .setLabelCol(label)
41 | .setFeaturesCols(features))
42 |
43 | if args.trainEvalDataPath:
44 | train_eval_data = prepare_data(args.trainEvalDataPath)
45 | regressor.setEvalSets({ 'test': train_eval_data })
46 |
47 | train_data = prepare_data(args.trainDataPath)
48 | model = with_benchmark('Training', lambda: regressor.fit(train_data))
49 |
50 | if args.modelPath:
51 | writer = model.write().overwrite() if args.overwrite else model
52 | writer.save(args.modelPath)
53 | else:
54 | model = XGBoostRegressionModel().load(args.modelPath)
55 |
56 | if args.mode in [ 'all', 'transform' ]:
57 | eval_data = prepare_data(args.evalDataPath)
58 |
59 | def transform():
60 | result = model.transform(eval_data).cache()
61 | result.foreachPartition(lambda _: None)
62 | return result
63 |
64 | result = with_benchmark('Transformation', transform)
65 | show_sample(args, result, label)
66 | with_benchmark('Evaluation', lambda: check_regression_accuracy(result, label))
67 |
68 | spark.stop()
69 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/examples/apps/python/ai/rapids/spark/examples/utility/__init__.py
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/utility/args.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from argparse import ArgumentParser
17 | from distutils.util import strtobool
18 | from sys import exit
19 |
20 | def _to_bool(literal):
21 | return bool(strtobool(literal))
22 |
23 | MAX_CHUNK_SIZE = 2 ** 31 - 1
24 |
25 | _examples = [
26 | 'ai.rapids.spark.examples.agaricus.cpu_main',
27 | 'ai.rapids.spark.examples.agaricus.gpu_main',
28 | 'ai.rapids.spark.examples.mortgage.cpu_main',
29 | 'ai.rapids.spark.examples.mortgage.gpu_main',
30 | 'ai.rapids.spark.examples.mortgage.cpu_cross_validator_main',
31 | 'ai.rapids.spark.examples.mortgage.gpu_cross_validator_main',
32 | 'ai.rapids.spark.examples.taxi.cpu_main',
33 | 'ai.rapids.spark.examples.taxi.gpu_main',
34 | ]
35 |
36 | _xgboost_simple_args = [
37 | ('cacheTrainingSet', _to_bool),
38 | ('maximizeEvaluationMetrics', _to_bool),
39 | ('useExternalMemory', _to_bool),
40 | ('checkpointInterval', int),
41 | ('maxBins', int),
42 | ('maxDepth', int),
43 | ('maxLeaves', int),
44 | ('nthread', int),
45 | ('numClass', int),
46 | ('numEarlyStoppingRounds', int),
47 | ('numRound', int),
48 | ('numWorkers', int),
49 | ('seed', int),
50 | ('silent', int),
51 | ('timeoutRequestWorkers', int),
52 | ('treeLimit', int),
53 | ('verbosity', int),
54 | ('alpha', float),
55 | ('baseScore', float),
56 | ('colsampleBylevel', float),
57 | ('colsampleBytree', float),
58 | ('eta', float),
59 | ('gamma', float),
60 | ('lambda_', float),
61 | ('lambdaBias', float),
62 | ('maxDeltaStep', float),
63 | ('minChildWeight', float),
64 | ('missing', float),
65 | ('rateDrop', float),
66 | ('scalePosWeight', float),
67 | ('sketchEps', float),
68 | ('skipDrop', float),
69 | ('subsample', float),
70 | ('trainTestRatio', float),
71 | ('baseMarginCol', str),
72 | ('checkpointPath', str),
73 | ('contribPredictionCol', str),
74 | ('evalMetric', str),
75 | ('featuresCol', str),
76 | ('groupCol', str),
77 | ('growPolicy', str),
78 | ('interactionConstraints', str),
79 | ('labelCol', str),
80 | ('leafPredictionCol', str),
81 | ('monotoneConstraints', str),
82 | ('normalizeType', str),
83 | ('objective', str),
84 | ('objectiveType', str),
85 | ('predictionCol', str),
86 | ('probabilityCol', str),
87 | ('rawPredictionCol', str),
88 | ('sampleType', str),
89 | ('treeMethod', str),
90 | ('weightCol', str),
91 | ]
92 |
93 | _xgboost_array_args = [
94 | ('thresholds', float),
95 | ]
96 |
97 | def _validate_args(args):
98 | usage = ''
99 | if args.mode in [ 'all', 'train' ] and not args.trainDataPath:
100 | usage += ' --trainDataPath is required for training\n'
101 | if args.mode in [ 'all', 'transform' ] and not args.evalDataPath:
102 | usage += ' --evalDataPath is required for transformation\n'
103 | if args.mode == 'transform' and not args.modelPath:
104 | usage += ' --modelPath is required for transformation\n'
105 | if not (1 <= args.maxRowsPerChunk <= MAX_CHUNK_SIZE):
106 | usage += ' --maxRowsPerChunk should be in range [1, {}]\n'.format(MAX_CHUNK_SIZE)
107 | if usage:
108 | print('-' * 80)
109 | print('Usage:\n' + usage)
110 | exit(1)
111 |
112 | def parse_arguments():
113 | parser = ArgumentParser()
114 |
115 | # application arguments
116 | parser.add_argument('--mainClass', required=True, choices=_examples)
117 | parser.add_argument('--mode', choices=['all', 'train', 'transform'], default='all')
118 | parser.add_argument('--format', required=True, choices=['csv', 'parquet', 'orc'])
119 | parser.add_argument('--hasHeader', type=_to_bool, default=True)
120 | parser.add_argument('--asFloats', type=_to_bool, default=True)
121 | parser.add_argument('--maxRowsPerChunk', type=int, default=MAX_CHUNK_SIZE)
122 | parser.add_argument('--modelPath')
123 | parser.add_argument('--overwrite', type=_to_bool, default=False)
124 | parser.add_argument('--trainDataPath')
125 | parser.add_argument('--trainEvalDataPath')
126 | parser.add_argument('--evalDataPath')
127 | parser.add_argument('--numRows', type=int, default=5)
128 | parser.add_argument('--showFeatures', type=bool, default=True)
129 |
130 | # xgboost simple args
131 | for arg, arg_type in _xgboost_simple_args:
132 | parser.add_argument('--' + arg, type=arg_type)
133 |
134 | # xgboost array args
135 | for arg, arg_type in _xgboost_array_args:
136 | parser.add_argument('--' + arg, type=arg_type, action='append')
137 |
138 | parsed_all = parser.parse_args()
139 | _validate_args(parsed_all)
140 |
141 | xgboost_args = [ arg for (arg, _) in _xgboost_simple_args + _xgboost_array_args ]
142 | parsed_xgboost = {
143 | k: v
144 | for k, v in vars(parsed_all).items()
145 | if k in xgboost_args and v is not None
146 | }
147 |
148 | return parsed_all, parsed_xgboost
149 |
--------------------------------------------------------------------------------
/examples/apps/python/ai/rapids/spark/examples/utility/utils.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from pyspark.ml.evaluation import *
17 | from pyspark.ml.feature import VectorAssembler
18 | from pyspark.sql.functions import col
19 | from pyspark.sql.types import FloatType
20 | from time import time
21 |
22 | def merge_dicts(dict_x, dict_y):
23 | result = dict_x.copy()
24 | result.update(dict_y)
25 | return result
26 |
27 | def show_sample(args, data_frame, label):
28 | data_frame = data_frame if args.showFeatures else data_frame.select(label, 'prediction')
29 | data_frame.show(args.numRows)
30 |
31 | def vectorize(data_frame, label):
32 | features = [ x.name for x in data_frame.schema if x.name != label ]
33 | to_floats = [ col(x.name).cast(FloatType()) for x in data_frame.schema ]
34 | return (VectorAssembler()
35 | .setInputCols(features)
36 | .setOutputCol('features')
37 | .transform(data_frame.select(to_floats))
38 | .select(col('features'), col(label)))
39 |
40 | def with_benchmark(phrase, action):
41 | start = time()
42 | result = action()
43 | end = time()
44 | print('-' * 100)
45 | print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
46 | return result
47 |
48 | def check_classification_accuracy(data_frame, label):
49 | accuracy = (MulticlassClassificationEvaluator()
50 | .setLabelCol(label)
51 | .evaluate(data_frame))
52 | print('-' * 100)
53 | print('Accuracy is ' + str(accuracy))
54 |
55 | def check_regression_accuracy(data_frame, label):
56 | accuracy = (RegressionEvaluator()
57 | .setLabelCol(label)
58 | .evaluate(data_frame))
59 | print('-' * 100)
60 | print('RMSE is ' + str(accuracy))
61 |
--------------------------------------------------------------------------------
/examples/apps/python/main.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | from ai.rapids.spark.examples.main import main
17 |
18 | main()
19 |
--------------------------------------------------------------------------------
/examples/apps/scala/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | target
3 | *.iml
4 |
--------------------------------------------------------------------------------
/examples/apps/scala/assembly/assembly-no-scala.xml:
--------------------------------------------------------------------------------
1 |
4 | jar-with-dependencies
5 |
6 | jar
7 |
8 | false
9 |
10 |
11 |
12 | org.scala-lang*:scala-*
13 |
14 | /
15 | true
16 | true
17 | runtime
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/examples/apps/scala/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | 4.0.0
6 | ai.rapids
7 | sample_xgboost_apps
8 |
9 | jar
10 | Sample XGBoost4J-Spark applications
11 | 0.1.5
12 | sample_xgboost_apps
13 |
14 |
15 | UTF-8
16 | 1.0.0-Beta5
17 | 0.9.2
18 | 2.4.0
19 | 2.11.6
20 | 2.11
21 |
22 |
23 |
24 |
25 |
26 | ai.rapids
27 | xgboost4j_2.x
28 | ${xgboost.version}
29 |
30 |
31 | ai.rapids
32 | cudf
33 | ${cudf.version}
34 | ${cuda.classifier}
35 |
36 |
37 | ai.rapids
38 | xgboost4j-spark_2.x
39 | ${xgboost.version}
40 |
41 |
42 | org.scala-lang
43 | scala-library
44 | ${scala.version}
45 | provided
46 |
47 |
48 | org.apache.spark
49 | spark-sql_${scala.binary.version}
50 | ${spark.version}
51 | provided
52 |
53 |
54 | org.apache.spark
55 | spark-mllib_${scala.binary.version}
56 | ${spark.version}
57 | provided
58 |
59 |
60 | org.scalatest
61 | scalatest_${scala.binary.version}
62 | 3.0.5
63 | test
64 |
65 |
66 |
67 |
68 |
69 |
70 | org.scala-tools
71 | maven-scala-plugin
72 | 2.15.2
73 |
74 |
75 |
76 | compile
77 | testCompile
78 |
79 |
80 |
81 |
82 |
83 | org.scalatest
84 | scalatest-maven-plugin
85 | 1.0
86 |
87 |
88 |
89 |
90 | test
91 |
92 | test
93 |
94 |
95 |
96 |
97 |
98 | org.apache.maven.plugins
99 | maven-assembly-plugin
100 | 2.6
101 |
102 |
103 | assembly/assembly-no-scala.xml
104 |
105 |
106 |
107 |
108 | assembly
109 | package
110 |
111 | single
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 | sonatype-repo
121 |
122 |
123 | sonatype-staging-repo
124 | Sonatype staging repo
125 | https://oss.sonatype.org/content/repositories/staging
126 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus/CPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.agaricus
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, SparkSetup, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
21 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
22 | import org.apache.spark.sql.types.{FloatType, StructField, StructType}
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object CPUMain {
26 | def main(args: Array[String]): Unit = {
27 |
28 | val labelName = "label"
29 | def featureNames(length: Int): List[String] =
30 | 0.until(length).map(i => s"feature_$i").toList.+:(labelName)
31 |
32 | def schema(length: Int): StructType =
33 | StructType(featureNames(length).map(n => StructField(n, FloatType)))
34 |
35 | val dataSchema = schema(126)
36 | val xgboostArgs = XGBoostArgs.parse(args)
37 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
38 | val appInfo = Seq("Agaricus", processor, xgboostArgs.format)
39 |
40 | // build spark session
41 | val spark = SparkSetup(args, appInfo.mkString("-"))
42 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
43 | // === diff ===
44 | // build data reader
45 | val dataReader = spark.read
46 |
47 | // load datasets, the order is (train, train-eval, eval)
48 | var datasets = xgboostArgs.dataPaths.map(_.map{
49 | path =>
50 | xgboostArgs.format match {
51 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(dataSchema).csv(path)
52 | case "parquet" => dataReader.parquet(path)
53 | case "orc" => dataReader.orc(path)
54 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
55 | }
56 | })
57 |
58 | val featureCols = dataSchema.filter(_.name != labelName).map(_.name)
59 |
60 | // === diff ===
61 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureCols, labelName)))
62 |
63 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
64 | // build XGBoost classifier
65 | val paramMap = xgboostArgs.xgboostParams(Map(
66 | "eta" -> 0.1,
67 | "missing" -> 0.0,
68 | "max_depth" -> 2,
69 | "eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)
70 | ))
71 | val xgbClassifier = new XGBoostClassifier(paramMap)
72 | .setLabelCol(labelName)
73 | // === diff ===
74 | .setFeaturesCol("features")
75 |
76 | println("\n------ Training ------")
77 | val (model, _) = benchmark.time("train") {
78 | xgbClassifier.fit(datasets(0).get)
79 | }
80 | // Save model if modelPath exists
81 | xgboostArgs.modelPath.foreach(path =>
82 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
83 | model
84 | } else {
85 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
86 | }
87 |
88 | if (xgboostArgs.isToTransform) {
89 | // start transform
90 | println("\n------ Transforming ------")
91 | var (results, _) = benchmark.time("transform") {
92 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
93 | ret.foreachPartition(_ => ())
94 | ret
95 | }
96 | results = if (xgboostArgs.isShowFeatures) {
97 | results
98 | } else {
99 | results.select(labelName, "rawPrediction", "probability", "prediction")
100 | }
101 | results.show(xgboostArgs.numRows)
102 |
103 | println("\n------Accuracy of Evaluation------")
104 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName)
105 | evaluator.evaluate(results) match {
106 | case accuracy if !accuracy.isNaN =>
107 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
108 | // Throw an exception when NaN ?
109 | }
110 | }
111 |
112 | spark.close()
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/agaricus/GPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.agaricus
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, SparkSetup, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
21 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
22 | import org.apache.spark.sql.types.{FloatType, StructField, StructType}
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object GPUMain {
26 | def main(args: Array[String]): Unit = {
27 |
28 | val labelName = "label"
29 | def featureNames(length: Int): List[String] =
30 | 0.until(length).map(i => s"feature_$i").toList.+:(labelName)
31 |
32 | def schema(length: Int): StructType =
33 | StructType(featureNames(length).map(n => StructField(n, FloatType)))
34 |
35 | val dataSchema = schema(126)
36 | val xgboostArgs = XGBoostArgs.parse(args)
37 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
38 | val appInfo = Seq("Agaricus", processor, xgboostArgs.format)
39 |
40 | // build spark session
41 | val spark = SparkSetup(args, appInfo.mkString("-"))
42 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
43 | // === diff ===
44 | // build data reader
45 | val dataReader = new GpuDataReader(spark)
46 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
47 |
48 | // load datasets, the order is (train, train-eval, eval)
49 | var datasets = xgboostArgs.dataPaths.map(_.map{
50 | path =>
51 | xgboostArgs.format match {
52 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(dataSchema).csv(path)
53 | case "parquet" => dataReader.parquet(path)
54 | case "orc" => dataReader.orc(path)
55 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
56 | }
57 | })
58 |
59 | val featureCols = dataSchema.filter(_.name != labelName).map(_.name)
60 |
61 | // === diff ===
62 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols'
63 |
64 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
65 | // build XGBoost classifier
66 | val paramMap = xgboostArgs.xgboostParams(Map(
67 | "eta" -> 0.1,
68 | "missing" -> 0.0,
69 | "max_depth" -> 2,
70 | "eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)
71 | ))
72 | val xgbClassifier = new XGBoostClassifier(paramMap)
73 | .setLabelCol(labelName)
74 | // === diff ===
75 | .setFeaturesCols(featureCols)
76 |
77 | println("\n------ Training ------")
78 | val (model, _) = benchmark.time("train") {
79 | xgbClassifier.fit(datasets(0).get)
80 | }
81 | // Save model if modelPath exists
82 | xgboostArgs.modelPath.foreach(path =>
83 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
84 | model
85 | } else {
86 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
87 | }
88 |
89 | if (xgboostArgs.isToTransform) {
90 | // start transform
91 | println("\n------ Transforming ------")
92 | var (results, _) = benchmark.time("transform") {
93 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
94 | ret.foreachPartition(_ => ())
95 | ret
96 | }
97 | results = if (xgboostArgs.isShowFeatures) {
98 | results
99 | } else {
100 | results.select(labelName, "rawPrediction", "probability", "prediction")
101 | }
102 | results.show(xgboostArgs.numRows)
103 |
104 | println("\n------Accuracy of Evaluation------")
105 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName)
106 | evaluator.evaluate(results) match {
107 | case accuracy if !accuracy.isNaN =>
108 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
109 | // Throw an exception when NaN ?
110 | }
111 | }
112 |
113 | spark.close()
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/CPUCrossValidatorMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.mortgage
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
20 | import ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
23 | import org.apache.spark.ml.tuning.ParamGridBuilder
24 |
25 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
26 | object CPUCrossValidatorMain extends Mortgage {
27 |
28 | def main(args: Array[String]): Unit = {
29 | val xgboostArgs = XGBoostArgs.parse(args)
30 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
31 | val appInfo = Seq(appName, processor, xgboostArgs.format)
32 |
33 | // build spark session
34 | val spark = SparkSession.builder()
35 | .appName(appInfo.mkString("-"))
36 | .getOrCreate()
37 |
38 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
39 |
40 | // === diff ===
41 | // build data reader
42 | val dataReader = spark.read
43 |
44 | // load datasets, the order is (train, train-eval, eval)
45 | var datasets = xgboostArgs.dataPaths.map(_.map{
46 | path =>
47 | xgboostArgs.format match {
48 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
49 | case "parquet" => dataReader.parquet(path)
50 | case "orc" => dataReader.orc(path)
51 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
52 | }
53 | })
54 |
55 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
56 |
57 | // === diff ===
58 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName)))
59 |
60 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap)
62 | val classifier = new XGBoostClassifier(xgbParamFinal)
63 | .setLabelCol(labelColName)
64 | // === diff ===
65 | .setFeaturesCol("features")
66 |
67 | // Tune model using cross validation
68 | val paramGrid = new ParamGridBuilder()
69 | .addGrid(classifier.maxDepth, Array(3, 8))
70 | .addGrid(classifier.eta, Array(0.2, 0.6))
71 | .build()
72 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
73 |
74 | val cv = new CrossValidator()
75 | .setEstimator(classifier)
76 | .setEvaluator(evaluator)
77 | .setEstimatorParamMaps(paramGrid)
78 | .setNumFolds(xgboostArgs.numFold)
79 |
80 | val (model, _) = benchmark.time("CrossValidation") {
81 | cv.fit(datasets.head.get).bestModel.asInstanceOf[XGBoostClassificationModel]
82 | }
83 | // Save model if modelPath exists
84 | xgboostArgs.modelPath.foreach(path =>
85 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
86 | model
87 | } else {
88 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
89 | }
90 |
91 | if (xgboostArgs.isToTransform) {
92 | println("\n------ Transforming ------")
93 | var (results, _) = benchmark.time("transform") {
94 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
95 | // Trigger the transformation
96 | ret.foreachPartition(_ => ())
97 | ret
98 | }
99 | results = if (xgboostArgs.isShowFeatures) {
100 | results
101 | } else {
102 | results.select(labelColName, "rawPrediction", "probability", "prediction")
103 | }
104 | results.show(xgboostArgs.numRows)
105 |
106 | println("\n------Accuracy of Evaluation------")
107 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
108 | evaluator.evaluate(results) match {
109 | case accuracy if !accuracy.isNaN =>
110 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
111 | // Throw an exception when NaN ?
112 | }
113 | }
114 | spark.close()
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/CPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.mortgage
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object CPUMain extends Mortgage {
26 |
27 | def main(args: Array[String]): Unit = {
28 | val xgboostArgs = XGBoostArgs.parse(args)
29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
30 | val appInfo = Seq(appName, processor, xgboostArgs.format)
31 |
32 | // build spark session
33 | val spark = SparkSession.builder()
34 | .appName(appInfo.mkString("-"))
35 | .getOrCreate()
36 |
37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
38 | // === diff ===
39 | // build data reader
40 | val dataReader = spark.read
41 |
42 | // load datasets, the order is (train, train-eval, eval)
43 | var datasets = xgboostArgs.dataPaths.map(_.map{
44 | path =>
45 | xgboostArgs.format match {
46 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
47 | case "parquet" => dataReader.parquet(path)
48 | case "orc" => dataReader.orc(path)
49 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
50 | }
51 | })
52 |
53 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
54 |
55 | // === diff ===
56 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName)))
57 |
58 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
59 | // build XGBoost classifier
60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap +
61 | // Add train-eval dataset if specified
62 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty))
63 | )
64 | val xgbClassifier = new XGBoostClassifier(xgbParamFinal)
65 | .setLabelCol(labelColName)
66 | // === diff ===
67 | .setFeaturesCol("features")
68 |
69 | // Start training
70 | println("\n------ Training ------")
71 | // Shall we not log the time if it is abnormal, which is usually caused by training failure
72 | val (model, _) = benchmark.time("train") {
73 | xgbClassifier.fit(datasets(0).get)
74 | }
75 | // Save model if modelPath exists
76 | xgboostArgs.modelPath.foreach(path =>
77 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
78 | model
79 | } else {
80 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
81 | }
82 |
83 | if (xgboostArgs.isToTransform) {
84 | println("\n------ Transforming ------")
85 | var (results, _) = benchmark.time("transform") {
86 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
87 | // Trigger the transformation
88 | ret.foreachPartition(_ => ())
89 | ret
90 | }
91 | results = if (xgboostArgs.isShowFeatures) {
92 | results
93 | } else {
94 | results.select(labelColName, "rawPrediction", "probability", "prediction")
95 | }
96 | results.show(xgboostArgs.numRows)
97 |
98 | println("\n------Accuracy of Evaluation------")
99 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
100 | evaluator.evaluate(results) match {
101 | case accuracy if !accuracy.isNaN =>
102 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
103 | // Throw an exception when NaN ?
104 | }
105 | }
106 |
107 | spark.close()
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/GPUCrossValidatorMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.mortgage
17 |
18 | import ai.rapids.spark.examples.mortgage.GPUMain.labelColName
19 | import ai.rapids.spark.examples.utility.{Benchmark, XGBoostArgs}
20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
21 | import ml.dmlc.xgboost4j.scala.spark.rapids.{CrossValidator, GpuDataReader}
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
24 | import org.apache.spark.ml.tuning.ParamGridBuilder
25 |
26 |
27 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
28 | object GPUCrossValidatorMain extends Mortgage {
29 |
30 | def main(args: Array[String]): Unit = {
31 | val xgboostArgs = XGBoostArgs.parse(args)
32 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
33 | val appInfo = Seq(appName, processor, xgboostArgs.format)
34 |
35 | // build spark session
36 | val spark = SparkSession.builder()
37 | .appName(appInfo.mkString("-"))
38 | .getOrCreate()
39 |
40 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
41 | // === diff ===
42 | // build data reader
43 | val dataReader = new GpuDataReader(spark)
44 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
45 |
46 | // load datasets, the order is (train, train-eval, eval)
47 | val datasets = xgboostArgs.dataPaths.map(_.map{
48 | path =>
49 | xgboostArgs.format match {
50 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
51 | case "parquet" => dataReader.parquet(path)
52 | case "orc" => dataReader.orc(path)
53 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
54 | }
55 | })
56 |
57 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
58 |
59 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap)
61 | val classifier = new XGBoostClassifier(xgbParamFinal)
62 | .setLabelCol(labelColName)
63 | // === diff ===
64 | .setFeaturesCols(featureNames)
65 |
66 | // Tune model using cross validation
67 | val paramGrid = new ParamGridBuilder()
68 | .addGrid(classifier.maxDepth, Array(3, 10))
69 | .addGrid(classifier.eta, Array(0.2, 0.6))
70 | .build()
71 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
72 |
73 | val cv = new CrossValidator()
74 | .setEstimator(classifier)
75 | .setEvaluator(evaluator)
76 | .setEstimatorParamMaps(paramGrid)
77 | .setNumFolds(xgboostArgs.numFold)
78 |
79 | val (model, _) = benchmark.time("CrossValidation") {
80 | cv.fit(datasets.head.get).asInstanceOf[XGBoostClassificationModel]
81 | }
82 | // Save model if modelPath exists
83 | xgboostArgs.modelPath.foreach(path =>
84 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
85 | model
86 | } else {
87 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
88 | }
89 |
90 | if (xgboostArgs.isToTransform) {
91 | println("\n------ Transforming ------")
92 | var (results, _) = benchmark.time("transform") {
93 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
94 | // Trigger the transformation
95 | ret.foreachPartition(_ => ())
96 | ret
97 | }
98 | results = if (xgboostArgs.isShowFeatures) {
99 | results
100 | } else {
101 | results.select(labelColName, "rawPrediction", "probability", "prediction")
102 | }
103 | results.show(xgboostArgs.numRows)
104 |
105 | println("\n------Accuracy of Evaluation------")
106 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
107 | evaluator.evaluate(results) match {
108 | case accuracy if !accuracy.isNaN =>
109 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
110 | // Throw an exception when NaN ?
111 | }
112 | }
113 |
114 | spark.close()
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/GPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.mortgage
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}
20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object GPUMain extends Mortgage {
26 |
27 | def main(args: Array[String]): Unit = {
28 | val xgboostArgs = XGBoostArgs.parse(args)
29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
30 | val appInfo = Seq(appName, processor, xgboostArgs.format)
31 |
32 | // build spark session
33 | val spark = SparkSession.builder()
34 | .appName(appInfo.mkString("-"))
35 | .getOrCreate()
36 |
37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
38 | // === diff ===
39 | // build data reader
40 | val dataReader = new GpuDataReader(spark)
41 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
42 |
43 | // load datasets, the order is (train, train-eval, eval)
44 | var datasets = xgboostArgs.dataPaths.map(_.map{
45 | path =>
46 | xgboostArgs.format match {
47 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
48 | case "parquet" => dataReader.parquet(path)
49 | case "orc" => dataReader.orc(path)
50 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
51 | }
52 | })
53 |
54 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
55 |
56 | // === diff ===
57 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols'
58 |
59 | val xgbClassificationModel = if (xgboostArgs.isToTrain) {
60 | // build XGBoost classifier
61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap +
62 | // Add train-eval dataset if specified
63 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty))
64 | )
65 | val xgbClassifier = new XGBoostClassifier(xgbParamFinal)
66 | .setLabelCol(labelColName)
67 | // === diff ===
68 | .setFeaturesCols(featureNames)
69 |
70 | // Start training
71 | println("\n------ Training ------")
72 | // Shall we not log the time if it is abnormal, which is usually caused by training failure
73 | val (model, _) = benchmark.time("train") {
74 | xgbClassifier.fit(datasets(0).get)
75 | }
76 | // Save model if modelPath exists
77 | xgboostArgs.modelPath.foreach(path =>
78 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
79 | model
80 | } else {
81 | XGBoostClassificationModel.load(xgboostArgs.modelPath.get)
82 | }
83 |
84 | if (xgboostArgs.isToTransform) {
85 | println("\n------ Transforming ------")
86 | var (results, _) = benchmark.time("transform") {
87 | val ret = xgbClassificationModel.transform(datasets(2).get).cache()
88 | // Trigger the transformation
89 | ret.foreachPartition(_ => ())
90 | ret
91 | }
92 | results = if (xgboostArgs.isShowFeatures) {
93 | results
94 | } else {
95 | results.select(labelColName, "rawPrediction", "probability", "prediction")
96 | }
97 | results.show(xgboostArgs.numRows)
98 |
99 | println("\n------Accuracy of Evaluation------")
100 | val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
101 | evaluator.evaluate(results) match {
102 | case accuracy if !accuracy.isNaN =>
103 | benchmark.value(accuracy, "Accuracy", "Accuracy for")
104 | // Throw an exception when NaN ?
105 | }
106 | }
107 |
108 | spark.close()
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage/Mortgage.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.mortgage
17 |
18 | import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
19 |
20 | private[mortgage] trait Mortgage {
21 | val appName = "Mortgage"
22 | val labelColName = "delinquency_12"
23 |
24 | val schema = StructType(List(
25 | StructField("orig_channel", FloatType),
26 | StructField("first_home_buyer", FloatType),
27 | StructField("loan_purpose", FloatType),
28 | StructField("property_type", FloatType),
29 | StructField("occupancy_status", FloatType),
30 | StructField("property_state", FloatType),
31 | StructField("product_type", FloatType),
32 | StructField("relocation_mortgage_indicator", FloatType),
33 | StructField("seller_name", FloatType),
34 | StructField("mod_flag", FloatType),
35 | StructField("orig_interest_rate", FloatType),
36 | StructField("orig_upb", IntegerType),
37 | StructField("orig_loan_term", IntegerType),
38 | StructField("orig_ltv", FloatType),
39 | StructField("orig_cltv", FloatType),
40 | StructField("num_borrowers", FloatType),
41 | StructField("dti", FloatType),
42 | StructField("borrower_credit_score", FloatType),
43 | StructField("num_units", IntegerType),
44 | StructField("zip", IntegerType),
45 | StructField("mortgage_insurance_percent", FloatType),
46 | StructField("current_loan_delinquency_status", IntegerType),
47 | StructField("current_actual_upb", FloatType),
48 | StructField("interest_rate", FloatType),
49 | StructField("loan_age", FloatType),
50 | StructField("msa", FloatType),
51 | StructField("non_interest_bearing_upb", FloatType),
52 | StructField(labelColName, IntegerType)))
53 |
54 | val commParamMap = Map(
55 | "eta" -> 0.1,
56 | "gamma" -> 0.1,
57 | "missing" -> 0.0,
58 | "max_depth" -> 10,
59 | "max_leaves" -> 256,
60 | "grow_policy" -> "depthwise",
61 | "objective" -> "binary:logistic",
62 | "min_child_weight" -> 30,
63 | "lambda" -> 1,
64 | "scale_pos_weight" -> 2,
65 | "subsample" -> 1,
66 | "nthread" -> 1,
67 | "num_round" -> 100)
68 | }
69 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/CPUCrossValidatorMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.taxi
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator
20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
21 | import org.apache.spark.ml.evaluation.{RegressionEvaluator}
22 | import org.apache.spark.ml.tuning.ParamGridBuilder
23 | import org.apache.spark.sql.SparkSession
24 |
25 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
26 | object CPUCrossValidatorMain extends Taxi {
27 |
28 | def main(args: Array[String]): Unit = {
29 | val xgboostArgs = XGBoostArgs.parse(args)
30 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
31 | val appInfo = Seq(appName, processor, xgboostArgs.format)
32 |
33 | // build spark session
34 | val spark = SparkSession.builder()
35 | .appName(appInfo.mkString("-"))
36 | .getOrCreate()
37 |
38 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
39 |
40 | // === diff ===
41 | // build data reader
42 | val dataReader = spark.read
43 |
44 | // load datasets, the order is (train, train-eval, eval)
45 | var datasets = xgboostArgs.dataPaths.map(_.map{
46 | path =>
47 | xgboostArgs.format match {
48 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
49 | case "parquet" => dataReader.parquet(path)
50 | case "orc" => dataReader.orc(path)
51 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
52 | }
53 | })
54 |
55 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
56 |
57 | // === diff ===
58 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName)))
59 |
60 | val xgbRegressionModel = if (xgboostArgs.isToTrain) {
61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap)
62 | val regressor = new XGBoostRegressor(xgbParamFinal)
63 | .setLabelCol(labelColName)
64 | // === diff ===
65 | .setFeaturesCol("features")
66 |
67 | // Tune model using cross validation
68 | val paramGrid = new ParamGridBuilder()
69 | .addGrid(regressor.maxDepth, Array(3, 8))
70 | .addGrid(regressor.eta, Array(0.2, 0.6))
71 | .build()
72 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
73 |
74 | val cv = new CrossValidator()
75 | .setEstimator(regressor)
76 | .setEvaluator(evaluator)
77 | .setEstimatorParamMaps(paramGrid)
78 | .setNumFolds(xgboostArgs.numFold)
79 |
80 | val (model, _) = benchmark.time("CrossValidation") {
81 | cv.fit(datasets.head.get).bestModel.asInstanceOf[XGBoostRegressionModel]
82 | }
83 | // Save model if modelPath exists
84 | xgboostArgs.modelPath.foreach(path =>
85 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
86 | model
87 | } else {
88 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get)
89 | }
90 |
91 | if (xgboostArgs.isToTransform) {
92 | println("\n------ Transforming ------")
93 | var (prediction, _) = benchmark.time("transform") {
94 | val ret = xgbRegressionModel.transform(datasets(2).get).cache()
95 | ret.foreachPartition(_ => ())
96 | ret
97 | }
98 | prediction = if (xgboostArgs.isShowFeatures) {
99 | prediction
100 | } else {
101 | prediction.select(labelColName, "prediction")
102 | }
103 | prediction.show(xgboostArgs.numRows)
104 |
105 | println("\n------Accuracy of Evaluation------")
106 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
107 | evaluator.evaluate(prediction) match {
108 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for")
109 | // Throw an exception when NaN ?
110 | }
111 | }
112 |
113 | spark.close()
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/CPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.taxi
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
21 | import org.apache.spark.ml.evaluation.RegressionEvaluator
22 | import org.apache.spark.sql.SparkSession
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object CPUMain extends Taxi {
26 |
27 | def main(args: Array[String]): Unit = {
28 | val xgboostArgs = XGBoostArgs.parse(args)
29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
30 | val appInfo = Seq(appName, processor, xgboostArgs.format)
31 |
32 | // build spark session
33 | val spark = SparkSession.builder()
34 | .appName(appInfo.mkString("-"))
35 | .getOrCreate()
36 |
37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
38 | // === diff ===
39 | // build data reader
40 | val dataReader = spark.read
41 |
42 | // load datasets, the order is (train, train-eval, eval)
43 | var datasets = xgboostArgs.dataPaths.map(_.map{
44 | path =>
45 | xgboostArgs.format match {
46 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
47 | case "parquet" => dataReader.parquet(path)
48 | case "orc" => dataReader.orc(path)
49 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
50 | }
51 | })
52 |
53 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
54 |
55 | // === diff ===
56 | datasets = datasets.map(_.map(ds => Vectorize(ds, featureNames, labelColName)))
57 |
58 | val xgbRegressionModel = if (xgboostArgs.isToTrain) {
59 | // build XGBoost XGBoostRegressor
60 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap +
61 | // Add train-eval dataset if specified
62 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty))
63 | )
64 | val xgbRegressor = new XGBoostRegressor(xgbParamFinal)
65 | .setLabelCol(labelColName)
66 | // === diff ===
67 | .setFeaturesCol("features")
68 |
69 | println("\n------ Training ------")
70 | // Shall we not log the time if it is abnormal, which is usually caused by training failure
71 | val (model, _) = benchmark.time("train") {
72 | xgbRegressor.fit(datasets(0).get)
73 | }
74 | // Save model if modelPath exists
75 | xgboostArgs.modelPath.foreach(path =>
76 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
77 | model
78 | } else {
79 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get)
80 | }
81 |
82 | if (xgboostArgs.isToTransform) {
83 | println("\n------ Transforming ------")
84 | var (prediction, _) = benchmark.time("transform") {
85 | val ret = xgbRegressionModel.transform(datasets(2).get).cache()
86 | ret.foreachPartition(_ => ())
87 | ret
88 | }
89 | prediction = if (xgboostArgs.isShowFeatures) {
90 | prediction
91 | } else {
92 | prediction.select(labelColName, "prediction")
93 | }
94 | prediction.show(xgboostArgs.numRows)
95 |
96 | println("\n------Accuracy of Evaluation------")
97 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
98 | evaluator.evaluate(prediction) match {
99 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for")
100 | // Throw an exception when NaN ?
101 | }
102 | }
103 |
104 | spark.close()
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/GPUCrossValidatorMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.taxi
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.rapids.{CrossValidator, GpuDataReader}
20 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
21 | import org.apache.spark.ml.evaluation.{RegressionEvaluator}
22 | import org.apache.spark.ml.tuning.ParamGridBuilder
23 | import org.apache.spark.sql.SparkSession
24 |
25 |
26 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
27 | object GPUCrossValidatorMain extends Taxi {
28 |
29 | def main(args: Array[String]): Unit = {
30 | val xgboostArgs = XGBoostArgs.parse(args)
31 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
32 | val appInfo = Seq(appName, processor, xgboostArgs.format)
33 |
34 | // build spark session
35 | val spark = SparkSession.builder()
36 | .appName(appInfo.mkString("-"))
37 | .getOrCreate()
38 |
39 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
40 | // === diff ===
41 | // build data reader
42 | val dataReader = new GpuDataReader(spark)
43 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
44 |
45 | // load datasets, the order is (train, train-eval, eval)
46 | val datasets = xgboostArgs.dataPaths.map(_.map{
47 | path =>
48 | xgboostArgs.format match {
49 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
50 | case "parquet" => dataReader.parquet(path)
51 | case "orc" => dataReader.orc(path)
52 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
53 | }
54 | })
55 |
56 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
57 |
58 | val xgbRegressionModel = if (xgboostArgs.isToTrain) {
59 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap)
60 | val regressor = new XGBoostRegressor(xgbParamFinal)
61 | .setLabelCol(labelColName)
62 | // === diff ===
63 | .setFeaturesCols(featureNames)
64 |
65 | // Tune model using cross validation
66 | val paramGrid = new ParamGridBuilder()
67 | .addGrid(regressor.maxDepth, Array(3, 10))
68 | .addGrid(regressor.eta, Array(0.2, 0.6))
69 | .build()
70 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
71 |
72 | val cv = new CrossValidator()
73 | .setEstimator(regressor)
74 | .setEvaluator(evaluator)
75 | .setEstimatorParamMaps(paramGrid)
76 | .setNumFolds(xgboostArgs.numFold)
77 |
78 | val (model, _) = benchmark.time("CrossValidation") {
79 | cv.fit(datasets.head.get).asInstanceOf[XGBoostRegressionModel]
80 | }
81 | // Save model if modelPath exists
82 | xgboostArgs.modelPath.foreach(path =>
83 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
84 | model
85 | } else {
86 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get)
87 | }
88 |
89 | if (xgboostArgs.isToTransform) {
90 | println("\n------ Transforming ------")
91 | var (prediction, _) = benchmark.time("transform") {
92 | val ret = xgbRegressionModel.transform(datasets(2).get).cache()
93 | ret.foreachPartition(_ => ())
94 | ret
95 | }
96 | prediction = if (xgboostArgs.isShowFeatures) {
97 | prediction
98 | } else {
99 | prediction.select(labelColName, "prediction")
100 | }
101 | prediction.show(xgboostArgs.numRows)
102 |
103 | println("\n------Accuracy of Evaluation------")
104 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
105 | evaluator.evaluate(prediction) match {
106 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for")
107 | // Throw an exception when NaN ?
108 | }
109 | }
110 |
111 | spark.close()
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/GPUMain.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.taxi
17 |
18 | import ai.rapids.spark.examples.utility.{Benchmark, Vectorize, XGBoostArgs}
19 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
20 | import ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataReader
21 | import org.apache.spark.ml.evaluation.RegressionEvaluator
22 | import org.apache.spark.sql.SparkSession
23 |
24 | // Only 3 differences between CPU and GPU. Please refer to '=== diff ==='
25 | object GPUMain extends Taxi {
26 |
27 | def main(args: Array[String]): Unit = {
28 | val xgboostArgs = XGBoostArgs.parse(args)
29 | val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
30 | val appInfo = Seq(appName, processor, xgboostArgs.format)
31 |
32 | // build spark session
33 | val spark = SparkSession.builder()
34 | .appName(appInfo.mkString("-"))
35 | .getOrCreate()
36 |
37 | val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
38 | // === diff ===
39 | // build data reader
40 | val dataReader = new GpuDataReader(spark)
41 | .option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
42 |
43 | // load datasets, the order is (train, train-eval, eval)
44 | var datasets = xgboostArgs.dataPaths.map(_.map{
45 | path =>
46 | xgboostArgs.format match {
47 | case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
48 | case "parquet" => dataReader.parquet(path)
49 | case "orc" => dataReader.orc(path)
50 | case _ => throw new IllegalArgumentException("Unsupported data file format!")
51 | }
52 | })
53 |
54 | val featureNames = schema.filter(_.name != labelColName).map(_.name)
55 |
56 | // === diff ===
57 | // No need to vectorize data since GPU support multiple feature columns via API 'setFeaturesCols'
58 |
59 | val xgbRegressionModel = if (xgboostArgs.isToTrain) {
60 | // build XGBoost XGBoostRegressor
61 | val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap +
62 | // Add train-eval dataset if specified
63 | ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty))
64 | )
65 | val xgbRegressor = new XGBoostRegressor(xgbParamFinal)
66 | .setLabelCol(labelColName)
67 | // === diff ===
68 | .setFeaturesCols(featureNames)
69 |
70 | println("\n------ Training ------")
71 | // Shall we not log the time if it is abnormal, which is usually caused by training failure
72 | val (model, _) = benchmark.time("train") {
73 | xgbRegressor.fit(datasets(0).get)
74 | }
75 | // Save model if modelPath exists
76 | xgboostArgs.modelPath.foreach(path =>
77 | if(xgboostArgs.isOverwrite) model.write.overwrite().save(path) else model.save(path))
78 | model
79 | } else {
80 | XGBoostRegressionModel.load(xgboostArgs.modelPath.get)
81 | }
82 |
83 | if (xgboostArgs.isToTransform) {
84 | println("\n------ Transforming ------")
85 | var (prediction, _) = benchmark.time("transform") {
86 | val ret = xgbRegressionModel.transform(datasets(2).get).cache()
87 | ret.foreachPartition(_ => ())
88 | ret
89 | }
90 | prediction = if (xgboostArgs.isShowFeatures) {
91 | prediction
92 | } else {
93 | prediction.select(labelColName, "prediction")
94 | }
95 | prediction.show(xgboostArgs.numRows)
96 |
97 | println("\n------Accuracy of Evaluation------")
98 | val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
99 | evaluator.evaluate(prediction) match {
100 | case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for")
101 | // Throw an exception when NaN ?
102 | }
103 | }
104 |
105 | spark.close()
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/taxi/Taxi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.taxi
17 |
18 | import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
19 |
20 | private[taxi] trait Taxi {
21 | val appName = "Taxi"
22 | val labelColName = "fare_amount"
23 |
24 | lazy val commParamMap = Map(
25 | "learning_rate" -> 0.05,
26 | "max_depth" -> 8,
27 | "subsample" -> 0.8,
28 | "gamma" -> 1
29 | )
30 |
31 | lazy val schema =
32 | StructType(Array(
33 | StructField("vendor_id", FloatType),
34 | StructField("passenger_count", FloatType),
35 | StructField("trip_distance", FloatType),
36 | StructField("pickup_longitude", FloatType),
37 | StructField("pickup_latitude", FloatType),
38 | StructField("rate_code", FloatType),
39 | StructField("store_and_fwd", FloatType),
40 | StructField("dropoff_longitude", FloatType),
41 | StructField("dropoff_latitude", FloatType),
42 | StructField(labelColName, FloatType),
43 | StructField("hour", FloatType),
44 | StructField("year", IntegerType),
45 | StructField("month", IntegerType),
46 | StructField("day", FloatType),
47 | StructField("day_of_week", FloatType),
48 | StructField("is_weekend", FloatType)
49 | ))
50 | }
51 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/Benchmark.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.utility
17 |
18 | import scala.util.Properties
19 |
20 | class Benchmark(
21 | appName: String,
22 | processor: String,
23 | dataFormat: String) {
24 |
25 | def time[R](phase: String, silent: (Any, Float) => Boolean = (_,_) => false)
26 | (block: => R): (R, Float) = {
27 | val t0 = System.currentTimeMillis
28 | val result = block // call-by-name
29 | val elapsedTimeSec = (System.currentTimeMillis - t0).toFloat / 1000
30 | logging(elapsedTimeSec, phase, "Elapsed time for", "s", silent(result, elapsedTimeSec))
31 | (result, elapsedTimeSec)
32 | }
33 |
34 | def value(value: Any, name: String = "value", prefix: String="", suffix: String = "") = {
35 | logging(value, name, prefix, suffix, false)
36 | }
37 |
38 | private def logging(value: Any, name: String , prefix: String, suffix: String, silent: Boolean) = {
39 | if (!silent) {
40 | val logString = buildLogSimple(value, prefix, suffix, buildRuntimeInfo(name))
41 | println("\n--------------")
42 | println("==> Benchmark: " + logString)
43 | println("--------------\n")
44 | }
45 | }
46 |
47 | private def buildRuntimeInfo(name: String): String = {
48 | // Get runtime information from Environment
49 | val osType = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_OS_TYPE", "Unknown")
50 | val cudaVersion = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_CUDA_VERSION", "Unknown")
51 | val sparkVersion = Properties.envOrElse("RAPIDS_XGB_EXAMPLE_SPARK_VERSION", "Unknown")
52 | Seq(appName, processor, name, dataFormat, "stub", cudaVersion, osType, sparkVersion)
53 | .mkString(" ")
54 | }
55 |
56 | private def buildLogSimple(value: Any, prefix: String, suffix: String, runtimeInfo: String): String =
57 | prefix + " [" + runtimeInfo + "]: " + value + suffix
58 | }
59 |
60 | object Benchmark {
61 | def apply(appName: String, processor: String, dataFormat: String) =
62 | new Benchmark(appName, processor, dataFormat)
63 | }
64 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/SparkSetup.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.utility
17 |
18 | import org.apache.spark.sql.SparkSession
19 |
20 | object SparkSetup {
21 | def apply(args: Array[String], appName: String) = {
22 | val builder = SparkSession.builder()
23 | val masterBuilder = Option(System.getenv("SPARK_MASTER")).map{master =>
24 | builder.master(master)
25 | }.getOrElse(builder)
26 |
27 | masterBuilder.appName(appName).getOrCreate()
28 | }
29 |
30 | def apply(args: Array[String]): SparkSession = SparkSetup(args, "default")
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/utility/Vectorize.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package ai.rapids.spark.examples.utility
17 |
18 | import org.apache.spark.ml.feature.VectorAssembler
19 | import org.apache.spark.sql.DataFrame
20 | import org.apache.spark.sql.functions._
21 | import org.apache.spark.sql.types.FloatType
22 |
23 | object Vectorize {
24 | def apply(df: DataFrame, labelName: String, changeLabelName: Boolean = true): DataFrame = {
25 | val features = df.schema.collect{case f if f.name != labelName => f.name}
26 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType))
27 | val labelCol = if (changeLabelName) col(labelName).alias("label") else col(labelName)
28 | new VectorAssembler()
29 | .setInputCols(features.toArray)
30 | .setOutputCol("features")
31 | .transform(df.select(toFloat:_*))
32 | .select(col("features"), labelCol)
33 | }
34 |
35 | def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {
36 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType))
37 | new VectorAssembler()
38 | .setInputCols(featureNames.toArray)
39 | .setOutputCol("features")
40 | .transform(df.select(toFloat:_*))
41 | .select(col("features"), col(labelName))
42 | }
43 |
44 | def criteoApply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {
45 | val toFloat = df.schema.map(f => col(f.name).cast(FloatType))
46 | new VectorAssembler()
47 | .setHandleInvalid("keep")
48 | .setInputCols(featureNames.toArray)
49 | .setOutputCol("features")
50 | .transform(df.select(toFloat:_*))
51 | .select(col("features"), col(labelName))
52 | }
53 |
54 | def apply(featureNames: Seq[String], df: DataFrame, otherNames: String*): DataFrame = {
55 | val resultCols = (otherNames :+ "features").map(col(_))
56 | new VectorAssembler()
57 | .setInputCols(featureNames.toArray)
58 | .setOutputCol("features")
59 | .transform(df)
60 | .select(resultCols: _*)
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/examples/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
--------------------------------------------------------------------------------
/getting-started-guides/building-sample-apps/python.md:
--------------------------------------------------------------------------------
1 | # Build XGBoost Python Examples
2 |
3 | ##### Build Process
4 |
5 | Follow these steps to package the Python zip file:
6 |
7 | ```
8 | git clone https://github.com/rapidsai/spark-examples.git
9 | cd spark-examples/examples/apps/python
10 | zip -r samples.zip ai
11 | ```
12 |
13 | ##### Files Required by PySpark
14 |
15 | Two files are required by PySpark:
16 |
17 | + *samples.zip* : the package including all example code
18 | + *main.py*: entrypoint for PySpark, you may just copy it from folder *spark-examples/examples/apps/python*
19 |
--------------------------------------------------------------------------------
/getting-started-guides/building-sample-apps/scala.md:
--------------------------------------------------------------------------------
1 | # Build XGBoost Scala Examples
2 |
3 | Our examples rely on [cuDF](https://github.com/rapidsai/cudf) and [XGBoost](https://github.com/rapidsai/xgboost/tree/rapids-spark)
4 |
5 | ##### Build Process
6 |
7 | Follow these steps to build the Scala jars (Here take CUDA 10.0 as an example):
8 |
9 | ```
10 | git clone https://github.com/rapidsai/spark-examples.git
11 | cd spark-examples/examples/apps/scala
12 | mvn package -Dcuda.classifier=cuda10
13 | ```
14 |
15 | ##### Generated Jars
16 |
17 | The build process generates two jars:
18 |
19 | + *sample_xgboost_apps-0.1.5.jar* : only classes for the examples are included, so it should be submitted to spark together with other dependent jars
20 | + *sample_xgboost_apps-0.1.5-jar-with-dependencies.jar*: both classes for the examples and the classes from dependent jars are included
21 |
22 | ##### Build Options
23 |
24 | Classifiers:
25 |
26 | + *cuda.classifier*
27 | + For CUDA 9.2 building, omit this classifier
28 | + For CUDA 10.0 building, specify *cuda10*
29 | + For CUDA 10.1 building, specify *cuda10-1*
30 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/EMR_Mortgage_Example_G4dn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%%configure -f\n",
10 | "{\n",
11 | " \"driverMemory\": \"8000M\",\n",
12 | " \"driverCores\": 2,\n",
13 | " \"executorMemory\": \"8000M\",\n",
14 | " \"conf\" : {\"spark.executor.instances\":2, \"spark.executor.cores\":4, \"spark.task.cpus\": 4, \"spark.yarn.maxAppAttempts\": 1, \"spark.dynamicAllocation.enabled\": false},\n",
15 | " \"jars\" : [\"https://repo1.maven.org/maven2/ai/rapids/cudf/0.9.2/cudf-0.9.2.jar\",\n",
16 | " \"https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\",\n",
17 | " \"https://repo1.maven.org/maven2/ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\"]\n",
18 | "}"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "sc.listJars.foreach(println)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "%%info"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "// import notebook source\n",
46 | "import org.apache.spark.sql.SparkSession\n",
47 | "import org.apache.spark.ml.evaluation.RegressionEvaluator\n",
48 | "import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\n",
49 | "import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\n",
50 | "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\n",
51 | "import ml.dmlc.xgboost4j.scala.spark.rapids.{GpuDataReader, GpuDataset}\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "val trainPath = \"s3://sagemaker-gpu-xgboost/mortgage/csv/train/\"\n",
61 | "val evalPath = \"s3://sagemaker-gpu-xgboost/mortgage/csv/test/\"\n"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "sc.listJars.foreach(println)\n",
71 | "\n",
72 | "val spark = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n",
73 | "\n",
74 | "val dataReader = new GpuDataReader(spark)\n",
75 | "\n",
76 | "val labelColName = \"delinquency_12\"\n"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "val schema = StructType(List(\n",
86 | " StructField(\"orig_channel\", DoubleType),\n",
87 | " StructField(\"first_home_buyer\", DoubleType),\n",
88 | " StructField(\"loan_purpose\", DoubleType),\n",
89 | " StructField(\"property_type\", DoubleType),\n",
90 | " StructField(\"occupancy_status\", DoubleType),\n",
91 | " StructField(\"property_state\", DoubleType),\n",
92 | " StructField(\"product_type\", DoubleType),\n",
93 | " StructField(\"relocation_mortgage_indicator\", DoubleType),\n",
94 | " StructField(\"seller_name\", DoubleType),\n",
95 | " StructField(\"mod_flag\", DoubleType),\n",
96 | " StructField(\"orig_interest_rate\", DoubleType),\n",
97 | " StructField(\"orig_upb\", IntegerType),\n",
98 | " StructField(\"orig_loan_term\", IntegerType),\n",
99 | " StructField(\"orig_ltv\", DoubleType),\n",
100 | " StructField(\"orig_cltv\", DoubleType),\n",
101 | " StructField(\"num_borrowers\", DoubleType),\n",
102 | " StructField(\"dti\", DoubleType),\n",
103 | " StructField(\"borrower_credit_score\", DoubleType),\n",
104 | " StructField(\"num_units\", IntegerType),\n",
105 | " StructField(\"zip\", IntegerType),\n",
106 | " StructField(\"mortgage_insurance_percent\", DoubleType),\n",
107 | " StructField(\"current_loan_delinquency_status\", IntegerType),\n",
108 | " StructField(\"current_actual_upb\", DoubleType),\n",
109 | " StructField(\"interest_rate\", DoubleType),\n",
110 | " StructField(\"loan_age\", DoubleType),\n",
111 | " StructField(\"msa\", DoubleType),\n",
112 | " StructField(\"non_interest_bearing_upb\", DoubleType),\n",
113 | " StructField(labelColName, IntegerType)))\n",
114 | "\n"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "val commParamMap = Map(\n",
124 | " \"eta\" -> 0.1,\n",
125 | " \"gamma\" -> 0.1,\n",
126 | " \"missing\" -> 0.0,\n",
127 | " \"max_depth\" -> 10,\n",
128 | " \"max_leaves\" -> 256,\n",
129 | " \"grow_policy\" -> \"depthwise\",\n",
130 | " \"min_child_weight\" -> 30,\n",
131 | " \"lambda\" -> 1,\n",
132 | " \"scale_pos_weight\" -> 2,\n",
133 | " \"subsample\" -> 1,\n",
134 | " \"nthread\" -> 4,\n",
135 | " \"num_round\" -> 100,\n",
136 | " \"num_workers\" -> 2,\n",
137 | " \"tree_method\" -> \"gpu_hist\")\n"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "var (trainSet, evalSet) = {\n",
147 | " dataReader.option(\"header\", true).schema(schema)\n",
148 | " (dataReader.csv(trainPath), dataReader.csv(evalPath))}\n",
149 | "\n",
150 | "val featureNames = schema.filter(_.name != labelColName).map(_.name)\n",
151 | "\n",
152 | "object Benchmark {\n",
153 | " def time[R](phase: String)(block: => R): (R, Float) = {\n",
154 | " val t0 = System.currentTimeMillis\n",
155 | " val result = block // call-by-name\n",
156 | " val t1 = System.currentTimeMillis\n",
157 | " println(\"==> Benchmark: Elapsed time for [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n",
158 | " (result, (t1 - t0).toFloat / 1000)\n",
159 | " }\n",
160 | "}\n",
161 | "\n"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "val modelPath = \"/tmp/model\"\n",
171 | "val xgbClassifier = new XGBoostClassifier(commParamMap).setLabelCol(labelColName).setFeaturesCols(featureNames)\n"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "println(\"\\n------ Training ------\")\n",
181 | "val (model, _) = Benchmark.time(\"train\") {\n",
182 | " xgbClassifier.fit(trainSet)\n",
183 | "}\n"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "// Save model if modelPath exists\n",
193 | "model.write.overwrite().save(modelPath)\n",
194 | "val xgbClassificationModel = model\n"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "println(\"\\n------ Transforming ------\")\n",
204 | "val (results, _) = Benchmark.time(\"transform\") {\n",
205 | " xgbClassificationModel.transform(evalSet)\n",
206 | "}\n"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "println(\"\\n------Accuracy of Evaluation------\")\n",
216 | "val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)\n",
217 | "val accuracy = evaluator.evaluate(results)\n",
218 | "println(accuracy)\n"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": []
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Spark",
232 | "language": "",
233 | "name": "sparkkernel"
234 | },
235 | "language_info": {
236 | "codemirror_mode": "text/x-scala",
237 | "mimetype": "text/x-scala",
238 | "name": "scala",
239 | "pygments_lexer": "scala"
240 | }
241 | },
242 | "nbformat": 4,
243 | "nbformat_minor": 4
244 | }
245 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/adv_full_datasets.md:
--------------------------------------------------------------------------------
1 | # Advanced Topic: Using Large Mortgage Dataset for AWS EMR XGBOOST4J-SPARK
2 |
3 |
4 | This guide adds some additional instructions and tips for running large Mortgage datasets on XGBoost4J-Spark on AWS EMR. Please use this quick start guide - [Get Started with XGBoost4J-Spark on AWS EMR](emr.md) for complete step-by-step instructions to run GPU XGBoost Mortgage Examples.
5 |
6 | ### Use Multi-GPU P3 instance for AWS EMR Core Nodes
7 |
8 | For large datasets, we recommend to use two P3.8xlarge instances as Core node, which has total 8 GPUs.
9 | Please also increase the EBS storage on each Core node to 300G if using HDFS to store the data.
10 |
11 | Please also ssh into each Core node to issue following command to set *EXCLUSIVE_PROCESS* for all GPUs on each Core node.
12 | This step is explained in [Multi-GPU Configuration for XGBoost4J-Spark](/advanced-topics/multi-gpu.md) The public IP address of each core node can be found by clicking EMR Cluster Details. Go to Hardware tab and click the ID of Core.
13 | You can use the same private key and username *hadoop* to log into each node. The bootstrap function in EMR currently doesn't support this script.
14 |
15 | ```
16 | nvidia-smi -c EXCLUSIVE_PROCESS
17 | ```
18 |
19 | ### Using Full Mortgage Datasets for Mortgage Example
20 |
21 | #### Option 1: Using EMR steps to copy full Mortgage datasets to HDFS
22 |
23 | You can copy the full [mortgage data](https://rapidsai.github.io/demos/datasets/mortgage-data) to HDFS in EMR Master Node.
24 | Or Load the dataset from S3 when launch the AWS EMR cluster using steps
25 |
26 | In step 1: Software and Steps, add a step with Name, JAR location (command-runner.jar) and the following command in arguments.
27 | ```
28 | s3-dist-cp --src=s3://spark-xgboost-mortgage-dataset/csv --dest=hdfs:///tmp/mortgage
29 | ```
30 |
31 | 
32 |
33 |
34 | #### Option 2: Using AWS S3 for Datasets Directly
35 | You can use dataseta on S3 directly when submit the spark job.
36 | Please refer to this [AWS document](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-file-systems.html) for detailed information.
37 | In our example, you can use following the format:
38 | ```
39 | export DATA_PREFIX=s3://spark-xgboost-mortgage-dataset/csv
40 | ```
41 |
42 |
43 | ### Run Mortgage Example with Full Datasets with 1000 rounds and 20 depth
44 | The same jar file built in [quick start guide](emr.md) will be used here.
45 | Now here is how to run the GPU Mortgage Example with full Mortgage Datasets for 1000 rounds and 20 depth:
46 |
47 | ```
48 | export SPARK_DEPLOY_MODE=cluster
49 | export JARS_PATH=hdfs:/tmp/xgboost4j_spark/
50 | export SPARK_DRIVER_MEMORY=10G
51 | export SPARK_EXECUTOR_MEMORY=40G
52 | export SPARK_NUM_EXECUTORS=8
53 | export CORES_PER_EXECUTOR=6
54 | export TOTAL_CORES=$((${CORES_PER_EXECUTOR}*${SPARK_NUM_EXECUTORS}))
55 | export JAR_PREFIX=hdfs:/tmp/xgboost4j_spark/
56 | export EXAMPLE_CLASS=ai.rapids.spark.examples.mortgage.GPUMain
57 | export JAR_EXAMPLE=${JARS_PATH}/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar
58 |
59 | export DATA_PREFIX=hdfs:/tmp/mortgage
60 | //export DATA_PREFIX=s3://spark-xgboost-mortgage-dataset/csv for s3 storage
61 | export TRAIN_DATA=${DATA_PREFIX}/train/20*
62 | export EVAL_DATA=${DATA_PREFIX}/eval/20*
63 |
64 | export ROUND=100
65 | export TREE_METHOD=gpu_hist
66 |
67 | spark-submit \
68 | --master yarn \
69 | --deploy-mode ${SPARK_DEPLOY_MODE} \
70 | --driver-memory ${SPARK_DRIVER_MEMORY} \
71 | --executor-memory ${SPARK_EXECUTOR_MEMORY} \
72 | --conf spark.executor.cores=${CORES_PER_EXECUTOR} \
73 | --conf spark.task.cpus=${CORES_PER_EXECUTOR} \
74 | --conf spark.yarn.maxAppAttempts=1 \
75 | --conf spark.sql.files.maxPartitionBytes=4294967296 \
76 | --num-executors ${SPARK_NUM_EXECUTORS} \
77 | --class ${EXAMPLE_CLASS} \
78 | ${JAR_EXAMPLE} \
79 | -trainDataPath=$TRAIN_DATA \
80 | -evalDataPath=$EVAL_DATA \
81 | -format=csv \
82 | -numRound=$ROUND \
83 | -max_depth=20 \
84 | -num_workers=${SPARK_NUM_EXECUTORS} \
85 | -treeMethod=${TREE_METHOD} \
86 | -nthread=${CORES_PER_EXECUTOR}
87 | ```
88 |
89 | In the stdout driver log, you should see timings\* (in seconds), and the RMSE accuracy metric. To find the stdout, go to the details of cluster, select Application history tab, and then click the application you just ran, click Executors tab, in the driver row, click "view logs" and then click "stdout". The stdout log file will show all the outputs.
90 |
91 | ------ Training ------
92 |
93 | Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=172.31.25.254, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=8}
94 |
95 | ==> Benchmark: Elapsed time for [Mortgage GPU train csv stub Unknown Unknown Unknown]: 785.183s
96 |
97 | ------ Transforming ------
98 |
99 | ==> Benchmark: Elapsed time for [Mortgage GPU transform csv stub Unknown Unknown Unknown]: 383.537s
100 |
101 | ------Accuracy of Evaluation------
102 |
103 | ==> Benchmark: Accuracy for [Mortgage GPU Accuracy csv stub Unknown Unknown Unknown]: 0.9909487814701571
104 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-cluster-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-details.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-cluster-dns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-dns.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-cluster-ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-ssh.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-cluster-waiting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-cluster-waiting.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-stdout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-stdout.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-step-four-security.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-four-security.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-step-one-s3-copy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-one-s3-copy.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-step-one-software-and-steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-one-software-and-steps.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-step-three-general-cluster-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-three-general-cluster-settings.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-step-two-hardware.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-step-two-hardware.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/emr-view-logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/emr-view-logs.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-config-move.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-config-move.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-config-updated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-config-updated.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-curl-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-curl-output.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-info-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-info-output.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-jupyter-new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-jupyter-new.gif
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-kernel-restart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-kernel-restart.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-notebook-instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-notebook-instance.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-output.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-permission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-permission.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/aws/pics/sagemaker-tcp-port.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/spark-examples/4f25d53a1eed30ee46549b4dc1bbd5b8ce201f1c/getting-started-guides/csp/aws/pics/sagemaker-tcp-port.png
--------------------------------------------------------------------------------
/getting-started-guides/csp/databricks/databricks.md:
--------------------------------------------------------------------------------
1 | Get Started with XGBoost4J-Spark on Databricks
2 | ======================================================
3 | This is a getting started guide to XGBoost4J-Spark on Databricks. At the end of this guide, the reader will be able to run a sample Apache Spark application that runs on NVIDIA GPUs on Databricks.
4 |
5 | Prerequisites
6 | -------------
7 | * Apache Spark 2.4+ running in DataBricks Runtime 5.3 ML with GPU, 5.4 ML with GPU, or 5.5 ML with GPU. Make sure it matches the hardware and software requirements below.
8 | * Hardware Requirements
9 | * NVIDIA Pascal™ GPU architecture or better
10 | * Multi-node clusters with homogenous GPU configuration
11 | * Software Requirements
12 | * Ubuntu 16.04/CentOS
13 | * CUDA V10.1/10.0/9.2
14 | * NVIDIA driver compatible with your CUDA
15 | * NCCL 2.4.7
16 |
17 | The number of GPUs per node dictates the number of Spark executors that can run in that node. Each executor should only be allowed to run 1 task at any given time.
18 |
19 | Start A Databricks Cluster
20 | --------------------------
21 | Create a Databricks cluster (`Clusters` -> `+ Create Cluster`) that meets the above prerequisites.
22 | 1. Make sure to use one of the 5.3 ML with GPU, 5.4 ML with GPU, or 5.5 LTS ML with GPU Databricks runtimes.
23 | 2. Use nodes with 1 GPU each such as p3.xlarge or Standard\_NC6s\_v3. We currently don't support nodes with multiple GPUs. p2 (AWS) and NC12/24 (Azure) nodes do not meet the architecture requirements for the XGBoost worker (although they can be used for the driver node).
24 | 3. Under Autopilot Options, disable autoscaling.
25 | 4. Choose the number of workers that matches the number of GPUs you want to use.
26 | 5. Select a worker type that has 1 GPU for the worker like p3.xlarge or NC6s_v3, for example.
27 |
28 |
29 | * After you start a Databricks cluster, use the initialization notebooks -- [5.3 & 5.4 notebook](/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb
30 | ) or [5.5 notebook](/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb
31 | ) to setup execution.
32 |
33 | The initialization notebooks will perform the following steps:
34 | 1.Downloading the CUDA and Rapids XGBoost4j Spark jars
35 | 2.Creating a new directory for initialization script in Databricks file system (DBFS)
36 | 3.Creating an initialization script inside the new directory to copy jars inside Databricks jar directory
37 | 4.Download and decompress the Sample Mortgage Notebook dataset
38 |
39 | After executing the steps in the initialization notebook, please follow the 1. Cluster initialization script and 2. Install the xgboost4j_spark jar in the cluster to ensure it is ready for XGBoost training.
40 |
41 | Add cluster initialization script
42 | ---------------------------
43 | 1. See [Initialization scripts](https://docs.databricks.com/user-guide/clusters/init-scripts.html) for how to configure cluster initialization scripts.
44 | 2. Edit your cluster, adding an initialization script from dbfs:/databricks/init_scripts/init.sh in the "Advanced Options" under "Init Scripts" tab
45 | 3. Reboot the cluster
46 |
47 |
48 | Install the xgboost4j_spark jar in the cluster
49 | ---------------------------
50 | 1. See [Libraries](https://docs.databricks.com/user-guide/libraries.html) for how to install jars from DBFS
51 | 2. Go to "Libraries" tab under your cluster and install dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar in your cluster by selecting the "DBFS" option for installing jars
52 |
53 | These steps will ensure you have a GPU Cluster ready for importing XGBoost notebooks or create your own XGBoost Application for training.
54 |
55 |
56 | Import the GPU Mortgage Example Notebook
57 | ---------------------------
58 | 1. See [Managing Notebooks](https://docs.databricks.com/user-guide/notebooks/notebook-manage.html) on how to import a notebook.
59 | 2. Import the example notebook: [XGBoost4j-Spark mortgage notebook](/examples/notebooks/python/mortgage-gpu.ipynb)
60 | 3. Inside the mortgage example notebook, update the data paths from
61 | "/data/datasets/mortgage-small/train" to "dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv"
62 | "/data/datasets/mortgage-small/eval" to "dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv"
63 |
64 | The example notebook comes with the following configuration, you can adjust this according to your setup.
65 | See supported configuration options here: [xgboost parameters](/examples/app-parameters/supported_xgboost_parameters_python.md)
66 | ```
67 | params = {
68 | 'eta': 0.1,
69 | 'gamma': 0.1,
70 | 'missing': 0.0,
71 | 'treeMethod': 'gpu_hist',
72 | 'maxDepth': 10,
73 | 'maxLeaves': 256,
74 | 'growPolicy': 'depthwise',
75 | 'minChildWeight': 30.0,
76 | 'lambda_': 1.0,
77 | 'scalePosWeight': 2.0,
78 | 'subsample': 1.0,
79 | 'nthread': 1,
80 | 'numRound': 100,
81 | 'numWorkers': 1,
82 | }
83 |
84 | ```
85 |
86 | 4. Run all the cells in the notebook.
87 |
88 | 5. View the results
89 | In the cell 5 (Training), 7 (Transforming) and 8 (Accuracy of Evaluation) you will see the output.
90 |
91 | ```
92 | --------------
93 | ==> Benchmark:
94 | Training takes 6.48 seconds
95 | --------------
96 |
97 | --------------
98 | ==> Benchmark: Transformation takes 3.2 seconds
99 |
100 | --------------
101 |
102 | ------Accuracy of Evaluation------
103 | Accuracy is 0.9980699597729774
104 |
105 | ```
106 |
107 | * The timings in this Getting Started guide are only illustrative. Please see our [release announcement](https://medium.com/rapids-ai/nvidia-gpus-and-apache-spark-one-step-closer-2d99e37ac8fd) for official benchmarks.
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.3-5.4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Download latest Jars"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/jars/\")"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 3,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "%sh\n",
26 | "cd ../../dbfs/FileStore/jars/\n",
27 | "wget -O cudf-0.9.2.jar https://search.maven.org/remotecontent?filepath=ai/rapids/cudf/0.9.2/cudf-0.9.2.jar\n",
28 | "wget -O xgboost4j_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\n",
29 | "wget -O xgboost4j-spark_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\n",
30 | "ls -ltr\n",
31 | "\n",
32 | "# Your Jars are downloaded in dbfs:/FileStore/jars directory"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "### Create a Directory for your init script"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 5,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 6,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "dbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n",
58 | "#!/bin/bash\n",
59 | "sudo cp /dbfs/FileStore/jars/xgboost4j_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j--ml.dmlc__xgboost4j__0.81.jar\n",
60 | "sudo cp /dbfs/FileStore/jars/cudf-0.9.2.jar /databricks/jars/\n",
61 | "sudo cp /dbfs/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j-spark--ml.dmlc__xgboost4j-spark__0.81.jar\"\"\", True)"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Confirm your init script is in the new directory"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 8,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "%sh\n",
78 | "cd ../../dbfs/databricks/init_scripts\n",
79 | "pwd\n",
80 | "ls -ltr"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Download the Mortgage Dataset into your local machine and upload Data using import Data"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 10,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/tables/\")"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 11,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "%sh\n",
106 | "cd /dbfs/FileStore/tables/\n",
107 | "wget -O mortgage.zip https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip\n",
108 | "ls\n",
109 | "unzip mortgage.zip"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 12,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "%sh\n",
119 | "pwd\n",
120 | "cd ../../dbfs/FileStore/tables\n",
121 | "ls -ltr mortgage/csv/*"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "### Next steps\n",
129 | "\n",
130 | "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n",
131 | "2. Reboot the cluster\n",
132 | "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
133 | "4. Import the mortgage example notebook from `https://github.com/rapidsai/spark-examples/blob/master/examples/notebooks/python/mortgage-gpu.ipynb`\n",
134 | "5. Inside the mortgage example notebook, update the data paths\n",
135 | " `train_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv')`\n",
136 | " `eval_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv')`"
137 | ]
138 | }
139 | ],
140 | "metadata": {
141 | "kernelspec": {
142 | "display_name": "Python 3",
143 | "language": "python",
144 | "name": "python3"
145 | },
146 | "language_info": {
147 | "codemirror_mode": {
148 | "name": "ipython",
149 | "version": 3
150 | },
151 | "file_extension": ".py",
152 | "mimetype": "text/x-python",
153 | "name": "python",
154 | "nbconvert_exporter": "python",
155 | "pygments_lexer": "ipython3",
156 | "version": "3.7.4"
157 | },
158 | "name": "Init Scripts_demo",
159 | "notebookId": 2585487876834579
160 | },
161 | "nbformat": 4,
162 | "nbformat_minor": 1
163 | }
164 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/databricks/init-notebook-for-rapids-spark-xgboost-on-databricks-gpu-5.5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Download latest Jars"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/jars/\")"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 3,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "%sh\n",
26 | "cd ../../dbfs/FileStore/jars/\n",
27 | "wget -O cudf-0.9.2-cuda10.jar https://search.maven.org/remotecontent?filepath=ai/rapids/cudf/0.9.2/cudf-0.9.2-cuda10.jar\n",
28 | "wget -O xgboost4j_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j_2.x/1.0.0-Beta5/xgboost4j_2.x-1.0.0-Beta5.jar\n",
29 | "wget -O xgboost4j-spark_2.x-1.0.0-Beta5.jar https://search.maven.org/remotecontent?filepath=ai/rapids/xgboost4j-spark_2.x/1.0.0-Beta5/xgboost4j-spark_2.x-1.0.0-Beta5.jar\n",
30 | "ls -ltr\n",
31 | "\n",
32 | "# Your Jars are downloaded in dbfs:/FileStore/jars directory"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "### Create a Directory for your init script"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 5,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 6,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "dbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n",
58 | "#!/bin/bash\n",
59 | "sudo cp /dbfs/FileStore/jars/xgboost4j_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j--ml.dmlc__xgboost4j__0.90.jar\n",
60 | "sudo cp /dbfs/FileStore/jars/cudf-0.9.2-cuda10.jar /databricks/jars/\n",
61 | "sudo cp /dbfs/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar /databricks/jars/spark--maven-trees--ml--xgboost--ml.dmlc--xgboost4j-spark--ml.dmlc__xgboost4j-spark__0.90.jar\"\"\", True)"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Confirm your init script is in the new directory"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 8,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "%sh\n",
78 | "cd ../../dbfs/databricks/init_scripts\n",
79 | "pwd\n",
80 | "ls -ltr"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### Download the Mortgage Dataset into your local machine and upload Data using import Data"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 10,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "dbutils.fs.mkdirs(\"dbfs:/FileStore/tables/\")"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 11,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "%sh\n",
106 | "cd /dbfs/FileStore/tables/\n",
107 | "wget -O mortgage.zip https://rapidsai-data.s3.us-east-2.amazonaws.com/spark/mortgage.zip\n",
108 | "ls\n",
109 | "unzip mortgage.zip"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 12,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "%sh\n",
119 | "pwd\n",
120 | "cd ../../dbfs/FileStore/tables\n",
121 | "ls -ltr mortgage/csv/*"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "### Next steps\n",
129 | "\n",
130 | "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n",
131 | "2. Reboot the cluster\n",
132 | "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark_2.x-1.0.0-Beta5.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n",
133 | "4. Import the mortgage example notebook from `https://github.com/rapidsai/spark-examples/blob/master/examples/notebooks/python/mortgage-gpu.ipynb`\n",
134 | "5. Inside the mortgage example notebook, update the data paths\n",
135 | " `train_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/train/mortgage_train_merged.csv')`\n",
136 | " `eval_data = GpuDataReader(spark).schema(schema).option('header', True).csv('dbfs:/FileStore/tables/mortgage/csv/test/mortgage_eval_merged.csv')`"
137 | ]
138 | }
139 | ],
140 | "metadata": {
141 | "kernelspec": {
142 | "display_name": "Python 3",
143 | "language": "python",
144 | "name": "python3"
145 | },
146 | "language_info": {
147 | "codemirror_mode": {
148 | "name": "ipython",
149 | "version": 3
150 | },
151 | "file_extension": ".py",
152 | "mimetype": "text/x-python",
153 | "name": "python",
154 | "nbconvert_exporter": "python",
155 | "pygments_lexer": "ipython3",
156 | "version": "3.7.4"
157 | },
158 | "name": "Init Scripts_demo",
159 | "notebookId": 2585487876834616
160 | },
161 | "nbformat": 4,
162 | "nbformat_minor": 1
163 | }
164 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/databricks/xgb_python_gpu_perf_blog.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["## GPU based PySpark XGBoost"],"metadata":{}},{"cell_type":"markdown","source":["##### Importing XGBoost, hyperopt, scikit learn, pandas and other helper function packages"],"metadata":{}},{"cell_type":"code","source":["import xgboost as xgb\n\nfrom hyperopt import hp, fmin, tpe, STATUS_OK, SparkTrials\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\n\nimport numpy as np\nimport pandas as pd\n\nimport os\nimport shutil\nimport tempfile"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":["## Data Loading"],"metadata":{}},{"cell_type":"markdown","source":["##### For a large dataset, broadcasting the dataset would take significant cluster resources. We store the data on DBFS and load it back on workers via DBFS' local file interface.\n\nSee Databricks best practices for HyperOpt: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html"],"metadata":{}},{"cell_type":"code","source":["def load(path):\n \"\"\"\n Loads saved data (a tuple of numpy arrays).\n Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html\n \"\"\"\n return list(np.load(path).values())\n \ndef save_to_dbfs(data):\n \"\"\"\n Saves input data (a tuple of numpy arrays) to a temporary file on DBFS and returns its path.\n Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html\n \"\"\"\n # Save data to a local file first.\n data_filename = \"data.npz\"\n local_data_dir = tempfile.mkdtemp()\n local_data_path = os.path.join(local_data_dir, data_filename)\n np.savez(local_data_path, *data)\n # Move it to DBFS, which is shared among cluster nodes.\n dbfs_tmp_dir = \"/dbfs/ml/tmp/hyperopt\"\n os.makedirs(dbfs_tmp_dir, exist_ok=True)\n dbfs_data_dir = tempfile.mkdtemp(dir=dbfs_tmp_dir) \n dbfs_data_path = os.path.join(dbfs_data_dir, data_filename) \n shutil.move(local_data_path, dbfs_data_path)\n return dbfs_data_path"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"markdown","source":["##### Preparing XGBoost Data"],"metadata":{}},{"cell_type":"code","source":["def prepare_xgb_data(data, id_col=\"Id\", label_col=\"Label\", test_size=0.2):\n \"\"\"\n Prepare data for xgboost training\n \"\"\"\n # Make sure last column is label, first col\n data[label_col+\"Temp\"] = data[label_col]\n data = data.drop([id_col, label_col], axis=1)\n data.rename(columns={label_col+\"Temp\": label_col}, inplace=True)\n \n # Prepare data\n X, y = data.iloc[:,:-1],data.iloc[:,-1]\n data_dmatrix = xgb.DMatrix(data=X,label=y)\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=21)\n return X_train, X_test, y_train, y_test\n \ndef get_raw_data(file_name, sample_size):\n input_file_location = \"/dbfs/FileStore/tables/\" + file_name\n pdf = pd.read_csv(input_file_location).dropna().sample(n=sample_size)\n return pdf"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"markdown","source":["## XGBoost Regression with Hyperopt + Spark Trials"],"metadata":{}},{"cell_type":"code","source":["def objective(space):\n \"\"\"\n Train and search input space\n \"\"\"\n clf = xgb.XGBRegressor(objective ='reg:squarederror', \n n_estimators = int(space['n_estimators']),\n colsample_bytree = space['colsample_bytree'],\n learning_rate = space['learning_rate'],\n max_depth = int(space['max_depth']),\n alpha = space['alpha'],\n tree_method= space['tree_method']\n )\n \n # Load data\n data = load(data_large_path)\n X_train, X_test, y_train, y_test = data[0], data[1], data[2], data[3]\n eval_set = [(X_train, y_train), (X_test, y_test)]\n\n # Train\n clf.fit(X_train, y_train,\n eval_set=eval_set, eval_metric=\"rmse\",\n early_stopping_rounds=10,verbose=False)\n \n # Validate\n pred = clf.predict(X_test)\n mse_scr = mean_squared_error(y_test, pred)\n\n return {'loss': mse_scr, 'status': STATUS_OK}\n\ndef run_hyperopt(df, treemethod, parallelism, max_evals):\n \"\"\"\n Run hyperopt and return best params\n \"\"\"\n # Hyperopt search space\n space ={'max_depth': hp.quniform('max_depth', 4, 16, 1),\n 'alpha' : hp.uniform('alpha', 1, 10),\n 'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1),\n 'learning_rate' : hp.uniform('learning_rate', 0.1, 1),\n 'n_estimators': hp.quniform('n_estimators', 25, 500, 25),\n 'tree_method': treemethod\n }\n if parallelism is None:\n trials = SparkTrials()\n else:\n trials = SparkTrials(parallelism=parallelism)\n\n # Hyperopt\n best_param = fmin(fn=objective,\n space=space,\n algo=tpe.suggest,\n max_evals=max_evals,\n trials=trials)\n print(best_param)\n \n return best_param"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"markdown","source":["## Train"],"metadata":{}},{"cell_type":"markdown","source":["##### Parallelism parameter is set \"2\" for 2 GPUs, which is effectively using 2 GPUs in parallel. Each new hyperparameter setting tested will be chosen based on previous results. Setting parallelism in between 1 and max_evals allows you to trade off scalability (getting results faster) and adaptiveness (sometimes getting better models). For GPU, is is advised to set number of GPUs used for training."],"metadata":{}},{"cell_type":"code","source":["# Dataset\nfile_name = \"your_file_name.csv\" # dataset file name\nid_col=\"unique_id_column_name\" # unique id for each row\nlabel_col=\"label_column_name\" # label column name\n\n# Load data\ndf = get_raw_data(file_name=file_name, sample_size=10000)\ndata_large = prepare_xgb_data(df, id_col=id_col, label_col=label_col, test_size=0.2)\ndata_large_path = save_to_dbfs(data_large)\n\n# Run training\nbest_param = run_hyperopt(df, treemethod='gpu_hist', parallelism=2, max_evals=10) # Set parallelism = Number of GPUs\n\n# Cleanup\nshutil.rmtree(data_large_path, ignore_errors=True)"],"metadata":{},"outputs":[],"execution_count":13}],"metadata":{"name":"xgb_python_gpu_perf_blog","notebookId":323},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/gcp/spark-gpu/README.md:
--------------------------------------------------------------------------------
1 | # RAPIDS Spark GPU
2 |
3 | This initialization action deploy the dependency of RAPIDS spark GPU(https://github.com/rapidsai/spark-examples) on a
4 | [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster.
5 |
6 | Prerequisites
7 | -------------
8 | * Apache Spark 2.3+
9 | * Hardware Requirements
10 | * NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later)
11 | * Multi-node clusters with homogenous GPU configuration
12 | * Software Requirements
13 | * NVIDIA driver 410.48+
14 | * CUDA V10.1/10.0/9.2
15 | * NCCL 2.4.7 and later
16 | * `EXCLUSIVE_PROCESS` must be set for all GPUs in each NodeManager.(Initialization script provided in this guide will set this mode by default)
17 | * `spark.dynamicAllocation.enabled` must be set to False for spark
18 |
19 | Our initialization action does the following:
20 |
21 | ### Step 1. Initialization steps to download required files for Spark RAPIDS XGBoost app
22 |
23 | 1. Git clone the [spark-examples directory](https://github.com/rapidsai/spark-examples) to your local machine.
24 | 2. Upload the necessary files into your GCP bucket by executing the following commands.
25 |
26 | ```bash
27 | cd spark-examples/
28 | export GCS_BUCKET=my-bucket
29 | export RAPIDS_SPARK_VERSION='2.x-1.0.0-Beta5'
30 | export RAPIDS_CUDF_VERSION='0.9.2-cuda10'
31 | pushd datasets/
32 | tar -xvf mortgage-small.tar.gz
33 | gsutil cp -r mortgage-small/ gs://$GCS_BUCKET/
34 | popd
35 | wget -O cudf-${RAPIDS_CUDF_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/cudf/${RAPIDS_CUDF_VERSION%-*}/cudf-${RAPIDS_CUDF_VERSION}.jar
36 | wget -O xgboost4j_${RAPIDS_SPARK_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/xgboost4j_${RAPIDS_SPARK_VERSION/-/\/}/xgboost4j_${RAPIDS_SPARK_VERSION}.jar
37 | wget -O xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar https://repo1.maven.org/maven2/ai/rapids/xgboost4j-spark_${RAPIDS_SPARK_VERSION/-/\/}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar
38 | gsutil cp cudf-${RAPIDS_CUDF_VERSION}.jar xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar xgboost4j_${RAPIDS_SPARK_VERSION}.jar gs://$GCS_BUCKET/
39 | ```
40 |
41 | After that, go to Google Cloud Platform console via browser to make sure your Google storage bucket “my-bucket” directory structure has the following files:
42 | * gs://my-bucket/cudf-${RAPIDS_CUDF_VERSION}.jar
43 | * gs://my-bucket/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar
44 | * gs://my-bucket/xgboost4j_${RAPIDS_SPARK_VERSION}.jar
45 | * gs://my-bucket/mortgage-small/eval/mortgage-small.csv
46 | * gs://my-bucket/mortgage-small/eval/mortgage-small.csv
47 | * gs://my-bucket/mortgage-small/trainWithEval/test.csv
48 |
49 |
50 | ### Step 2 Using the `gcloud` command to create a new cluster with this initialization action.
51 |
52 | The following command will create a new cluster named ``.
53 |
54 | ```bash
55 | export CLUSTER_NAME=my-gpu-cluster
56 | export ZONE=us-central1-b
57 | export REGION=us-central1
58 | export GCS_BUCKET=my-bucket
59 | export INIT_ACTIONS_BUCKET=my-bucket
60 | export NUM_GPUS=2
61 | export NUM_WORKERS=2
62 | export RAPIDS_SPARK_VERSION='2.x-1.0.0-Beta5'
63 | export RAPIDS_CUDF_VERSION='0.9.2-cuda10'
64 |
65 | gcloud beta dataproc clusters create $CLUSTER_NAME \
66 | --zone $ZONE \
67 | --region $REGION \
68 | --master-machine-type n1-standard-32 \
69 | --master-boot-disk-size 50 \
70 | --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
71 | --worker-machine-type n1-standard-32 \
72 | --worker-boot-disk-size 50 \
73 | --num-worker-local-ssds 1 \
74 | --num-workers $NUM_WORKERS \
75 | --image-version 1.4-ubuntu18 \
76 | --bucket $GCS_BUCKET \
77 | --metadata JUPYTER_PORT=8123,INIT_ACTIONS_REPO="gs://$INIT_ACTIONS_BUCKET",linux-dist="ubuntu",GCS_BUCKET="gs://$GCS_BUCKET" \
78 | --initialization-actions gs://goog-dataproc-initialization-actions${REGION}/gpu/install_gpu_driver.sh \
79 | --optional-components=ANACONDA,JUPYTER \
80 | --subnet=default \
81 | --properties "^#^spark:spark.dynamicAllocation.enabled=false#spark:spark.shuffle.service.enabled=false#spark:spark.submit.pyFiles=/usr/lib/spark/python/lib/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar#spark:spark.jars=/usr/lib/spark/jars/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar,/usr/lib/spark/jars/xgboost4j_${RAPIDS_SPARK_VERSION}.jar,/usr/lib/spark/jars/cudf-${RAPIDS_CUDF_VERSION}.jar" \
82 | --enable-component-gateway
83 | ```
84 |
85 | ### Step 3, execute the sample app
86 |
87 | Once the cluster has been created, yarn resource manager could be accessed on port `8088` on the Dataproc master
88 | node.
89 |
90 | To connect to the dataproc web interface, you will need to create an SSH tunnel as
91 | described in the
92 | [dataproc web interfaces](https://cloud.google.com/dataproc/cluster-web-interfaces)
93 | documentation.
94 |
95 | See
96 | [the Mortgage example](https://github.com/rapidsai/spark-examples/tree/master/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage)
97 | that demonstrates end to end XGBoost4j in spark including data pre-processing and model
98 | training with RAPIDS Spark GPU APIs. Additional examples
99 | [are available](https://github.com/rapidsai/spark-examples/tree/master/examples). See the
100 | [RAPIDS Spark GPU API documentation](https://github.com/rapidsai/spark-examples/tree/master/api-docs) for API details.
101 |
102 | To submit such a job run:
103 |
104 | ```bash
105 | export MAIN_CLASS=ai.rapids.spark.examples.mortgage.GPUMain
106 | export RAPIDS_JARS=gs://$GCS_BUCKET/spark-gpu/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar
107 | export DATA_PATH=$GCS_BUCKET
108 | export TREE_METHOD=gpu_hist
109 | export SPARK_NUM_EXECUTORS=4
110 | export CLUSTER_NAME=my-gpu-cluster
111 | export REGION=us-central1
112 |
113 | gcloud beta dataproc jobs submit spark \
114 | --cluster=$CLUSTER_NAME \
115 | --region=$REGION \
116 | --class=$MAIN_CLASS \
117 | --jars=$RAPIDS_JARS \
118 | --properties=spark.executor.cores=1,spark.executor.instances=${SPARK_NUM_EXECUTORS},spark.executor.memory=8G,spark.executorEnv.LD_LIBRARY_PATH=/usr/local/lib/x86_64-linux-gnu:/usr/local/cuda-10.0/lib64:${LD_LIBRARY_PATH} \
119 | -- \
120 | -format=csv \
121 | -numRound=100 \
122 | -numWorkers=${SPARK_NUM_EXECUTORS} \
123 | -treeMethod=${TREE_METHOD} \
124 | -trainDataPath=${DATA_PATH}/mortgage-small/train/mortgage_small.csv \
125 | -evalDataPath=${DATA_PATH}/mortgage-small/eval/mortgage_small.csv \
126 | -maxDepth=8
127 | ```
128 |
129 |
130 | RAPIDS Spark GPU is a relatively young project with APIs evolving quickly. If you
131 | encounter unexpected errors or have feature requests, please file them at the
132 | relevant [RAPIDS Spark example repo](https://github.com/rapidsai/spark-examples).
133 |
134 | ### Options
135 |
136 | #### GPU Types & Driver Configuration
137 |
138 | By default, these initialization actions install a CUDA 10.0 with NVIDIA 418 driver. If you wish
139 | to install a different driver version, `metadata` need to be passed into initial action. Available options below:
140 |
141 | ```
142 | cuda-version='10-0'
143 | nccl-url='https://developer.nvidia.com/compute/machine-learning/nccl/secure/v2.4/prod/nccl-repo-ubuntu1804-2.4.8-ga-cuda10.0_1-1_amd64.deb'
144 | nccl-version='2.4.8'
145 | ```
146 |
147 | ## Important notes
148 |
149 | * RAPIDS Spark GPU is supported on Pascal or newer GPU architectures (Tesla K80s will
150 | _not_ work with RAPIDS). See
151 | [list](https://cloud.google.com/compute/docs/gpus/) of available GPU types
152 | by GCP region.
153 | * You must set a GPU accelerator type for worker nodes, else
154 | the GPU driver install will fail and the cluster will report an error state.
155 | * When running RAPIDS Spark GPU with multiple attached GPUs, We recommend an
156 | n1-standard-32 worker machine type or better to ensure sufficient
157 | host-memory for buffering data to and from GPUs. When running with a single
158 | attached GPU, GCP only permits machine types up to 24 vCPUs.
159 |
160 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/gcp/spark-gpu/internal/install-gpu-driver-debian.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euxo pipefail
4 |
5 | readonly DEFAULT_GPU_DRIVER_URL='http://us.download.nvidia.com/tesla/418.87/NVIDIA-Linux-x86_64-418.87.00.run'
6 | readonly GPU_DRIVER_URL=$(/usr/share/google/get_metadata_value attributes/gpu-driver-url ||
7 | echo -n "${DEFAULT_GPU_DRIVER_URL}")
8 |
9 | readonly DEFAULT_CUDA_URL='https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux'
10 | readonly CUDA_URL=$(/usr/share/google/get_metadata_value attributes/gpu-cuda-url ||
11 | echo -n "${DEFAULT_CUDA_URL}")
12 |
13 | readonly DEFAULT_CUDA_VERSION='10-0'
14 | readonly CUDA_VERSION=$(/usr/share/google/get_metadata_value attributes/cuda-version ||
15 | echo -n "${DEFAULT_CUDA_VERSION}")
16 |
17 | readonly DEFAULT_NCCL_URL='https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb'
18 | readonly NCCL_URL=$(/usr/share/google/get_metadata_value attributes/nccl-url ||
19 | echo -n "${DEFAULT_NCCL_URL}")
20 |
21 | readonly DEFAULT_NCCL_VERSION='2.4.8'
22 | readonly NCCL_VERSION=$(/usr/share/google/get_metadata_value attributes/nccl-version ||
23 | echo -n "${DEFAULT_NCCL_VERSION}")
24 |
25 | apt-get update
26 | DEBIAN_FRONTEND=noninteractive apt-get install -y pciutils "linux-headers-$(uname -r)"
27 |
28 | wget --progress=dot:mega -O driver.run "${GPU_DRIVER_URL}"
29 | chmod +x "./driver.run"
30 | "./driver.run" --silent
31 |
32 | wget --progress=dot:mega -O cuda.run "${CUDA_URL}"
33 | chmod +x "./cuda.run"
34 | "./cuda.run" --silent --toolkit --no-opengl-libs
35 |
36 | wget --progress=dot:mega -O nccl.deb "${GPU_NCCL_URL}"
37 | chmod +x "./nccl.deb"
38 | dpkg -i nccl.deb
39 | apt update
40 | apt install "libnccl2=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" "libnccl-dev=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" -y
41 |
42 | /usr/bin/nvidia-smi -c EXCLUSIVE_PROCESS
43 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/gcp/spark-gpu/internal/install-gpu-driver-ubuntu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euxo pipefail
4 |
5 | readonly DEFAULT_CUDA_VERSION='10-0'
6 | readonly CUDA_VERSION=$(/usr/share/google/get_metadata_value attributes/cuda-version ||
7 | echo -n "${DEFAULT_CUDA_VERSION}")
8 |
9 | readonly DEFAULT_NCCL_URL='https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb'
10 | readonly NCCL_URL=$(/usr/share/google/get_metadata_value attributes/nccl-url ||
11 | echo -n "${DEFAULT_NCCL_URL}")
12 |
13 | readonly DEFAULT_NCCL_VERSION='2.4.8'
14 | readonly NCCL_VERSION=$(/usr/share/google/get_metadata_value attributes/nccl-version ||
15 | echo -n "${DEFAULT_NCCL_VERSION}")
16 |
17 | apt-get update
18 | apt-get install build-essential
19 |
20 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
21 | mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
22 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
23 | add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
24 | apt-get update
25 |
26 | if [[ "${CUDA_VERSION}" != '10-0' ]]; then
27 | apt-get -y install cuda
28 | else
29 | apt-get -y install cuda-10-0
30 | fi
31 |
32 | wget --progress=dot:mega -O nccl.deb "${NCCL_URL}"
33 | dpkg -i nccl.deb
34 | apt update
35 | apt install "libnccl2=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" "libnccl-dev=${NCCL_VERSION}-1+cuda${CUDA_VERSION//\-/\.}" -y
36 |
37 | /usr/bin/nvidia-smi -c EXCLUSIVE_PROCESS
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/getting-started-guides/csp/gcp/spark-gpu/rapids.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euxo pipefail
4 |
5 | readonly ROLE=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
6 | readonly LINUX_DIST=$(/usr/share/google/get_metadata_value attributes/linux-dist)
7 |
8 | readonly DEFAULT_INIT_ACTIONS_REPO=gs://dataproc-initialization-actions
9 | readonly INIT_ACTIONS_REPO="$(/usr/share/google/get_metadata_value attributes/INIT_ACTIONS_REPO ||
10 | echo ${DEFAULT_INIT_ACTIONS_REPO})"
11 |
12 | readonly DEFAULT_GCS_BUCKET=gs://my-bucket
13 | readonly GCS_BUCKET="$(/usr/share/google/get_metadata_value attributes/GCS_BUCKET ||
14 | echo ${DEFAULT_GCS_BUCKET})"
15 |
16 | readonly DEFAULT_RAPIDS_SPARK_VERSION=2.x-1.0.0-Beta5
17 | readonly RAPIDS_SPARK_VERSION="$(/usr/share/google/get_metadata_value attributes/RAPIDS_SPARK_VERSION ||
18 | echo ${DEFAULT_RAPIDS_SPARK_VERSION})"
19 |
20 | readonly DEFAULT_RAPIDS_CUDF_VERSION=0.9.2-cuda10
21 | readonly RAPIDS_CUDF_VERSION="$(/usr/share/google/get_metadata_value attributes/RAPIDS_CUDF_VERSION ||
22 | echo ${DEFAULT_RAPIDS_CUDF_VERSION})"
23 |
24 | echo "Cloning RAPIDS initialization action from '${INIT_ACTIONS_REPO}' ..."
25 | RAPIDS_INIT_ACTION_DIR=$(mktemp -d -t rapids-init-action-XXXX)
26 | readonly RAPIDS_INIT_ACTION_DIR
27 | gsutil -m rsync -r "${INIT_ACTIONS_REPO}/spark-gpu" "${RAPIDS_INIT_ACTION_DIR}"
28 |
29 | if [[ "${LINUX_DIST}" == 'ubuntu' ]]; then
30 | mv "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver-ubuntu.sh" "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh"
31 | else
32 | mv "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver-debian.sh" "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh"
33 | fi
34 | find "${RAPIDS_INIT_ACTION_DIR}" -name '*.sh' -exec chmod +x {} \;
35 |
36 | if [[ "${ROLE}" != 'Master' ]]; then
37 | # Ensure we have GPU drivers installed.
38 | "${RAPIDS_INIT_ACTION_DIR}/internal/install-gpu-driver.sh"
39 | else
40 | gsutil cp ${GCS_BUCKET}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/python/lib/
41 | gsutil cp ${GCS_BUCKET}/xgboost4j-spark_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/jars/
42 | gsutil cp ${GCS_BUCKET}/xgboost4j_${RAPIDS_SPARK_VERSION}.jar /usr/lib/spark/jars/
43 | gsutil cp ${GCS_BUCKET}/cudf-${RAPIDS_CUDF_VERSION}.jar /usr/lib/spark/jars/
44 | fi
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/getting-started-guides/notebook/python-notebook.md:
--------------------------------------------------------------------------------
1 | Get Started with XGBoost4J-Spark with Jupyter Notebook
2 | ===================================================================
3 | This is a getting started guide to XGBoost4J-Spark using an [Jupyter notebook](https://jupyter.org/). At the end of this guide, the reader will be able to run a sample notebook that runs on NVIDIA GPUs.
4 |
5 | Before you begin, please ensure that you have setup a [Spark Standalone Cluster](/getting-started-guides/on-prem-cluster/standalone-python.md).
6 |
7 | It is assumed that the `SPARK_MASTER` and `SPARK_HOME` environment variables are defined and point to the master spark URL (e.g. `spark://localhost:7077`), and the home directory for Apache Spark respectively.
8 |
9 | 1. Make sure you have [Jupyter notebook installed](https://jupyter.org/install.html). If you install it with conda, please makes sure your Python version is consistent.
10 |
11 | 2. Make sure you have `SPARK_JARS` and `SPARK_PY_FILES` set properly. Please note, here *cudf-0.9.2-cuda10.jar* is used as an example. Please choose other *cudf-0.9.2* jars based on your environment. You may need to update these env variables because the working directory will be changed:
12 | ```
13 | export LIBS_PATH=[full path to xgboost4j_spark/libs]
14 | export SPARK_JARS=${LIBS_PATH}/cudf-0.9.2-cuda10.jar,${LIBS_PATH}/xgboost4j_2.x-1.0.0-Beta5.jar,${LIBS_PATH}/xgboost4j-spark_2.x-1.0.0-Beta5.jar
15 | export SPARK_PY_FILES=${LIBS_PATH}/xgboost4j-spark_2.x-1.0.0-Beta5.jar
16 | ```
17 |
18 | 3. Go to the project root directory and launch the notebook:
19 | ```
20 | PYSPARK_DRIVER_PYTHON=jupyter \
21 | PYSPARK_DRIVER_PYTHON_OPTS=notebook \
22 | pyspark \
23 | --master ${SPARK_MASTER} \
24 | --jars ${SPARK_JARS} \
25 | --py-files ${SPARK_PY_FILES}
26 | ```
27 |
28 | Then you start your notebook and open [`mortgage-gpu.ipynb`](/examples/notebooks/python/mortgage-gpu.ipynb) to explore.
29 |
--------------------------------------------------------------------------------
/getting-started-guides/notebook/toree.md:
--------------------------------------------------------------------------------
1 | Get Started with XGBoost4J-Spark with Apache Toree Jupyter Notebook
2 | ===================================================================
3 | This is a getting started guide to XGBoost4J-Spark using an [Apache Toree](https://toree.apache.org/) Jupyter notebook. At the end of this guide, the reader will be able to run a sample notebook that runs on NVIDIA GPUs.
4 |
5 | Before you begin, please ensure that you have setup a [Spark Standalone Cluster](/getting-started-guides/on-prem-cluster/standalone-scala.md).
6 |
7 | It is assumed that the `SPARK_MASTER` and `SPARK_HOME` environment variables are defined and point to the master spark URL (e.g. `spark://localhost:7077`), and the home directory for Apache Spark respectively.
8 |
9 | 1. Make sure you have jupyter notebook installed:
10 | Install Toree:
11 | ```
12 | pip install toree
13 | ```
14 |
15 | 2. Install kernel configured for our example:
16 | ```
17 | export SPARK_EXAMPLES=[full path to spark-examples repo]
18 | export SPARK_JARS=${SPARK_EXAMPLES}/sample_xgboost_apps-0.1.5-jar-with-dependencies.jar
19 |
20 | jupyter toree install \
21 | --spark_home=${SPARK_HOME} \
22 | --user \
23 | --kernel_name="XGBoost4j-Spark" \
24 | --spark_opts='--master ${SPARK_MASTER} --jars ${SPARK_JARS}'
25 | ```
26 |
27 | 2. Launch the notebook:
28 | ```
29 | jupyter notebook
30 | ```
31 |
32 | Then you start your notebook and open [`mortgage-gpu.ipynb`](/examples/notebooks/scala/mortgage-gpu.ipynb) to explore.
33 |
34 | Please ensure that the *XGBoost4j-Spark* kernel is running.
35 |
--------------------------------------------------------------------------------
/gpu_executor_template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | spec:
4 | containers:
5 | - name: executor
6 | resources:
7 | limits:
8 | nvidia.com/gpu: 1
9 |
10 |
--------------------------------------------------------------------------------
/tools/jupyter_gpu_count_estimation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Script to estimate at least GPU count\n",
8 | "\n",
9 | "This script which typically simulates the GPU memory consumption flow is used to estimate the GPU minimum count by giving some parameters including rows and columns and others. The more precise parameters are, the more accurate the output is.\n",
10 | "\n",
11 | "| parameters | |\n",
12 | "|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
13 | "| SINGLE_GPU_MEMORY_SIZE | The size of one gpu memory on device, you can get it by `nvidia-smi` |\n",
14 | "| NUM_OF_FEATURE_COLUMNS | The total feature columns of input dataset. |\n",
15 | "| NUM_OF_WEIGHT_COLUMNS | The total weight columns of input dataset. If no weight column, it should be set to 0. |\n",
16 | "| NUM_OF_GROUPS | the size of prediction per instance. This value is set to 1 for all tasks except multi-class classification. For multi-class classification, NUM_OF_GROUPS must be set to the number of classes |\n",
17 | "| SPARSITY | sparsity of input dataset. (1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A) |\n",
18 | "| MAX_BIN | maximum number of discrete bins to bucket continuous features. Default is 16 |\n",
19 | "\n",
20 | "---\n",
21 | "\n",
22 | "- ROW_STRIDE\n",
23 | "\n",
24 | "As to ROW_STRIDE, which is the largest number of features/items across all rows the input dataset. \n",
25 | "You can calculate it by \n",
26 | "```shell\n",
27 | "cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i 0\n",
49 | "\n",
50 | "SPARSITY = 0.5 #(1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 2,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "# Below parameters can also affects the result, all of them are default values\n",
60 | "\n",
61 | "MAX_BIN = 16 # max_bin default value. It is 256 in native xgboost, while it is 16 in xgboost-4j\n",
62 | "\n",
63 | "# ROW_STRIDE: it should be <= NUM_OF_FEATURE_COLUMNS\n",
64 | "# let's assume last column is feature column, so the ROW_STRIDE can be calculated with below script\n",
65 | "# cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i 0\n",
48 | "\n",
49 | "SPARSITY = 1 #(1 - NON_ZEROR_COUNT(A)/TOTAL_COUNT_A)"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "MAX_BIN = 16 # max_bin default value, It is 256 in native xgboost, while it is 16 in xgboost-4j\n",
59 | "\n",
60 | "# ROW_STRIDE: it should be <= NUM_OF_FEATURE_COLUMNS\n",
61 | "# let's assume last column is feature column, so the ROW_STRIDE can be calculated with below script\n",
62 | "# cat xxxx | awk -F, '{$NF=\"\"; print $0}' | sort -n -r | head -1 | awk '{for(i=0;i GPU_MEMORY:\n",
170 | " print(\"\\nMax loadable rows:%d Given cols:%d on GPU:%d G\\n\" % (rows_to_load, TOTAL_COLUMNS, GPU_MEMORY/1024/1024/1024))\n",
171 | " break\n",
172 | " loadable\n",
173 | " rows_to_load += step # speed up"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": []
182 | }
183 | ],
184 | "metadata": {
185 | "kernelspec": {
186 | "display_name": "rapids",
187 | "language": "python",
188 | "name": "rapids"
189 | },
190 | "language_info": {
191 | "codemirror_mode": {
192 | "name": "ipython",
193 | "version": 3
194 | },
195 | "file_extension": ".py",
196 | "mimetype": "text/x-python",
197 | "name": "python",
198 | "nbconvert_exporter": "python",
199 | "pygments_lexer": "ipython3",
200 | "version": "3.6.7"
201 | }
202 | },
203 | "nbformat": 4,
204 | "nbformat_minor": 2
205 | }
206 |
--------------------------------------------------------------------------------