├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation-request.md
    │   ├── feature_request.md
    │   └── submit-question.md
    └── workflows
    │   ├── add-to-project.yml
    │   ├── license-header-check.yml
    │   └── signoff-check.yml
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── SECURITY.md
├── TPC EULA.txt
├── nds-h
    ├── README.md
    ├── nds_h_gen_data.py
    ├── nds_h_gen_query_stream.py
    ├── nds_h_power.py
    ├── nds_h_schema.py
    ├── nds_h_transcode.py
    ├── nds_h_validate.py
    └── tpch-gen
    │   ├── Makefile
    │   ├── patches
    │       └── template.patch
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── org
    │                   └── nvidia
    │                       └── nds_h
    │                           └── GenTable.java
├── nds
    ├── PysparkBenchReport.py
    ├── README.md
    ├── base.template
    ├── bench.yml
    ├── check.py
    ├── cicd
    │   └── settings.xml
    ├── convert_submit_cpu.template
    ├── convert_submit_cpu_delta.template
    ├── convert_submit_cpu_iceberg.template
    ├── convert_submit_gpu.template
    ├── data_maintenance
    │   ├── DF_CS.sql
    │   ├── DF_I.sql
    │   ├── DF_SS.sql
    │   ├── DF_WS.sql
    │   ├── LF_CR.sql
    │   ├── LF_CS.sql
    │   ├── LF_I.sql
    │   ├── LF_SR.sql
    │   ├── LF_SS.sql
    │   ├── LF_WR.sql
    │   └── LF_WS.sql
    ├── jvm_listener
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── com
    │   │               └── nvidia
    │   │                   └── spark
    │   │                       └── rapids
    │   │                           └── listener
    │   │                               ├── Listener.scala
    │   │                               ├── Manager.scala
    │   │                               └── TaskFailureListener.scala
    ├── maintenance_delta.template
    ├── maintenance_iceberg.template
    ├── nds-throughput
    ├── nds_bench.py
    ├── nds_gen_data.py
    ├── nds_gen_query_stream.py
    ├── nds_maintenance.py
    ├── nds_power.py
    ├── nds_rollback.py
    ├── nds_schema.py
    ├── nds_transcode.py
    ├── nds_validate.py
    ├── power_run_cpu.template
    ├── power_run_cpu_delta.template
    ├── power_run_cpu_iceberg.template
    ├── power_run_gpu.template
    ├── power_run_gpu_delta.template
    ├── power_run_gpu_iceberg.template
    ├── properties
    │   └── aqe-on.properties
    ├── python_listener
    │   ├── PythonListener.py
    │   └── __init__.py
    ├── spark-submit-template
    └── tpcds-gen
    │   ├── Makefile
    │   ├── README.md
    │   ├── patches
    │       ├── code.patch
    │       └── templates.patch
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── org
    │                   └── notmysock
    │                       └── tpcds
    │                           └── GenTable.java
├── scripts
    └── auto-copyrighter.sh
├── shared
    ├── base.template
    ├── convert_submit_cpu.template
    ├── convert_submit_gpu.template
    ├── power_run_cpu.template
    ├── power_run_gpu.template
    └── spark-submit-template
└── utils
    ├── check.py
    ├── jvm_listener
        ├── pom.xml
        └── src
        │   └── main
        │       └── scala
        │           └── com
        │               └── nvidia
        │                   └── spark
        │                       └── rapids
        │                           └── listener
        │                               ├── Listener.scala
        │                               ├── Manager.scala
        │                               └── TaskFailureListener.scala
    ├── properties
        └── aqe-on.properties
    └── python_benchmark_reporter
        ├── PysparkBenchReport.py
        ├── PythonListener.py
        └── __init__.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a bug report to help us improve RAPIDS Accelerator for Apache Spark benchmark repository
 4 | title: "[BUG]"
 5 | labels: "? - Needs Triage, bug"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Steps/Code to reproduce bug**
14 | Please provide a list of steps or a code sample to reproduce the issue.
15 | Avoid posting private or sensitive data.
16 | 
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 | 
20 | **Environment details (please complete the following information)**
21 |  - Environment
22 |  - Configuration settings related to the issue
23 | 
24 | **Additional context**
25 | Add any other context about the problem here.
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation request
 3 | about: Report incorrect or needed documentation
 4 | title: "[DOC]"
 5 | labels: "? - Needs Triage, documentation"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Report incorrect documentation
11 | 
12 | **Location of incorrect documentation**
13 | Provide links and line numbers if applicable.
14 | 
15 | **Describe the problems or issues found in the documentation**
16 | A clear and concise description of what you found to be incorrect.
17 | 
18 | **Steps taken to verify documentation is incorrect**
19 | List any steps you have taken:
20 | 
21 | **Suggested fix for documentation**
22 | Detail proposed changes to fix the documentation if you have any.
23 | 
24 | ---
25 | 
26 | ## Report needed documentation
27 | 
28 | **Report needed documentation**
29 | A clear and concise description of what documentation you believe it is needed and why.
30 | 
31 | **Describe the documentation you'd like**
32 | A clear and concise description of what you want to happen.
33 | 
34 | **Steps taken to search for needed documentation**
35 | List any steps you have taken:
36 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for RAPIDS Accelerator for Apache Spark benchmarks repository
 4 | title: "[FEA]"
 5 | labels: "? - Needs Triage, feature request"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I wish the RAPIDS Accelerator for Apache Spark benchmark scripts would [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context, code examples, or references to existing implementations about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/submit-question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Submit question
 3 | about: Ask a general question about RAPIDS Accelerator for Apache Spark benchmarks, or open a thread in the Discussions tab in the https://github.com/nvidia/spark-rapids repository
 4 | title: "[QST]"
 5 | labels: "? - Needs Triage, question"
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **What is your question?**
11 | 


--------------------------------------------------------------------------------
/.github/workflows/add-to-project.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Add new issues and pull requests to project
16 | 
17 | on:
18 |   issues:
19 |     types:
20 |       - opened
21 |   pull_request_target:
22 |     types:
23 |       - opened
24 | 
25 | jobs:
26 |   Add-to-project:
27 |     if: github.repository_owner == 'NVIDIA' # avoid adding issues from forks
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - name: add-to-project
31 |         uses: NVIDIA/spark-rapids-common/add-to-project@main
32 |         with:
33 |           token: ${{ secrets.PROJECT_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/license-header-check.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to check copyright/license header
16 | name: license header check
17 | 
18 | on:
19 |   pull_request:
20 |     types: [opened, synchronize, reopened]
21 | 
22 | jobs:
23 |   license-header-check:
24 |     runs-on: ubuntu-latest
25 |     if: "!contains(github.event.pull_request.title, '[bot]')"
26 |     steps:
27 |       - name: Get checkout depth
28 |         run: |
29 |           echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
30 | 
31 |       - name: Checkout code
32 |         uses: actions/checkout@v4
33 |         with:
34 |           fetch-depth: ${{ env.PR_FETCH_DEPTH }}
35 | 
36 |       - name: license-header-check
37 |         uses: NVIDIA/spark-rapids-common/license-header-check@main
38 |         with:
39 |           included_file_patterns: |
40 |             *.sh,
41 |             *.template,
42 |             *.py,
43 |             *.yaml,
44 |             *.yml,
45 |             *.xml,
46 |             *.scala,
47 |             *.properties,
48 |             *.java,
49 |             *Makefile*,
50 |             *.sql
51 |       


--------------------------------------------------------------------------------
/.github/workflows/signoff-check.yml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # A workflow to check if PR got sign-off
16 | name: signoff check
17 | 
18 | on:
19 |   pull_request_target:
20 |     types: [opened, synchronize, reopened]
21 | 
22 | jobs:
23 |   signoff-check:
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - name: signoff
27 |         uses: NVIDIA/spark-rapids-common/signoff-check@main
28 |         with:
29 |           owner: ${{ github.repository_owner }}
30 |           repo: spark-rapids-benchmarks
31 |           pull_number: ${{ github.event.number }}
32 |           token: ${{ secrets.GITHUB_TOKEN }}
33 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | repos:
16 |   - repo: local
17 |     hooks:
18 |       - id: auto-copyrighter
19 |         name: Update copyright year
20 |         entry: scripts/auto-copyrighter.sh
21 |         language: script
22 |         pass_filenames: true
23 |         verbose: true


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Overview
 4 | 
 5 | Define the code of conduct followed and enforced by the RAPIDS Accelerator for Apache Spark project
 6 | 
 7 | ### Intended audience
 8 | 
 9 | COMMUNITY | DEVELOPERS | PROJECT LEADS
10 | 
11 | ## Our Pledge
12 | 
13 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
14 | 
15 | ## Our Standards
16 | 
17 | Examples of behavior that contributes to creating a positive environment include:
18 | 
19 | -   Using welcoming and inclusive language
20 | -   Being respectful of differing viewpoints and experiences
21 | -   Gracefully accepting constructive criticism
22 | -   Focusing on what is best for the community
23 | -   Showing empathy towards other community members
24 | 
25 | Examples of unacceptable behavior by participants include:
26 | 
27 | -   The use of sexualized language or imagery and unwelcome sexual attention or advances
28 | -   Trolling, insulting/derogatory comments, and personal or political attacks
29 | -   Public or private harassment
30 | -   Publishing others’ private information, such as a physical or electronic address, without explicit permission
31 | -   Other conduct which could reasonably be considered inappropriate in a professional setting
32 | 
33 | ## Our Responsibilities
34 | 
35 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
36 | 
37 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
38 | 
39 | ## Scope
40 | 
41 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
42 | 
43 | ## Enforcement
44 | 
45 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at  [spark-rapids-conduct@nvidia.com](mailto:spark-rapids-conduct@nvidia.com)  All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
46 | 
47 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership.
48 | 
49 | ## Attribution
50 | 
51 | This Code of Conduct was taken from the [NVIDIA RAPIDS](https://docs.rapids.ai/resources/conduct/) project, which was adapted from the  [Contributor Covenant](https://www.contributor-covenant.org/), version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
52 | 
53 | For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq
54 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Spark RAPIDS Benchmarks
  2 | 
  3 | Contributing to Spark RAPIDS Benchmarks fall into the following three categories.
  4 | 
  5 | 1. To report a bug, request a new feature, or report a problem with
  6 |     documentation, please file an issue
  7 |     describing in detail the problem or new feature. The project team evaluates
  8 |     and triages issues, and schedules them for a release. If you believe the
  9 |     issue needs priority attention, please comment on the issue to notify the
 10 |     team.
 11 | 2. To propose and implement a new feature, please file a new feature request
 12 |     issue. Describe the
 13 |     intended feature and discuss the design and implementation with the team and
 14 |     community. Once the team agrees that the plan looks good, go ahead and
 15 |     implement it using the [code contributions](#code-contributions) guide below.
 16 | 3. To implement a feature or bug-fix for an existing outstanding issue, please
 17 |     follow the [code contributions](#code-contributions) guide below. If you
 18 |     need more context on a particular issue, please ask in a comment.
 19 | 
 20 | ## Building From Source
 21 | [Nvidia Decision Support(NDS)](./nds/): 
 22 |         
 23 | - Please refer to [nds/README](./nds/README.md#prerequisites) for prerequisites and build instructions.
 24 | 
 25 | - Note: the build step aims to:
 26 |     1. Apply code and query template modifications to original TPC-DS toolkit to make it compatible to
 27 | Spark (see [patches](./nds/tpcds-gen/patches/))
 28 |     2. Use maven to build a maven project that is used to only generate data on HDFS.
 29 | (see [tpcds-gen/src](./nds/tpcds-gen/src/))
 30 | 
 31 | ## Code contributions
 32 | 
 33 | ### Source code layout
 34 | 
 35 | The repository contains the following parts:
 36 | 
 37 | [Nvidia Decision Support(NDS)](./nds):
 38 | - `cicd` contains an settings.xml file used for maven build tool
 39 | - `properties` property files that contains Spark configs used for submitting Spark jobs
 40 | - `pyspark_spy` a third party library used to add SparkListener via pyspark
 41 | - `tpcds-gen`
 42 |     - `patches` code changes and template modifications based on original TPC-DS tool
 43 |     - `src` Hadoop application code to generate data in HDFS
 44 | - `PysparkBenchReport.py` generate json summary report reflecting statistics for NDS run
 45 | - `check.py` utils to check build and validate input
 46 | - `nds_gen_data.py` generate data in local or hdfs
 47 | - `nds_gen_query_stream.py` generate query streams or specific query
 48 | - `nds_power.py` functionality to execute Power Run
 49 | - `nds_transcode.py` used to convert CSV data to Parquet
 50 | - `*.template` template file contains Spark configs to submit a Power Run
 51 | - `spark-submit-template` script to process template content
 52 | 
 53 | ### Integrated Development Environment
 54 | For python scripts, VSCode or PyCharm are recommended but developers can choose arbitary IDE they prefer.
 55 | For java code in [tpcds-gen/src](./nds/tpcds-gen/src/), IntelliJ IDEA is recommended.
 56 | It will download necessary dependencies once it opens the folder as a maven project.
 57 | 
 58 | ### Your first issue
 59 | 
 60 | 1. Read the project's [README.md](./nds/README.md) to learn how to build the project and run scripts.
 61 | 2. Find an issue to work on.
 62 | 
 63 | ## Coding style
 64 | 1. For Python [PEP8](https://www.python.org/dev/peps/pep-0008) is used to check the adherence to this style.
 65 | 2. For Java [Oracle Java code conventions](http://www.oracle.com/technetwork/java/codeconvtoc-136057.html) is used to check the adherence to this style.
 66 | 
 67 | ### Sign your work
 68 | 
 69 | We require that all contributors sign-off on their commits. This certifies that the contribution is
 70 | your original work, or you have rights to submit it under the same license, or a compatible license.
 71 | 
 72 | Any contribution which contains commits that are not signed off will not be accepted.
 73 | 
 74 | To sign off on a commit use the `--signoff` (or `-s`) option when committing your changes:
 75 | 
 76 | ```shell
 77 | git commit -s -m "Add cool feature."
 78 | ```
 79 | 
 80 | This will append the following to your commit message:
 81 | 
 82 | ```
 83 | Signed-off-by: Your Name <your@email.com>
 84 | ```
 85 | 
 86 | The sign-off is a simple line at the end of the explanation for the patch. Your signature certifies
 87 | that you wrote the patch or otherwise have the right to pass it on as an open-source patch. Use your
 88 | real name, no pseudonyms or anonymous contributions.  If you set your `user.name` and `user.email`
 89 | git configs, you can sign your commit automatically with `git commit -s`.
 90 | 
 91 | 
 92 | The signoff means you certify the below (from [developercertificate.org](https://developercertificate.org)):
 93 | 
 94 | ```
 95 | Developer Certificate of Origin
 96 | Version 1.1
 97 | 
 98 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 99 | 1 Letterman Drive
100 | Suite D4700
101 | San Francisco, CA, 94129
102 | 
103 | Everyone is permitted to copy and distribute verbatim copies of this
104 | license document, but changing it is not allowed.
105 | 
106 | 
107 | Developer's Certificate of Origin 1.1
108 | 
109 | By making a contribution to this project, I certify that:
110 | 
111 | (a) The contribution was created in whole or in part by me and I
112 |     have the right to submit it under the open source license
113 |     indicated in the file; or
114 | 
115 | (b) The contribution is based upon previous work that, to the best
116 |     of my knowledge, is covered under an appropriate open source
117 |     license and I have the right under that license to submit that
118 |     work with modifications, whether created in whole or in part
119 |     by me, under the same open source license (unless I am
120 |     permitted to submit under a different license), as indicated
121 |     in the file; or
122 | 
123 | (c) The contribution was provided directly to me by some other
124 |     person who certified (a), (b) or (c) and I have not modified
125 |     it.
126 | 
127 | (d) I understand and agree that this project and the contribution
128 |     are public and that a record of the contribution (including all
129 |     personal information I submit with it, including my sign-off) is
130 |     maintained indefinitely and may be redistributed consistent with
131 |     this project or the open source license(s) involved.
132 | ```
133 | 
134 | ### Pre-commit hooks
135 | 
136 | We provide a basic config `.pre-commit-config.yaml` for [pre-commit](https://pre-commit.com/) to
137 | automate some aspects of the development process. As a convenience you can enable automatic
138 | copyright year updates by following the installation instructions on the
139 | [pre-commit homepage](https://pre-commit.com/).
140 | 
141 | To this end, first install `pre-commit` itself using the method most suitable for your development
142 | environment. Then you will need to run `pre-commit install` to enable it in your local git
143 | repository. Using `--allow-missing-config` will make it easy to work with older branches
144 | that do not have `.pre-commit-config.yaml`.
145 | 
146 | ```bash
147 | pre-commit install --allow-missing-config
148 | ```
149 | 
150 | and setting the environment variable:
151 | 
152 | ```bash
153 | export SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER=ON
154 | ```
155 | The default value of `SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER` is `OFF`.
156 | 
157 | When automatic copyright updater is enabled and you modify a file with a prior
158 | year in the copyright header it will be updated on `git commit` to the current year automatically.
159 | However, this will abort the [commit process](https://github.com/pre-commit/pre-commit/issues/532)
160 | with the following error message:
161 | ```
162 | Update copyright year....................................................Failed
163 | - hook id: auto-copyrighter
164 | - duration: 0.01s
165 | - files were modified by this hook
166 | ```
167 | You can confirm that the update has actually happened by either inspecting its effect with
168 | `git diff` first or simply re-executing `git commit` right away. The second time no file
169 | modification should be triggered by the copyright year update hook and the commit should succeed.
170 | 
171 | There is a known issue for macOS users if they use the default version of `sed`. The copyright update
172 | script may fail and generate an unexpected file named `source-file-E`. As a workaround, please
173 | install GNU sed
174 | 
175 | ```bash
176 | brew install gnu-sed
177 | # and add to PATH to make it as default sed for your shell
178 | export PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH"
179 | ```
180 | 
181 | ## Attribution
182 | Portions adopted from https://github.com/NVIDIA/spark-rapids/blob/main/CONTRIBUTING.md
183 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | NDS
 2 | Copyright (c) 2022, NVIDIA CORPORATION
 3 | 
 4 | // ------------------------------------------------------------------
 5 | // NOTICE file corresponding to the section 4d of The Apache License,
 6 | // Version 2.0, in this case for
 7 | // ------------------------------------------------------------------
 8 | 
 9 | pyspark-spy
10 | The MIT License (MIT)
11 | 
12 | Copyright (c) 2016 Alexander Gorokhov
13 | 
14 | This product includes software developed by
15 | Alexander Gorokhov in project pyspark-spy at https://github.com/sashgorokhov/pyspark-spy


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark RAPIDS Benchmarks
 2 | 
 3 | A repo for Spark related benchmark sets and utilities using the 
 4 | [RAPIDS Accelerator For Apache Spark](https://github.com/NVIDIA/spark-rapids). 
 5 | 
 6 | ## Benchmark sets:
 7 | - [Nvidia Decision Support ( NDS )](./nds/)
 8 | - [Nvidia Decision Support-H ( NDS-H )](./nds-h/)
 9 | 
10 | Please see README in each benchmark set for more details including building instructions and usage
11 | descriptions.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | ## Security
 2 | 
 3 | NVIDIA is dedicated to the security and trust of our software products and services, including all
 4 | source code repositories managed through our organization. 
 5 | 
 6 | If you need to report a security issue, please use the appropriate contact points outlined
 7 | below. **Please do not report security vulnerabilities through GitHub/GitLab.** 
 8 | 
 9 | ## Reporting Potential Security Vulnerability in an NVIDIA Product
10 | 
11 | To report a potential security vulnerability in any NVIDIA product:
12 | - Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
13 | - E-Mail: psirt@nvidia.com
14 |    - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
15 |    - Please include the following information:
16 |       - Product/Driver name and version/branch that contains the vulnerability
17 | 


--------------------------------------------------------------------------------
/nds-h/nds_h_gen_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-H version 3.0.1
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-H Benchmark.
 31 | #
 32 | 
 33 | import argparse
 34 | import os
 35 | import sys
 36 | import subprocess
 37 | import shutil
 38 | 
 39 | #For adding utils to path
 40 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 41 | utils_dir = os.path.join(parent_dir, 'utils')
 42 | sys.path.insert(0, utils_dir)
 43 | 
 44 | from check import check_build_nds_h, check_version, get_abs_path, get_dir_size, parallel_value_type, valid_range
 45 | 
 46 | check_version()
 47 | 
 48 | # Source tables contained in the schema for TPC-H. For more information, check -
 49 | # https://www.tpc.org/TPC_Documents_Current_Versions/pdf/TPC-H_v3.0.1.pdf
 50 | 
 51 | source_table_names = [
 52 |     'customer',
 53 |     'lineitem',
 54 |     'nation',
 55 |     'orders',
 56 |     'part',
 57 |     'partsupp',
 58 |     'region',
 59 |     'supplier'
 60 | ]
 61 | 
 62 | 
 63 | def generate_data_local(args, range_start, range_end, tool_path):
 64 |     """Generate data to local file system. TPC-DS tool will generate all table data under target
 65 |     folder without creating sub-folders for each table. So we add extra code to create sub folder
 66 |     for each table and move data there respectively.
 67 | 
 68 |     Args:
 69 |         args (Namepace): Namespace from argparser
 70 |         tool_path (str): path to the dsdgen tool
 71 | 
 72 |     Raises:
 73 |         Exception: if data already exists and overwrite_output is not honored
 74 |         Exception: dsdgen failed
 75 |     """
 76 |     data_dir = get_abs_path(args.data_dir)
 77 |     if not os.path.isdir(data_dir):
 78 |         os.makedirs(data_dir)
 79 |     else:
 80 |         # Verify if there's already data in this path
 81 |         if get_dir_size(data_dir) > 0 and not args.overwrite_output:
 82 |             raise Exception(
 83 |                 "There's already been data exists in directory {}.".format(data_dir) +
 84 |                 " Use '--overwrite_output' to overwrite.")
 85 | 
 86 |     # working directory for dsdgen
 87 |     work_dir = tool_path.parent
 88 |     print(work_dir)
 89 |     procs = []
 90 |     for i in range(range_start, range_end + 1):
 91 |         dbgen = ["-s", args.scale,
 92 |                  "-C", args.parallel,
 93 |                  "-S", str(i),
 94 |                  "-v", "Y",
 95 |                  "-f", "Y"]
 96 |         procs.append(subprocess.Popen(
 97 |             ["./dbgen"] + dbgen, cwd=str(work_dir)))
 98 |     # wait for data generation to complete
 99 |     for p in procs:
100 |         p.wait()
101 |         if p.returncode != 0:
102 |             print("dbgen failed with return code {}".format(p.returncode))
103 |             raise Exception("dbgen failed")
104 |     # move multi-partition files into table folders
105 |     table_names = source_table_names
106 |     for table in table_names:
107 |         print('mkdir -p {}/{}'.format(data_dir, table))
108 |         subprocess.run(['mkdir', '-p', data_dir + '/' + table])
109 |         if (table != 'region' and table != 'nation'):
110 |             for i in range(range_start, range_end + 1):
111 |                 subprocess.run(['mv', f'{work_dir}/{table}.tbl.{i}',
112 |                                 f'{data_dir}/{table}/'], stderr=subprocess.DEVNULL)
113 |         else:
114 |             subprocess.run(['mv', f'{work_dir}/{table}.tbl',
115 |                             f'{data_dir}/{table}/'], stderr=subprocess.DEVNULL)
116 |         # delete date file has no parallel number suffix in the file name, move separately
117 |     # show summary
118 |     subprocess.run(['du', '-h', '-d1', data_dir])
119 | 
120 | 
121 | def clean_temp_data(temp_data_path):
122 |     cmd = ['hadoop', 'fs', '-rm', '-r', '-skipTrash', temp_data_path]
123 |     print(" ".join(cmd))
124 |     subprocess.run(cmd)
125 | 
126 | 
127 | def merge_temp_tables(temp_data_path, parent_data_path):
128 |     """Helper functions for incremental data generation. Move data in temporary child range path to
129 |     parent directory.
130 | 
131 |     Args:
132 |         temp_data_path (str): temorary child range data path
133 |         parent_data_path (str): parent data path
134 |     """
135 |     table_names = source_table_names
136 |     for table_name in table_names:
137 |         # manually create table sub-folders
138 |         # redundant step if it's not the first range part.
139 |         cmd = ['hadoop', 'fs', '-mkdir', parent_data_path + '/' + table_name]
140 |         print(" ".join(cmd))
141 |         subprocess.run(cmd)
142 |         # move temp content to upper folder
143 |         # note not all tables are generated in different child range step
144 |         # please ignore messages like "mv: `.../reason/*': No such file or directory"
145 |         temp_table_data_path = temp_data_path + '/' + table_name + '/*'
146 |         cmd = ['hadoop', 'fs', '-mv', temp_table_data_path,
147 |                parent_data_path + '/' + table_name + '/']
148 |         print(" ".join(cmd))
149 |         subprocess.run(cmd)
150 |     clean_temp_data(temp_data_path)
151 | 
152 | 
153 | def generate_data_hdfs(args, jar_path):
154 |     """generate data to hdfs using TPC-DS dsdgen tool. Support incremental generation: due to the
155 |     limit of hdfs, each range data will be generated under a temporary folder then move to target
156 |     folder.
157 | 
158 |     Args:
159 |         args (Namespace): Namespace from argparser
160 |         jar_path (str): path to the target jar
161 | 
162 |     Raises:
163 |         Exception: if Hadoop binary is not installed.
164 |     """
165 |     # Check if hadoop is installed.
166 |     if shutil.which('hadoop') is None:
167 |         raise Exception('No Hadoop binary found in current environment, ' +
168 |                         'please install Hadoop for data generation in cluster.')
169 |     # Submit hadoop MR job to generate data
170 |     cmd = ['hadoop', 'jar', str(jar_path)]
171 |     cmd += ['-p', args.parallel, '-s', args.scale]
172 |     # get dsdgen.jar path, assume user won't change file structure
173 |     tpcds_gen_path = jar_path.parent.parent.absolute()
174 |     if args.overwrite_output:
175 |         cmd += ['-o']
176 |     if args.range:
177 |         # use a temp folder to save the specific range data.
178 |         # will move the content to parent folder afterwards.
179 |         # it's a workaround for "Output directory ... already exists" in incremental generation
180 |         temp_data_path = args.data_dir + '/_temp_'
181 |         # before generation, we remove "_temp_" folders in case they contain garbage generated by
182 |         # previous user runs.
183 |         clean_temp_data(temp_data_path)
184 |         cmd.extend(["-r", args.range])
185 |         cmd.extend(["-d", temp_data_path])
186 |         try:
187 |             subprocess.run(cmd, check=True, cwd=str(tpcds_gen_path))
188 |             # only move delete table for data maintenance
189 |             merge_temp_tables(temp_data_path, args.data_dir)
190 |         finally:
191 |             clean_temp_data(temp_data_path)
192 |     else:
193 |         cmd.extend(["-d", args.data_dir])
194 |         subprocess.run(cmd, check=True, cwd=str(tpcds_gen_path))
195 |         # only move delete table for data maintenance
196 | 
197 | 
198 | def generate_data(args):
199 |     jar_path, tool_path = check_build_nds_h()
200 |     range_start = 1
201 |     range_end = int(args.parallel)
202 |     if args.range:
203 |         range_start, range_end = valid_range(args.range, args.parallel)
204 |     if args.type == 'hdfs':
205 |         generate_data_hdfs(args, jar_path)
206 |     else:
207 |         generate_data_local(args, range_start, range_end, tool_path)
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     parser = parser = argparse.ArgumentParser()
212 |     parser.add_argument("type",
213 |                         choices=["local", "hdfs"],
214 |                         help="file system to save the generated data.")
215 |     parser.add_argument("scale",
216 |                         help="volume of data to generate in GB. Accepted SF - 1,10, 100, 300, 1000 \
217 |                             ,3000, 10000, 30000,"
218 |                         )
219 |     parser.add_argument("parallel",
220 |                         type=parallel_value_type,
221 |                         help="build data in <parallel_value> separate chunks"
222 |                         )
223 |     parser.add_argument("data_dir",
224 |                         help="generate data in directory.")
225 |     parser.add_argument('--range',
226 |                         help='Used for incremental data generation, meaning which part of child' +
227 |                              'chunks are generated in one run. Format: "start,end", both are inclusive. ' +
228 |                              'e.g. "1,100". Note: the child range must be within the "parallel", ' +
229 |                              '"--parallel 100 --range 100,200" is illegal.')
230 |     parser.add_argument("--overwrite_output",
231 |                         action="store_true",
232 |                         help="overwrite if there has already existing data in the path provided.")
233 | 
234 |     args = parser.parse_args()
235 |     generate_data(args)
236 | 


--------------------------------------------------------------------------------
/nds-h/nds_h_gen_query_stream.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-H version 3.0.1
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-H Benchmark.
 31 | #
 32 | import argparse
 33 | import os
 34 | import subprocess
 35 | import sys
 36 | 
 37 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 38 | utils_dir = os.path.join(parent_dir, 'utils')
 39 | sys.path.insert(0, utils_dir)
 40 | 
 41 | from check import check_build_nds_h, check_version, get_abs_path
 42 | 
 43 | check_version()
 44 | 
 45 | def generate_query_streams(args, tool_path):
 46 |     """call TPC-H qgen tool to generate a specific query or query stream(s) that contains all
 47 |     TPC-DS queries.
 48 | 
 49 |     Args:
 50 |         args (Namespace): Namespace from argparser
 51 |         tool_path (str): path to the tool
 52 |     """
 53 |     # move to the tools directory
 54 |     work_dir = tool_path.parent
 55 |     output_dir = get_abs_path(args.output_dir)
 56 | 
 57 |     if not os.path.isdir(args.output_dir):
 58 |         os.makedirs(args.output_dir)
 59 |     
 60 |     os.environ["DSS_QUERY"] = str(work_dir / "queries")
 61 | 
 62 |     base_cmd = ['./qgen',
 63 |                 '-s', args.scale]
 64 |     
 65 |     if args.streams:
 66 |         procs = []
 67 |         for i in range(1,int(args.streams)+1):
 68 |             new_cmd = base_cmd + ['-p',str(i)]
 69 |             output_file = os.path.join(output_dir, f"stream_{i}.sql")
 70 |             with open(output_file,'w') as f:
 71 |                 procs.append(subprocess.Popen(new_cmd, cwd=str(work_dir), stdout=f))
 72 |         for p in procs:
 73 |             p.wait()
 74 |             if p.returncode != 0:
 75 |                 print("QGEN failed with return code {}".format(p.returncode))
 76 |                 raise Exception("dbgen failed")
 77 |     else:
 78 |         output_file = os.path.join(output_dir, f"query_{args.template}.sql")
 79 |         base_cmd = base_cmd + ['-d',args.template]
 80 |         with open(output_file,"w") as f:
 81 |             subprocess.run(base_cmd, check=True, cwd=str(work_dir),stdout=f)
 82 | 
 83 | if __name__ == "__main__":
 84 |     jar_path, tool_path = check_build_nds_h()
 85 |     parser = parser = argparse.ArgumentParser()
 86 |     parser.add_argument("scale",
 87 |                         help="Assume a database of this scale factor.")
 88 |     parser.add_argument("output_dir",
 89 |                         help="Generate query stream in directory.")
 90 |     group = parser.add_mutually_exclusive_group(required=True)
 91 |     group.add_argument("--template",
 92 |                         help="build queries from this template. Only used to generate one query " +
 93 |                         "from one tempalte. This argument is mutually exclusive with --streams. " +
 94 |                         "It is often used for test purpose.")
 95 |     group.add_argument('--streams',
 96 |                         help='generate how many query streams. ' +
 97 |                         'This argument is mutually exclusive with --template.')
 98 |     args = parser.parse_args()
 99 | 
100 |     generate_query_streams(args, tool_path) 
101 | 


--------------------------------------------------------------------------------
/nds-h/nds_h_schema.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-H version 3.0.1
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-H Benchmark.
 31 | #
 32 | 
 33 | from pyspark.sql.types import *
 34 | 
 35 | 
 36 | def get_schemas():
 37 |     """get the schemas of all tables.
 38 | 
 39 |     Returns:
 40 |         dict: {table_name: schema}
 41 |     """
 42 |     SCHEMAS = {}
 43 | 
 44 |     # The specification states that "Identifier means that the column shall be able to hold any
 45 |     # key value generated for that column". Some tables have more rows than others so we can
 46 |     # choose to use different types per table.
 47 |     identifier_int = IntegerType()
 48 |     identifier_long = LongType()
 49 | 
 50 |     SCHEMAS["part"] = StructType([
 51 |         StructField("p_partkey", LongType(), False),
 52 |         StructField("p_name", StringType(), False),
 53 |         StructField("p_mfgr", StringType(), False),
 54 |         StructField("p_brand", StringType(), False),
 55 |         StructField("p_type", StringType(), False),
 56 |         StructField("p_size", IntegerType(), False),
 57 |         StructField("p_container", StringType(), False),
 58 |         StructField("p_retailprice", DecimalType(11, 2), False),
 59 |         StructField("p_comment", StringType(), False),
 60 |         StructField("ignore", StringType(), True)
 61 |     ])
 62 | 
 63 |     SCHEMAS['supplier'] = StructType([
 64 |             StructField("s_suppkey", LongType(), False),
 65 |             StructField("s_name", StringType(), False),
 66 |             StructField("s_address", StringType(), False),
 67 |             StructField("s_nationkey", LongType(), False),
 68 |             StructField("s_phone", StringType(), False),
 69 |             StructField("s_acctbal", DecimalType(11, 2), False),
 70 |             StructField("s_comment", StringType(), False),
 71 |             StructField("ignore", StringType(), True)
 72 |     ])
 73 | 
 74 |     SCHEMAS['partsupp'] = StructType([
 75 |             StructField("ps_partkey", LongType(), False),
 76 |             StructField("ps_suppkey", LongType(), False),
 77 |             StructField("ps_availqty", IntegerType(), False),
 78 |             StructField("ps_supplycost", DecimalType(11, 2), False),
 79 |             StructField("ps_comment", StringType(), False),
 80 |             StructField("ignore", StringType(), True)
 81 |     ])
 82 | 
 83 |     SCHEMAS['customer'] = StructType([
 84 |             StructField("c_custkey", LongType(), False),
 85 |             StructField("c_name", StringType(), False),
 86 |             StructField("c_address", StringType(), False),
 87 |             StructField("c_nationkey", LongType(), False),
 88 |             StructField("c_phone", StringType(), False),
 89 |             StructField("c_acctbal", DecimalType(11, 2), False),
 90 |             StructField("c_mktsegment", StringType(), False),
 91 |             StructField("c_comment", StringType(), False),
 92 |             StructField("ignore", StringType(), True)
 93 |     ])
 94 | 
 95 |     SCHEMAS['orders'] = StructType([
 96 |             StructField("o_orderkey", LongType(), False),
 97 |             StructField("o_custkey", LongType(), False),
 98 |             StructField("o_orderstatus", StringType(), False),
 99 |             StructField("o_totalprice", DecimalType(11, 2), False),
100 |             StructField("o_orderdate", DateType(), False),
101 |             StructField("o_orderpriority", StringType(), False),
102 |             StructField("o_clerk", StringType(), False),
103 |             StructField("o_shippriority", IntegerType(), False),
104 |             StructField("o_comment", StringType(), False),
105 |             StructField("ignore", StringType(), True)
106 |     ])
107 | 
108 |     SCHEMAS['lineitem'] = StructType([
109 |             StructField("l_orderkey", LongType(), False),
110 |             StructField("l_partkey", LongType(), False),
111 |             StructField("l_suppkey", LongType(), False),
112 |             StructField("l_linenumber", IntegerType(), False),
113 |             StructField("l_quantity", DecimalType(11, 2), False),
114 |             StructField("l_extendedprice", DecimalType(11, 2), False),
115 |             StructField("l_discount", DecimalType(11, 2), False),
116 |             StructField("l_tax", DecimalType(11, 2), False),
117 |             StructField("l_returnflag", StringType(), False),
118 |             StructField("l_linestatus", StringType(), False),
119 |             StructField("l_shipdate", DateType(), False),
120 |             StructField("l_commitdate", DateType(), False),
121 |             StructField("l_receiptdate", DateType(), False),
122 |             StructField("l_shipinstruct", StringType(), False),
123 |             StructField("l_shipmode", StringType(), False),
124 |             StructField("l_comment", StringType(), False),
125 |             StructField("ignore", StringType(), True)
126 |     ])
127 | 
128 |     SCHEMAS['nation'] = StructType([
129 |             StructField("n_nationkey", LongType(), False),
130 |             StructField("n_name", StringType(), False),
131 |             StructField("n_regionkey", LongType(), False),
132 |             StructField("n_comment", StringType(), False),
133 |             StructField("ignore", StringType(), True)
134 |     ])
135 | 
136 |     SCHEMAS['region'] = StructType([
137 |             StructField("r_regionkey", LongType(), False),
138 |             StructField("r_name", StringType(), False),
139 |             StructField("r_comment", StringType(), False),
140 |             StructField("ignore", StringType(), True)
141 |     ])
142 | 
143 |     return SCHEMAS
144 | 
145 | if __name__ == "__main__":
146 |     # Test code
147 |     print(get_schemas())


--------------------------------------------------------------------------------
/nds-h/nds_h_transcode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-H version 3.0.1
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-H Benchmark.
 31 | #
 32 | 
 33 | import argparse
 34 | import timeit
 35 | import pyspark
 36 | 
 37 | from datetime import datetime
 38 | 
 39 | from pyspark.sql.types import *
 40 | from pyspark.sql.functions import col
 41 | from nds_h_schema import *
 42 | 
 43 | # Note the specific partitioning is applied when save the parquet data files.
 44 | TABLE_PARTITIONING = {
 45 |     'part': 'p_partkey',
 46 |     'supplier': 's_suppkey',
 47 |     'partsupp': 'ps_partkey',
 48 |     'customer': 'c_custkey',
 49 |     'orders': 'o_orderkey',
 50 |     'nation': 'n_nationkey',
 51 |     'region':'r_regionkey'
 52 | }
 53 | 
 54 | 
 55 | def load(session, filename, schema, input_format, delimiter="|", header="false", prefix=""):
 56 |     data_path = prefix + '/' + filename
 57 |     if input_format == 'csv':
 58 |         print("Schema is {}",schema)
 59 |         df = session.read.option("delimiter", delimiter).option("header", header)\
 60 |                 .option("encoding", "ISO-8859-1").csv(data_path, schema=schema)
 61 |         print("Head is {}",df.head())
 62 |         return df
 63 |     elif input_format in ['parquet', 'orc', 'avro', 'json']:
 64 |         return session.read.format(input_format).load(data_path)
 65 |     # TODO: all of the output formats should be also supported as input format possibilities
 66 |     # remains 'iceberg', 'delta'
 67 |     else:
 68 |         raise ValueError("Unsupported input format: {}".format(input_format))
 69 | 
 70 | def store(session,
 71 |           df,
 72 |           filename,
 73 |           output_format,
 74 |           output_mode,
 75 |           prefix=""):
 76 |     """Create Iceberg tables by CTAS
 77 | 
 78 |     Args:
 79 |         session (SparkSession): a working SparkSession instance
 80 |         df (DataFrame): DataFrame to be serialized into Iceberg table
 81 |         filename (str): name of the table(file)
 82 |         output_format (str): parquet, orc or avro
 83 |         output_mode (str): save modes as defined by "https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes.
 84 |         iceberg_write_format (bool): write data into Iceberg tables with specified format
 85 |         compression (str): compression codec for converted data when saving to disk
 86 |         prefix (str): output data path when not using Iceberg.
 87 |     """
 88 |     data_path = prefix + '/' + filename
 89 |     df = df.repartition(200)
 90 |     writer = df.write
 91 |     writer = writer.format(output_format).mode(output_mode)
 92 |     writer.saveAsTable(filename, path=data_path)
 93 | 
 94 | def transcode(args):
 95 |     """
 96 |     Default function that is triggered post argument parsing
 97 | 
 98 |     Parameters: 
 99 |     args ( argparse.Namespace ): returns the parsed arguments in the namespace
100 | 
101 |     Returns:
102 |     Nothing
103 | 
104 |     """
105 |     session_builder = pyspark.sql.SparkSession.builder
106 |     session = session_builder.appName(f"NDS-H - transcode - {args.output_format}").getOrCreate()
107 |     session.sparkContext.setLogLevel(args.log_level)
108 |     results = {}
109 | 
110 |     schemas = get_schemas()
111 |     
112 |     trans_tables = schemas
113 | 
114 |     if args.tables:
115 |         for t in args.tables:
116 |             if t not in trans_tables.keys() :
117 |                 raise Exception(f"invalid table name: {t}. Valid tables are: {schemas.keys()}")
118 |         trans_tables = {t: trans_tables[t] for t in args.tables if t in trans_tables}
119 | 
120 | 
121 |     start_time = datetime.now()
122 |     print(f"Load Test Start Time: {start_time}")
123 |     for fn, schema in trans_tables.items():
124 |         results[fn] = timeit.timeit(
125 |             lambda: store(session,
126 |                           load(session,
127 |                                f"{fn}",
128 |                                schema,
129 |                                input_format=args.input_format,
130 |                                prefix=args.input_prefix),
131 |                           f"{fn}",
132 |                           args.output_format,
133 |                           args.output_mode,
134 |                           args.output_prefix),
135 |             number=1)
136 | 
137 |     end_time = datetime.now()
138 |     delta = (end_time - start_time).total_seconds()
139 |     print(f"Load Test Finished at: {end_time}")
140 |     print(f"Load Test Time: {delta} seconds")
141 |     # format required at TPC-DS Spec 4.3.1
142 |     end_time_formatted = end_time.strftime("%m%d%H%M%S%f")[:-5]
143 |     print(f"RNGSEED used :{end_time_formatted}")
144 | 
145 |     report_text = ""
146 |     report_text += f"Load Test Time: {delta} seconds\n"
147 |     report_text += f"Load Test Finished at: {end_time}\n"
148 |     report_text += f"RNGSEED used: {end_time_formatted}\n"
149 | 
150 |     for table, duration in results.items():
151 |         report_text += "Time to convert '%s' was %.04fs\n" % (table, duration)
152 | 
153 |     report_text += "\n\n\nSpark configuration follows:\n\n"
154 | 
155 |     with open(args.report_file, "w") as report:
156 |         report.write(report_text)
157 |         print(report_text)
158 | 
159 |         for conf in session.sparkContext.getConf().getAll():
160 |             report.write(str(conf) + "\n")
161 |             print(conf)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     parser = parser = argparse.ArgumentParser()
166 |     parser.add_argument(
167 |         'input_prefix',
168 |         help='input folder')
169 |     parser.add_argument(
170 |         'output_prefix',
171 |         help='output folder')
172 |     parser.add_argument(
173 |         'report_file',
174 |         help='location to store a performance report(local)')
175 |     parser.add_argument(
176 |         '--output_mode',
177 |         choices=['overwrite', 'append', 'ignore', 'error', 'errorifexists'],
178 |         help="save modes as defined by " +
179 |         "https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes." +
180 |         "default value is errorifexists, which is the Spark default behavior.",
181 |         default="errorifexists")
182 |     parser.add_argument(
183 |         '--input_format',
184 |         choices=['csv', 'parquet', 'orc', 'avro', 'json'],
185 |         default='csv',
186 |         help='input data format to be converted. default value is csv.'
187 |     )
188 |     parser.add_argument(
189 |         '--output_format',
190 |         choices=['parquet', 'orc', 'avro', 'json', 'iceberg', 'delta'],
191 |         default='parquet',
192 |         help="output data format when converting CSV data sources."
193 |     )
194 |     parser.add_argument(
195 |         '--tables',
196 |         type=lambda s: s.split(','),
197 |         help="specify table names by a comma separated string. e.g. 'catalog_page,catalog_sales'.")
198 |     parser.add_argument(
199 |         '--log_level',
200 |         help='set log level for Spark driver log. Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN(default: INFO)',
201 |         default="INFO")
202 |     args = parser.parse_args()
203 |     transcode(args)
204 | 


--------------------------------------------------------------------------------
/nds-h/tpch-gen/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | all: check-tpch-env prepare-target copy-dbgen modify-makefile modify-tpc-h build-dbgen make-jar build-package
19 | 
20 | check-tpch-env:
21 | ifndef TPCH_HOME
22 | 	$(error "TPCH_HOME not defined, please set TPCH_HOME environment variable to your TPCH Tool directory")
23 | endif
24 | 
25 | prepare-target:
26 | 	rm -Rf target
27 | 	mkdir -p target/
28 | 
29 | copy-dbgen:
30 | 	#Copying all patches to the current query folder
31 | 	cp patches/template.patch "$(TPCH_HOME)/dbgen/queries"
32 | 	# This is required to ensure similar line ending semantics bw patch
33 | 	# file and the sql files
34 | 	cd "$(TPCH_HOME)/dbgen/queries"; dos2unix *.sql
35 | 	cd "$(TPCH_HOME)/dbgen/queries"; dos2unix *.patch
36 | 	# apply patches to both source code and templates
37 | 	cd "$(TPCH_HOME)/dbgen/queries" && cat *.patch | patch -p1
38 | 	cp -r "$(TPCH_HOME)/dbgen" target/
39 | 
40 | modify-makefile:
41 | 	# Create makefile from the template suit
42 | 	cp target/dbgen/makefile.suite target/dbgen/Makefile
43 | 	sed -i '103s/$$/ gcc/' target/dbgen/Makefile
44 | 	sed -i '109s/$$/ SPARK/' target/dbgen/Makefile
45 | 	sed -i '110s/$$/ LINUX/' target/dbgen/Makefile
46 | 	sed -i '111s/$$/ TPCH/' target/dbgen/Makefile
47 | 	sed -i '172i fprintf(ofp, "\\n-- Template file: %s\\n", qtag);' target/dbgen/qgen.c
48 | 
49 | modify-tpc-h:
50 | 	# Enter information for the SPARK replacement variables
51 | 	sed -i '115a\
52 | 		#ifdef SPARK\
53 | 		#define GEN_QUERY_PLAN  ""\
54 | 		#define START_TRAN      ""\
55 | 		#define END_TRAN        ""\
56 | 		#define SET_OUTPUT      ""\
57 | 		#define SET_ROWCOUNT    "LIMIT %d"\
58 | 		#define SET_DBASE       ""\
59 | 		#endif' target/dbgen/tpcd.h
60 | 
61 | build-dbgen:
62 | 	# Build it
63 | 	cd target/dbgen ; make clean; make 2>/dev/null
64 | 
65 | make-jar:
66 | 	cd target;  (jar cvf dbgen.jar dbgen/ || gjar cvf dbgen.jar dbgen/ )
67 | 
68 | build-package:
69 | 	mvn package
70 | 	


--------------------------------------------------------------------------------
/nds-h/tpch-gen/patches/template.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/1.sql b/1.sql
  2 | index 407417e..12815a0 100644
  3 | --- a/1.sql
  4 | +++ b/1.sql
  5 | @@ -18,11 +18,11 @@ select
  6 |  from
  7 |  	lineitem
  8 |  where
  9 | -	l_shipdate <= date '1998-12-01' - interval ':1' day (3)
 10 | +	l_shipdate <= date '1998-12-01' - interval ':1' day
 11 |  group by
 12 |  	l_returnflag,
 13 |  	l_linestatus
 14 |  order by
 15 |  	l_returnflag,
 16 | -	l_linestatus;
 17 | -:n -1
 18 | +	l_linestatus
 19 | +;
 20 | diff --git a/10.sql b/10.sql
 21 | index 2c8810c..55d13eb 100644
 22 | --- a/10.sql
 23 | +++ b/10.sql
 24 | @@ -34,5 +34,6 @@ group by
 25 |  	c_address,
 26 |  	c_comment
 27 |  order by
 28 | -	revenue desc;
 29 | +	revenue desc
 30 |  :n 20
 31 | +;
 32 | diff --git a/11.sql b/11.sql
 33 | index 885185c..c0c6372 100644
 34 | --- a/11.sql
 35 | +++ b/11.sql
 36 | @@ -30,5 +30,5 @@ group by
 37 |  				and n_name = ':1'
 38 |  		)
 39 |  order by
 40 | -	value desc;
 41 | -:n -1
 42 | +	value desc
 43 | +;
 44 | diff --git a/12.sql b/12.sql
 45 | index 0eb4aec..7b41659 100644
 46 | --- a/12.sql
 47 | +++ b/12.sql
 48 | @@ -31,5 +31,5 @@ where
 49 |  group by
 50 |  	l_shipmode
 51 |  order by
 52 | -	l_shipmode;
 53 | -:n -1
 54 | +	l_shipmode
 55 | +;
 56 | diff --git a/13.sql b/13.sql
 57 | index 90d0750..2d85977 100644
 58 | --- a/13.sql
 59 | +++ b/13.sql
 60 | @@ -23,5 +23,5 @@ group by
 61 |  	c_count
 62 |  order by
 63 |  	custdist desc,
 64 | -	c_count desc;
 65 | -:n -1
 66 | +	c_count desc
 67 | +;
 68 | diff --git a/14.sql b/14.sql
 69 | index b5e45e3..eb4815a 100644
 70 | --- a/14.sql
 71 | +++ b/14.sql
 72 | @@ -16,5 +16,5 @@ from
 73 |  where
 74 |  	l_partkey = p_partkey
 75 |  	and l_shipdate >= date ':1'
 76 | -	and l_shipdate < date ':1' + interval '1' month;
 77 | -:n -1
 78 | +	and l_shipdate < date ':1' + interval '1' month
 79 | +;
 80 | diff --git a/15.sql b/15.sql
 81 | index 8e7e974..a966331 100644
 82 | --- a/15.sql
 83 | +++ b/15.sql
 84 | @@ -3,7 +3,7 @@
 85 |  -- Functional Query Definition
 86 |  -- Approved February 1998
 87 |  :x
 88 | -create view revenue:s (supplier_no, total_revenue) as
 89 | +create temp view revenue:s (supplier_no, total_revenue) as
 90 |  	select
 91 |  		l_suppkey,
 92 |  		sum(l_extendedprice * (1 - l_discount))
 93 | @@ -36,5 +36,5 @@ where
 94 |  order by
 95 |  	s_suppkey;
 96 |  
 97 | -drop view revenue:s;
 98 | -:n -1
 99 | +drop view revenue:s
100 | +;
101 | diff --git a/16.sql b/16.sql
102 | index 0dabfb5..bc347b0 100644
103 | --- a/16.sql
104 | +++ b/16.sql
105 | @@ -33,5 +33,5 @@ order by
106 |  	supplier_cnt desc,
107 |  	p_brand,
108 |  	p_type,
109 | -	p_size;
110 | -:n -1
111 | +	p_size
112 | +;
113 | diff --git a/17.sql b/17.sql
114 | index 3968f54..c4f9373 100644
115 | --- a/17.sql
116 | +++ b/17.sql
117 | @@ -20,5 +20,5 @@ where
118 |  			lineitem
119 |  		where
120 |  			l_partkey = p_partkey
121 | -	);
122 | -:n -1
123 | +	)
124 | +;
125 | diff --git a/18.sql b/18.sql
126 | index cce174f..6b38325 100644
127 | --- a/18.sql
128 | +++ b/18.sql
129 | @@ -35,5 +35,6 @@ group by
130 |  	o_totalprice
131 |  order by
132 |  	o_totalprice desc,
133 | -	o_orderdate;
134 | +	o_orderdate
135 |  :n 100
136 | +;
137 | diff --git a/19.sql b/19.sql
138 | index 8b7b915..5c32dcf 100644
139 | --- a/19.sql
140 | +++ b/19.sql
141 | @@ -38,5 +38,5 @@ where
142 |  		and p_size between 1 and 15
143 |  		and l_shipmode in ('AIR', 'AIR REG')
144 |  		and l_shipinstruct = 'DELIVER IN PERSON'
145 | -	);
146 | -:n -1
147 | +	)
148 | +;
149 | diff --git a/2.sql b/2.sql
150 | index 2308318..572a927 100644
151 | --- a/2.sql
152 | +++ b/2.sql
153 | @@ -46,5 +46,6 @@ order by
154 |  	s_acctbal desc,
155 |  	n_name,
156 |  	s_name,
157 | -	p_partkey;
158 | +	p_partkey
159 |  :n 100
160 | +;
161 | \ No newline at end of file
162 | diff --git a/20.sql b/20.sql
163 | index 1d358bf..1323de7 100644
164 | --- a/20.sql
165 | +++ b/20.sql
166 | @@ -40,5 +40,5 @@ where
167 |  	and s_nationkey = n_nationkey
168 |  	and n_name = ':3'
169 |  order by
170 | -	s_name;
171 | -:n -1
172 | +	s_name
173 | +;
174 | diff --git a/21.sql b/21.sql
175 | index 38187dc..671435d 100644
176 | --- a/21.sql
177 | +++ b/21.sql
178 | @@ -42,5 +42,6 @@ group by
179 |  	s_name
180 |  order by
181 |  	numwait desc,
182 | -	s_name;
183 | +	s_name
184 |  :n 100
185 | +;
186 | diff --git a/22.sql b/22.sql
187 | index 9a2aeb7..88a377e 100644
188 | --- a/22.sql
189 | +++ b/22.sql
190 | @@ -40,5 +40,5 @@ from
191 |  group by
192 |  	cntrycode
193 |  order by
194 | -	cntrycode;
195 | -:n -1
196 | +	cntrycode
197 | +;
198 | diff --git a/3.sql b/3.sql
199 | index f054d31..46f33c6 100644
200 | --- a/3.sql
201 | +++ b/3.sql
202 | @@ -25,5 +25,6 @@ group by
203 |  	o_shippriority
204 |  order by
205 |  	revenue desc,
206 | -	o_orderdate;
207 | +	o_orderdate
208 |  :n 10
209 | +;
210 | diff --git a/4.sql b/4.sql
211 | index f068f36..ed3ebca 100644
212 | --- a/4.sql
213 | +++ b/4.sql
214 | @@ -24,5 +24,5 @@ where
215 |  group by
216 |  	o_orderpriority
217 |  order by
218 | -	o_orderpriority;
219 | -:n -1
220 | +	o_orderpriority
221 | +;
222 | diff --git a/5.sql b/5.sql
223 | index 998913d..ec16737 100644
224 | --- a/5.sql
225 | +++ b/5.sql
226 | @@ -27,5 +27,5 @@ where
227 |  group by
228 |  	n_name
229 |  order by
230 | -	revenue desc;
231 | -:n -1
232 | +	revenue desc
233 | +;
234 | diff --git a/6.sql b/6.sql
235 | index 59a6883..3bf726d 100644
236 | --- a/6.sql
237 | +++ b/6.sql
238 | @@ -12,5 +12,5 @@ where
239 |  	l_shipdate >= date ':1'
240 |  	and l_shipdate < date ':1' + interval '1' year
241 |  	and l_discount between :2 - 0.01 and :2 + 0.01
242 | -	and l_quantity < :3;
243 | -:n -1
244 | +	and l_quantity < :3
245 | +;
246 | diff --git a/7.sql b/7.sql
247 | index 26eafad..81ba730 100644
248 | --- a/7.sql
249 | +++ b/7.sql
250 | @@ -42,5 +42,5 @@ group by
251 |  order by
252 |  	supp_nation,
253 |  	cust_nation,
254 | -	l_year;
255 | -:n -1
256 | +	l_year
257 | +;
258 | diff --git a/8.sql b/8.sql
259 | index 977d24e..9b67466 100644
260 | --- a/8.sql
261 | +++ b/8.sql
262 | @@ -40,5 +40,5 @@ from
263 |  group by
264 |  	o_year
265 |  order by
266 | -	o_year;
267 | -:n -1
268 | +	o_year
269 | +;
270 | diff --git a/9.sql b/9.sql
271 | index b262db4..1e7aa9e 100644
272 | --- a/9.sql
273 | +++ b/9.sql
274 | @@ -35,5 +35,5 @@ group by
275 |  	o_year
276 |  order by
277 |  	nation,
278 | -	o_year desc;
279 | -:n -1
280 | +	o_year desc
281 | +;
282 | -- 
283 | 2.34.1
284 | 
285 | 


--------------------------------------------------------------------------------
/nds-h/tpch-gen/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |   SPDX-License-Identifier: Apache-2.0
  5 | 
  6 |   Licensed under the Apache License, Version 2.0 (the "License");
  7 |   you may not use this file except in compliance with the License.
  8 |   You may obtain a copy of the License at
  9 | 
 10 |   http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |   Unless required by applicable law or agreed to in writing, software
 13 |   distributed under the License is distributed on an "AS IS" BASIS,
 14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   See the License for the specific language governing permissions and
 16 |   limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 19 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 20 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
 21 |                         http://maven.apache.org/maven-v4_0_0.xsd">
 22 | 
 23 |   <modelVersion>4.0.0</modelVersion>
 24 | 
 25 |   <groupId>org.nvidia.nds_h</groupId>
 26 |   <artifactId>tpch-gen</artifactId>
 27 |   <version>1.0-SNAPSHOT</version>
 28 |   <packaging>jar</packaging>
 29 | 
 30 |   <name>tpch-gen</name>
 31 |   <url>http://maven.apache.org</url>
 32 | 
 33 |   <properties>
 34 |     <tpcds-gen.jdk.version>1.8</tpcds-gen.jdk.version>
 35 |   </properties>
 36 | 
 37 |   <dependencies>
 38 |     <dependency>
 39 |       <groupId>org.apache.hadoop</groupId>
 40 |       <artifactId>hadoop-client</artifactId>
 41 |       <version>3.2.1</version>
 42 |       <scope>compile</scope>
 43 |     </dependency>
 44 |     <dependency>
 45 |       <groupId>commons-cli</groupId>
 46 |       <artifactId>commons-cli</artifactId>
 47 |       <version>1.1</version>
 48 |       <scope>compile</scope>
 49 |     </dependency>
 50 |     <dependency>
 51 |       <groupId>org.mockito</groupId>
 52 |       <artifactId>mockito-core</artifactId>
 53 |       <version>1.8.5</version>
 54 |       <scope>test</scope>
 55 |     </dependency>
 56 |     <dependency>
 57 |       <groupId>junit</groupId>
 58 |       <artifactId>junit</artifactId>
 59 |       <version>4.13.1</version>
 60 |       <scope>test</scope>
 61 |     </dependency>
 62 |   </dependencies>
 63 | 
 64 |   <build>
 65 |     <plugins>
 66 |       <plugin>
 67 |         <artifactId>maven-compiler-plugin</artifactId>
 68 |         <configuration>
 69 |           <source>${tpcds-gen.jdk.version}</source>
 70 |           <target>${tpcds-gen.jdk.version}</target>
 71 |         </configuration>
 72 |       </plugin>
 73 |       <plugin>
 74 |         <groupId>org.apache.maven.plugins</groupId>
 75 |         <artifactId>maven-jar-plugin</artifactId>
 76 |         <configuration>
 77 |           <archive>
 78 |             <manifest>
 79 |               <addClasspath>true</addClasspath>
 80 | 			        <classpathPrefix>lib/</classpathPrefix>
 81 |               <mainClass>org.nvidia.nds_h.GenTable</mainClass>
 82 |             </manifest>
 83 |           </archive>
 84 |         </configuration>
 85 |       </plugin>
 86 | 	  <plugin>
 87 | 		<groupId>org.apache.maven.plugins</groupId>
 88 | 		<artifactId>maven-dependency-plugin</artifactId>
 89 |         <executions>
 90 | 		  <execution>
 91 |             <id>copy-dependencies</id>
 92 |             <phase>package</phase>
 93 |             <goals>
 94 | 			  <goal>copy-dependencies</goal>
 95 |             </goals>
 96 |             <configuration>
 97 |                 <outputDirectory>${project.build.directory}/lib</outputDirectory>
 98 |             </configuration>
 99 |           </execution>
100 |         </executions>
101 |       </plugin>
102 |     </plugins>
103 |   </build>
104 | 
105 | </project>
106 | 


--------------------------------------------------------------------------------
/nds/PysparkBenchReport.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-DS Benchmark.
 31 | #
 32 | 
 33 | import json
 34 | import os
 35 | import time
 36 | import traceback
 37 | from typing import Callable
 38 | from pyspark.sql import SparkSession
 39 | 
 40 | import python_listener
 41 | 
 42 | class PysparkBenchReport:
 43 |     """Class to generate json summary report for a benchmark
 44 |     """
 45 |     def __init__(self, spark_session: SparkSession, query_name) -> None:
 46 |         self.spark_session = spark_session
 47 |         self.summary = {
 48 |             'env': {
 49 |                 'envVars': {},
 50 |                 'sparkConf': {},
 51 |                 'sparkVersion': None
 52 |             },
 53 |             'queryStatus': [],
 54 |             'exceptions': [],
 55 |             'startTime': None,
 56 |             'queryTimes': [],
 57 |             'query': query_name,
 58 |         }
 59 | 
 60 |     def report_on(self, fn: Callable, warmup_iterations = 0, iterations = 1, *args):
 61 |         """Record a function for its running environment, running status etc. and exclude sentive
 62 |         information like tokens, secret and password Generate summary in dict format for it.
 63 | 
 64 |         Args:
 65 |             fn (Callable): a function to be recorded
 66 | 
 67 |         Returns:
 68 |             dict: summary of the fn
 69 |         """
 70 |         spark_conf = dict(self.spark_session.sparkContext._conf.getAll())
 71 |         env_vars = dict(os.environ)
 72 |         redacted = ["TOKEN", "SECRET", "PASSWORD"]
 73 |         filtered_env_vars = dict((k, env_vars[k]) for k in env_vars.keys() if not (k in redacted))
 74 |         self.summary['env']['envVars'] = filtered_env_vars
 75 |         self.summary['env']['sparkConf'] = spark_conf
 76 |         self.summary['env']['sparkVersion'] = self.spark_session.version
 77 |         listener = None
 78 |         try:
 79 |             listener = python_listener.PythonListener()
 80 |             listener.register()
 81 |         except TypeError as e:
 82 |             print("Not found com.nvidia.spark.rapids.listener.Manager", str(e))
 83 |             listener = None
 84 |         if listener is not None:
 85 |             print("TaskFailureListener is registered.")
 86 |         try:
 87 |             # warmup
 88 |             for i in range(0, warmup_iterations):
 89 |                 fn(*args)
 90 |         except Exception as e:
 91 |             print('ERROR WHILE WARMUP BEGIN')
 92 |             print(e)
 93 |             traceback.print_tb(e.__traceback__)
 94 |             print('ERROR WHILE WARMUP END')
 95 | 
 96 |         start_time = int(time.time() * 1000)
 97 |         self.summary['startTime'] = start_time
 98 |         # run the query
 99 |         for i in range(0, iterations):
100 |             try:
101 |                 start_time = int(time.time() * 1000)
102 |                 fn(*args)
103 |                 end_time = int(time.time() * 1000)
104 |                 if listener and len(listener.failures) != 0:
105 |                     self.summary['queryStatus'].append("CompletedWithTaskFailures")
106 |                 else:
107 |                     self.summary['queryStatus'].append("Completed")
108 |             except Exception as e:
109 |                 # print the exception to ease debugging
110 |                 print('ERROR BEGIN')
111 |                 print(e)
112 |                 traceback.print_tb(e.__traceback__)
113 |                 print('ERROR END')
114 |                 end_time = int(time.time() * 1000)
115 |                 self.summary['queryStatus'].append("Failed")
116 |                 self.summary['exceptions'].append(str(e))
117 |             finally:
118 |                 self.summary['queryTimes'].append(end_time - start_time)
119 |         if listener is not None:
120 |             listener.unregister()
121 |         return self.summary
122 | 
123 |     def write_summary(self, prefix=""):
124 |         """_summary_
125 | 
126 |         Args:
127 |             query_name (str): name of the query
128 |             prefix (str, optional): prefix for the output json summary file. Defaults to "".
129 |         """
130 |         # Power BI side is retrieving some information from the summary file name, so keep this file
131 |         # name format for pipeline compatibility
132 |         filename = prefix + '-' + self.summary['query'] + '-' +str(self.summary['startTime']) + '.json'
133 |         self.summary['filename'] = filename
134 |         with open(filename, "w") as f:
135 |             json.dump(self.summary, f, indent=2)
136 | 
137 |     def is_success(self):
138 |         """Check if the query succeeded, queryStatus == Completed
139 |         """
140 |         return self.summary['queryStatus'][0] == 'Completed'
141 | 


--------------------------------------------------------------------------------
/nds/base.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # This is the base template file for the common information about test environment
19 | # including the information about Spark, cluster configuration and the Jar files,
20 | # which are required in the other templates.
21 | # We'll source this base file in all the other templates so that we just need to update
22 | # here once instead of updating in all the templates.
23 | # If you have any different configuration in a specific template, you can override
24 | # the variables in that template.
25 | 
26 | export SPARK_HOME=${SPARK_HOME:-/usr/lib/spark}
27 | export SPARK_MASTER=${SPARK_MASTER:-yarn}
28 | export DRIVER_MEMORY=${DRIVER_MEMORY:-10G}
29 | export EXECUTOR_CORES=${EXECUTOR_CORES:-12}
30 | export NUM_EXECUTORS=${NUM_EXECUTORS:-8}
31 | export EXECUTOR_MEMORY=${EXECUTOR_MEMORY:-16G}
32 | 
33 | # The NDS listener jar which is built in jvm_listener directory.
34 | export NDS_LISTENER_JAR=${NDS_LISTENER_JAR:-./jvm_listener/target/nds-benchmark-listener-1.0-SNAPSHOT.jar}
35 | # The spark-rapids jar which is required when running on GPU
36 | export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_PLUGIN_JAR:-rapids-4-spark_2.12-22.06.0.jar}
37 | export PYTHONPATH=$SPARK_HOME/python:`echo $SPARK_HOME/python/lib/py4j-*.zip`
38 | 


--------------------------------------------------------------------------------
/nds/bench.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | data_gen:
19 |   scale_factor: 10000
20 |   parallel: 100
21 |   raw_data_path: PATH_FOR_RAW_DATA # raw NDS csv data
22 |   local_or_hdfs: hdfs
23 |   # data generation is not a timed part in the full benchmark steps, set to "true" by default.
24 |   # the raw_data_path is the location for already generated data when "skip" is "true"
25 |   skip: true
26 | load_test:
27 |   # template to do Iceberg(or DeltaLake, use "delta" keyword template) writing, GPU disabled
28 |   spark_template_path: convert_submit_cpu_iceberg.template
29 |   output_path: PATH_FOR_ICEBERG_OR_DELTA_WAREHOUSE # warehouse location for Iceberg or DeltaLake
30 |   warehouse_type: iceberg # support "iceberg" and "delta", use "iceberg" by default
31 |   report_path: load_test.txt # execution report path
32 |   skip: false
33 | generate_query_stream:
34 |   # how many streams to be generated including the stream for Power Test
35 |   # If there're 4 streams in one Throughput test, num_streams = 4*2+1 according to Spec 4.3.2
36 |   num_streams: 9
37 |   # template dir contains all query templates to generate Spark compatible queries
38 |   # yaml doesn't support resolving environment variable, please use absolute path
39 |   query_template_dir: PATH_TO_TPCDS_HOME/query_templates
40 |   stream_output_path: bench_streams
41 |   skip: false
42 | power_test: # use load test output as input to avoid duplication
43 |   # template with both Iceberg(or DeltaLake) & GPU enabled, use Iceberg by default
44 |   spark_template_path: power_run_gpu_iceberg.template
45 |   report_path: power_test.csv
46 |   property_path: properties/aqe-on.properties # property file contains some Spark configs as a addition to spark template file
47 |   output_path: # leave it empty to use "collect" as Spark action. Otherwise it can be used for data validation
48 |   skip: false
49 | throughput_test: # use most parameters from power test to avoid duplication
50 |   spark_template_path: throughput_run_gpu_iceberg.template # user can copy the one used for Power run and add resource limit
51 |   report_base_path: throughput_report
52 |   skip: false
53 | maintenance_test:
54 |   # template to do data maintenance on Iceberg(or DeltaLake, use "delta" keyword template)
55 |   maintenance_template_path: maintenance_iceberg.template
56 |   query_dir: data_maintenance # folder that contains all data maintenance queries
57 |   maintenance_report_base_path: maintenance_report
58 |   skip: false
59 | metrics_report_path: metrics.csv


--------------------------------------------------------------------------------
/nds/check.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 | # SPDX-License-Identifier: Apache-2.0
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | # -----
 19 | #
 20 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
 21 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 22 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 23 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 24 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 25 | #
 26 | # You may not use this file except in compliance with the TPC EULA.
 27 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
 28 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
 29 | # obtained from using this file do not comply with the TPC-DS Benchmark.
 30 | #
 31 | 
 32 | import argparse
 33 | import os
 34 | import sys
 35 | from pathlib import Path
 36 | 
 37 | 
 38 | def check_version():
 39 |     req_ver = (3, 6)
 40 |     cur_ver = sys.version_info
 41 |     if cur_ver < req_ver:
 42 |         raise Exception('Minimum required Python version is 3.6, but current python version is {}.'
 43 |                         .format(str(cur_ver.major) + '.' + str(cur_ver.minor)) +
 44 |                         ' Please use proper Python version')
 45 | 
 46 | 
 47 | def check_build():
 48 |     """check jar and tpcds executable
 49 | 
 50 |     Raises:
 51 |         Exception: the build is not done or broken
 52 | 
 53 |     Returns:
 54 |         PosixPath, PosixPath: path of jar and dsdgen executable
 55 |     """
 56 |     # Check if necessary executable or jars are built.
 57 |     # we assume user won't move this script.
 58 |     src_dir = Path(__file__).parent.absolute()
 59 |     jar_path = list(
 60 |         Path(src_dir / 'tpcds-gen/target').rglob("tpcds-gen-*.jar"))
 61 |     tool_path = list(Path(src_dir / 'tpcds-gen/target/tools').rglob("dsdgen"))
 62 |     if jar_path == [] or tool_path == []:
 63 |         raise Exception('Target jar file is not found in `target` folder or dsdgen executable is ' +
 64 |                         'not found in `target/tools` folder.' +
 65 |                         'Please refer to README document and build this project first.')
 66 |     return jar_path[0], tool_path[0]
 67 | 
 68 | 
 69 | def get_abs_path(input_path):
 70 |     """receive a user input path and return absolute path of it.
 71 | 
 72 |     Args:
 73 |         input_path (str): user's input path
 74 | 
 75 |     Returns:
 76 |         str: if the input is absolute, return it; if it's relative path, return the absolute path of
 77 |         it.
 78 |     """
 79 |     if Path(input_path).is_absolute():
 80 |         # it's absolute path
 81 |         output_path = input_path
 82 |     else:
 83 |         # it's relative path where this script is executed
 84 |         output_path = os.getcwd() + '/' + input_path
 85 |     return output_path
 86 | 
 87 | 
 88 | def valid_range(range, parallel):
 89 |     """check the range validation
 90 | 
 91 |     Args:
 92 |         range (str): a range specified for a range data generation, e.g. "1,10"
 93 |         parallel (str): string type number for parallelism in TPC-DS data generation, e.g. "20"
 94 | 
 95 |     Raises:
 96 |         Exception: error message for invalid range input.
 97 |     """
 98 |     if len(range.split(',')) != 2:
 99 |         msg = 'Invalid range: please specify a range with a comma between start and end. e.g., "1,10".'
100 |         raise Exception(msg)
101 |     range_start = int(range.split(',')[0])
102 |     range_end = int(range.split(',')[1])
103 |     if range_start < 1 or range_start > range_end or range_end > int(parallel):
104 |         msg = 'Please provide correct child range: 1 <= range_start <= range_end <= parallel'
105 |         raise Exception(msg)
106 |     return range_start, range_end
107 | 
108 | 
109 | def parallel_value_type(p):
110 |     """helper function to check parallel valuie
111 | 
112 |     Args:
113 |         p (str): parallel value
114 | 
115 |     Raises:
116 |         argparse.ArgumentTypeError: ArgumentTypeError exception
117 | 
118 |     Returns:
119 |         str: parallel in string
120 |     """
121 |     if int(p) < 2:
122 |         raise argparse.ArgumentTypeError("PARALLEL must be >= 2")
123 |     return p
124 | 
125 | 
126 | def get_dir_size(start_path):
127 |     total_size = 0
128 |     for dirpath, dirnames, filenames in os.walk(start_path):
129 |         for f in filenames:
130 |             fp = os.path.join(dirpath, f)
131 |             # skip if it is symbolic link
132 |             if not os.path.islink(fp):
133 |                 total_size += os.path.getsize(fp)
134 |     return total_size
135 | 
136 | def check_json_summary_folder(json_summary_folder):
137 |     if json_summary_folder:
138 |     # prepare a folder to save json summaries of query results
139 |         if not os.path.exists(json_summary_folder):
140 |             os.makedirs(json_summary_folder)
141 |         else:
142 |             if os.listdir(json_summary_folder):
143 |                 raise Exception(f"json_summary_folder {json_summary_folder} is not empty. " +
144 |                                 "There may be already some json files there. Please clean the folder " +
145 |                                 "or specify another one.")
146 | 
147 | def check_query_subset_exists(query_dict, subset_list):
148 |     """check if the query subset exists in the query dictionary"""
149 |     for q in subset_list:
150 |         if q not in query_dict.keys():
151 |             raise Exception(f"Query {q} is not in the query dictionary. Please check the query subset.")
152 |     return True
153 | 


--------------------------------------------------------------------------------
/nds/cicd/settings.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   Copyright (c) 2022, NVIDIA CORPORATION.
  4 |   Licensed under the Apache License, Version 2.0 (the "License");
  5 |   you may not use this file except in compliance with the License.
  6 |   You may obtain a copy of the License at
  7 |       http://www.apache.org/licenses/LICENSE-2.0
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License.
 13 | -->
 14 | <settings xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.1.0 http://maven.apache.org/xsd/settings-1.1.0.xsd"
 15 |           xmlns="http://maven.apache.org/SETTINGS/1.1.0"
 16 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 17 |   <servers>
 18 |     <server>
 19 |       <username>${env.ART_CREDS_USR}</username>
 20 |       <password>${env.ART_CREDS_PSW}</password>
 21 |       <id>central</id>
 22 |     </server>
 23 |     <server>
 24 |       <username>${env.ART_CREDS_USR}</username>
 25 |       <password>${env.ART_CREDS_PSW}</password>
 26 |       <id>snapshots</id>
 27 |     </server>
 28 |     <server>
 29 |       <id>ossrh</id>
 30 |       <username>${env.SONATYPE_USR}</username>
 31 |       <password>${env.SONATYPE_PSW}</password>
 32 |     </server>
 33 |   </servers>
 34 |   <profiles>
 35 |     <profile>
 36 |       <repositories>
 37 |         <repository>
 38 |           <snapshots>
 39 |             <enabled>false</enabled>
 40 |           </snapshots>
 41 |           <id>central</id>
 42 |           <name>sw-spark-maven</name>
 43 |           <url>${env.MVN_REPO_URL}</url>
 44 |         </repository>
 45 |         <repository>
 46 |           <snapshots/>
 47 |           <id>snapshots</id>
 48 |           <name>sw-spark-maven</name>
 49 |           <url>${env.MVN_REPO_URL}</url>
 50 |         </repository>
 51 |       </repositories>
 52 |       <pluginRepositories>
 53 |         <pluginRepository>
 54 |           <snapshots>
 55 |             <enabled>false</enabled>
 56 |           </snapshots>
 57 |           <id>central</id>
 58 |           <name>sw-spark-maven</name>
 59 |           <url>${env.MVN_REPO_URL}</url>
 60 |         </pluginRepository>
 61 |         <pluginRepository>
 62 |           <snapshots/>
 63 |           <id>snapshots</id>
 64 |           <name>sw-spark-maven</name>
 65 |           <url>${env.MVN_REPO_URL}</url>
 66 |         </pluginRepository>
 67 |       </pluginRepositories>
 68 |       <id>artifactory</id>
 69 |     </profile>
 70 |     <profile>
 71 |       <id>mirror-apache-to-urm</id>
 72 |       <repositories>
 73 |         <repository>
 74 |           <id>apache.snapshots</id>
 75 |           <name>sw-spark-maven</name>
 76 |           <url>${env.MVN_REPO_URL}</url>
 77 |         </repository>
 78 |       </repositories>
 79 |     </profile>
 80 |     <profile>
 81 |       <id>mirror-apache-https-to-urm</id>
 82 |       <repositories>
 83 |         <repository>
 84 |           <id>apache.snapshots.https</id>
 85 |           <name>sw-spark-maven</name>
 86 |           <url>${env.MVN_REPO_URL}</url>
 87 |         </repository>
 88 |       </repositories>
 89 |     </profile>
 90 |     <profile>
 91 |       <id>mirror-apache2-to-urm</id>
 92 |       <repositories>
 93 |         <repository>
 94 |           <id>apache-snapshots-repo</id>
 95 |           <name>sw-spark-maven</name>
 96 |           <url>${env.MVN_REPO_URL}</url>
 97 |         </repository>
 98 |       </repositories>
 99 |     </profile>
100 |     <profile>
101 |       <id>deploy-to-art</id>
102 |       <properties>
103 |         <altDeploymentRepository>snapshots::default::${env.MVN_REPO_URL}-local</altDeploymentRepository>
104 |       </properties>
105 |     </profile>
106 |   </profiles>
107 |   <activeProfiles>
108 |     <activeProfile>artifactory</activeProfile>
109 |     <activeProfile>deploy-to-art</activeProfile>
110 |   </activeProfiles>
111 | </settings>


--------------------------------------------------------------------------------
/nds/convert_submit_cpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
20 | 
21 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
22 |                    "--deploy-mode" "client"
23 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
24 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
25 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
26 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
27 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}")
28 | 


--------------------------------------------------------------------------------
/nds/convert_submit_cpu_delta.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The io.delta:delta-core_2.12:1.1.0 only works on Spark 3.2.x
19 | #    Please refer to https://docs.delta.io/latest/releases.html for other Spark versions.
20 | 
21 | source base.template
22 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
23 | 
24 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
25 |                    "--deploy-mode" "client"
26 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
27 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
28 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
29 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
30 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
31 |                    "--packages" "io.delta:delta-core_2.12:1.1.0"
32 |                    "--conf" "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension"
33 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog")
34 | 


--------------------------------------------------------------------------------
/nds/convert_submit_cpu_iceberg.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The iceberg-spark-runtime-3.1_2.12:0.13.2 only works on Spark 3.1.x
19 | #    Please refer to https://iceberg.apache.org/releases/ for other Spark versions.
20 | # 2. The Iceberg catalog/tables is expected to be in current directory.
21 | #    see `spark.sql.catalog.spark_catalog.warehouse`.
22 | 
23 | source base.template
24 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
25 | 
26 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
27 |                    "--deploy-mode" "client"
28 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
29 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
30 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
31 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
32 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
33 |                    "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
34 |                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
35 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
36 |                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop")
37 | 


--------------------------------------------------------------------------------
/nds/convert_submit_gpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
20 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
21 | 
22 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
23 |                    "--deploy-mode" "client"
24 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
25 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
26 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
27 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
28 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
29 |                    "--conf" "spark.executor.resource.gpu.amount=1"
30 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
31 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
32 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
33 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
34 |                    "--conf" "spark.rapids.sql.explain=NOT_ON_GPU"
35 |                    "--conf" "spark.rapids.sql.incompatibleOps.enabled=true"
36 |                    "--conf" "spark.rapids.sql.variableFloatAgg.enabled=true"
37 |                    "--conf" "spark.sql.files.maxPartitionBytes=2g"
38 |                    "--conf" "spark.sql.legacy.parquet.datetimeRebaseModeInWrite=CORRECTED"
39 |                    "--conf" "spark.task.resource.gpu.amount=0.05"
40 |                    "--files" "$SPARK_HOME/examples/src/main/scripts/getGpusResources.sh"
41 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR")
42 | 


--------------------------------------------------------------------------------
/nds/data_maintenance/DF_CS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | delete from catalog_returns where cr_order_number in  (select distinct cs_order_number from catalog_sales, date_dim  where cs_sold_date_sk=d_date_sk and d_date between 'DATE1' and 'DATE2');
32 | delete from catalog_sales where cs_sold_date_sk >= (select min(d_date_sk) from date_dim  where d_date between 'DATE1' and 'DATE2') and 
33 |                 cs_sold_date_sk <= (select max(d_date_sk) from date_dim  where d_date between 'DATE1' and 'DATE2');
34 | 


--------------------------------------------------------------------------------
/nds/data_maintenance/DF_I.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | delete from inventory where inv_date_sk >= ( select min(d_date_sk) from date_dim  where  d_date between 'DATE1' and 'DATE2') and 
32 |                 inv_date_sk <= ( select max(d_date_sk) from date_dim  where  d_date between 'DATE1' and 'DATE2');
33 | 


--------------------------------------------------------------------------------
/nds/data_maintenance/DF_SS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | delete from store_returns where sr_ticket_number in (select distinct ss_ticket_number from store_sales, date_dim   where ss_sold_date_sk=d_date_sk and d_date between 'DATE1' and 'DATE2');
32 | delete from store_sales where ss_sold_date_sk >= (select min(d_date_sk) from date_dim  where d_date between 'DATE1' and 'DATE2') and
33 |                 ss_sold_date_sk <= (select max(d_date_sk) from date_dim  where d_date between 'DATE1' and 'DATE2');
34 | 


--------------------------------------------------------------------------------
/nds/data_maintenance/DF_WS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | delete from web_returns where wr_order_number in (select distinct ws_order_number from web_sales, date_dim where ws_sold_date_sk=d_date_sk and d_date between 'DATE1' and 'DATE2');
32 | delete from web_sales where ws_sold_date_sk >= (select min(d_date_sk) from date_dim where d_date between 'DATE1' and 'DATE2') and
33 |                  ws_sold_date_sk <= (select max(d_date_sk) from date_dim where d_date between 'DATE1' and 'DATE2');
34 | 


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_CR.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS crv;
32 | CREATE TEMP VIEW crv as
33 | SELECT d_date_sk cr_returned_date_sk
34 |  ,t_time_sk cr_returned_time_sk
35 |  ,i_item_sk cr_item_sk
36 |  ,c1.c_customer_sk cr_refunded_customer_sk
37 |  ,c1.c_current_cdemo_sk cr_refunded_cdemo_sk
38 |  ,c1.c_current_hdemo_sk cr_refunded_hdemo_sk
39 |  ,c1.c_current_addr_sk cr_refunded_addr_sk
40 |  ,c2.c_customer_sk cr_returning_customer_sk
41 |  ,c2.c_current_cdemo_sk cr_returning_cdemo_sk
42 |  ,c2.c_current_hdemo_sk cr_returning_hdemo_sk
43 |  ,c2.c_current_addr_sk cr_returing_addr_sk
44 |  ,cc_call_center_sk cr_call_center_sk
45 |  ,cp_catalog_page_sk CR_CATALOG_PAGE_SK
46 |  ,sm_ship_mode_sk CR_SHIP_MODE_SK
47 |  ,w_warehouse_sk CR_WAREHOUSE_SK
48 |  ,r_reason_sk cr_reason_sk
49 |  ,cret_order_id cr_order_number
50 |  ,cret_return_qty cr_return_quantity
51 |  ,cret_return_amt cr_return_amt
52 |  ,cret_return_tax cr_return_tax
53 |  ,cret_return_amt + cret_return_tax AS cr_return_amt_inc_tax
54 |  ,cret_return_fee cr_fee
55 |  ,cret_return_ship_cost cr_return_ship_cost
56 |  ,cret_refunded_cash cr_refunded_cash
57 |  ,cret_reversed_charge cr_reversed_charge
58 |  ,cret_merchant_credit cr_merchant_credit
59 |  ,cret_return_amt+cret_return_tax+cret_return_fee
60 |  -cret_refunded_cash-cret_reversed_charge-cret_merchant_credit cr_net_loss
61 | FROM s_catalog_returns 
62 | LEFT OUTER JOIN date_dim 
63 |  ON (cast(cret_return_date as date) = d_date)
64 | LEFT OUTER JOIN time_dim ON 
65 |  ((CAST(substr(cret_return_time,1,2) AS integer)*3600
66 |  +CAST(substr(cret_return_time,4,2) AS integer)*60
67 |  +CAST(substr(cret_return_time,7,2) AS integer)) = t_time)
68 | LEFT OUTER JOIN item ON (cret_item_id = i_item_id)
69 | LEFT OUTER JOIN customer c1 ON (cret_return_customer_id = c1.c_customer_id)
70 | LEFT OUTER JOIN customer c2 ON (cret_refund_customer_id = c2.c_customer_id)
71 | LEFT OUTER JOIN reason ON (cret_reason_id = r_reason_id)
72 | LEFT OUTER JOIN call_center ON (cret_call_center_id = cc_call_center_id)
73 | LEFT OUTER JOIN catalog_page ON (cret_catalog_page_id = cp_catalog_page_id)
74 | LEFT OUTER JOIN ship_mode ON (cret_shipmode_id = sm_ship_mode_id)
75 | LEFT OUTER JOIN warehouse ON (cret_warehouse_id = w_warehouse_id)
76 | WHERE i_rec_end_date IS NULL AND cc_rec_end_date IS NULL;
77 | ------------------------------------------------
78 | insert into catalog_returns (select * from crv order by cr_returned_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_CS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS csv;
32 | CREATE TEMP view csv as
33 | SELECT d1.d_date_sk cs_sold_date_sk 
34 |  ,t_time_sk cs_sold_time_sk 
35 |  ,d2.d_date_sk cs_ship_date_sk
36 |  ,c1.c_customer_sk cs_bill_customer_sk
37 |  ,c1.c_current_cdemo_sk cs_bill_cdemo_sk 
38 |  ,c1.c_current_hdemo_sk cs_bill_hdemo_sk
39 |  ,c1.c_current_addr_sk cs_bill_addr_sk
40 |  ,c2.c_customer_sk cs_ship_customer_sk
41 |  ,c2.c_current_cdemo_sk cs_ship_cdemo_sk
42 |  ,c2.c_current_hdemo_sk cs_ship_hdemo_sk
43 |  ,c2.c_current_addr_sk cs_ship_addr_sk
44 |  ,cc_call_center_sk cs_call_center_sk
45 |  ,cp_catalog_page_sk cs_catalog_page_sk
46 |  ,sm_ship_mode_sk cs_ship_mode_sk
47 |  ,w_warehouse_sk cs_warehouse_sk
48 |  ,i_item_sk cs_item_sk
49 |  ,p_promo_sk cs_promo_sk
50 |  ,cord_order_id cs_order_number
51 |  ,clin_quantity cs_quantity
52 |  ,i_wholesale_cost cs_wholesale_cost
53 |  ,i_current_price cs_list_price
54 |  ,clin_sales_price cs_sales_price
55 |  ,(i_current_price-clin_sales_price)*clin_quantity cs_ext_discount_amt
56 |  ,clin_sales_price * clin_quantity cs_ext_sales_price
57 |  ,i_wholesale_cost * clin_quantity cs_ext_wholesale_cost 
58 |  ,i_current_price * clin_quantity CS_EXT_LIST_PRICE
59 |  ,i_current_price * cc_tax_percentage CS_EXT_TAX
60 |  ,clin_coupon_amt cs_coupon_amt
61 |  ,clin_ship_cost * clin_quantity CS_EXT_SHIP_COST
62 |  ,(clin_sales_price * clin_quantity)-clin_coupon_amt cs_net_paid
63 |  ,((clin_sales_price * clin_quantity)-clin_coupon_amt)*(1+cc_tax_percentage) cs_net_paid_inc_tax
64 |  ,(clin_sales_price * clin_quantity)-clin_coupon_amt + (clin_ship_cost * clin_quantity) CS_NET_PAID_INC_SHIP
65 |  ,(clin_sales_price * clin_quantity)-clin_coupon_amt + (clin_ship_cost * clin_quantity) 
66 |  + i_current_price * cc_tax_percentage CS_NET_PAID_INC_SHIP_TAX
67 |  ,((clin_sales_price * clin_quantity)-clin_coupon_amt)-(clin_quantity*i_wholesale_cost) cs_net_profit
68 | FROM s_catalog_order 
69 | LEFT OUTER JOIN date_dim d1 ON
70 |  (cast(cord_order_date as date) = d1.d_date)
71 | LEFT OUTER JOIN time_dim ON (cord_order_time = t_time)
72 | LEFT OUTER JOIN customer c1 ON (cord_bill_customer_id = c1.c_customer_id)
73 | LEFT OUTER JOIN customer c2 ON (cord_ship_customer_id = c2.c_customer_id)
74 | LEFT OUTER JOIN call_center ON (cord_call_center_id = cc_call_center_id AND cc_rec_end_date IS NULL)
75 | LEFT OUTER JOIN ship_mode ON (cord_ship_mode_id = sm_ship_mode_id)
76 | JOIN s_catalog_order_lineitem ON (cord_order_id = clin_order_id)
77 | LEFT OUTER JOIN date_dim d2 ON
78 |  (cast(clin_ship_date as date) = d2.d_date)
79 | LEFT OUTER JOIN catalog_page ON
80 |  (clin_catalog_page_number = cp_catalog_page_number and clin_catalog_number = cp_catalog_number)
81 | LEFT OUTER JOIN warehouse ON (clin_warehouse_id = w_warehouse_id)
82 | LEFT OUTER JOIN item ON (clin_item_id = i_item_id AND i_rec_end_date IS NULL)
83 | LEFT OUTER JOIN promotion ON (clin_promotion_id = p_promo_id);
84 | ------------------------------------------------
85 | insert into catalog_sales (select * from csv order by cs_sold_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_I.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS iv;
32 | CREATE TEMP view iv AS
33 | SELECT d_date_sk inv_date_sk,
34 |  i_item_sk inv_item_sk,
35 |  w_warehouse_sk inv_warehouse_sk,
36 |  invn_qty_on_hand inv_quantity_on_hand
37 | FROM s_inventory
38 | LEFT OUTER JOIN warehouse ON (invn_warehouse_id=w_warehouse_id)
39 | LEFT OUTER JOIN item ON (invn_item_id=i_item_id AND i_rec_end_date IS NULL)
40 | LEFT OUTER JOIN date_dim ON (d_date=invn_date);
41 | ------------------------------------------------
42 | insert into inventory (select * from iv order by inv_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_SR.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS srv;
32 | CREATE TEMP view srv as
33 | SELECT d_date_sk sr_returned_date_sk
34 |  ,t_time_sk sr_return_time_sk
35 |  ,i_item_sk sr_item_sk
36 |  ,c_customer_sk sr_customer_sk
37 |  ,c_current_cdemo_sk sr_cdemo_sk
38 |  ,c_current_hdemo_sk sr_hdemo_sk
39 |  ,c_current_addr_sk sr_addr_sk
40 |  ,s_store_sk sr_store_sk
41 |  ,r_reason_sk sr_reason_sk
42 |  ,sret_ticket_number sr_ticket_number
43 |  ,sret_return_qty sr_return_quantity
44 |  ,sret_return_amt sr_return_amt
45 |  ,sret_return_tax sr_return_tax
46 |  ,sret_return_amt + sret_return_tax sr_return_amt_inc_tax
47 |  ,sret_return_fee sr_fee
48 |  ,sret_return_ship_cost sr_return_ship_cost
49 |  ,sret_refunded_cash sr_refunded_cash
50 |  ,sret_reversed_charge sr_reversed_charge
51 |  ,sret_store_credit sr_store_credit
52 |  ,sret_return_amt+sret_return_tax+sret_return_fee
53 |  -sret_refunded_cash-sret_reversed_charge-sret_store_credit sr_net_loss
54 | FROM s_store_returns 
55 | LEFT OUTER JOIN date_dim 
56 |  ON (cast(sret_return_date as date) = d_date)
57 | LEFT OUTER JOIN time_dim 
58 |  ON (( cast(substr(sret_return_time,1,2) AS integer)*3600
59 |  +cast(substr(sret_return_time,4,2) AS integer)*60
60 |  +cast(substr(sret_return_time,7,2) AS integer)) = t_time)
61 | LEFT OUTER JOIN item ON (sret_item_id = i_item_id)
62 | LEFT OUTER JOIN customer ON (sret_customer_id = c_customer_id)
63 | LEFT OUTER JOIN store ON (sret_store_id = s_store_id)
64 | LEFT OUTER JOIN reason ON (sret_reason_id = r_reason_id)
65 | WHERE i_rec_end_date IS NULL
66 |  AND s_rec_end_date IS NULL;
67 | ------------------------------------------------
68 | insert into store_returns (select * from srv order by sr_returned_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_SS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS ssv;
32 | CREATE TEMP view ssv as
33 | SELECT d_date_sk ss_sold_date_sk, 
34 |  t_time_sk ss_sold_time_sk, 
35 |  i_item_sk ss_item_sk, 
36 |  c_customer_sk ss_customer_sk, 
37 |  c_current_cdemo_sk ss_cdemo_sk, 
38 |  c_current_hdemo_sk ss_hdemo_sk,
39 |  c_current_addr_sk ss_addr_sk,
40 |  s_store_sk ss_store_sk,
41 |  p_promo_sk ss_promo_sk,
42 |  purc_purchase_id ss_ticket_number, 
43 |  plin_quantity ss_quantity, 
44 |  i_wholesale_cost ss_wholesale_cost, 
45 |  i_current_price ss_list_price,
46 |  plin_sale_price ss_sales_price,
47 |  (i_current_price-plin_sale_price)*plin_quantity ss_ext_discount_amt,
48 |  plin_sale_price * plin_quantity ss_ext_sales_price,
49 |  i_wholesale_cost * plin_quantity ss_ext_wholesale_cost, 
50 |  i_current_price * plin_quantity ss_ext_list_price, 
51 |  i_current_price * s_tax_precentage ss_ext_tax, 
52 |  plin_coupon_amt ss_coupon_amt,
53 |  (plin_sale_price * plin_quantity)-plin_coupon_amt ss_net_paid,
54 |  ((plin_sale_price * plin_quantity)-plin_coupon_amt)*(1+s_tax_precentage) ss_net_paid_inc_tax,
55 |  ((plin_sale_price * plin_quantity)-plin_coupon_amt)-(plin_quantity*i_wholesale_cost) 
56 | ss_net_profit
57 | FROM s_purchase 
58 | LEFT OUTER JOIN customer ON (purc_customer_id = c_customer_id) 
59 | LEFT OUTER JOIN store ON (purc_store_id = s_store_id)
60 | LEFT OUTER JOIN date_dim ON (cast(purc_purchase_date as date) = d_date)
61 | LEFT OUTER JOIN time_dim ON (PURC_PURCHASE_TIME = t_time)
62 | JOIN s_purchase_lineitem ON (purc_purchase_id = plin_purchase_id)
63 | LEFT OUTER JOIN promotion ON plin_promotion_id = p_promo_id
64 | LEFT OUTER JOIN item ON plin_item_id = i_item_id
65 | WHERE purc_purchase_id = plin_purchase_id
66 |  AND i_rec_end_date is NULL
67 |  AND s_rec_end_date is NULL;
68 | ------------------------------------------------
69 | insert into store_sales (select * from ssv order by ss_sold_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_WR.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS wrv;
32 | CREATE TEMP VIEW wrv AS
33 | SELECT d_date_sk wr_return_date_sk
34 |  ,t_time_sk wr_return_time_sk
35 |  ,i_item_sk wr_item_sk
36 |  ,c1.c_customer_sk wr_refunded_customer_sk
37 |  ,c1.c_current_cdemo_sk wr_refunded_cdemo_sk
38 |  ,c1.c_current_hdemo_sk wr_refunded_hdemo_sk
39 |  ,c1.c_current_addr_sk wr_refunded_addr_sk
40 |  ,c2.c_customer_sk wr_returning_customer_sk
41 |  ,c2.c_current_cdemo_sk wr_returning_cdemo_sk
42 |  ,c2.c_current_hdemo_sk wr_returning_hdemo_sk
43 |  ,c2.c_current_addr_sk wr_returing_addr_sk
44 |  ,wp_web_page_sk wr_web_page_sk 
45 |  ,r_reason_sk wr_reason_sk
46 |  ,wret_order_id wr_order_number
47 |  ,wret_return_qty wr_return_quantity
48 |  ,wret_return_amt wr_return_amt
49 |  ,wret_return_tax wr_return_tax
50 |  ,wret_return_amt + wret_return_tax AS wr_return_amt_inc_tax
51 |  ,wret_return_fee wr_fee
52 |  ,wret_return_ship_cost wr_return_ship_cost
53 |  ,wret_refunded_cash wr_refunded_cash
54 |  ,wret_reversed_charge wr_reversed_charge
55 |  ,wret_account_credit wr_account_credit
56 |  ,wret_return_amt+wret_return_tax+wret_return_fee
57 |  -wret_refunded_cash-wret_reversed_charge-wret_account_credit wr_net_loss
58 | FROM s_web_returns LEFT OUTER JOIN date_dim ON (cast(wret_return_date as date) = d_date)
59 | LEFT OUTER JOIN time_dim ON ((CAST(SUBSTR(wret_return_time,1,2) AS integer)*3600
60 | +CAST(SUBSTR(wret_return_time,4,2) AS integer)*60+CAST(SUBSTR(wret_return_time,7,2) AS integer))=t_time)
61 | LEFT OUTER JOIN item ON (wret_item_id = i_item_id)
62 | LEFT OUTER JOIN customer c1 ON (wret_return_customer_id = c1.c_customer_id)
63 | LEFT OUTER JOIN customer c2 ON (wret_refund_customer_id = c2.c_customer_id)
64 | LEFT OUTER JOIN reason ON (wret_reason_id = r_reason_id)
65 | LEFT OUTER JOIN web_page ON (wret_web_page_id = WP_WEB_PAGE_id)
66 | WHERE i_rec_end_date IS NULL AND wp_rec_end_date IS NULL;
67 | ------------------------------------------------
68 | insert into web_returns (select * from wrv order by wr_return_date_sk);


--------------------------------------------------------------------------------
/nds/data_maintenance/LF_WS.sql:
--------------------------------------------------------------------------------
 1 | --
 2 | -- SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | -- SPDX-License-Identifier: Apache-2.0
 4 | --
 5 | -- Licensed under the Apache License, Version 2.0 (the "License");
 6 | -- you may not use this file except in compliance with the License.
 7 | -- You may obtain a copy of the License at
 8 | --
 9 | -- http://www.apache.org/licenses/LICENSE-2.0
10 | --
11 | -- Unless required by applicable law or agreed to in writing, software
12 | -- distributed under the License is distributed on an "AS IS" BASIS,
13 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | -- See the License for the specific language governing permissions and
15 | -- limitations under the License.
16 | --
17 | -- -----
18 | --
19 | -- Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
20 | -- (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
21 | -- Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
22 | -- and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
23 | -- available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
24 | --
25 | -- You may not use this file except in compliance with the TPC EULA.
26 | -- DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
27 | -- obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
28 | -- obtained from using this file do not comply with the TPC-DS Benchmark.
29 | --
30 | 
31 | DROP VIEW IF EXISTS wsv;
32 | CREATE TEMP VIEW wsv AS
33 | SELECT d1.d_date_sk ws_sold_date_sk, 
34 |  t_time_sk ws_sold_time_sk, 
35 |  d2.d_date_sk ws_ship_date_sk,
36 |  i_item_sk ws_item_sk, 
37 |  c1.c_customer_sk ws_bill_customer_sk, 
38 |  c1.c_current_cdemo_sk ws_bill_cdemo_sk, 
39 |  c1.c_current_hdemo_sk ws_bill_hdemo_sk,
40 |  c1.c_current_addr_sk ws_bill_addr_sk,
41 |  c2.c_customer_sk ws_ship_customer_sk,
42 |  c2.c_current_cdemo_sk ws_ship_cdemo_sk,
43 |  c2.c_current_hdemo_sk ws_ship_hdemo_sk,
44 |  c2.c_current_addr_sk ws_ship_addr_sk,
45 |  wp_web_page_sk ws_web_page_sk,
46 |  web_site_sk ws_web_site_sk,
47 |  sm_ship_mode_sk ws_ship_mode_sk,
48 |  w_warehouse_sk ws_warehouse_sk,
49 |  p_promo_sk ws_promo_sk,
50 |  word_order_id ws_order_number, 
51 |  wlin_quantity ws_quantity, 
52 |  i_wholesale_cost ws_wholesale_cost, 
53 |  i_current_price ws_list_price,
54 |  wlin_sales_price ws_sales_price,
55 |  (i_current_price-wlin_sales_price)*wlin_quantity ws_ext_discount_amt,
56 |  wlin_sales_price * wlin_quantity ws_ext_sales_price,
57 |  i_wholesale_cost * wlin_quantity ws_ext_wholesale_cost, 
58 |  i_current_price * wlin_quantity ws_ext_list_price, 
59 |  i_current_price * web_tax_percentage ws_ext_tax, 
60 |  wlin_coupon_amt ws_coupon_amt,
61 |  wlin_ship_cost * wlin_quantity WS_EXT_SHIP_COST,
62 |  (wlin_sales_price * wlin_quantity)-wlin_coupon_amt ws_net_paid,
63 |  ((wlin_sales_price * wlin_quantity)-wlin_coupon_amt)*(1+web_tax_percentage) ws_net_paid_inc_tax,
64 |  ((wlin_sales_price * wlin_quantity)-wlin_coupon_amt)-(wlin_quantity*i_wholesale_cost) 
65 | WS_NET_PAID_INC_SHIP,
66 |  (wlin_sales_price * wlin_quantity)-wlin_coupon_amt + (wlin_ship_cost * wlin_quantity)
67 |  + i_current_price * web_tax_percentage WS_NET_PAID_INC_SHIP_TAX,
68 |  ((wlin_sales_price * wlin_quantity)-wlin_coupon_amt)-(i_wholesale_cost * wlin_quantity) 
69 | WS_NET_PROFIT
70 | FROM s_web_order 
71 | LEFT OUTER JOIN date_dim d1 ON (cast(word_order_date as date) = d1.d_date)
72 | LEFT OUTER JOIN time_dim ON (word_order_time = t_time)
73 | LEFT OUTER JOIN customer c1 ON (word_bill_customer_id = c1.c_customer_id)
74 | LEFT OUTER JOIN customer c2 ON (word_ship_customer_id = c2.c_customer_id)
75 | LEFT OUTER JOIN web_site ON (word_web_site_id = web_site_id AND web_rec_end_date IS NULL)
76 | LEFT OUTER JOIN ship_mode ON (word_ship_mode_id = sm_ship_mode_id)
77 | JOIN s_web_order_lineitem ON (word_order_id = wlin_order_id)
78 | LEFT OUTER JOIN date_dim d2 ON (cast(wlin_ship_date as date) = d2.d_date)
79 | LEFT OUTER JOIN item ON (wlin_item_id = i_item_id AND i_rec_end_date IS NULL)
80 | LEFT OUTER JOIN web_page ON (wlin_web_page_id = wp_web_page_id AND wp_rec_end_date IS NULL)
81 | LEFT OUTER JOIN warehouse ON (wlin_warehouse_id = w_warehouse_id)
82 | LEFT OUTER JOIN promotion ON (wlin_promotion_id = p_promo_id);
83 | ------------------------------------------------
84 | insert into web_sales (select * from wsv order by ws_sold_date_sk);


--------------------------------------------------------------------------------
/nds/jvm_listener/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   ~ Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 4 |   ~
 5 |   ~ Licensed under the Apache License, Version 2.0 (the "License");
 6 |   ~ you may not use this file except in compliance with the License.
 7 |   ~ You may obtain a copy of the License at
 8 |   ~
 9 |   ~ http://www.apache.org/licenses/LICENSE-2.0
10 |   ~
11 |   ~ Unless required by applicable law or agreed to in writing, software
12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   ~ See the License for the specific language governing permissions and
15 |   ~ limitations under the License.
16 |   -->
17 | <project xmlns="http://maven.apache.org/POM/4.0.0"
18 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
19 |     <modelVersion>4.0.0</modelVersion>
20 | 
21 |     <groupId>com.nvidia</groupId>
22 |     <artifactId>nds-benchmark-listener</artifactId>
23 |     <packaging>jar</packaging>
24 |     <version>1.0-SNAPSHOT</version>
25 | 
26 |     <properties>
27 |         <maven.compiler.source>8</maven.compiler.source>
28 |         <maven.compiler.target>8</maven.compiler.target>
29 |     </properties>
30 |     <dependencies>
31 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
32 |         <dependency>
33 |             <groupId>org.apache.spark</groupId>
34 |             <artifactId>spark-core_2.12</artifactId>
35 |             <version>3.1.2</version>
36 |         </dependency>
37 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
38 |         <dependency>
39 |             <groupId>org.apache.spark</groupId>
40 |             <artifactId>spark-sql_2.12</artifactId>
41 |             <version>3.1.2</version>
42 |             <scope>provided</scope>
43 |         </dependency>
44 |     </dependencies>
45 | 
46 |     <build>
47 |         <sourceDirectory>src/main/scala/</sourceDirectory>
48 |         <plugins>
49 |             <plugin>
50 |                 <groupId>org.apache.maven.plugins</groupId>
51 |                 <artifactId>maven-compiler-plugin</artifactId>
52 |                 <version>3.8.1</version>
53 |                 <configuration>
54 |                     <source>1.8</source>
55 |                     <target>1.8</target>
56 |                 </configuration>
57 |             </plugin>
58 |             <plugin>
59 |                 <groupId>org.scala-tools</groupId>
60 |                 <artifactId>maven-scala-plugin</artifactId>
61 |                 <version>2.15.2</version>
62 |                 <executions>
63 |                     <execution>
64 |                         <goals>
65 |                             <goal>compile</goal>
66 |                             <goal>testCompile</goal>
67 |                         </goals>
68 |                     </execution>
69 |                 </executions>
70 |             </plugin>
71 |         </plugins>
72 |     </build>
73 | 
74 | </project>
75 | 


--------------------------------------------------------------------------------
/nds/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/Listener.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | trait Listener {
21 |   /* Listener interface to be implemented at Python side
22 |    */
23 |   def notify(x: Any): Any
24 | }
25 | 


--------------------------------------------------------------------------------
/nds/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/Manager.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | import org.apache.spark.SparkContext
21 | 
22 | object Manager {
23 |   /* Manager class to manage all extra customized listeners.
24 |   */
25 |   private var listeners: Map[String, Listener] = Map()
26 |   private val spark_listener = new TaskFailureListener()
27 |   private var isRegistered = false
28 | 
29 |   def register(listener: Listener): String = {
30 |     /* Note this register method has nothing to do with SparkContext.addSparkListener method.
31 |     * This method is only to provide an interface to developers to have a better control over
32 |     * all customized listeners.
33 |     */
34 |     this.synchronized {
35 |       // We register to the spark listener when the first listener is registered.
36 |       registerSparkListener()
37 |       val uuid = java.util.UUID.randomUUID().toString
38 |       listeners = listeners + (uuid -> listener)
39 |       uuid
40 |     }
41 |   }
42 | 
43 |   def unregister(uuid: String) = {
44 |     this.synchronized {
45 |       listeners = listeners - uuid
46 |     }
47 |   }
48 | 
49 |   def notifyAll(message: String): Unit = {
50 |     for { (_, listener) <- listeners } listener.notify(message)
51 |   }
52 | 
53 |   def registerSparkListener() : Unit = {
54 |     if (!isRegistered) {
55 |       SparkContext.getOrCreate().addSparkListener(spark_listener)
56 |       isRegistered = true
57 |     }
58 |   }
59 | 
60 |   def unregisterSparkListener() : Unit = {
61 |     if (isRegistered) {
62 |       SparkContext.getOrCreate().removeSparkListener(spark_listener)
63 |       isRegistered = false
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/nds/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/TaskFailureListener.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | import org.apache.spark.{Success, TaskEndReason}
21 | import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
22 | import scala.collection.mutable.ListBuffer
23 | 
24 | 
25 | /* A simple listener which captures SparkListenerTaskEnd,
26 |  * extracts "reason" of the task. If the reason is not "Success",
27 |  * send this reason to python side.
28 |  */
29 | class TaskFailureListener extends SparkListener {
30 |   override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
31 |     taskEnd.reason match {
32 |       case Success =>
33 |       case reason => Manager.notifyAll(reason.toString)
34 |     }
35 |     super.onTaskEnd(taskEnd)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/nds/maintenance_delta.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The io.delta:delta-core_2.12:1.1.0 only works on Spark 3.2.x
19 | #    Please refer to https://docs.delta.io/latest/releases.html for other Spark versions.
20 | 
21 | source base.template
22 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
23 | 
24 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
25 |                    "--deploy-mode" "client"
26 |                    "--conf" "spark.driver.maxResultSize=2GB"
27 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
28 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
29 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
30 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
31 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
32 |                    "--packages" "io.delta:delta-core_2.12:1.1.0"
33 |                    "--conf" "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension"
34 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"
35 |                    "--jars" "$NDS_LISTENER_JAR")
36 | 
37 | 


--------------------------------------------------------------------------------
/nds/maintenance_iceberg.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # The iceberg-spark-runtime-3.1_2.12:0.13.2 only works on Spark 3.1.x
19 | # Please refer to https://iceberg.apache.org/releases/ for other Spark versions.
20 | 
21 | source base.template
22 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
23 | export ICEBERG_WAREHOUSE=${ICEBERG_WAREHOUSE:-/data/iceberg-warehouse}
24 | 
25 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
26 |                    "--deploy-mode" "client"
27 |                    "--conf" "spark.driver.maxResultSize=2GB"
28 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
29 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
30 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
31 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
32 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
33 |                    "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
34 |                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
35 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
36 |                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"
37 |                    "--conf" "spark.sql.catalog.spark_catalog.warehouse=$ICEBERG_WAREHOUSE"
38 |                    "--jars" "$NDS_LISTENER_JAR")
39 | 
40 | 


--------------------------------------------------------------------------------
/nds/nds-throughput:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | set -e
19 | num_streams=$(tr -dc ',' <<<"$1" | awk '{print length + 1}')
20 | echo "$num_streams"
21 | cmd="${@:2}"
22 | echo $cmd
23 | echo -n "$1" | xargs -d ',' -I '{}' -P$num_streams -r -t "${@:2}"
24 | 


--------------------------------------------------------------------------------
/nds/nds_gen_query_stream.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-DS Benchmark.
 31 | #
 32 | 
 33 | import argparse
 34 | import os
 35 | import subprocess
 36 | import sys
 37 | 
 38 | from check import check_build, check_version, get_abs_path
 39 | 
 40 | check_version()
 41 | 
 42 | def generate_query_streams(args, tool_path):
 43 |     """call TPC-DS dsqgen tool to generate a specific query or query stream(s) that contains all
 44 |     TPC-DS queries.
 45 | 
 46 |     Args:
 47 |         args (Namespace): Namespace from argparser
 48 |         tool_path (str): path to the tool
 49 |     """
 50 |     # move to the tools directory
 51 |     work_dir = tool_path.parent
 52 |     output_dir = get_abs_path(args.output_dir)
 53 |     template_dir = get_abs_path(args.template_dir)
 54 |     if not os.path.isdir(args.output_dir):
 55 |         os.makedirs(args.output_dir)
 56 |     
 57 |     base_cmd = ['./dsqgen',
 58 |                 '-scale', args.scale,
 59 |                 '-directory', template_dir,
 60 |                 '-dialect', 'spark',
 61 |                 '-output_dir', output_dir]
 62 |     
 63 |     if args.streams:
 64 |         cmd = base_cmd + ['-input', template_dir + '/' + 'templates.lst',
 65 |                           '-streams', args.streams]
 66 |     else:
 67 |         cmd = base_cmd + ['-template', args.template]
 68 |     if args.rngseed:
 69 |         cmd += ['-rngseed', args.rngseed]
 70 |     subprocess.run(cmd, check=True, cwd=str(work_dir))
 71 | 
 72 |     if args.template:
 73 |         # It's specific query, rename the stream file to its template query name
 74 |         # Special cases for query 14,23,24,39. They contains two queries in one template
 75 |         if any(q_num in args.template for q_num in ['14', '23', '24', '39']):
 76 |             with open(output_dir + '/' + 'query_0.sql', 'r') as f:
 77 |                 full_content = f.read()
 78 |                 part_1, part_2 = split_special_query(full_content)
 79 |             with open(output_dir + '/' + args.template[:-4] + '_part1.sql', 'w') as f:
 80 |                 f.write(part_1)
 81 |             with open(output_dir + '/' + args.template[:-4] + '_part2.sql', 'w') as f:
 82 |                 f.write(part_2)
 83 |             cmd = ['rm',  output_dir + '/' + 'query_0.sql']
 84 |             subprocess.run(cmd, check=True, cwd=str(work_dir))
 85 |         else:
 86 |             subprocess.run(['mv',
 87 |                             output_dir + '/' + 'query_0.sql',
 88 |                             output_dir + '/' + args.template[:-4] + '.sql'],
 89 |                            check=True, cwd=str(work_dir))
 90 | 
 91 | def split_special_query(q):
 92 |     split_q = q.split(';')
 93 |     # now split_q has 3 items:
 94 |     # 1. "query x in stream x using template query[xx].tpl query_part_1"
 95 |     # 2. "query_part_2"
 96 |     # 3. "-- end query [x] in stream [x] using template query[xx].tpl"
 97 |     part_1 = split_q[0].replace('.tpl', '_part1.tpl')
 98 |     part_1 += ';'
 99 |     head = split_q[0].split('\n')[0]
100 |     part_2 = head.replace('.tpl', '_part2.tpl') + '\n'
101 |     part_2 += split_q[1]
102 |     part_2 += ';'
103 |     return part_1, part_2
104 | 
105 | if __name__ == "__main__":
106 |     _, tool_path = check_build()
107 |     parser = parser = argparse.ArgumentParser()
108 |     parser.add_argument('template_dir',
109 |                         help='directory to find query templates and dialect file.')
110 |     parser.add_argument("scale",
111 |                         help="assume a database of this scale factor."
112 |     )
113 |     parser.add_argument("output_dir",
114 |                         help="generate query in directory.")
115 |     group = parser.add_mutually_exclusive_group(required=True)
116 |     group.add_argument("--template",
117 |                         help="build queries from this template. Only used to generate one query " +
118 |                         "from one tempalte. This argument is mutually exclusive with --streams. " +
119 |                         "It is often used for test purpose.")
120 |     group.add_argument('--streams',
121 |                         help='generate how many query streams. ' +
122 |                         'This argument is mutually exclusive with --template.')
123 |     parser.add_argument('--rngseed',
124 |                         help='seed the random generation seed.')
125 | 
126 | 
127 |     args = parser.parse_args()
128 | 
129 |     generate_query_streams(args, tool_path) 
130 | 


--------------------------------------------------------------------------------
/nds/nds_rollback.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: Apache-2.0
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | # -----
20 | #
21 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
26 | #
27 | # You may not use this file except in compliance with the TPC EULA.
28 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
29 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
30 | # obtained from using this file do not comply with the TPC-DS Benchmark.
31 | #
32 | 
33 | import argparse
34 | 
35 | from pyspark.sql import SparkSession
36 | 
37 | tables_to_rollback = [
38 |     'catalog_sales',
39 |     'inventory',
40 |     'store_returns',
41 |     'store_sales',
42 |     'web_returns',
43 |     'web_sales']
44 | 
45 | 
46 | def rollback(spark, timestamp, tables_to_rollback):
47 |     """roll back the tables to the timestamp"""
48 |     for table in tables_to_rollback:
49 |         print(f"Rolling back {table} to {timestamp}")
50 |         rollback_sql = f"CALL spark_catalog.system.rollback_to_timestamp('{table}', TIMESTAMP '{timestamp}')"
51 |         spark.sql(rollback_sql)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = parser = argparse.ArgumentParser()
56 |     parser.add_argument('timestamp', help='timestamp to rollback to')
57 |     args = parser.parse_args()
58 |     spark = SparkSession.builder.appName("Rollback").getOrCreate()
59 |     rollback(spark, args.timestamp, tables_to_rollback)
60 |     spark.stop()


--------------------------------------------------------------------------------
/nds/power_run_cpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
20 | 
21 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
22 |                    "--deploy-mode" "client"
23 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
24 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
25 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
26 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
27 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
28 |                    "--conf" "spark.scheduler.minRegisteredResourcesRatio=1.0"
29 |                    "--conf" "spark.sql.adaptive.enabled=true"
30 |                    "--conf" "spark.sql.broadcastTimeout=1200"
31 |                    "--conf" "spark.dynamicAllocation.enabled=false"
32 |                    "--jars" "$NDS_LISTENER_JAR")
33 | 


--------------------------------------------------------------------------------
/nds/power_run_cpu_delta.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The io.delta:delta-core_2.12:1.1.0 only works on Spark 3.2.x
19 | #    Please refer to https://docs.delta.io/latest/releases.html for other Spark versions.
20 | 
21 | source base.template
22 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
23 | 
24 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
25 |                    "--deploy-mode" "client"
26 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
27 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
28 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
29 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
30 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
31 |                    "--packages" "io.delta:delta-core_2.12:1.1.0"
32 |                    "--conf" "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension"
33 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"
34 |                    "--jars" "$NDS_LISTENER_JAR")
35 | 


--------------------------------------------------------------------------------
/nds/power_run_cpu_iceberg.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The iceberg-spark-runtime-3.1_2.12:0.13.2 only works on Spark 3.1.x
19 | #    Please refer to https://iceberg.apache.org/releases/ for other Spark versions.
20 | # 2. The Iceberg catalog/tables is expected to be in current directory.
21 | #    see `spark.sql.catalog.spark_catalog.warehouse`.
22 | 
23 | source base.template
24 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
25 | 
26 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
27 |                    "--deploy-mode" "client"
28 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
29 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
30 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
31 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
32 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
33 |                    "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
34 |                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
35 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
36 |                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"
37 |                    "--jars" "$NDS_LISTENER_JAR")
38 | 


--------------------------------------------------------------------------------
/nds/power_run_gpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
20 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
21 | 
22 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
23 |                    "--deploy-mode" "client"
24 |                    "--conf" "spark.driver.maxResultSize=2GB"
25 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
26 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
27 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
28 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
29 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
30 |                    "--conf" "spark.sql.files.maxPartitionBytes=2gb"
31 |                    "--conf" "spark.sql.adaptive.enabled=true"
32 |                    "--conf" "spark.executor.resource.gpu.amount=1"
33 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
34 |                    "--conf" "spark.task.resource.gpu.amount=0.0625"
35 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
36 |                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
37 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
38 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
39 |                    "--files" "$SPARK_HOME/examples/src/main/scripts/getGpusResources.sh"
40 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR,$NDS_LISTENER_JAR")
41 | 


--------------------------------------------------------------------------------
/nds/power_run_gpu_delta.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The io.delta:delta-core_2.12:1.1.0 only works on Spark 3.2.x
19 | #    Please refer to https://docs.delta.io/latest/releases.html for other Spark versions.
20 | 
21 | source base.template
22 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
23 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
24 | 
25 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
26 |                    "--deploy-mode" "client"
27 |                    "--conf" "spark.driver.maxResultSize=2GB"
28 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
29 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
30 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
31 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
32 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
33 |                    "--conf" "spark.sql.files.maxPartitionBytes=2gb"
34 |                    "--conf" "spark.sql.adaptive.enabled=true"
35 |                    "--conf" "spark.executor.resource.gpu.amount=1"
36 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
37 |                    "--conf" "spark.task.resource.gpu.amount=0.0625"
38 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
39 |                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
40 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
41 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
42 |                    "--packages" "io.delta:delta-core_2.12:1.1.0"
43 |                    "--conf" "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension"
44 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"
45 |                    "--files" "$SPARK_HOME/examples/src/main/scripts/getGpusResources.sh"
46 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR,$NDS_LISTENER_JAR")
47 | 


--------------------------------------------------------------------------------
/nds/power_run_gpu_iceberg.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # 1. The iceberg-spark-runtime-3.1_2.12:0.13.2 only works on Spark 3.1.x
19 | #    Please refer to https://iceberg.apache.org/releases/ for other Spark versions.
20 | # 2. The Iceberg catalog/tables is expected to be in current directory.
21 | #    see `spark.sql.catalog.spark_catalog.warehouse`.
22 | 
23 | source base.template
24 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
25 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
26 | 
27 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
28 |                    "--deploy-mode" "client"
29 |                    "--conf" "spark.driver.maxResultSize=2GB"
30 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
31 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
32 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
33 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
34 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
35 |                    "--conf" "spark.sql.files.maxPartitionBytes=2gb"
36 |                    "--conf" "spark.sql.adaptive.enabled=true"
37 |                    "--conf" "spark.executor.resource.gpu.amount=1"
38 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
39 |                    "--conf" "spark.task.resource.gpu.amount=0.0625"
40 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
41 |                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
42 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
43 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
44 |                    "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
45 |                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
46 |                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
47 |                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"
48 |                    "--files" "$SPARK_HOME/examples/src/main/scripts/getGpusResources.sh"
49 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR,$NDS_LISTENER_JAR")
50 | 


--------------------------------------------------------------------------------
/nds/properties/aqe-on.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | spark.sql.adaptive.enabled=true


--------------------------------------------------------------------------------
/nds/python_listener/PythonListener.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | from pyspark import SparkContext
19 | from pyspark.java_gateway import ensure_callback_server_started
20 | 
21 | class PythonListener(object):
22 |     package = "com.nvidia.spark.rapids.listener"
23 | 
24 |     @staticmethod
25 |     def get_manager():
26 |         jvm = SparkContext.getOrCreate()._jvm
27 |         manager = getattr(jvm, "{}.{}".format(PythonListener.package, "Manager"))
28 |         return manager
29 | 
30 |     def __init__(self):
31 |         self.uuid = None
32 |         self.failures = []
33 | 
34 |     def notify(self, obj):
35 |         """This method is required by Scala Listener interface
36 |         we defined above.
37 |         """
38 |         self.failures.append(obj)
39 | 
40 |     def register(self):
41 |         ensure_callback_server_started(gw = SparkContext.getOrCreate()._gateway)
42 |         manager = PythonListener.get_manager()
43 |         self.uuid = manager.register(self)
44 |         return self.uuid
45 | 
46 |     def unregister(self):
47 |         manager =  PythonListener.get_manager()
48 |         manager.unregister(self.uuid)
49 |         self.uuid = None
50 | 
51 |     # should call after register
52 |     def register_spark_listener(self):
53 |         manager = PythonListener.get_manager()
54 |         manager.registerSparkListener()
55 | 
56 |     def unregister_spark_listener(self):
57 |         manager = PythonListener.get_manager()
58 |         manager.unregisterSparkListener()
59 | 
60 |     class Java:
61 |         implements = ["com.nvidia.spark.rapids.listener.Listener"]
62 | 


--------------------------------------------------------------------------------
/nds/python_listener/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: Apache-2.0
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | # -----
20 | #
21 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
26 | #
27 | # You may not use this file except in compliance with the TPC EULA.
28 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
29 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
30 | # obtained from using this file do not comply with the TPC-DS Benchmark.
31 | #
32 | from .PythonListener import PythonListener
33 | 


--------------------------------------------------------------------------------
/nds/spark-submit-template:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | set -ex
19 | # e.g.
20 | # ./spark-submit-template power_run_gpu.template nds_power.py \
21 | # local_data_parquet/
22 | # ./nds_query_streams/query_0.sql \
23 | # time.csv
24 | 
25 | # the first argument must be the template file
26 | source "$1"
27 | # build spark-submit command
28 | MORE_ARGS=("${@:2}")
29 | CMD=("$SPARK_HOME/bin/spark-submit")
30 | CMD+=("${SPARK_CONF[@]}")
31 | CMD+=("${MORE_ARGS[@]}")
32 | # submit
33 | "${CMD[@]}"
34 | 


--------------------------------------------------------------------------------
/nds/tpcds-gen/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | all: target/lib/dsdgen.jar target/tpcds-gen-1.0-SNAPSHOT.jar
19 | 
20 | target/tpcds-gen-1.0-SNAPSHOT.jar: $(shell find -name *.java)
21 | ifdef MVN_REPO_URL
22 | 	mvn package -s ../cicd/settings.xml
23 | else
24 | 	mvn package
25 | endif
26 | 
27 | target/lib/dsdgen.jar: target/tools/dsdgen
28 | 	cd target/; mkdir -p lib/; ( jar cvf lib/dsdgen.jar tools/ || gjar cvf lib/dsdgen.jar tools/ )
29 | 
30 | target/tools/dsdgen: check-env
31 | 	mkdir -p target/
32 | 	cp patches/*.patch $(TPCDS_HOME)/
33 | 	# unapply previously applied patches if any, ignore errors
34 | 	-cd $(TPCDS_HOME); cat *.patch | patch -R -p1 -N
35 | 	# apply patches to both source code and templates
36 | 	cd $(TPCDS_HOME) && cat *.patch | patch -p1
37 | 	test -d target/tools/ || (cd target; cp -r $(TPCDS_HOME)/tools tools)
38 | 	cd target/tools; make clean; make
39 | 
40 | check-env:
41 | ifndef TPCDS_HOME
42 | 	$(error "TPCDS_HOME not defined, please set TPCDS_HOME environment variable to your TPCDS Tool directory")
43 | endif
44 | 
45 | clean:
46 | 	mvn clean
47 | 


--------------------------------------------------------------------------------
/nds/tpcds-gen/README.md:
--------------------------------------------------------------------------------
 1 | ## Disclaimer
 2 | 
 3 | NDS is derived from the TPC-DS benchmarks and as such any results obtained using NDS are not
 4 | comparable to published TPC-DS benchmark results, as the results obtained from using NDS do not
 5 | comply with the TPC-DS benchmark.
 6 | 
 7 | ## License
 8 | 
 9 | NDS is licensed under Apache License, Version 2.0.
10 | 
11 | Additionally, certain files in NDS are licensed subject to the accompanying [TPC EULA](TPC%20EULA.txt) (also 
12 | available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).  Files subject to the TPC 
13 | EULA are identified as such within the files.
14 | 
15 | You may not use NDS except in compliance with the Apache License, Version 2.0 and the TPC EULA.
16 | 
17 | 
18 | MapReduce TPC-DS Generator
19 | ==========================
20 | 
21 | This simplifies creating TPC-DS datasets at large scales on a Hadoop cluster.
22 | 
23 | To get set up, you need to run
24 | 
25 | ```
26 | $ make
27 | ```
28 | 
29 | This will download the TPC-DS dsgen program, compile it and use maven to build the MR app wrapped around it.
30 | 
31 | To generate the data use a variation of the following command to specify the target directory in HDFS (`-d`), the scale factor in GB (`-s 10000`, for 10TB), and the parallelism to use (`-p 100`).
32 | 
33 | ```
34 | $ hadoop jar target/tpcds-gen-1.0-SNAPSHOT.jar -d /tmp/tpc-ds/sf10000/ -p 100 -s 10000
35 | ```
36 | 
37 | This uses the existing parallelism in the driver.c of TPC-DS without modification and uses it to run the command on multiple machines instead of running in local fork mode.
38 | 
39 | The command generates multiple files for each map task, resulting in each table having its own subdirectory.
40 | 
41 | Assumptions made are that all machines in the cluster are OS/arch/lib identical.
42 | 


--------------------------------------------------------------------------------
/nds/tpcds-gen/patches/code.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/tools/print.c b/tools/print.c
 2 | index 5ecb5d7..1baf56e 100644
 3 | --- a/tools/print.c
 4 | +++ b/tools/print.c
 5 | @@ -68,6 +68,7 @@ print_close(int tbl)
 6 |  	fpOutfile = NULL;
 7 |  	if (pTdef->outfile)
 8 |  	{
 9 | +		fflush(pTdef->outfile);
10 |  		fclose(pTdef->outfile);
11 |  		pTdef->outfile = NULL;
12 |  	}
13 | @@ -537,7 +538,7 @@ print_end (int tbl)
14 |     if (add_term)
15 |        fwrite(term, 1, add_term, fpOutfile);
16 |     fprintf (fpOutfile, "\n");
17 | -   fflush(fpOutfile);
18 | +   //fflush(fpOutfile);
19 |  
20 |     return (res);
21 |  }
22 | diff --git a/tools/r_params.c b/tools/r_params.c
23 | index 4db16e5..9b1a8e6 100644
24 | --- a/tools/r_params.c
25 | +++ b/tools/r_params.c
26 | @@ -46,7 +46,7 @@
27 |  #include "tdefs.h"
28 |  #include "release.h"
29 |  
30 | -#define PARAM_MAX_LEN	80
31 | +#define PARAM_MAX_LEN	PATH_MAX
32 |  
33 |  #ifndef TEST
34 |  extern option_t options[];
35 | @@ -275,7 +275,7 @@ set_str(char *var, char *val)
36 |  	nParam = fnd_param(var);
37 |  	if (nParam >= 0)
38 |  	{
39 | -		strcpy(params[options[nParam].index], val);
40 | +		strncpy(params[options[nParam].index], val, PARAM_MAX_LEN);
41 |  		options[nParam].flags |= OPT_SET;
42 |  	}
43 |  
44 | 


--------------------------------------------------------------------------------
/nds/tpcds-gen/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 |   SPDX-License-Identifier: Apache-2.0
  5 | 
  6 |   Licensed under the Apache License, Version 2.0 (the "License");
  7 |   you may not use this file except in compliance with the License.
  8 |   You may obtain a copy of the License at
  9 | 
 10 |   http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |   Unless required by applicable law or agreed to in writing, software
 13 |   distributed under the License is distributed on an "AS IS" BASIS,
 14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   See the License for the specific language governing permissions and
 16 |   limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 19 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 20 |     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
 21 |                         http://maven.apache.org/maven-v4_0_0.xsd">
 22 | 
 23 |   <modelVersion>4.0.0</modelVersion>
 24 | 
 25 |   <groupId>org.notmysock.tpcds</groupId>
 26 |   <artifactId>tpcds-gen</artifactId>
 27 |   <version>1.0-SNAPSHOT</version>
 28 |   <packaging>jar</packaging>
 29 | 
 30 |   <name>tpcds-gen</name>
 31 |   <url>http://maven.apache.org</url>
 32 | 
 33 |   <properties>
 34 |     <tpcds-gen.jdk.version>1.8</tpcds-gen.jdk.version>
 35 |   </properties>
 36 | 
 37 |   <dependencies>
 38 |     <dependency>
 39 |       <groupId>org.apache.hadoop</groupId>
 40 |       <artifactId>hadoop-client</artifactId>
 41 |       <version>3.2.1</version>
 42 |       <scope>compile</scope>
 43 |     </dependency>
 44 |     <dependency>
 45 |       <groupId>commons-cli</groupId>
 46 |       <artifactId>commons-cli</artifactId>
 47 |       <version>1.1</version>
 48 |       <scope>compile</scope>
 49 |     </dependency>
 50 |     <dependency>
 51 |       <groupId>org.mockito</groupId>
 52 |       <artifactId>mockito-core</artifactId>
 53 |       <version>1.8.5</version>
 54 |       <scope>test</scope>
 55 |     </dependency>
 56 |     <dependency>
 57 |       <groupId>junit</groupId>
 58 |       <artifactId>junit</artifactId>
 59 |       <version>4.13.1</version>
 60 |       <scope>test</scope>
 61 |     </dependency>
 62 |   </dependencies>
 63 | 
 64 |   <build>
 65 |     <plugins>
 66 |       <plugin>
 67 |         <artifactId>maven-compiler-plugin</artifactId>
 68 |         <configuration>
 69 |           <source>${tpcds-gen.jdk.version}</source>
 70 |           <target>${tpcds-gen.jdk.version}</target>
 71 |         </configuration>
 72 |       </plugin>
 73 |       <plugin>
 74 |         <groupId>org.apache.maven.plugins</groupId>
 75 |         <artifactId>maven-jar-plugin</artifactId>
 76 |         <configuration>
 77 |           <archive>
 78 |             <manifest>
 79 |               <addClasspath>true</addClasspath>
 80 |                           <classpathPrefix>lib/</classpathPrefix>
 81 |               <mainClass>org.notmysock.tpcds.GenTable</mainClass>
 82 |             </manifest>
 83 |           </archive>
 84 |         </configuration>
 85 |       </plugin>
 86 | 	  <plugin>
 87 | 		<groupId>org.apache.maven.plugins</groupId>
 88 | 		<artifactId>maven-dependency-plugin</artifactId>
 89 |         <executions>
 90 | 		  <execution>
 91 |             <id>copy-dependencies</id>
 92 |             <phase>package</phase>
 93 |             <goals>
 94 | 			  <goal>copy-dependencies</goal>
 95 |             </goals>
 96 |             <configuration>
 97 |                 <outputDirectory>${project.build.directory}/lib</outputDirectory>
 98 |             </configuration>
 99 |           </execution>
100 |         </executions>
101 |       </plugin>
102 |     </plugins>
103 |   </build>
104 | 
105 | </project>
106 | 


--------------------------------------------------------------------------------
/scripts/auto-copyrighter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2024, NVIDIA CORPORATION.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER=${SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER:-OFF}
19 | 
20 | case "$SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER" in
21 | 
22 |     OFF)
23 |         echo "Copyright updater is DISABLED. Automatic Copyright Updater can be enabled/disabled by setting \
24 | SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER=ON or SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER=OFF, \
25 | correspondingly"
26 |         exit 0
27 |         ;;
28 | 
29 |     ON)
30 |         ;;
31 | 
32 |     *)
33 |         echo "Invalid value of SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER=$SPARK_RAPIDS_BENCHMARKS_AUTO_COPYRIGHTER.
34 |         Only ON or OFF are allowed"
35 |         exit 1
36 |         ;;
37 | esac
38 | 
39 | set -x
40 | echo "$@" | xargs -L1 sed -i -E \
41 |     "s/Copyright *\(c\) *([0-9,-]+)*-([0-9]{4}), *NVIDIA *CORPORATION/Copyright (c) \\1-`date +%Y`, NVIDIA CORPORATION/; /`date +%Y`/! s/Copyright *\(c\) ([0-9]{4}), *NVIDIA *CORPORATION/Copyright (c) \\1-`date +%Y`, NVIDIA CORPORATION/"


--------------------------------------------------------------------------------
/shared/base.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # This is the base template file for the common information about test environment
19 | # including the information about Spark, cluster configuration and the Jar files,
20 | # which are required in the other templates.
21 | # We'll source this base file in all the other templates so that we just need to update
22 | # here once instead of updating in all the templates.
23 | # If you have any different configuration in a specific template, you can override
24 | # the variables in that template.
25 | 
26 | export SPARK_HOME=${SPARK_HOME:-/usr/lib/spark}
27 | export SPARK_MASTER=${SPARK_MASTER:-local[*]}
28 | export DRIVER_MEMORY=${DRIVER_MEMORY:-10G}
29 | export EXECUTOR_CORES=${EXECUTOR_CORES:-12}
30 | export NUM_EXECUTORS=${NUM_EXECUTORS:-8}
31 | export EXECUTOR_MEMORY=${EXECUTOR_MEMORY:-16G}
32 | 
33 | # The NDS listener jar which is built in jvm_listener directory.
34 | export NDS_LISTENER_JAR=${NDS_LISTENER_JAR:-../utils/jvm_listener/target/benchmark-listener-1.0-SNAPSHOT.jar}
35 | # The spark-rapids jar which is required when running on GPU
36 | export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_PLUGIN_JAR:-rapids-4-spark_2.12-24.04.0-cuda11.jar}
37 | export PYTHONPATH=$SPARK_HOME/python:`echo $SPARK_HOME/python/lib/py4j-*.zip`
38 | 


--------------------------------------------------------------------------------
/shared/convert_submit_cpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
20 | 
21 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
22 |                    "--deploy-mode" "client"
23 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
24 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
25 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
26 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
27 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
28 |                    "--conf" "spark.sql.legacy.charVarcharAsString=true")
29 | 


--------------------------------------------------------------------------------
/shared/convert_submit_gpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
20 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
21 | 
22 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
23 |                    "--deploy-mode" "client"
24 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
25 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
26 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
27 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
28 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
29 |                    "--conf" "spark.executor.resource.gpu.amount=1"
30 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
31 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
32 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
33 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
34 |                    "--conf" "spark.rapids.sql.explain=NOT_ON_GPU"
35 |                    "--conf" "spark.rapids.sql.incompatibleOps.enabled=true"
36 |                    "--conf" "spark.rapids.sql.variableFloatAgg.enabled=true"
37 |                    "--conf" "spark.sql.files.maxPartitionBytes=2g"
38 |                    "--conf" "spark.sql.legacy.parquet.datetimeRebaseModeInWrite=CORRECTED"
39 |                    "--conf" "spark.task.resource.gpu.amount=0.05"
40 |                    "--conf" "spark.sql.legacy.charVarcharAsString=true"
41 |                    "--files" "/opt/spark/getGpusResources.sh"
42 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR")
43 | 


--------------------------------------------------------------------------------
/shared/power_run_cpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
20 | 
21 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
22 |                    "--deploy-mode" "client"
23 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
24 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
25 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
26 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
27 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
28 |                    "--conf" "spark.scheduler.minRegisteredResourcesRatio=1.0"
29 |                    "--conf" "spark.sql.adaptive.enabled=true"
30 |                    "--conf" "spark.sql.broadcastTimeout=1200"
31 |                    "--conf" "spark.dynamicAllocation.enabled=false"
32 |                    "--jars" "$NDS_LISTENER_JAR")
33 | 


--------------------------------------------------------------------------------
/shared/power_run_gpu.template:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source base.template
19 | export CONCURRENT_GPU_TASKS=${CONCURRENT_GPU_TASKS:-2}
20 | export SHUFFLE_PARTITIONS=${SHUFFLE_PARTITIONS:-200}
21 | 
22 | export SPARK_CONF=("--master" "${SPARK_MASTER}"
23 |                    "--deploy-mode" "client"
24 |                    "--conf" "spark.driver.maxResultSize=2GB"
25 |                    "--conf" "spark.driver.memory=${DRIVER_MEMORY}"
26 |                    "--conf" "spark.executor.cores=${EXECUTOR_CORES}"
27 |                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
28 |                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
29 |                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
30 |                    "--conf" "spark.sql.files.maxPartitionBytes=2gb"
31 |                    "--conf" "spark.sql.adaptive.enabled=true"
32 |                    "--conf" "spark.executor.resource.gpu.amount=1"
33 |                    "--conf" "spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh"
34 |                    "--conf" "spark.task.resource.gpu.amount=0.0625"
35 |                    "--conf" "spark.plugins=com.nvidia.spark.SQLPlugin"
36 |                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
37 |                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
38 |                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
39 |                    "--files" "$SPARK_HOME/examples/src/main/scripts/getGpusResources.sh"
40 |                    "--jars" "$SPARK_RAPIDS_PLUGIN_JAR,$NDS_LISTENER_JAR")
41 | 


--------------------------------------------------------------------------------
/shared/spark-submit-template:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | set -ex
19 | # e.g.
20 | # ./spark-submit-template power_run_gpu.template nds_power.py \
21 | # local_data_parquet/
22 | # ./nds_query_streams/query_0.sql \
23 | # time.csv
24 | 
25 | # the first argument must be the template file
26 | source "$1"
27 | # build spark-submit command
28 | MORE_ARGS=("${@:2}")
29 | CMD=("$SPARK_HOME/bin/spark-submit")
30 | CMD+=("${SPARK_CONF[@]}")
31 | CMD+=("${MORE_ARGS[@]}")
32 | # submit
33 | "${CMD[@]}"
34 | 


--------------------------------------------------------------------------------
/utils/check.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  4 | # SPDX-License-Identifier: Apache-2.0
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | # -----
 19 | #
 20 | # Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
 21 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 22 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 23 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 24 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 25 | #
 26 | # You may not use this file except in compliance with the TPC EULA.
 27 | # DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
 28 | # obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
 29 | # obtained from using this file do not comply with the TPC-DS Benchmark.
 30 | #
 31 | 
 32 | import argparse
 33 | import os
 34 | import sys
 35 | from pathlib import Path
 36 | 
 37 | 
 38 | def check_version():
 39 |     req_ver = (3, 6)
 40 |     cur_ver = sys.version_info
 41 |     if cur_ver < req_ver:
 42 |         raise Exception('Minimum required Python version is 3.6, but current python version is {}.'
 43 |                         .format(str(cur_ver.major) + '.' + str(cur_ver.minor)) +
 44 |                         ' Please use proper Python version')
 45 | 
 46 | 
 47 | def check_build_nds_h():
 48 |     """check jar and tpcds executable
 49 | 
 50 |     Raises:
 51 |         Exception: the build is not done or broken
 52 | 
 53 |     Returns:
 54 |         PosixPath, PosixPath: path of jar and dsdgen executable
 55 |     """
 56 |     # Check if necessary executable or jars are built.
 57 |     # we assume user won't move this script.
 58 |     src_dir = Path(__file__).parent.parent.absolute()
 59 |     tool_path = list(Path(src_dir / 'nds-h/tpch-gen/target/dbgen').rglob("dbgen"))
 60 |     jar_path = list(
 61 |         Path(src_dir / 'nds-h/tpch-gen/target').rglob("tpch-gen-*.jar"))
 62 |     print(tool_path)
 63 |     if tool_path == []:
 64 |         raise Exception('dbgen executable is ' +
 65 |                         'not found in `target` folder.' +
 66 |                         'Please refer to README document and build this project first.')
 67 |     return jar_path[0], tool_path[0]
 68 | 
 69 | def check_build_nds():
 70 |     """check jar and tpcds executable
 71 | 
 72 |     Raises:
 73 |         Exception: the build is not done or broken
 74 | 
 75 |     Returns:
 76 |         PosixPath, PosixPath: path of jar and dsdgen executable
 77 |     """
 78 |     # Check if necessary executable or jars are built.
 79 |     # we assume user won't move this script.
 80 |     src_dir = Path(__file__).parent.parent.absolute()
 81 |     jar_path = list(
 82 |         Path(src_dir / 'nds/tpcds-gen/target').rglob("tpcds-gen-*.jar"))
 83 |     tool_path = list(Path(src_dir / 'nds/tpcds-gen/target/tools').rglob("dsdgen"))
 84 |     if jar_path == [] or tool_path == []:
 85 |         raise Exception('Target jar file is not found in `target` folder or dsdgen executable is ' +
 86 |                         'not found in `target/tools` folder.' +
 87 |                         'Please refer to README document and build this project first.')
 88 |     return jar_path[0], tool_path[0]
 89 | 
 90 | 
 91 | 
 92 | def get_abs_path(input_path):
 93 |     """receive a user input path and return absolute path of it.
 94 | 
 95 |     Args:
 96 |         input_path (str): user's input path
 97 | 
 98 |     Returns:
 99 |         str: if the input is absolute, return it; if it's relative path, return the absolute path of
100 |         it.
101 |     """
102 |     if Path(input_path).is_absolute():
103 |         # it's absolute path
104 |         output_path = input_path
105 |     else:
106 |         # it's relative path where this script is executed
107 |         output_path = os.getcwd() + '/' + input_path
108 |     return output_path
109 | 
110 | 
111 | def valid_range(range, parallel):
112 |     """check the range validation
113 | 
114 |     Args:
115 |         range (str): a range specified for a range data generation, e.g. "1,10"
116 |         parallel (str): string type number for parallelism in TPC-DS data generation, e.g. "20"
117 | 
118 |     Raises:
119 |         Exception: error message for invalid range input.
120 |     """
121 |     if len(range.split(',')) != 2:
122 |         msg = 'Invalid range: please specify a range with a comma between start and end. e.g., "1,10".'
123 |         raise Exception(msg)
124 |     range_start = int(range.split(',')[0])
125 |     range_end = int(range.split(',')[1])
126 |     if range_start < 1 or range_start > range_end or range_end > int(parallel):
127 |         msg = 'Please provide correct child range: 1 <= range_start <= range_end <= parallel'
128 |         raise Exception(msg)
129 |     return range_start, range_end
130 | 
131 | 
132 | def parallel_value_type(p):
133 |     """helper function to check parallel valuie
134 | 
135 |     Args:
136 |         p (str): parallel value
137 | 
138 |     Raises:
139 |         argparse.ArgumentTypeError: ArgumentTypeError exception
140 | 
141 |     Returns:
142 |         str: parallel in string
143 |     """
144 |     if int(p) < 2:
145 |         raise argparse.ArgumentTypeError("PARALLEL must be >= 2")
146 |     return p
147 | 
148 | 
149 | def get_dir_size(start_path):
150 |     total_size = 0
151 |     for dirpath, dirnames, filenames in os.walk(start_path):
152 |         for f in filenames:
153 |             fp = os.path.join(dirpath, f)
154 |             # skip if it is symbolic link
155 |             if not os.path.islink(fp):
156 |                 total_size += os.path.getsize(fp)
157 |     return total_size
158 | 
159 | def check_json_summary_folder(json_summary_folder):
160 |     if json_summary_folder:
161 |     # prepare a folder to save json summaries of query results
162 |         if not os.path.exists(json_summary_folder):
163 |             os.makedirs(json_summary_folder)
164 |         else:
165 |             if os.listdir(json_summary_folder):
166 |                 raise Exception(f"json_summary_folder {json_summary_folder} is not empty. " +
167 |                                 "There may be already some json files there. Please clean the folder " +
168 |                                 "or specify another one.")
169 | 
170 | def check_query_subset_exists(query_dict, subset_list):
171 |     """check if the query subset exists in the query dictionary"""
172 |     for q in subset_list:
173 |         if q not in query_dict.keys():
174 |             raise Exception(f"Query {q} is not in the query dictionary. Please check the query subset.")
175 |     return True
176 | 


--------------------------------------------------------------------------------
/utils/jvm_listener/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   ~ Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 |   ~
 5 |   ~ Licensed under the Apache License, Version 2.0 (the "License");
 6 |   ~ you may not use this file except in compliance with the License.
 7 |   ~ You may obtain a copy of the License at
 8 |   ~
 9 |   ~ http://www.apache.org/licenses/LICENSE-2.0
10 |   ~
11 |   ~ Unless required by applicable law or agreed to in writing, software
12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |   ~ See the License for the specific language governing permissions and
15 |   ~ limitations under the License.
16 |   -->
17 | <project xmlns="http://maven.apache.org/POM/4.0.0"
18 |     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
19 |     <modelVersion>4.0.0</modelVersion>
20 | 
21 |     <groupId>com.nvidia</groupId>
22 |     <artifactId>benchmark-listener</artifactId>
23 |     <packaging>jar</packaging>
24 |     <version>1.0-SNAPSHOT</version>
25 | 
26 |     <properties>
27 |         <maven.compiler.source>8</maven.compiler.source>
28 |         <maven.compiler.target>8</maven.compiler.target>
29 |     </properties>
30 |     <dependencies>
31 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
32 |         <dependency>
33 |             <groupId>org.apache.spark</groupId>
34 |             <artifactId>spark-core_2.12</artifactId>
35 |             <version>3.1.2</version>
36 |         </dependency>
37 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
38 |         <dependency>
39 |             <groupId>org.apache.spark</groupId>
40 |             <artifactId>spark-sql_2.12</artifactId>
41 |             <version>3.1.2</version>
42 |             <scope>provided</scope>
43 |         </dependency>
44 |     </dependencies>
45 | 
46 |     <build>
47 |         <sourceDirectory>src/main/scala/</sourceDirectory>
48 |         <plugins>
49 |             <plugin>
50 |                 <groupId>org.apache.maven.plugins</groupId>
51 |                 <artifactId>maven-compiler-plugin</artifactId>
52 |                 <version>3.8.1</version>
53 |                 <configuration>
54 |                     <source>1.8</source>
55 |                     <target>1.8</target>
56 |                 </configuration>
57 |             </plugin>
58 |             <plugin>
59 |                 <groupId>org.scala-tools</groupId>
60 |                 <artifactId>maven-scala-plugin</artifactId>
61 |                 <version>2.15.2</version>
62 |                 <executions>
63 |                     <execution>
64 |                         <goals>
65 |                             <goal>compile</goal>
66 |                             <goal>testCompile</goal>
67 |                         </goals>
68 |                     </execution>
69 |                 </executions>
70 |             </plugin>
71 |         </plugins>
72 |     </build>
73 | 
74 | </project>
75 | 


--------------------------------------------------------------------------------
/utils/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/Listener.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | trait Listener {
21 |   /* Listener interface to be implemented at Python side
22 |    */
23 |   def notify(x: Any): Any
24 | }
25 | 


--------------------------------------------------------------------------------
/utils/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/Manager.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | import org.apache.spark.SparkContext
21 | 
22 | object Manager {
23 |   /* Manager class to manage all extra customized listeners.
24 |   */
25 |   private var listeners: Map[String, Listener] = Map()
26 |   private val spark_listener = new TaskFailureListener()
27 |   private var isRegistered = false
28 | 
29 |   def register(listener: Listener): String = {
30 |     /* Note this register method has nothing to do with SparkContext.addSparkListener method.
31 |     * This method is only to provide an interface to developers to have a better control over
32 |     * all customized listeners.
33 |     */
34 |     this.synchronized {
35 |       // We register to the spark listener when the first listener is registered.
36 |       registerSparkListener()
37 |       val uuid = java.util.UUID.randomUUID().toString
38 |       listeners = listeners + (uuid -> listener)
39 |       uuid
40 |     }
41 |   }
42 | 
43 |   def unregister(uuid: String) = {
44 |     this.synchronized {
45 |       listeners = listeners - uuid
46 |     }
47 |   }
48 | 
49 |   def notifyAll(message: String): Unit = {
50 |     for { (_, listener) <- listeners } listener.notify(message)
51 |   }
52 | 
53 |   def registerSparkListener() : Unit = {
54 |     if (!isRegistered) {
55 |       SparkContext.getOrCreate().addSparkListener(spark_listener)
56 |       isRegistered = true
57 |     }
58 |   }
59 | 
60 |   def unregisterSparkListener() : Unit = {
61 |     if (isRegistered) {
62 |       SparkContext.getOrCreate().removeSparkListener(spark_listener)
63 |       isRegistered = false
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/utils/jvm_listener/src/main/scala/com/nvidia/spark/rapids/listener/TaskFailureListener.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.nvidia.spark.rapids.listener
19 | 
20 | import org.apache.spark.{Success, TaskEndReason}
21 | import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
22 | import scala.collection.mutable.ListBuffer
23 | 
24 | 
25 | /* A simple listener which captures SparkListenerTaskEnd,
26 |  * extracts "reason" of the task. If the reason is not "Success",
27 |  * send this reason to python side.
28 |  */
29 | class TaskFailureListener extends SparkListener {
30 |   override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
31 |     taskEnd.reason match {
32 |       case Success =>
33 |       case reason => Manager.notifyAll(reason.toString)
34 |     }
35 |     super.onTaskEnd(taskEnd)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/utils/properties/aqe-on.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | spark.sql.adaptive.enabled=true


--------------------------------------------------------------------------------
/utils/python_benchmark_reporter/PysparkBenchReport.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  5 | # SPDX-License-Identifier: Apache-2.0
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | # http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | # -----
 20 | #
 21 | # Certain portions of the contents of this file are derived from TPC-H version 3.2.0
 22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
 23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
 24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
 25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
 26 | #
 27 | # You may not use this file except in compliance with the TPC EULA.
 28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
 29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
 30 | # obtained from using this file do not comply with the TPC-H Benchmark.
 31 | #
 32 | 
 33 | import json
 34 | import os
 35 | import time
 36 | import traceback
 37 | from typing import Callable
 38 | from pyspark.sql import SparkSession
 39 | from python_benchmark_reporter.PythonListener import PythonListener
 40 | 
 41 | class PysparkBenchReport:
 42 |     """Class to generate json summary report for a benchmark
 43 |     """
 44 |     def __init__(self, spark_session: SparkSession, query_name) -> None:
 45 |         self.spark_session = spark_session
 46 |         self.summary = {
 47 |             'env': {
 48 |                 'envVars': {},
 49 |                 'sparkConf': {},
 50 |                 'sparkVersion': None
 51 |             },
 52 |             'queryStatus': [],
 53 |             'exceptions': [],
 54 |             'startTime': None,
 55 |             'queryTimes': [],
 56 |             'query': query_name,
 57 |         }
 58 | 
 59 |     def report_on(self, fn: Callable, warmup_iterations = 0, iterations = 1, *args):
 60 |         """Record a function for its running environment, running status etc. and exclude sentive
 61 |         information like tokens, secret and password Generate summary in dict format for it.
 62 | 
 63 |         Args:
 64 |             fn (Callable): a function to be recorded
 65 |             :param iterations:
 66 |             :param warmup_iterations:
 67 |         Returns:
 68 |             dict: summary of the fn
 69 |         """
 70 |         spark_conf = dict(self.spark_session.sparkContext._conf.getAll())
 71 |         env_vars = dict(os.environ)
 72 |         redacted = ["TOKEN", "SECRET", "PASSWORD"]
 73 |         filtered_env_vars = dict((k, env_vars[k]) for k in env_vars.keys() if not (k in redacted))
 74 |         self.summary['env']['envVars'] = filtered_env_vars
 75 |         self.summary['env']['sparkConf'] = spark_conf
 76 |         self.summary['env']['sparkVersion'] = self.spark_session.version
 77 |         listener = None
 78 |         try:
 79 |             listener = PythonListener()
 80 |             listener.register()
 81 |         except TypeError as e:
 82 |             print("Not found com.nvidia.spark.rapids.listener.Manager", str(e))
 83 |             listener = None
 84 |         if listener is not None:
 85 |             print("TaskFailureListener is registered.")
 86 |         try:
 87 |             # warmup
 88 |             for i in range(0, warmup_iterations):
 89 |                 fn(*args)
 90 |         except Exception as e:
 91 |             print('ERROR WHILE WARMUP BEGIN')
 92 |             print(e)
 93 |             traceback.print_tb(e.__traceback__)
 94 |             print('ERROR WHILE WARMUP END')
 95 | 
 96 |         start_time = int(time.time() * 1000)
 97 |         self.summary['startTime'] = start_time
 98 |         # run the query
 99 |         for i in range(0, iterations):
100 |             try:
101 |                 start_time = int(time.time() * 1000)
102 |                 fn(*args)
103 |                 end_time = int(time.time() * 1000)
104 |                 if listener and len(listener.failures) != 0:
105 |                     self.summary['queryStatus'].append("CompletedWithTaskFailures")
106 |                 else:
107 |                     self.summary['queryStatus'].append("Completed")
108 |             except Exception as e:
109 |                 # print the exception to ease debugging
110 |                 print('ERROR BEGIN')
111 |                 print(e)
112 |                 traceback.print_tb(e.__traceback__)
113 |                 print('ERROR END')
114 |                 end_time = int(time.time() * 1000)
115 |                 self.summary['queryStatus'].append("Failed")
116 |                 self.summary['exceptions'].append(str(e))
117 |             finally:
118 |                 self.summary['queryTimes'].append(end_time - start_time)
119 |         if listener is not None:
120 |             listener.unregister()
121 |         return self.summary
122 | 
123 |     def write_summary(self, prefix=""):
124 |         """_summary_
125 | 
126 |         Args:
127 |             query_name (str): name of the query
128 |             prefix (str, optional): prefix for the output json summary file. Defaults to "".
129 |         """
130 |         # Power BI side is retrieving some information from the summary file name, so keep this file
131 |         # name format for pipeline compatibility
132 |         filename = prefix + '-' + self.summary['query'] + '-' +str(self.summary['startTime']) + '.json'
133 |         self.summary['filename'] = filename
134 |         with open(filename, "w") as f:
135 |             json.dump(self.summary, f, indent=2)
136 | 
137 |     def is_success(self):
138 |         """Check if the query succeeded, queryStatus == Completed
139 |         """
140 |         return self.summary['queryStatus'][0] == 'Completed'
141 | 


--------------------------------------------------------------------------------
/utils/python_benchmark_reporter/PythonListener.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 4 | # SPDX-License-Identifier: Apache-2.0
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | from pyspark import SparkContext
19 | from pyspark.java_gateway import ensure_callback_server_started
20 | 
21 | class PythonListener(object):
22 |     package = "com.nvidia.spark.rapids.listener"
23 | 
24 |     @staticmethod
25 |     def get_manager():
26 |         jvm = SparkContext.getOrCreate()._jvm
27 |         manager = getattr(jvm, "{}.{}".format(PythonListener.package, "Manager"))
28 |         return manager
29 | 
30 |     def __init__(self):
31 |         self.uuid = None
32 |         self.failures = []
33 | 
34 |     def notify(self, obj):
35 |         """This method is required by Scala Listener interface
36 |         we defined above.
37 |         """
38 |         self.failures.append(obj)
39 | 
40 |     def register(self):
41 |         ensure_callback_server_started(gw = SparkContext.getOrCreate()._gateway)
42 |         manager = PythonListener.get_manager()
43 |         self.uuid = manager.register(self)
44 |         return self.uuid
45 | 
46 |     def unregister(self):
47 |         manager =  PythonListener.get_manager()
48 |         manager.unregister(self.uuid)
49 |         self.uuid = None
50 | 
51 |     # should call after register
52 |     def register_spark_listener(self):
53 |         manager = PythonListener.get_manager()
54 |         manager.registerSparkListener()
55 | 
56 |     def unregister_spark_listener(self):
57 |         manager = PythonListener.get_manager()
58 |         manager.unregisterSparkListener()
59 | 
60 |     class Java:
61 |         implements = ["com.nvidia.spark.rapids.listener.Listener"]
62 | 


--------------------------------------------------------------------------------
/utils/python_benchmark_reporter/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 5 | # SPDX-License-Identifier: Apache-2.0
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | # -----
20 | #
21 | # Certain portions of the contents of this file are derived from TPC-H version 3.2.0
22 | # (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
23 | # Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
24 | # and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
25 | # available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
26 | #
27 | # You may not use this file except in compliance with the TPC EULA.
28 | # DISCLAIMER: Portions of this file is derived from the TPC-H Benchmark and as such any results
29 | # obtained using this file are not comparable to published TPC-H Benchmark results, as the results
30 | # obtained from using this file do not comply with the TPC-H Benchmark.


--------------------------------------------------------------------------------