├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── README.md
    ├── contact
    │   └── report-a-bug.md
    ├── examples
    │   ├── README.md
    │   ├── basic.md
    │   ├── blas.md
    │   └── ptx.md
    ├── favicon.png
    ├── licensing.md
    ├── logo_white.svg
    ├── manual
    │   ├── CHANGELOG.md
    │   ├── api-driver.md
    │   ├── api-math.md
    │   ├── api-runtime.md
    │   ├── apis.md
    │   ├── comparison.md
    │   ├── compute-capabilities.md
    │   ├── diagnostic-flags.md
    │   ├── dialects.md
    │   ├── differences.md
    │   ├── faq.md
    │   ├── how-to-install.md
    │   ├── how-to-use.md
    │   ├── inline-ptx.md
    │   ├── language-extensions.md
    │   ├── optimisation-flags.md
    │   ├── runtime-extensions.md
    │   └── troubleshooting.md
    ├── notices.md
    ├── style.css
    └── use_of_trademarks.md
├── examples
    ├── .gitignore
    ├── example.sh
    └── src
    │   ├── basic
    │       ├── CMakeLists.txt
    │       └── basic.cu
    │   ├── blas
    │       ├── CMakeLists.txt
    │       └── blas.cu
    │   └── ptx
    │       ├── CMakeLists.txt
    │       └── ptx.cu
├── main.py
├── mkdocs.yml
├── overrides
    └── partials
    │   └── footer.html
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | site/
2 | venv/
3 | __pycache__/
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SCALE documentation
 2 | 
 3 | This repository contains the source code for the [SCALE](https://scale-lang.com) 
 4 | documentation and examples.
 5 | 
 6 | The documentation can be viewed at https://docs.scale-lang.com/.
 7 | 
 8 | The root of the documentation files is at [`docs/README.md`](docs/README.md).
 9 | 
10 | Pull requests are welcomed!
11 | 
12 | The `master` branch holds the current version of the stable documentation.
13 | `unstable` contains the current unstable documentation.
14 | 
15 | The [`mike`](https://github.com/jimporter/mike) tool is used for managing
16 | documentation versions.
17 | 
18 | ## Compiling the manual
19 | 
20 | Create a venv and `pip install -r requirements.txt`.
21 | 
22 | To launch the devserver and see changes live:
23 | 
24 | ```sh
25 | mkdocs serve
26 | ```
27 | 
28 | ## Publishing
29 | 
30 | ### To publish a new version of the manual for a stable release:
31 | 
32 | *Make sure not to prepend 'v' to the version number*
33 | 
34 | - Ensure your changes are all in master, and master is checked out.
35 | - `mike deploy --push --update-aliases <SCALE VERSION NUMBER> stable`
36 | 
37 | To deploy an unstable release of the manual:
38 | 
39 | - Check out the `unstable` branch.
40 | - `mike deploy --push unstable`
41 | 
42 | ### To publish a new version of the manual for an unstable release:
43 | 
44 | *Make sure not to prepend 'v' to the version number*
45 | 
46 | - Ensure your changes are all in `unstable`, and `unstable` is checked out.
47 | - `mike deploy --push unstable`
48 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # SCALE by Spectral Compute
 2 | 
 3 | ## What is SCALE?
 4 | 
 5 | SCALE is a GPGPU programming toolkit that can natively compile CUDA 
 6 | applications for AMD GPUs.
 7 | 
 8 | SCALE does not require the CUDA program or its build system to be modified.
 9 | 
10 | Support for more GPU vendors and CUDA APIs is in development.
11 | 
12 | ## How do I use SCALE?
13 | 
14 | 1. [Install SCALE](./manual/how-to-install.md).
15 | 2. Activate SCALE, eg. `. /opt/SCALE/scaleenv gfx1100`
16 | 3. Compile your application, following the same steps you would use for 
17 |    NVIDIA CUDA.
18 | 
19 | ## What projects have been tested?
20 | 
21 | We validate SCALE by compiling open-source CUDA projects and running their
22 | tests. The list of currently-tested projects and their compatibility status
23 | can be found [here](https://github.com/spectral-compute/scale-validation/tree/master?tab=readme-ov-file#current-status)
24 | 
25 | ## Which GPUs are supported?
26 | 
27 | The following GPU targets are supported in the free edition of SCALE:
28 | 
29 | - AMD `gfx900` (Vega 10, GCN 5.0)
30 | - AMD `gfx1030` (Navi 21, RDNA 2.0)
31 | - AMD `gfx1100` (Navi 31, RDNA 3.0)
32 | - AMD `gfx1010`
33 | - AMD `gfx1101`
34 | - AMD `gfx1102`
35 | 
36 | The enterprise edition also has support for:
37 | 
38 | - AMD `gfx908`
39 | - AMD `gfx90a`
40 | - AMD `gfx942`
41 | 
42 | Academic/research licenses are available.
43 | 
44 | [Contact us](#contact-us) if you want us to expedite support for a particular AMD GPU
45 | architecture.
46 | 
47 | ## What are the components of SCALE?
48 | 
49 | SCALE consists of:
50 | 
51 | - An `nvcc`-compatible compiler capable of compiling nvcc-dialect CUDA for AMD
52 |   GPUs, including PTX asm.
53 | - An of the CUDA runtime, driver and math APIs for AMD GPUs.
54 | - Open-source wrapper libraries providing the "CUDA-X" APIs by delegating to the
55 |   corresponding ROCm libraries.
56 |   This is how libraries such as `cuBLAS` and `cuSOLVER` are handled.
57 | 
58 | ## What are the differences between SCALE and other solutions?
59 | 
60 | Instead of providing a [new way](https://xkcd.com/927/) to write GPGPU 
61 | software, SCALE allows programs written using the widely-popular CUDA
62 | language to be directly compiled for AMD GPUs.
63 | 
64 | SCALE aims to be fully compatible with NVIDIA CUDA. We believe that users 
65 | should not have to maintain multiple codebases or compromise on performance
66 | to support multiple GPU vendors.
67 | 
68 | SCALE's language is a _superset_ of NVIDIA CUDA, offering some opt-in
69 | [language extensions](./manual/language-extensions.md)
70 | that can make writing GPU code easier and more efficient for users who wish
71 | to move away from `nvcc`.
72 | 
73 | SCALE is a work in progress. If there is a missing API that is blocking your
74 | attempt to use SCALE, please contact us so we can prioritise its development.
75 | 
76 | ## Contact us
77 | 
78 | There are multiple ways to get in touch with us:
79 | 
80 |  - Join our [Discord](https://discord.gg/KNpgGbTc38)
81 |  - Send us an e-mail at [hello@spectralcompute.co.uk](mailto:hello@spectralcompute.co.uk)
82 | 


--------------------------------------------------------------------------------
/docs/contact/report-a-bug.md:
--------------------------------------------------------------------------------
 1 | # Report a Bug
 2 | 
 3 | SCALE is still in active development, so you may encounter bugs. If you run 
 4 | into problems, contact us by:
 5 | 
 6 | - Joining our [Discord](https://discord.gg/KNpgGbTc38)
 7 | - Creating [a ticket](https://github.com/spectral-compute/scale-validation/issues)
 8 | - Sending us an e-mail at [hello@spectralcompute.co.uk](mailto:hello@spectralcompute.co.uk)
 9 | 
10 | The remainder of this page provides information about how to make your 
11 | report as helpful as possible.
12 | 
13 | ## "No such function: cudaSomethingSomething()"
14 | 
15 | If your project fails to compile due to a missing CUDA Runtime or Driver API
16 | function, [get in touch][get-in-touch]: this helps us prioritise work by fixing
17 | the holes that have the most demand first.
18 | 
19 | ## "No such function: cuBlas/cuFFt/cuSolverSomethingSomething()"
20 | 
21 | If your project needs a missing "CUDA-X" API (cuBLAS, cuFFT, cuSOLVER and
22 | friends), this is most likely something you can fix yourself by submitting a
23 | patch to the [open-source library wrapper project](https://github.com/spectral-compute/scale-library-wrappers).
24 | So long as an equivalent function is available in a ROCm library, the wrapper
25 | code is trivial.
26 | 
27 | ## Compiler crash
28 | 
29 | When the compiler crashes, it creates temporary files containing a reproducer
30 | for the compiler crash, like this:
31 | 
32 | ```
33 | ********************
34 | 
35 | PLEASE ATTACH THE FOLLOWING FILES TO THE BUG REPORT:
36 | Preprocessed source(s) and associated run script(s) are located at:
37 | clang++: note: diagnostic msg: /tmp/a-02f191.cpp
38 | clang++: note: diagnostic msg: /tmp/a-02f191.sh
39 | clang++: note: diagnostic msg:
40 | 
41 | ********************
42 | ```
43 | 
44 | These files will contain the preprocessed version of the source file that broke
45 | the compiler, among other things.
46 | If you are able to share this with us, it will significantly increase the
47 | usefulness of the bug report.
48 | 
49 | If the source file contains sensitive/proprietary information, this could be
50 | destroyed by reducing the testcase using [cvise][cvise]. Alternatively, a bug
51 | report consisting of just the compiler output is still helpful - especially if
52 | it relates to PTX.
53 | 
54 | [cvise]: https://github.com/marxin/cvise/
55 | 
56 | ## GPU Crash
57 | 
58 | If your GPU code crashes with SCALE but not with NVIDIA's compiler, more
59 | useful information can be harvested by enabling some environment variables
60 | that dump extra information. If you are able, sharing the output obtained
61 | from reproducing the crash with one or both of these enabled can be helpful:
62 | 
63 | - `REDSCALE_CRASH_REPORT_DETAILED=1` will dump extra information from the
64 |   GPU trap handler. This includes register state and some symbol names, so
65 |   it is unlikely to contain any sensitive/proprietary information from your code.
66 | - `REDSCALE_CRASH_DUMP=somefilename` will write the crashing machine code to
67 |   a file. This makes it easier to investigate the problem, but it means that you're
68 |   sharing the compiled version of the crasing GPU kernel with us.
69 | 
70 | ## Something else
71 | 
72 | It will be helpful if you provide the output of the following commands along
73 | with your report:
74 | 
75 | ```
76 | lspci | grep VGA
77 | scaleinfo
78 | ```
79 | 
80 | Running your program with the environment variable `SCALE_EXCEPTIONS=1` set might give a more detailed error that would
81 | be helpful to us too.
82 | 
83 | [get-in-touch]: ../README.md#contact-us
84 | 


--------------------------------------------------------------------------------
/docs/examples/README.md:
--------------------------------------------------------------------------------
 1 | # SCALE Example Programs
 2 | 
 3 | These example programs are simple CUDA programs demonstrating the 
 4 | capabilities of SCALE.
 5 | 
 6 | SCALE is capable of much more, but these small demonstrations serve as a 
 7 | proof of concept of CUDA compatibility, as well as a starting point for 
 8 | users wishing to get into GPGPU programming.
 9 | 
10 | ## List of examples
11 | 
12 | Here is the list of examples that are currently available:
13 | 
14 | | Example             | What it is about           |
15 | | ------------------- | -------------------------- |
16 | | [Basic](./basic.md) | Usage in its simplest form |
17 | | [PTX](./ptx.md)     | Using PTX Assembly         |
18 | | [BLAS](./blas.md)   | Using BLAS maths wrapper   |
19 | 
20 | ## Accessing the examples
21 | 
22 | The examples are hosted in [the public github repository](https://github.com/spectral-compute/scale-docs)
23 | with the rest of this manual.
24 | 
25 | ```sh
26 | git clone https://github.com/spectral-compute/scale-docs.git
27 | cd scale-docs/examples
28 | ```
29 | 
30 | ## Using the examples
31 | 
32 | To build an example:
33 | - [Install SCALE](../manual/how-to-install.md)
34 | - [Decide on a GPU target](../manual/how-to-use.md#identifying-gpu-target)
35 | - [Build the example using cmake](../manual/how-to-use.md#cmake)
36 | 
37 | The example repository includes a helper script, `example.sh` that can fully 
38 | automate the process. Pass your SCALE target directory as the first argument,
39 | and the example you want to build/run as the second:
40 | 
41 | ```bash
42 | # You should be in the `examples` directory of the `scale-docs` repository
43 | ./example.sh /opt/scale gfx1030 basic
44 | ```
45 | 
46 | For the specified example, this will:
47 | 
48 | 1. Remove its build directory if it already exists.
49 | 2. Configure CMake for that example in a freshly-created build directory.
50 | 3. Build the example in that directory using Make.
51 | 4. Set the [`SCALE_EXCEPTIONS=1` environment variable][exceptions] for better 
52 |    error reporting.
53 | 4. Run the example.
54 | 
55 | [exceptions]: ../manual/runtime-extensions.md#scale_exceptions
56 | 


--------------------------------------------------------------------------------
/docs/examples/basic.md:
--------------------------------------------------------------------------------
 1 | # Basic example
 2 | 
 3 | This is simple vector-sum kernel using CUDA. 
 4 | 
 5 | The example:
 6 | 
 7 | - Generates test data on the host
 8 | - Sends data to the device
 9 | - Launches a kernel on the device
10 | - Receives data back from the device
11 | - Checks that the data is correct
12 | 
13 | Build and run the example by following the [general instructions](./README.md).
14 | 
15 | ## Example source code
16 | 
17 | ```cpp
18 | ---8<--- "examples/src/basic/basic.cu"
19 | ```
20 | 
21 | ## `CMakeLists.txt` used
22 | 
23 | ```cmake
24 | ---8<--- "examples/src/basic/CMakeLists.txt"
25 | ```
26 | 


--------------------------------------------------------------------------------
/docs/examples/blas.md:
--------------------------------------------------------------------------------
 1 | # BLAS example
 2 | 
 3 | This example demonstrates SCALE's compatibility with cuBLAS APIs by using 
 4 | cuBLAS to perform a double-precision dot-product on an AMD GPU.
 5 | 
 6 | cuBLAS APIs are forwarded to use the relevant ROCm APIs.
 7 | Note that the example links to `cublas` in its [`CMakeLists.txt`](#cmakeliststxt-used).
 8 | 
 9 | ## Example source code
10 | 
11 | ```cpp
12 | ---8<--- "examples/src/blas/blas.cu"
13 | ```
14 | 
15 | ## `CMakeLists.txt` used
16 | 
17 | ```cmake
18 | ---8<--- "examples/src/blas/CMakeLists.txt"
19 | ```
20 | 


--------------------------------------------------------------------------------
/docs/examples/ptx.md:
--------------------------------------------------------------------------------
 1 | # PTX example
 2 | 
 3 | This example demonstrates SCALE's support for inline PTX. A lot of real-world 
 4 | CUDA code uses inline PTX asm blocks, which are inherently NVIDIA-only. There is no 
 5 | need to rewrite those when using SCALE: the compiler just digests them and 
 6 | outputs AMD machine code.
 7 | 
 8 | This example uses C++ templates to access the functionality of the PTX 
 9 | `lop3` instruction, used in various ways throughout the kernel.
10 | 
11 | Build and run the example by following the [general instructions](./README.md).
12 | 
13 | ## Extra info
14 | 
15 | - [Using inline PTX Assembly](https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
16 | - [PTX ISA reference](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
17 | 
18 | PTX instructions used:
19 | 
20 | - [`add`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add)
21 | - [`lop3`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
22 | 
23 | ## Example source code
24 | 
25 | ```cpp
26 | ---8<--- "examples/src/ptx/ptx.cu"
27 | ```
28 | 
29 | ## `CMakeLists.txt` used
30 | 
31 | ```cmake
32 | ---8<--- "examples/src/ptx/CMakeLists.txt"
33 | ```
34 | 


--------------------------------------------------------------------------------
/docs/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spectral-compute/scale-docs/8f98d9dcfdd1cbd3ee7cc12cdbceb1094e038e8a/docs/favicon.png


--------------------------------------------------------------------------------
/docs/licensing.md:
--------------------------------------------------------------------------------
 1 | # Free Edition License
 2 | 
 3 | The Free Edition of SCALE has the following licensing terms:
 4 | 
 5 | | Permissions                     | Conditions                                              | Limitations                           |
 6 | | ------------------------------- | ------------------------------------------------------- | ------------------------------------- |
 7 | | :material-check: Commercial use | :material-check-outline: License and copyright notice   | :fontawesome-solid-xmark: Liability   |
 8 | | :material-check: Distribution   | :material-check-outline: No reverse engineering         | :fontawesome-solid-xmark: Warranty    |
 9 | | :material-check: Modification   | :material-check-outline: No endorsement                 |                                       |
10 | | :material-check: Private Use    | :material-check-outline: Comply with 3rd party licenses |                                       |
11 | 
12 | This table is intended as an overview, for detailed terms, please see the full license text below.
13 | 
14 | ## Full License Text
15 | 
16 | SCALE Free Edition Public License v1
17 | 
18 | Published: 10/7-2024
19 | 
20 | Copyright (C) 2024 Spectral Compute Ltd
21 | 
22 | License
23 | 
24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
25 | 
26 | Attribution
27 | 
28 | - Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer.
29 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution.
30 | - Neither the name of Spectral Compute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
31 | 
32 | Modifications
33 | 
34 |  - Any modifications made to this software must be clearly documented, stating the nature of the changes, the date of the changes, and the identity of the person or entity making the changes.
35 | 
36 | Prohibition on Reverse Engineering
37 | 
38 |  - You may not reverse engineer, decompile, or disassemble the software, except and to only the extent that such activity is expressly permitted by applicable law notwithstanding this limitation.
39 | 
40 | Third-Party Software
41 | 
42 |  - The software may include third-party software components. You agree to comply with all applicable third-party terms and conditions, which will be included in the NOTICES file accompanying the software. The NOTICES file will also contain attributions for any third-party software components included in this software.
43 | 
44 | Disclaimer of Warranty
45 | 
46 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47 | 
48 | Limitation of Liability
49 | 
50 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
51 | 
52 | Governing Law
53 | 
54 | This License shall be governed by and construed in accordance with the laws of England, United Kingdom.
55 | 
56 | Contact
57 | 
58 | If you have any questions regarding this license, please contact us at [legal@spectralcompute.co.uk](mailto:legal@spectralcompute.co.uk).


--------------------------------------------------------------------------------
/docs/logo_white.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <svg version="1.1" viewBox="0 0 93.531 108" xmlns="http://www.w3.org/2000/svg">
3 | <path d="m46.766 0-46.766 27v53.999l46.765 27.001 46.766-27v-53.998zm36.977 75.349-36.978 21.349-36.977-21.349v-42.698l36.978-21.35 36.977 21.351zm-51.166-32.823 0.192-3e-3c0.298 0 0.597 0.021 0.889 0.063l0.517 0.034 0.427 0.027a5.64 5.64 0 0 0 1.817-0.33l23.284-13.442-12.938-7.47-28.225 16.298v32.594l6.705 3.872 28.617-16.522a5.78 5.78 0 0 0 1.412-1.76c0.738-3.258 3.674-5.616 7.024-5.623 3.986 0 7.226 3.231 7.239 7.202 8e-3 3.981-3.224 7.229-7.203 7.24a6.284 6.284 0 0 1-1.08-0.062c-0.181-2e-3 -0.349-0.017-0.517-0.032-0.14-0.012-0.276-0.026-0.426-0.026h-0.039c-0.385 0-0.774 0.044-1.156 0.131l-25.122 14.503 12.771 7.375 28.227-16.298v-32.594l-6.538-3.775-27.879 16.095a5.33 5.33 0 0 0-0.937 1.314 7.26 7.26 0 0 1-7.024 5.629 7.24 7.24 0 0 1-7.24-7.203c-5e-3 -1.93 0.742-3.745 2.102-5.112s3.172-2.121 5.101-2.125z" fill="#e0e0df"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/docs/manual/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # What's new?
  2 | 
  3 | ## Release 1.3.1 (2025-05-12)
  4 | 
  5 | ### Compiler
  6 | 
  7 | - Fixed a bug in the handling of weak device-side symbols which broke the 
  8 |   device binaries for certain projects.
  9 | - Fixed various PTX miscompilations.
 10 | - Added support for approximate-math PTX instructions (`lg2.approx` and 
 11 |   friends).
 12 | 
 13 | ### Library
 14 | 
 15 | - Fixed many small bugs in the device-side APIs.
 16 | - Per-thread-default-stream actually works now, rather than silently using 
 17 |   the legacy stream.
 18 | - Fixed a race condition in the fft library.
 19 | 
 20 | ### Thirdparty Project demos
 21 | 
 22 | - GROMACS now works. SCALE appears to support a wider selection of AMD 
 23 |   architectures than the HIP port, and seems to perform somewhat better (on 
 24 |   MI210, at least!).
 25 | 
 26 | ## Release 1.3.0 (2025-04-23)
 27 | 
 28 | ### Platform
 29 | 
 30 | - Upgraded from llvm17 to llvm19.1.7.
 31 | - Support for rocm 6.3.1.
 32 | - Support for `gfx902` architecture.
 33 | - Enterprise edition: Support for new architectures:
 34 |     - `gfx908`
 35 |     - `gfx90a`
 36 |     - `gfx942`
 37 | 
 38 | ### Packaging
 39 | 
 40 | - Packages for Rocky9 are now available.
 41 | - Package repos for Ubuntu and Rocky9 to simplify
 42 |   installation/upgrades.
 43 | 
 44 | ### New Features
 45 | 
 46 | - Added `scaleenv`, a new and much easier way to [use SCALE](./how-to-use.md).
 47 | - Support for simulating a warp size of 32 even on wave64 platforms, fixing
 48 |   many projects on such platforms.
 49 | - Support for `bfloat16`.
 50 | - Compatibility improvements with non-cmake buildsystems.
 51 | - Added `SCALE_CUDA_VERSION` environment variable to tell SCALE to impersonate a
 52 |   specific version of CUDA.
 53 | - `SCALE_EXCEPTIONS` now supports a non-fatal mode.
 54 | 
 55 | ### Library wrappers
 56 | 
 57 | - Added most of cuFFT.
 58 | - Added lots more cuSolver and cuSPARSE.
 59 | - Filled in some missing NVTX APIs.
 60 | - Added homeopathic quantities of nvtx3.
 61 | 
 62 | ### Library
 63 | 
 64 | - Lazy-initialisation of primary contexts now works properly, fixing some 
 65 |   subtle lifecycle issues.
 66 | - Added some missing undocumented headers like `texture_types.h`.
 67 | - Added the IPC memory/event APIs
 68 | - Added many multi-GPU APIs
 69 | - Added `cuMemcpyPeer`/`cuMemcpyPeerAsync`.
 70 | - Rewritten device allocator to work around HSA bugs and performance issues.
 71 | - Fix a crash when initialising SCALE with many GPUs with huge amounts of memory.
 72 | - Added CUDA IPC APIs. Among other things, this enables CUDA-MPI
 73 |   applications to work, including AMGX's distributed mode.
 74 | - Fixed lots of multi-GPU brokenness.
 75 | - Implemented the `CU_CTX_SYNC_MEMOPS` context flag.
 76 | - Fixed accuracy issues in some of the CUDA Math APIs.
 77 | - fp16 headers no longer produce warnings for projects that include them without
 78 |   `-isystem`.
 79 | - Improved performance and correctness of cudaMemcpy/memset.
 80 | - Fix subtle issues with pointer attribute APIs.
 81 | - Improvements to C89 compatibility of headers.
 82 | - Added more Cooperative Groups APIs.
 83 | - Support for `grid_sync()`.
 84 | - Fix some wave64 issues with `cooperative_groups.h`.
 85 | 
 86 | ### Compiler
 87 | 
 88 | - `__launch_bounds__` now works correctly, significantly improving performance.
 89 | - Device atomics are now much more efficient.
 90 | - Denorm-flushing optimisations are no longer skipped when they aren't
 91 |   supposed to be.
 92 | - Ability to use DPP to optimise warp shuffles in some cases. Currently, 
 93 |   this only works if the individual shfl is provably equivalent to a DPP op, 
 94 |   not when loop analysis would be required. `__shfl_xor` is your friend.
 95 | 
 96 | ### NVCC Interface
 97 | 
 98 | - Corrected the behaviour of the nvcc `-odir` flag in during dependency generation.
 99 | - Added the nvcc `-prec-sqrt` and `-prec-div` flags.
100 | - `use_fast_math` now matches nivida's behaviour, instead of mapping to clang's
101 |   `-ffast-math`, which does too much.
102 | - `--device-c` no longer inappropriately triggers the linker.
103 | - Newly-supported `nvcc` flags:
104 |     * `-arch=native`
105 |     * `-jump-table-density` (ignored)
106 |     * `-compress-mode` (ignored)
107 |     * `-split-compile-extended` (ignored)
108 | 
109 | ### NVCC Semantics
110 | 
111 | - Support broken template code in more situations in nvcc mode.
112 | - Allow invalid const-correctness in unexpanded template code in nvcc mode.
113 | - Allow trailing commas in template argument lists in nvcc mode.
114 | - Fix a parser crash when explicitly calling `operator<<<int>()` in CUDA mode.
115 | - Fix a crash when using `--compiler-options` to pass huge numbers of
116 |   options through to `-Wl`.
117 | 
118 | ### Diagnostics
119 | 
120 | - Warning for unused PTX variables
121 | - Error for attempts to return the carry bit (undefined behaviour on NVIDIA).
122 | - Compiler diagnostic to catch some undefined behaviour patterns with CUDA
123 |   atomics.
124 | 
125 | ### PTX
126 | 
127 | - New instructions supported
128 |     - `sm_100` variants of `redux`.
129 |     - Mixed-precision `add/sub/fma` FP instructions.
130 |     - `membar`
131 |     - `bar.warp.sync`
132 |     - `fence` (partial)
133 |     - `mma` (software emulated)
134 |     - `wmma` (software emulated)
135 | - Fixed parsing of hex-float constants.
136 | - Support for PTX `C` constraints (dynamic asm strings).
137 | - f16/bf16 PTX instructions no longer depend on the corresponding C++ header.
138 | - asm blocks can now refer to variables declared in other asm blocks, including
139 |   [absolutely cursed](https://gist.github.com/ChrisKitching/73f66a422af926a6dbdcd045442c4440) patterns.
140 | - Fixed an issue where template-dependent asm strings were mishandled.
141 | - Fixed various parsing issues (undocumented syntax quirks etc.).
142 | - Fixed a crash when trying to XOR floating point numbers together.
143 | 
144 | ### Thirdparty Project demos
145 | 
146 | Things that now appear to work include:
147 | 
148 | - CUDA-aware MPI
149 | - MAGMA
150 | - whisper.cpp
151 | - TCLB
152 | 
153 | ## Release 1.2.0 (2024-11-27)
154 | 
155 | ### Library Enhancements
156 | 
157 | - Support for `gfx900` architecture.
158 | - Support for `gfx1102` architecture.
159 | 
160 | ### PTX
161 | 
162 | - [Improved handling of wave64](./inline-ptx.md#wave64-considerations) in inline PTX.
163 | - Various inline PTX compilation fixes.
164 | 
165 | ### Other
166 | 
167 | - Support for Ubuntu 24.04.
168 | - Upgraded to ROCm 6.2.2.
169 | 
170 | ## Release 1.1.0 (2024-10-31)
171 | 
172 | ### Library Enhancements
173 | 
174 | - Added much of the CUDA graph API.
175 | - Improvements to multi-GPU handling.
176 | - Fixed rare shutdown-time segfaults.
177 | - Added many random API functions. As usual, see [The diff](./apis.md).
178 | 
179 | ### PTX
180 | 
181 | - `f16x2`, `u16x2` and `s16x2` types.
182 | - `fns` instruction
183 | - Fixed miscompile of `sad` instruction.
184 | 
185 | ### Thirdparty Project demos
186 | 
187 | The `scale-validation` repo now has working demos for the following 
188 | additional projects:
189 | 
190 | - FLAMEGPU2
191 | - GPUJPEG
192 | - gpu_jpeg2k
193 | 
194 | ## Release 1.0.2.0 (2024-09-05)
195 | 
196 | Documented a record of the CUDA APIs already available in SCALE, and those still to come: [Implemented APIs](./apis.md).
197 | 
198 | ### Library Enhancements
199 | 
200 | - Kernel arguments larger than 4kb no longer crash the library.
201 | - Programs that ignore CUDA error codes can no longer get stuck in a state
202 |   where the library always returns the error code you ignored.
203 | - Fixed synchronisation bugs when using synchronous `cuMemset*` APIs.
204 | - Fixed implicit synchronisation behaviour of `cuMemcpy2D/cuMemcpy2DAsync()`.
205 | - Fixed precision issues in fp16 `exp2()`, `rsqrt()`, and `h2log()`.
206 | - `cudaEventRecord` for an empty event no longer returns a time in the past.
207 | - Fixed occupancy API behaviour in edgecases that are not multiples of warp
208 |   size.
209 | - Fixed rare crashes during static de-initialisation when library wrappers
210 |   were in use.
211 | - All flags supported by SCALE's nvcc are now also accepted by our nvrtc
212 |   implementation.
213 | - Various small header fixes.
214 | 
215 | ### Compiler Enhancements
216 | 
217 | - `decltype()` now works correctly for `__host__ __device__` functions.
218 | - `-Winvalid-constexpr` no longer defaults to `-Werror`, for consistency
219 |   with nvcc.
220 | - PTX variable names including `%` are no longer rejected.
221 | - Support for nvcc's nonstandard permissiveness surrounding missing
222 |   `typename` keywords in dependent types.
223 | - Support for nvcc's wacky "split declaration" syntax for `__host__ __device`
224 |   functions (with a warning):
225 |   ```
226 |   int foo();
227 |   __device__ int foo();
228 |   __host__ int foo() {
229 |       return 5;
230 |   }
231 |   // foo() is a __host__ __device__ function. :D
232 |   ```
233 | - Newly-supported compiler flags (all of which are aliases for
234 |   standard flags, or combinations thereof):
235 |     * `-device-c`
236 |     * `-device-w`
237 |     * `-pre-include`
238 |     * `-library`
239 |     * `-output-file`
240 |     * `-define-macro`
241 |     * `-undefine-macro`
242 | 
243 | ### New CUDA APIs
244 | 
245 | #### Math APIs
246 | 
247 | - `exp10(__half)`
248 | - `exp2(__half)`
249 | - `rcp(__half)`
250 | - `rint(__half)`
251 | - `h2exp10(__half2)`
252 | - `h2exp2(__half2)`
253 | - `h2rcp(__half2)`
254 | - `h2rint(__half2)`
255 | 
256 | ## Release 1.0.1.0 (2024-07-24)
257 | 
258 | This release primarily fixes issues that prevent people from successfully
259 | compiling their projects with SCALE. Many thanks to those users who
260 | submitted bug reports.
261 | 
262 | ### CUDA APIs
263 | 
264 | - The `extra` argument to `cuLaunchKernel` is now supported.
265 | - Added support for some more undocumented NVIDIA headers.
266 | - Fix various overload resolution issues with atomic APIs.
267 | - Fix overload resolution issues with min/max.
268 | - Added various undocumented macros to support projects that are explicitly
269 |   checking cuda include guard macros.
270 | - `lrint()` and `llrint()` no longer crash the compiler. :D
271 | - Newly supported CUDA APIs:
272 |     * `nvrtcGetNumSupportedArchs`
273 |     * `nvrtcGetSupportedArchs`
274 |     * `cudaLaunchKernelEx`, `cuLaunchKernelEx`, `cudaLaunchKernelExC`: some
275 |      of the performance-hint
276 |     launch options are no-ops.
277 |     * `__vavgs2`, `__vavgs4`
278 |     * All the `atomic*_block()` and `atomic*_system()` variants.
279 | 
280 | ### Compiler
281 | 
282 | - Improved parsing of nvcc arguments:
283 |      * Allow undocumented option variants (`-foo bar`, `--foo bar`,
284 |        `--foo=bar`, and `-foo=bar` are always allowed, it seems).
285 |      * Implement "interesting" quoting/escaping rules in nvcc arguments, such as
286 |        embedded quotes and `\,`. We now correctly handle cursed arguments like:
287 |        `'-Xcompiler=-Wl\,-O1' '-Xcompiler=-Wl\,-rpath\,/usr/lib,-Wl\,-rpath-link\,/usr/lib'`
288 | - Support for more nvcc arguments:
289 |     * NVCC-style diagnostic flags: `-Werror`, `-disable-warnings`, etc.
290 |     * `--run`, `--run-args`
291 |     * `-Xlinker`, `-linker-options`
292 |     * `-no-exceptions`, `-noeh`
293 |     * `-minimal`: no-op. Exact semantics are undocumented, and build times
294 |       are reasonably fast anyway.
295 |     * `-gen-opt-lto`, `-dlink-time-opt`, `-dlto`. No-ops: device LTO not yet
296 |       supported.
297 |     * `-t`, `--threads`, `-split-compile`: No-ops: they're flags for making
298 |       compilation faster and are specific to how nvcc is implemented.
299 |     * `-device-int128`: no-op: we always enable int128.
300 |     * `-extra-device-vectorization`: no-op: vectorisation optimisations are
301 |       controlled by the usual `-O*` flags.
302 |     * `-entries`, `-source-in-ptx`, `-src-in-ptx`: no-ops: there is no PTX.
303 |     * `-use-local-env`, `-idp`, `-ddp`, `-dp`, etc.: ignored since they are
304 |       meaningless except on Windows.
305 | 
306 | - Allow variadic device functions in non-evaluated functions.
307 | - Don't warn about implicit conversion from `cudaLaneMask_t` to `bool`.
308 | - `__builtin_provable` no longer causes compiler crashes in `-O0`/`-O1` builds.
309 | - Fixed a bug causing PTX `asm` blocks inside non-template, non-dependent
310 |   member functions of template classes to sometimes not be compiled,
311 |   causing PTX to end up in the AMD binary unmodified.
312 | - CUDA launch tokens with spaces (ie.: `myKernel<< <1, 1>> >()`) are now
313 |   supported.
314 | - Building non-cuda C translation units with SCALE-nvcc now works.
315 | 
316 | ### Other
317 | 
318 | - The `meson` build system no longer regards SCALE-nvcc as a "broken" compiler.
319 | - `hsakmtsysinfo` no longer explodes if it doesn't like your GPU.
320 | - New documentation pages.
321 | - Published more details about thirdparty testing, including the build scripts.
322 | 
323 | ## Release 1.0.0.0 (2024-07-15)
324 | 
325 | Initial release
326 | 


--------------------------------------------------------------------------------
/docs/manual/apis.md:
--------------------------------------------------------------------------------
 1 | # API Coverage Report
 2 | 
 3 | These pages provide a diff between SCALE's headers and the NVIDIA 
 4 | documentation, describing which APIs are supported by SCALE.
 5 | 
 6 | - [Driver API](./api-driver.md)
 7 | - [Math API](./api-math.md)
 8 | - [Runtime API](./api-runtime.md)
 9 | 
10 | The lists are based on the official Nvidia documentation and use the same layout.
11 | 
12 | ## Presentation
13 | 
14 | The lists are presented using `diff` syntax highlighting of code blocks.
15 | This allows seeing which entries are available and which [may be missing](#correctness).
16 | Missing entries are prefixed with `-` (a minus) which paints them red in the list.
17 | 
18 | By default, `__host__` qualifier is assumed for functions, it is removed if present.
19 | Functions that are qualified as `__host__ __device__` are split into two separate entries.
20 | 
21 | Here is an example:
22 | 
23 | ```diff
24 | const char * cudaGetErrorName(cudaError_t);
25 | __device__ const char * cudaGetErrorName(cudaError_t);
26 | const char * cudaGetErrorString(cudaError_t);
27 | __device__ const char * cudaGetErrorString(cudaError_t);
28 | cudaError_t cudaGetLastError();
29 | -__device__ cudaError_t cudaGetLastError();
30 | cudaError_t cudaPeekAtLastError();
31 | -__device__ cudaError_t cudaPeekAtLastError();
32 | ```
33 | 
34 | In this example, functions `cudaGetErrorName` and `cudaGetErrorString` are available on both host and device.
35 | Functions `cudaGetLastError` and `cudaPeekAtLastError` are available on host, and are not available on device.
36 | 
37 | ## Correctness
38 | 
39 | The lists may say that something is unavailable when it's not the case.
40 | This may happen for a few reasons.
41 | 
42 | NVIDIA documentation may differ from what CUDA provides in reality.
43 | An example of that is differences in `const`-ness of some function arguments.
44 | In such cases SCALE may be forced to maintain "bug compatibility" and the
45 | functions stop matching what NVIDIA documentation promises.
46 | 
47 | Many functions are called conditionally and may never get used in certain scenarios.
48 | For some of those functions, SCALE may provide an empty implementation.
49 | By doing this, SCALE allows more projects pass compilation and linking.
50 | We don't want to list such empty functions as available, so we manually mark them as missing to avoid confusion.
51 | 
52 | The code that compares entries from SCALE against NVIDIA documentation may contain imperfections.
53 | For this reason, some successful matches may simply get missed.
54 | 
55 | Note that these lists currently **don't check members of types** such as struct fields or enum variants.
56 | 
57 | [Reach out to us](../contact/report-a-bug.md) if you experience problems for any of these reasons.
58 | Possible problems with the entries require our attention on a case-by-case basis.
59 | Your feedback will help us find possible inconsistencies and prioritise our work to fix them.
60 | 


--------------------------------------------------------------------------------
/docs/manual/comparison.md:
--------------------------------------------------------------------------------
 1 | # Comparison to other solutions
 2 | 
 3 | ## HIP
 4 | 
 5 | [HIP](https://github.com/ROCm/HIP) is AMD's answer to CUDA. It is superficially
 6 | similar to CUDA, providing a similar programming language and similar APIs. 
 7 | An automatic `hipify` tool exists to partially automate the process of 
 8 | rewriting your code from CUDA to HIP.
 9 | 
10 | We believe HIP does not solve the "CUDA compatibility problem" because:
11 | 
12 | - The CUDA [dialect problem](./dialects.md). HIP's language is almost 
13 |   identical to LLVM-dialect CUDA, which is quite different from the dialect 
14 |   of CUDA accepted by `nvcc`. Consequently, some CUDA programs fail in 
15 |   strange ways after porting.
16 | - HIP has no support for inline PTX `asm` blocks in CUDA code. These must be
17 |   manually removed or guarded by macros. SCALE simply accepts them and
18 |   compiles them for AMD.
19 | - HIP's support for NVIDIA is via wrapper APIs rather than simply using 
20 |   NVIDIA's tools directly as a SCALE-based solution does.
21 | - `hipify` is unable to handle many CUDA code constructs, such as complex 
22 |   macros.
23 | 
24 | To avoid these issues, many projects end up maintaining separate HIP and 
25 | CUDA codebases (or one codebase that converts to HIP or CUDA via complex
26 | preprocessor macros).
27 | 
28 | ## ZLUDA
29 | 
30 | [ZLUDA](https://github.com/vosen/ZLUDA) is a PTX JIT for AMD GPUs. On program
31 | startup, ZLUDA grabs the PTX from the CUDA binary and compiles it for your AMD
32 | GPU.
33 | 
34 | ZLUDA is a useful tool for end-users to run CUDA programs on 
35 | otherwise-unsupported GPUs, without the involvement of the authors of the 
36 | program (or even access to the source code!).
37 | 
38 | There are some downsides:
39 | 
40 | - JIT on startup can lead to startup-time delays.
41 | - Reliance on dll-injection is a bit "hacky", and tends to make antivirus 
42 |   software angry.
43 | - ZLUDA's approach to providing AMD support inherently depends on tools 
44 |   provided by NVIDIA. NVIDIA controls the design of the PTX language and the 
45 |   compilers that produce it, and manipulate both to optimise outcomes for 
46 |   their hardware specifically.
47 | - Compiling source code directly to AMDGPU machine code should
48 |   offer greater opportunities for optimisation than working backwards from
49 |   PTX that has already been optimised for a specific NVIDIA target.
50 | 
51 | We believe that ZLUDA fills a useful niche, but that software distributors 
52 | should have the power to compile their CUDA source code directly to the 
53 | machine code of multiple GPU vendors, without reliance on tools maintained 
54 | by NVIDIA.
55 | 


--------------------------------------------------------------------------------
/docs/manual/compute-capabilities.md:
--------------------------------------------------------------------------------
  1 | # Compute Capability Mapping
  2 | 
  3 | "Compute capability" is a numbering system used by NVIDIA's CUDA tools to
  4 | represent different GPU targets. The value of the `__CUDA_ARCH__` macro is
  5 | derived from this, and it's how you communicate with `nvcc` to request the
  6 | target to build for.
  7 | 
  8 | GPUs from other vendors have their own numbering scheme, such as AMD's 
  9 | `gfx1234` format.
 10 | 
 11 | CUDA projects sometimes do numeric comparisons on the compute capability 
 12 | value to enable/disable features using the preprocessor. This is a problem, 
 13 | since those comparisons are inherently meaningless when targeting non-NVIDIA 
 14 | hardware.
 15 | 
 16 | There is no meaningful mapping between compute capability numbers and the 
 17 | hardware of other vendors.
 18 | 
 19 | SCALE addresses this problem by providing a "CUDA installation directory" 
 20 | for each supported GPU target. By default, the `nvcc` in each of these 
 21 | directories maps *every* compute capability number to the corresponding AMD 
 22 | GPU target.
 23 | 
 24 | This approach works, but has one obvious downside: it makes fat binaries 
 25 | unrepresentable.
 26 | 
 27 | To resolve that, your buildsystem must be at least somewhat SCALE-aware: 
 28 | compute capabilities are not a sufficiently powerful abstraction to model 
 29 | the needs of a cross-vendor fat binary.
 30 | 
 31 | The special `gfxany` target directory is a "CUDA installation directory" 
 32 | that does not perform this compute capability mapping at all. Instead, you 
 33 | may provide your own arbitrary mapping from GPU targets to CC-number - or 
 34 | use no such mapping at all (if your program doesn't use the CC-number for 
 35 | metaprogramming). We recommend CUDA progras be written using more portable 
 36 | and reliable means of detecting the existence of features: even within 
 37 | NVIDIA's universe, the CC number is a rather blunt instrument.
 38 | 
 39 | The remainder of this document explains how the compute capability mapping 
 40 | configuration works for users of the `gfxany` target.
 41 | 
 42 | ## Configuration file format
 43 | 
 44 | The configuration file is a newline separated list of entries. Each entry
 45 | consists of an ISA name, e.g: `gfx1030` and a
 46 | compute capability represented as an integer, e.g: `86`, separated by a space.
 47 | The entries are tried in order, so it's
 48 | possible to map more than one ISA and compute capability to each other
 49 | unambiguously. If the space and compute
 50 | capability are omitted, then the compiler associates all NVIDIA compute
 51 | capabilities with the specified GPU.
 52 | 
 53 | If no entry is found for the given GPU or if no configuration file is found,
 54 | then the library reports a compute
 55 | capability with a large major version defined by the default numbering scheme.
 56 | 
 57 | If no entry is found for the given GPU or if no configuration file is found, the
 58 | compiler does not translate a compute
 59 | capability to an AMD ISA.
 60 | 
 61 | Lines starting with `#` and empty lines are ignored.
 62 | 
 63 | ### Example
 64 | 
 65 | ```
 66 | # The library will report compute capability 6.1 for gfx900 devices. The compiler will use gfx900 for `sm_61` or
 67 | # `compute_61`.
 68 | gfx900 61
 69 | 
 70 | # The library will report compute capability 8.6 for gfx1030 devices. The compiler will use gfx1030 for any of `sm_80`,
 71 | # `compute_80`, `sm_86`, or `compute_86`.
 72 | gfx1030 86
 73 | gfx1030 80
 74 | 
 75 | # The compiler will use gfx1100 for any compute capability other than 6.1, 8.0, or 8.6.
 76 | gfx1100
 77 | ```
 78 | 
 79 | ## Search locations for the library
 80 | 
 81 | The library searches for a compute capability map in the following order:
 82 | 
 83 | - The file pointed the `REDSCALE_CCMAP` environment variable. It is an error if
 84 |   this environment variable is set but the
 85 |   file to which it points does not exist.
 86 | - `../share/redscale/ccmap.conf` relative to the directory
 87 |   containing `libredscale.so`. This search location is intended
 88 |   for users who build different installation trees for different GPUs. Packagers
 89 |   should not place a configuration here.
 90 | - `${HOME}/.redscale/ccmap.conf`
 91 | - `/etc/redscale/ccmap.conf`
 92 | 
 93 | ## Search locations for the compiler
 94 | 
 95 | The compiler searches for a compute capability map in the following order:
 96 | 
 97 | - The file pointed to by the `--cuda-ccmap` flag. It is an error if this flag is
 98 |   given but the file to which it points
 99 |   does not exist.
100 | - The file pointed the `REDSCALE_CCMAP` environment variable. It is an error if
101 |   this environment variable is set but the
102 |   file to which it points does not exist.
103 | - `../share/redscale/ccmap.conf` relative to the directory containing the
104 |   compiler binary (e.g: `nvcc`) if that
105 |   directory is in a CUDA installation directory. This search location is
106 |   intended for users who build different
107 |   installation trees for different GPUs. Packagers should not place a
108 |   configuration here.
109 | 


--------------------------------------------------------------------------------
/docs/manual/diagnostic-flags.md:
--------------------------------------------------------------------------------
 1 | # Compiler warnings
 2 | 
 3 | There are some differences in how NVIDIA's `nvcc` and the SCALE 
 4 | compiler in "nvcc mode" interpret compiler options relatig to warnings.
 5 | 
 6 | ## `clang++` flags
 7 | 
 8 | The SCALE compiler accepts all of `clang++`'s usual flags in addition to those
 9 | provided by nvcc, except where doing so would create an ambiguity.
10 | 
11 | ## Compiler warnings
12 | 
13 | The SCALE compiler has the same default warning behaviour as `clang`, which 
14 | is somewhat more strict than `nvcc`. Warnings may be disabled with the usual 
15 | `-Wno-` flags documented in the [clang diagnostics reference](https://clang.llvm.org/docs/DiagnosticsReference.html).
16 | 
17 | There may be value in *enabling* even more warnings to find further issues
18 | and improve your code.
19 | 
20 | Note that the end of every compiler warning message tells you the name of 
21 | the warning flag it is associated with, such as:
22 | 
23 | ```
24 | warning: implicit conversion from 'int' to 'float' changes value from 
25 | 2147483647 to 2147483648 [-Wimplicit-const-int-float-conversion]
26 | ```
27 | 
28 | By changing `-W` to `-Wno-`, you obtain the flag required to disable that 
29 | warning.
30 | 
31 | The SCALE implementation of the CUDA runtime/driver APIs uses `[[nodiscard]]`
32 | for the error return codes, meaning you'll get a warning from code that 
33 | ignores potential errors from CUDA APIs. This warning can be disabled via 
34 | `-Wno-unused-result`.
35 | 
36 | ## `-Werror`
37 | 
38 | `nvcc`'s `-Werror` takes an argument specifying the types of warnings that 
39 | should be errors, such as:
40 | 
41 | ```
42 | -Werror reorder,default-stream-launch
43 | ```
44 | 
45 | This differs from clang's syntax, which consists of either a lone `-Werror` 
46 | to make all warnings into errors, or a set of `-Werror=name` flags to make 
47 | specific things into errors.
48 | 
49 | In `nvcc` mode, the SCALE compiler accepts only the `nvcc` syntax, but 
50 | allows the same set of diagnostic names accepted by `clang` (as well 
51 | as the special names supported by NVIDIA's `nvcc`). For example:
52 | 
53 | ```
54 | nvcc -Werror=documentation,implicit-int-conversion foo.cu
55 | clang++ -Werror=documentation -Werror=implicit-int-conversion foo.cu
56 | ```
57 | 
58 | Since SCALE enables more warnings than nvcc does by default, many projects 
59 | using `-Werror` with nvcc will not compile without either disabling the flag 
60 | or fixing the underlying code issues.
61 | 
62 | ## Diagnostic control pragmas
63 | 
64 | The SCALE compiler does not currently support `#pragma nv_diag_suppress` or
65 | `#pragma diag_suppress` because the set of integers accepted by these pragmas
66 | does not appear to be documented, so we do not know which diagnostics should
67 | be controlled by which pragmas. Using these pragmas in your program will
68 | produce an "unrecognised pragma ignored" warning, which can itself be disabled
69 | with `-Wno-unknown-pragmas`.
70 | 
71 | SCALE supports clang-style diagnostic pragmas, as documented [here]
72 | (https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas).
73 | This can be combined with preprocessor macros to achieve the desired effect:
74 | 
75 | ```c++
76 | #if defined(__clang__) // All clang-like compilers, including SCALE.
77 | #pragma clang diagnostic ignored "-Wunused-result"
78 | #elif defined(__NVCC__) // NVCC, but not clang. AKA: nvidia's one.
79 | #pragma nv_diag_suppress ...
80 | #endif
81 | ```
82 | 


--------------------------------------------------------------------------------
/docs/manual/dialects.md:
--------------------------------------------------------------------------------
 1 | # CUDA Dialects
 2 | 
 3 | The CUDA programming language is not formally specified. The "standard" is 
 4 | therefore approximately "whatever `nvcc` does". Although `clang` supports 
 5 | compiling CUDA, it supports a somewhat different dialect compared to `nvcc`.
 6 | 
 7 | You can read more about (some of) the specific differences in
 8 | [the LLVM manual page about it](https://llvm.org/docs/CompileCudaWithLLVM.html#dialect-differences-between-clang-and-nvcc).
 9 | 
10 | This leads to a problem: most CUDA code is written with `nvcc` in mind, but 
11 | the only open-source compiler available with a CUDA frontend is `clang`. Many 
12 | real-world CUDA programs cannot be successfully compiled with `clang` 
13 | because they depend on `nvcc`'s behaviour.
14 | 
15 | HIP experiences the same problem: the HIP compiler is based on LLVM, so HIP 
16 | is closer to "LLVM-dialect CUDA" than it is to "nvcc-dialect CUDA". This 
17 | causes some CUDA programs to fail in "interesting" ways when ported to HIP. It's
18 | not really the case that you can remap all the API calls to the HIP ones and
19 | expect it to work: nvcc-CUDA, and LLVM-CUDA/HIP have quite different C++ 
20 | semantics.
21 | 
22 | SCALE resolves this issue by offering two compilers:
23 | 
24 | - `"nvcc"`: a clang frontend that replicates the behaviour of NVIDIA's `nvcc`, 
25 |   allowing existing CUDA programs to compile directly. This is similar to how
26 |   LLVM achieves MSVC compatibility by providing `clang-cl`.
27 | - `clang`: providing clang's usual clang-dialect-CUDA support, with our 
28 |   opt-in language extensions.
29 | 
30 | Existing projects can be compiled without modification using the 
31 | `nvcc`-equivalent compiler. Users of clang-dialect CUDA may use the provided 
32 | clang compiler to compile for either platform.
33 | 


--------------------------------------------------------------------------------
/docs/manual/differences.md:
--------------------------------------------------------------------------------
  1 | # Differences from NVIDIA CUDA
  2 | 
  3 | There are some areas where SCALE's implementation of a certain feature also
  4 | found in NVIDIA CUDA has different behaviour. This document does not
  5 | enumerate _missing_ CUDA APIs/features.
  6 | 
  7 | ## Defects
  8 | 
  9 | ### NVRTC differences
 10 | 
 11 | SCALE's current implementation of the nvrtc API works by calling the
 12 | compiler as a subprocess instead of a library. This differs from how
 13 | NVIDIA's implementation works, and means that the library must be able to
 14 | locate the compiler to invoke it.
 15 | 
 16 | If your program uses the rtc APIs and fails with errors that relate to being 
 17 | unable to locate the compiler, ensure that SCALE's `nvcc` is first in PATH.
 18 | 
 19 | ### Stream synchronization
 20 | 
 21 | SCALE does not yet support
 22 | [per-thread default stream behaviour](http://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html).
 23 | 
 24 | Instead, the default stream is used in place of the per-thread default stream.
 25 | This will not break programs, but is likely to reduce performance.
 26 | 
 27 | A workaround which will also slightly improve the performance of your
 28 | program when run on NVIDIA GPUs is to use nonblocking CUDA streams
 29 | explicitly, rather than relying on the implicit CUDA stream.
 30 | 
 31 | ### Host-side `__half` support
 32 | 
 33 | The CUDA API allows many `__half` math functions to be used on both host and 
 34 | device.
 35 | 
 36 | When compiling _non-CUDA_ translation units, you can include `<cuda_fp16.h>` 
 37 | and use the `__half` math APIs in host code. When you do this, NVIDIA's CUDA 
 38 | implementation converts the `__half` to 32-bit `float`, does the
 39 | calculation, and converts back.
 40 | 
 41 | SCALE only allows these functions to be used on the host when the host compiler 
 42 | supports compiling fp16 code directly (via the `_Float16` type). Current
 43 | versions of gcc and clang both support this.
 44 | 
 45 | This difference only applies to _non-CUDA_ translation units using compilers at
 46 | least 2 years old.
 47 | 
 48 | This means:
 49 | 
 50 | - All `__half` APIs work in both host and device code in `.cu` files.
 51 | - `__half` APIs that perform floating point math will not compile in host 
 52 |   code in non-CUDA translation units if an old host compiler is used.
 53 | - The outcome of `__half` calculations on host/device will always be the same.
 54 | - APIs for using `__half` as a storage type are always supported.
 55 | 
 56 | SCALE bundles a modern host compiler at `<SCALE_DIR>/targets/gfxXXX/bin/clang++` 
 57 | you can use as a workaround if this edgecase becomes a problem.
 58 | 
 59 | ## Enhancements
 60 | 
 61 | ### Contexts where CUDA APIs are forbidden
 62 | 
 63 | NVIDIA's implementation forbids CUDA APIs in various contexts, such as from
 64 | host-side functions enqueued onto streams.
 65 | 
 66 | This implementation allows CUDA API calls in such cases.
 67 | 
 68 | ### Static initialization and deinitialization
 69 | 
 70 | This implementation permits the use of CUDA API functions during global static
 71 | initialization and `thread_local` static initialization.
 72 | 
 73 | It is not permitted to use CUDA API functions during static deinitialization.
 74 | 
 75 | This is more permissive than what is allowed by NVIDIA's implementation.
 76 | 
 77 | ### Device `printf`
 78 | 
 79 | SCALE's device `printf` accepts an unlimited number of arguments if you compile
 80 | with at least C++11.
 81 | 
 82 | If you target an older version of C++ then it is limited to 32, like NVIDIA's
 83 | implementation.
 84 | 
 85 | ### Contexts
 86 | 
 87 | If `cuCtxDestroy()` is used to destroy the context that is current to a
 88 | different CPU thread, and that CPU thread then issues an API call that
 89 | depends on the context without first setting a different context to be
 90 | current, the behaviour is undefined.
 91 | 
 92 | In NVIDIA's implementation, this condition returns
 93 | `CUDA_ERROR_CONTEXT_IS_DESTROYED`.
 94 | 
 95 | Matching NVIDIA's behaviour would have incurred a small performance penalty
 96 | on many operations to handle an edgecase that is not permitted.
 97 | 
 98 | ### Kernel argument size
 99 | 
100 | SCALE accepts kernel arguments up to 2GB, whereas NVIDIA CUDA allows only 
101 | 32kb (and 4kb before version 12.1).
102 | 
103 | This is more an implementation quirk than a feature, since huge kernel 
104 | arguments are unlikely to perform well compared to achieving the same effect 
105 | with async copies, memory mapping, etc.
106 | 


--------------------------------------------------------------------------------
/docs/manual/faq.md:
--------------------------------------------------------------------------------
 1 | # Frequently asked questions
 2 | 
 3 | ## How do I report a problem?
 4 | 
 5 | Strange compiler errors? Performance not as great as expected? Something else
 6 | not working as expected?
 7 | 
 8 | [Contact us](../contact/report-a-bug.md)
 9 | 
10 | Bug reports - no matter how small - accelerate the SCALE project.
11 | 
12 | Let's work together to democratise the GPGPU market!
13 | 
14 | ## What are `unstable` builds?
15 | 
16 | `unstable` builds give you access to our latest features and performance
17 | optimisations. `unstable` builds give you access to these features sooner
18 | than they would become available via our stable release channel.
19 | 
20 | However, `unstable` builds do not pass through our full quality assurance
21 | process: they may contain regressions and other bugs. `unstable` builds
22 | are made available "as is", and no detailed changlogs are available for
23 | `unstable` builds.
24 | 
25 | ## When will `<some GPU>` be supported?
26 | 
27 | Expanding the set of supported GPUs is an ongoing process. At present we're 
28 | being very conservative with the set of GPUs enabled with SCALE to avoid 
29 | use on platforms we currently have zero ability to test on.
30 | 
31 | If your GPU is supported by ROCm, it'll probably become available on SCALE a 
32 | little sooner than if it is not, since it won't break our "CUDA-X" library 
33 | delegation mechanism.
34 | 
35 | ## When will `<some CUDA API>` be supported?
36 | 
37 | We prioritise CUDA APIs based on the number and popularity of third-party 
38 | projects requiring the missing API.
39 | 
40 | If you'd like to bring a missing API to our attention,
41 | [Contact us](../contact/report-a-bug.md)
42 | 
43 | ## Can't NVIDIA just change CUDA and break SCALE?
44 | 
45 | Of course, we have no control over what NVIDIA does with the CUDA toolkit.
46 | 
47 | Although it is possible for NVIDIA to change/remove APIs in CUDA or PTX, 
48 | doing so would break every CUDA program that uses these functions. Those
49 | programs would then be broken on both SCALE and NVIDIA's platform.
50 | 
51 | NVIDIA can *add new things* to CUDA which we don't support. Projects are free to 
52 | choose whether or not to use any new features that are added in the future, 
53 | and may choose to use feature detection macros to conditionalise dependence 
54 | on non-essential new features. Projects face a similar choice when deciding 
55 | whether or not to use SCALE's steadily growing set of features that go beyond
56 | NVIDIA's CUDA.
57 | 
58 | ### Does SCALE depend on NVIDIA's compiler/assembler/etc.?
59 | 
60 | No.
61 | 
62 | Although much of this manual talks about "nvcc", it is important to 
63 | understand the distinction between the two things this can refer to:
64 | 
65 | - The SCALE compiler, which is named "nvcc" for compatibility. This is the 
66 |   name build scripts expect, so if we named it anything else then nothing 
67 |   would work!
68 | - NVIDIA's proprietary CUDA compiler, `nvcc`.
69 | 
70 | SCALE provides a _thing called nvcc_, which is in fact absolutely nothing to 
71 | do with NVIDIA's `nvcc`. Our "nvcc" is built on top of the open-source 
72 | clang/llvm compiler, and has no dependency on NVIDIA's compiler.
73 | 
74 | SCALE does not make use of "nvvm", either.
75 | 


--------------------------------------------------------------------------------
/docs/manual/how-to-install.md:
--------------------------------------------------------------------------------
  1 | # Install SCALE
  2 | 
  3 | {% set rocm_message = 'First, add ROCM 6.3.1\'s package repositories, as [per AMD\'s instructions](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.3.1/install/quick-start.html). Then, add our repository and install from it.' %}
  4 | 
  5 | Select your operating system and version below to see installation instructions.
  6 | 
  7 | === "Ubuntu"
  8 | 
  9 |     {% for deb_os in [{"version": "22.04", "codename": "jammy"}, {"version": "24.04", "codename": "noble"}] %}
 10 | 
 11 |     === "{{deb_os.version}}"
 12 | 
 13 |         {{ rocm_message }}
 14 | 
 15 |         ```bash
 16 |         {% if customer_specific_repo %}
 17 |         # Replace with your credentials
 18 |         export CUSTOMER_NAME="<customer-username>"
 19 |         export CUSTOMER_PASSWORD="<customer-password>"
 20 | 
 21 |         # Tell apt to authenticate to the repo
 22 |         sudo tee /etc/apt/auth.conf.d/scale.conf <<EOF
 23 |         machine {{repo_subdomain}}.scale-lang.com
 24 |         login $CUSTOMER_NAME
 25 |         password $CUSTOMER_PASSWORD
 26 |         EOF
 27 |         chmod 700 /etc/apt/auth.conf.d/scale.conf
 28 |         # Add the scale deb repos.
 29 |         wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/deb/dists/{{deb_os.codename}}/main/binary-all/scale-repos.deb
 30 |         {% else %}
 31 |         # Add the scale deb repos.
 32 |         wget https://{{repo_subdomain}}.scale-lang.com/deb/dists/{{deb_os.codename}}/main/binary-all/scale-repos.deb
 33 |         {% endif %}
 34 |         sudo apt install ./scale-repos.deb
 35 | 
 36 |         # Install SCALE
 37 |         sudo apt update && sudo apt install {{ scale_pkgname }}
 38 | 
 39 |         # Add your user to the `video` group:
 40 |         sudo usermod -a -G video $(whoami)
 41 |         ```
 42 | 
 43 |         If you did not already have the `amdgpu-dkms` kernel driver installed prior to
 44 |         installing SCALE, you should now reboot.
 45 | 
 46 |     {% endfor %}
 47 | 
 48 | === "Rocky Linux"
 49 | 
 50 |     === "9"
 51 | 
 52 |         {{ rocm_message }}
 53 | 
 54 |         ```bash
 55 |         {% if customer_specific_repo %}
 56 |         # Replace with your credentials
 57 |         export CUSTOMER_NAME="<customer-username>"
 58 |         export CUSTOMER_PASSWORD="<customer-password>"
 59 | 
 60 |         # Add the scale rpm repos.
 61 |         wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/rpm/el9/main/scale-repos.rpm
 62 |         sudo dnf install ./scale-repos.rpm
 63 | 
 64 |         # Tell dnf to authenticate to the repo
 65 |         sudo tee -a /etc/yum.repos.d/scale.repo <<EOF
 66 |         username = $CUSTOMER_NAME
 67 |         password = $CUSTOMER_PASSWORD
 68 |         EOF
 69 |         chmod 700 /etc/yum.repos.d/scale.repo
 70 |         {% else %}
 71 |         # Add the scale rpm repos.
 72 |         sudo dnf install https://{{repo_subdomain}}.scale-lang.com/rpm/el9/main/scale-repos.rpm
 73 |         {% endif %}
 74 |         # Install SCALE
 75 |         sudo dnf install {{ scale_pkgname }}
 76 |         ```
 77 | 
 78 |         If you did not already have the `amdgpu-dkms` kernel driver installed prior to
 79 |         installing SCALE, you should now reboot.
 80 | 
 81 | === "Other Distros"
 82 | 
 83 |     Download and extract the SCALE tarball:
 84 | 
 85 |     ```bash
 86 |     {% if customer_specific_repo %}
 87 |     # Replace with your credentials
 88 |     export CUSTOMER_NAME="<customer-username>"
 89 |     export CUSTOMER_PASSWORD="<customer-password>"
 90 | 
 91 |     wget --http-user="$CUSTOMER_NAME" --http-password="$CUSTOMER_PASSWORD" https://{{repo_subdomain}}.scale-lang.com/$CUSTOMER_NAME/tar/{{scale_pkgname}}-{{scale_version}}-amd64.tar.xz
 92 |     {% else %}
 93 |     # Download the tarball
 94 |     wget https://{{ repo_subdomain }}.scale-lang.com/tar/{{scale_pkgname}}-{{scale_version}}-amd64.tar.xz
 95 |     {% endif %}
 96 |     # Extract it to the current directory
 97 |     tar xf {{scale_pkgname}}-{{ scale_version }}-amd64.tar.xz
 98 |     ```
 99 | 
100 |     The tarball is significantly larger than other options since it
101 |     includes many dependent libraries directly instead of asking the system
102 |         package manager to install them.
103 | 


--------------------------------------------------------------------------------
/docs/manual/how-to-use.md:
--------------------------------------------------------------------------------
  1 | # Compile CUDA with SCALE
  2 | 
  3 | This guide covers the steps required to compile an existing CUDA project for an
  4 | AMD GPU using SCALE.
  5 | 
  6 | SCALE makes this as easy as possible by convincingly impersonating the 
  7 | NVIDIA CUDA Toolkit (from the point of view of your build system).
  8 | 
  9 | To use SCALE, we must simply cause your build system to use the "CUDA 
 10 | installation" offered by SCALE.
 11 | 
 12 | [Install SCALE](./how-to-install.md), if you haven't already.
 13 | 
 14 | ## Identifying GPU Target
 15 | 
 16 | If you don't already know which AMD GPU target you need to compile for, you can
 17 | use the `scaleinfo` command provided by SCALE to find out:
 18 | 
 19 | ```bash
 20 | scaleinfo | grep gfx
 21 | ```
 22 | 
 23 | Example output:
 24 | 
 25 | ```
 26 | Device 0 (00:23:00.0): AMD Radeon Pro W6800 - gfx1030 (AMD) <amdgcn-amd-amdhsa--gfx1030>
 27 | ```
 28 | 
 29 | In this example, the GPU target ID is `gfx1030`.
 30 | 
 31 | If your GPU is not listed in the output of this command, it is not currently
 32 | supported by SCALE.
 33 | 
 34 | If the `scaleinfo` command is not found, ensure
 35 | that `<SCALE install path>/bin` is in `PATH`.
 36 | 
 37 | ## The easy way: `scaleenv`
 38 | 
 39 | SCALE offers a "`venv`-flavoured" environment management script to allow 
 40 | "magically" building CUDA projects.
 41 | 
 42 | The concept is simple:
 43 | 
 44 | 1. Activate the `scaleenv` for the AMD GPU target you want to build for.
 45 | 2. Run the commands you normally use to build the project for an NVIDIA GPU.
 46 | 3. AMD binaries are sneakily produced instead of NVIDIA ones.
 47 | 
 48 | To activate a scaleenv:
 49 | 
 50 | ```
 51 | source /opt/scale/bin/scaleenv gfx1030
 52 | ```
 53 | 
 54 | You can exit a `scaleenv` by typing `deactivate` or closing your terminal.
 55 | 
 56 | While the environment is active: simply run the usual `cmake`/`make`/etc.
 57 | commands needed to build the project, and it will build for whatever AMD 
 58 | target you handed to `scaleenv`.
 59 | 
 60 | ## How it really works
 61 | 
 62 | To allow compilation without build system changes, SCALE provides a series of
 63 | directories that are recognised by build systems as being CUDA Toolkit
 64 | installations. One such directory is provided for each supported AMD GPU
 65 | target. These directories can be found at `<SCALE install 
 66 | path>/targets/gfxXXXX`, where `gfxXXXX` is the name of an AMD GPU target,
 67 | such as `gfx1030`.
 68 | 
 69 | To achieve the desired effect, we need the build system to use the "CUDA 
 70 | toolkit" corresponding to the desired AMD GPU target.
 71 | 
 72 | For example: to build for `gfx1030` you would tell your build system that
 73 | CUDA is installed at `<SCALE install path>/targets/gfx1030`.
 74 | 
 75 | All `scaleenv` is actually doing is setting various environment variables up 
 76 | to make this happen. It's just a shell script: open it to see the variables 
 77 | it is manipulating.
 78 | 
 79 | ## Finding the libraries at runtime
 80 | 
 81 | For maximum compatibility with projects that depend on NVIDIA's "compute 
 82 | capability" numbering scheme, SCALE provides one "cuda mimic directory" per 
 83 | supported GPU target that maps the new target to "sm_86" in NVIDIA's 
 84 | numbering scheme.
 85 | 
 86 | This means that each of the `target` subdirectories contains 
 87 | identically-named libraries, so SCALE cannot meaningfully add them to the 
 88 | system's library search path when it is installed. The built executable/library
 89 | therefore needs to be told how to find the libraries via another mechanism, 
 90 | such as:
 91 | 
 92 | - [rpath](https://en.wikipedia.org/wiki/Rpath). With CMake, the simplest 
 93 |   thing that "usually just works" is to add 
 94 |   `-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON` to your cmake incantation.
 95 | - Set `LD_LIBRARY_PATH` to include `${SCALE_DIR}/lib` at runtime. `scaleenv` 
 96 |   does this, so if you keep that enabled when running your programs things 
 97 |   will just work.
 98 | 
 99 | Support for multiple GPU architectures in a single binary ("Fat binaries") 
100 | is in development.
101 | 
102 | ## Next steps
103 | 
104 | - Learn about [CUDA dialects](dialects.md) and [SCALE language extensions](language-extensions.md)
105 | - [Report a bug](../contact/report-a-bug.md)
106 | 


--------------------------------------------------------------------------------
/docs/manual/inline-ptx.md:
--------------------------------------------------------------------------------
  1 | # Inline PTX support
  2 | 
  3 | SCALE accepts inline PTX `asm` blocks in CUDA programs and will attempt to 
  4 | compile it for AMD along with the rest of your program.
  5 | 
  6 | ## Wave64 considerations
  7 | 
  8 | A small number of PTX instructions depend on the warp size of the GPU being 
  9 | used. Since all NVIDIA GPUs and many AMD ones have a warp size of 32, much 
 10 | code implicitly relies on this. As a result, issues can appear when 
 11 | targeting wave64 devices.
 12 | 
 13 | SCALE provides several tools and compiler warnings to help you write 
 14 | portable PTX code. In most cases only small tweaks are required to get things
 15 | working. Since so little PTX actually depends on the warp size, most 
 16 | projects are unaffected by the issues documented in this section. 
 17 | Nevertheless, it is useful to adjust your code to be warp-size-agnostic, 
 18 | since doing so can be done with no downsides.
 19 | 
 20 | ### Querying warp size
 21 | 
 22 | PTX defines the `WARP_SZ` global constant which can be used to access the
 23 | warp size directly. It's a compile-time constant in nvidia's implementation 
 24 | as well as in SCALE, so there is no cost to using this and doing arithmetic 
 25 | with it (like with `warpSize` in CUDA code).
 26 | 
 27 | ### Lanemask inputs
 28 | 
 29 | The length of lanemask operands on instructions will always have a number of 
 30 | bits equal to the warp size on the target GPU. For 
 31 | example, when compiling for a wave64 GPU, the lanemask argument to `shfl.sync`
 32 | is a `b64`, not `b32`.
 33 | 
 34 | The following rules are applied to help detect problems with such operands:
 35 | 
 36 | - If a non-constant lanemask operand is used, and its bit-length is <= the 
 37 |   warp size, an error is raised.
 38 | - If a constant lanemask operand is used with no 1-bits in the high 32 bits, 
 39 |   while compiling for a wave64 architecture, a warning is raised (which can 
 40 |   be disabled). This catches the common case of hardcoded lanemasks like 
 41 |   `0xFFFFFFFF` which will typecheck as `b64`, but will probably not do what 
 42 |   you want.
 43 | 
 44 | In the common case where you want an all-ones lanemask, the most convenient 
 45 | thing to do is write `-1` instead of `0xFFFFFFFF`: this will give you the 
 46 | correct number of 1-bits in all cases, including on nvidia platforms.
 47 | 
 48 | ### The `c` argument to `shfl` instructions
 49 | 
 50 | The `shfl` PTX instruction has a funky operand, `c`, used for clamping etc.
 51 | See [the documentation](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync).
 52 | 
 53 | The `c` operand is really two operands packed together: `cval` in
 54 | bits 0-4, and `segmask` in bits 8-12. For wave64, an extra bit is needed. Since
 55 | there is space for an extra bit in each of these values, we simply add it in
 56 | the obvious place.
 57 | 
 58 | A portable way of reasoning about this is to assume that `cval` is in bits 0-7
 59 | and `segmask` in bits 8-15.
 60 | 
 61 | Here's a concrete example of a reverse cumsum that works on either warp size:
 62 | 
 63 | ```c++
 64 | __global__ void shuffleRevCumsumKernel(float *dst)
 65 | {
 66 |     float out;
 67 |     const int C = warpSize - 1;
 68 |     asm(
 69 |     ".reg .f32 Rx;"
 70 |     ".reg .f32 Ry;"
 71 |     ".reg .pred p;"
 72 |     "mov.b32 Rx, %1;"
 73 |     "shfl.sync.down.b32  Ry|p, Rx, 0x1,  %2, -1;"
 74 |     "@p  add.f32        Rx, Ry, Rx;"
 75 |     "shfl.sync.down.b32  Ry|p, Rx, 0x2,  %2, -1;"
 76 |     "@p  add.f32        Rx, Ry, Rx;"
 77 |     "shfl.sync.down.b32  Ry|p, Rx, 0x4,  %2, -1;"
 78 |     "@p  add.f32        Rx, Ry, Rx;"
 79 |     "shfl.sync.down.b32  Ry|p, Rx, 0x8,  %2, -1;"
 80 |     "@p  add.f32        Rx, Ry, Rx;"
 81 |     "shfl.sync.down.b32  Ry|p, Rx, 0x10, %2, -1;"
 82 |     "@p  add.f32        Rx, Ry, Rx;"
 83 |     
 84 |     // One extra shuffle is needed for the larger warp size.
 85 |     #if __SCALE_WARP_SIZE__ > 32
 86 |     "shfl.sync.down.b32  Ry|p, Rx, 0x20, %2, -1;"
 87 |     "@p  add.f32        Rx, Ry, Rx;"
 88 |     #endif // __SCALE_WARP_SIZE__
 89 |     "mov.b32 %0, Rx;"
 90 |     : "=f"(out) : "f"(1.0f), "n"(C)
 91 |     );
 92 | 
 93 |     dst[threadIdx.x] = out;
 94 | }
 95 | ```
 96 | 
 97 | And here's how to do a portable butterfly shuffle reduction:
 98 | 
 99 | ```c++
100 | __global__ void shuffleBflyKernel(float *dst)
101 | {
102 |     const int C = warpSize - 1;
103 | 
104 |     float out;
105 |     asm(
106 |     ".reg .f32 Rx;"
107 |     ".reg .f32 Ry;"
108 |     ".reg .pred p;"
109 |     "mov.b32 Rx, %1;"
110 |     #if __SCALE_WARP_SIZE__ > 32
111 |     "shfl.sync.bfly.b32  Ry, Rx, 0x20, %2, -1;"
112 |     "add.f32        Rx, Ry, Rx;"
113 |     #endif // __SCALE_WARP_SIZE__
114 |     "shfl.sync.bfly.b32  Ry, Rx, 0x10, %2, -1;"
115 |     "add.f32        Rx, Ry, Rx;"
116 |     "shfl.sync.bfly.b32  Ry, Rx, 0x8,  %2, -1;"
117 |     "add.f32        Rx, Ry, Rx;"
118 |     "shfl.sync.bfly.b32  Ry, Rx, 0x4,  %2, -1;"
119 |     "add.f32        Rx, Ry, Rx;"
120 |     "shfl.sync.bfly.b32  Ry, Rx, 0x2,  %2, -1;"
121 |     "add.f32        Rx, Ry, Rx;"
122 |     "shfl.sync.bfly.b32  Ry, Rx, 0x1,  %2, -1;"
123 |     "add.f32        Rx, Ry, Rx;"
124 |     "mov.b32 %0, Rx;"
125 |     : "=f"(out) : "f"((float) threadIdx.x), "n"(C)
126 |     );
127 | 
128 |     dst[threadIdx.x] = out;
129 | }
130 | ```
131 | 
132 | ## Dialect differences
133 | 
134 | The SCALE compiler accepts a more permissive dialect of PTX than NVIDIA's 
135 | implementation does. 
136 | 
137 | ### Integer lengths
138 | 
139 | Most PTX instructions are defined to work only for a specific, arbitrary set 
140 | of integer types. We didn't bother to implement such restrictions except in 
141 | cases where they are needed for correctness, so many PTX instructions accept 
142 | a wider selection of types than nvcc accepts.
143 | 
144 | One amusing consequence of this is that most of the simple instructions work 
145 | for *any* bit-length: `add.s17` is allowed (but will of course lead to 
146 | extra sext/trunc instructions, so isn't necessarily a good idea).
147 | 
148 | ### Divergent `exit`
149 | 
150 | AMD hardware does not seem to have a mechanism by which individual threads 
151 | can terminate early (only entire warps). As a result, the `exit` 
152 | instruction may be used only in converged contexts. We transform it into
153 | approximately:
154 | 
155 | ```c++
156 | if (__activemask() == -1) {
157 |     exit_entire_warp();
158 | } else {
159 |     // This situation is unrepresentable
160 |     __trap();
161 | }
162 | ```
163 | 
164 | Code that uses `exit` as a performance optimisation for nvidia hardware may 
165 | benefit from being adjusted for AMD.
166 | 
167 | ## Empty `asm volatile` blocks
168 | 
169 | To cater to "interesting" user code, the SCALE compiler will not touch
170 | `asm volatile` blocks containing no instructions. We've seen
171 | real-world CUDA code that uses these as a kind of ad-hoc optimisation
172 | barrier to prevent the compiler breaking programs that contain undefined
173 | behaviour. This pragmatic choice should reduce how often such broken programs
174 | fail to function, but such code is broken by definition.
175 | 
176 | Note that the `volatile` on non-empty `volatile asm` blocks has no effect on the
177 | behaviour of the SCALE compiler. `volatile` asm is a conservative feature that
178 | allows the compiler to model "unknowable" implicit dependencies of the actions
179 | takenby the inline asm. Since we're compiling the asm to IR, the *actual*
180 | dependencies and properties of everything it does are known and modelled. This
181 | can improve optimisation, but may break programs that have undefined behaviour
182 | that was being hidden by the optimisation barrier effect of the volatile asm 
183 | block.
184 | 
185 | ## Returning the carry-bit
186 | 
187 | The PTX carry-bit may not be implicitly returned from functions.
188 | 
189 | Some PTX instructions have a carry flag bit, used to perform extended-precision
190 | integer arithmetic across multiple instructions. The [PTX manual](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions)
191 | notes:
192 | 
193 | > The condition code register is not preserved across calls and is mainly
194 | > intended for use in straight-line code sequences for computing
195 | > extended-precision integer addition, subtraction, and multiplication.
196 | 
197 | It is, therefore, undefined behaviour to write code that writes to the 
198 | carry-bit in one function and then attempts to read it from another. The 
199 | correct execution of such code is optimiser-dependent on NVIDIA's platform 
200 | (depending on the function to inline), so will likely fail in `-O0` builds. 
201 | 
202 | For example:
203 | 
204 | ```c++
205 | void createCarryBit(int x, int y) {
206 |     // Add with carry-out.
207 |     asm("add.cc.u32 %0, %0, %1": "+r"(x) : "r"(y));
208 | }
209 | 
210 | void useCarryBit(int x, int y) {
211 |     createCarryBit(x, y);
212 |     
213 |     // Add with carry-in. Not allowed, since it's trying to read the
214 |     // carry-bit across a function boundary.
215 |     int z;
216 |     asm("add.cc.u32 %0, %1, %2" : "=r"(z) : "r"(x), "r"(y));
217 | }
218 | ```
219 | 
220 | Due to how SCALE's PTX support works, it *can't* support this pattern, so this
221 | situation is a compiler error instead. To write code like this in a portable 
222 | way, you can refactor it to use macros instead.
223 | 
224 | It may also be worth considering if you truly still need this inline asm: both
225 | NVIDIA nvcc and SCALE have good support for `int128_t` now, which makes many 
226 | common uses of these asm constructs redundant.
227 | 
228 | ## `asm` input/output types
229 | 
230 | `nvcc` doesn't appear to consistently follow its own tying rules for PTX asm 
231 | inputs/outputs. It allows the following invalid things to occur in some cases 
232 | (and real programs depend on this):
233 | 
234 | - Writes to read-only asm bindings are permitted (such as writing to an "r") 
235 |   constraint. The result of the write is not visible to the caller: it's 
236 |   effectively a temporary inside the scope of the asm block.
237 | - `=r` (write-only) constraints can be used in a read-write fashion (as if 
238 |   they were `+r`).
239 | - Values passed to the asm block are sometimes, but not always, type checked,
240 |   implicitly widened, or implicitly truncated.
241 | 
242 | To avoid having to characterise and define the perimeter of this buggy 
243 | behaviour, SCALE's implementation defines the following consistent rules 
244 | which are intended to maximise compatibility (and minimise "weirdness"):
245 | 
246 | - All read-only inputs may be written to. The results of these writes are 
247 |   visible only within the scope of the asm block (as if they were local 
248 |   variables being passed by value into a function).
249 | - All write-only outputs are implicitly read-write. ie.: there is no 
250 |   difference between `+r` and `=r`.
251 | - The type of an input or output binding is governed by the type of the 
252 |   expression, not the constraint letter. Once "in PTX", the usual PTX rules 
253 |   about implicit truncation/widening/etc. apply. This nuance won't change 
254 |   the behaviour of programs unless they rely on using a "too short" PTX 
255 |   constraint type to truncate a value, and then implicitly widen it within 
256 |   PTX (hence zeroing out some of the bits). Since such truncations are 
257 |   inconsistently applied even with nvidia nvcc mode, they are probably best 
258 |   achieved with an explicit cast.
259 | 
260 | ## Performance considerations
261 | 
262 | In most cases, there isn't a performance penalty from using PTX asm in CUDA 
263 | code: it will usually convert to the same IR as the C++ you could have 
264 | written instead, and may actually be faster due to not needing to be as
265 | conservative about optimisation compared to the usual rules of asm blocks.
266 | 
267 | Since the compiler _effectively_ converts it to the CUDA code you could have 
268 | written to achieve the same effect without the use of the PTX asm, it 
269 | doesn't come with the optimisation-hindering downsides asm blocks 
270 | normally imply. The compiler will respect the ordering/synchronisation/etc. 
271 | requirements of each operation individually, rather than having to regard an 
272 | entire `asm volatile` block as an opaque, immutable unit.
273 | 
274 | Programs that have already added support for HIP might have multiple 
275 | codepaths: one for CUDA that uses inline PTX, and one for AMD which doesn't. 
276 | In such cases, it is worth testing both to see which is fastest.
277 | 
278 | ## Supported constraints
279 | 
280 | The following PTX constraint letters are supported. See above commentary on 
281 | nuances regarding how they are interpreted.
282 | 
283 | `h`: u16
284 | `r`: u32
285 | `l`: u64
286 | `f`: f32
287 | `d`: f64
288 | `n`: constants
289 | `C`: dynamic asm strings
290 | 
291 | 
292 | ## Supported instructions
293 | 
294 | The following instructions are currently supported.
295 | 
296 | Caveat: since the `bf16`, `fp8` and `tf32` floating point formats are not 
297 | currently supported in SCALE, they are also not supported here.
298 | 
299 | 
300 | | Instruction                      | Notes                     |
301 | |----------------------------------|---------------------------|
302 | | abs                              |                           |
303 | | activemask                       |                           |
304 | | add                              |                           |
305 | | addc                             |                           |
306 | | and                              |                           |
307 | | atom                             |                           |
308 | | bfe                              |                           |
309 | | bfi                              |                           |
310 | | bfind                            |                           |
311 | | bfind.shiftamt                   |                           |
312 | | bmsk                             |                           |
313 | | bra                              |                           |
314 | | brev                             |                           |
315 | | brkpt                            | Currently a no-op         |
316 | | clz                              |                           |
317 | | cnot                             |                           |
318 | | copysign                         |                           |
319 | | cos.approx                       |                           |
320 | | cvt                              |                           |
321 | | cvt.pack                         |                           |
322 | | discard                          | Currently a no-op         |
323 | | div                              |                           |
324 | | dp2a                             |                           |
325 | | dp4a                             |                           |
326 | | elect                            |                           |
327 | | ex2.approx                       |                           |
328 | | exit                             | Only from convergent code |
329 | | fence                            | Memory ranges unsupported |
330 | | fma                              |                           |
331 | | fns                              |                           |
332 | | griddepcontrol.launch_dependents | Currently a no-op         |
333 | | griddepcontrol.wait              | Currently a no-op         |
334 | | isspacep                         |                           |
335 | | ld                               |                           |
336 | | ld.nc                            |                           |
337 | | ldmatrix                         |                           |
338 | | ldu                              |                           |
339 | | lg2.approx                       |                           |
340 | | lop3                             |                           |
341 | | mad                              |                           |
342 | | mad24                            |                           |
343 | | madc                             |                           |
344 | | match.all                        |                           |
345 | | match.any                        |                           |
346 | | max                              |                           |
347 | | max.xorsign.abs                  |                           |
348 | | membar                           |                           |
349 | | min                              |                           |
350 | | min.xorsign.abs                  |                           |
351 | | mma                              | `wmma.mma` likely faster  |
352 | | mov                              |                           |
353 | | mul                              |                           |
354 | | mul24                            |                           |
355 | | nanosleep                        |                           |
356 | | neg                              |                           |
357 | | not                              |                           |
358 | | or                               |                           |
359 | | pmevent                          | Currently a no-op         |
360 | | popc                             |                           |
361 | | prefetch                         |                           |
362 | | prefetchu                        |                           |
363 | | prmt                             |                           |
364 | | prmt.b4e                         |                           |
365 | | prmt.ecl                         |                           |
366 | | prmt.ecr                         |                           |
367 | | prmt.f4e                         |                           |
368 | | prmt.rc16                        |                           |
369 | | prmt.rc8                         |                           |
370 | | rcp                              |                           |
371 | | red                              |                           |
372 | | redux                            |                           |
373 | | rem                              |                           |
374 | | sad                              |                           |
375 | | sin.approx                       |                           |
376 | | selp                             |                           |
377 | | set                              |                           |
378 | | setp                             |                           |
379 | | shf.l                            |                           |
380 | | shfl.bfly                        |                           |
381 | | shfl.down                        |                           |
382 | | shfl.idx                         |                           |
383 | | shfl.up                          |                           |
384 | | shf.r                            |                           |
385 | | shl                              |                           |
386 | | shr                              |                           |
387 | | slct                             |                           |
388 | | st                               |                           |
389 | | stmatrix                         |                           |
390 | | sub                              |                           |
391 | | subc                             |                           |
392 | | szext                            |                           |
393 | | testp.finite                     |                           |
394 | | testp.infinite                   |                           |
395 | | testp.normal                     |                           |
396 | | testp.notanumber                 |                           |
397 | | testp.number                     |                           |
398 | | testp.subnormal                  |                           |
399 | | trap                             |                           |
400 | | vabsdiff                         |                           |
401 | | vadd                             |                           |
402 | | vmax                             |                           |
403 | | vmin                             |                           |
404 | | vote.all                         |                           |
405 | | vote.any                         |                           |
406 | | vote.ballot                      |                           |
407 | | vote.uni                         |                           |
408 | | vshl                             |                           |
409 | | vshr                             |                           |
410 | | vsub                             |                           |
411 | | wmma.load                        |                           |
412 | | wmma.store                       |                           |
413 | | wmma.mma                         |                           |
414 | | xor                              |                           |
415 | 


--------------------------------------------------------------------------------
/docs/manual/language-extensions.md:
--------------------------------------------------------------------------------
  1 | # Language Extensions
  2 | 
  3 | SCALE has various opt-in language extensions that aim to improve the 
  4 | experience of writing GPU code. More language extensions are in development.
  5 | 
  6 | Unless otherwise noted, SCALE accepts these language extensions in both 
  7 | `clang` and `nvcc` modes. Note that there are [dialect differences](dialects.md)
  8 | between the two modes.
  9 | 
 10 | Since NVIDIA's compiler does not support SCALE language extensions, if you 
 11 | want to retain the ability to compile for NVIDIA GPUs you must do one of two 
 12 | things:
 13 | 
 14 | - Guard use of language extensions behind the `__REDSCALE__` macro, hiding 
 15 |   it from NVIDIA's `nvcc`.
 16 | - Use SCALE's `clang` compiler to compile for both NVIDIA and AMD targets. 
 17 |   This will require changes to your build system.
 18 | 
 19 | ## `[[clang::loop_unroll]]`
 20 | 
 21 | GPU code frequently contains loops that need to be partially unrolled, and which
 22 | have the property that the degree of unrolling is a tradeoff between ILP and
 23 | register usage.
 24 | 
 25 | Finding the optimal amount to unroll is not usually possible in the compiler
 26 | because the number of threads to be used is a runtime value. Programmers
 27 | therefore usually want to set unroll depth by hand.
 28 | 
 29 | The existing `#pragma unroll N` allows this to be set at the preprocessor level.
 30 | The new `[[clang::loop_unroll N]]` allows doing this in a template-dependent
 31 | way:
 32 | 
 33 | ```c++
 34 | template<int UnrollAmount>
 35 | __device__ void example(int n) {
 36 |     [[clang::loop_unroll UnrollAmount]]
 37 |     for (int i = 0; i < n; i++) {
 38 |         // ...
 39 |     }
 40 | }
 41 | ```
 42 | 
 43 | ## `__builtin_provable(bool X)`
 44 | 
 45 | `__builtin_provable(X)` accepts a boolean, `X`, and:
 46 | 
 47 | - If the compiler is able to prove, during optimisation, that `X` is a 
 48 |   compile-time constant true, the entire expression evaluates to a 
 49 |   compile-time constant true.
 50 | - Otherwise (if `X` is unknown, or provably false), the entire expression 
 51 |   evaluates to a compile-time constant false.
 52 | 
 53 | This allows you to write code that opportunistically optimises for a special 
 54 | case, without the risk of runtime branching overhead or the inconvenience of 
 55 | propagating this information through your entire program using templates. 
 56 | For example:
 57 | 
 58 | ```c++
 59 | __device__ int myCleverFunction(int input) {
 60 |     if (__builtin_provable(input % 2 == 0)) {
 61 |         // Special fast code for the case where `input` is divisible
 62 |         // by 2 goes here.
 63 |     } else {
 64 |         // Slow, general case goes here.
 65 |     }
 66 | }
 67 | ```
 68 | 
 69 | During optimisation, as calls to `myCleverFunction` get inlined, the 
 70 | compiler may be able to prove that `input % 2 == 0` for specific calls to 
 71 | this function. Those cases will be compiled with the "fast path", while all 
 72 | others will be compiled to the "slow path". The `if` statement will never 
 73 | compile to an actual conditional.
 74 | 
 75 | Since there are no guarantees that the optimiser is able to prove the 
 76 | condition, the program must produce identical outputs from either path, or 
 77 | the behaviour is undefined.
 78 | 
 79 | This feature differs from the standard c++17 `if constexpr` in that it is 
 80 | not required that the input boolean be `constexpr`. `__builtin_provable()` 
 81 | communicates with the optimiser, not the template system. Consequently:
 82 | 
 83 | - You don't need to use templates to propagate "optimisation knowledge" 
 84 |   throughout the program.
 85 | - Compilation may be faster, as a result of not having to template everything.
 86 | - Some cases may be missed where optimisation fails. Such cases are probably 
 87 |   independently worth investigating (_Why_ did optimisation fail? That's a 
 88 |   source of additional slowness).
 89 | 
 90 | ## Improved support for non-32 warpSize
 91 | 
 92 | Not all AMD GPUs have a warp size of 32. To mitigate this, we offer a variety
 93 | of compiler and API features:
 94 | 
 95 | - `cudaLaneMask_t`: A type that is an integer with the number of bits as a CUDA
 96 |   warp. This should be used when using functions such as `__ballot()` to avoid
 97 |   discarding half the bits.
 98 | - Use of `cudaLaneMask_t` in appropriate places in the CUDA APIs (such as 
 99 |   the return value of `__ballot()`)
100 | - Diagnostics to catch implicit casts from `cudaLaneMask_t` to narrower types.
101 | 
102 | In practice, this means the compiler detects the majority of cases where code
103 | is written in a way that will break on a device with a warp size of 64.
104 | 
105 | Programmers should modify their CUDA code to be agnostic to warp size. 
106 | NVIDIA's documentation recommends this practice, but a lot of real-world CUDA 
107 | code does it incorrectly because no current NVIDIA hardware has a warp size 
108 | other than 32.
109 | 
110 | Since NVIDIA's `nvcc` does not have `cudaLaneMask_t`, programmers should use
111 | `auto` to declare the return types of functions such as `__ballot()` that return
112 | it. This will compile correctly on all platforms.
113 | 


--------------------------------------------------------------------------------
/docs/manual/optimisation-flags.md:
--------------------------------------------------------------------------------
 1 | # Compiler Optimisation Flags
 2 | 
 3 | When using the `nvcc` frontend to SCALE, it matches the behaviour of 
 4 | NVIDIA's compiler as closely as possible.
 5 | 
 6 | This means disabling some optimisations that are enabled by default by clang,
 7 | since those break certain programs which rely on the behaviour of NVIDIA's 
 8 | compiler. In some cases, such reliance represents undefined behaviour in the
 9 | affected program.
10 | 
11 | This page documents some of these differences, and how to opt-in to these 
12 | optimisations on a case-by-case basis. In general, all `clang++` flags are 
13 | also accepted by SCALE in `nvcc` mode.
14 | 
15 | You may be able to simply switch some of these features on and immediately 
16 | gain performance.
17 | 
18 | tl;dr: try `-ffast-math -fstrict-aliasing` and see if your program explodes.
19 | 
20 | ## Floating Point Optimisation
21 | 
22 | NVIDIA's compiler provides a `-use_fast_math` flag that relaxes some 
23 | floating point rules to improve optimisation. It's documented to do exactly
24 | these things:
25 | 
26 | - Use some less precise math functions (eg. `__sinf()` instead of `sinf()`).
27 | - Enables less precise sqrt/division
28 | - Flushes denorms to zero.
29 | - Fuse multiply-adds into FMA operations.
30 | 
31 | SCALE mirrors this behaviour when you use this flag in our `nvcc` emulation 
32 | mode, aiming to produce the same results as NVIDIA's compiler.
33 | 
34 | SCALE also provides all of `clang`'s flags,
35 | allowing access to its more aggressive floating point optimisations, 
36 | such as:
37 | 
38 | - Assume infinities and NaNs never happen
39 | - Allow algebraic rearrangement of floating point calculations
40 | 
41 | Full details about these flags are available in the [clang user manal](https://releases.llvm.org/19.1.0/tools/clang/docs/UsersManual.html).
42 | 
43 | These optimisations can be controlled per-line using the [fp control pragmas](https://releases.llvm.org/19.1.0/tools/clang/docs/LanguageExtensions.html#extensions-to-specify-floating-point-flags).
44 | 
45 | This allows you to either:
46 | 
47 | - Specify the compiler flag (to enable an optimisation by default) and then 
48 |   switch it off for special code regions (ie. opt-out mode).
49 | - Opt-in to the optimisation in regions of code where you know it to be safe.
50 | 
51 | These flags will affect the performance of functions 
52 | in the SCALE implementation of the CUDA Math API.
53 | 
54 | These flags do not affect the accuracy of the results of the Math API, but 
55 | do apply assumptions about the range of possible inputs. For example: if 
56 | you enable "assume no infinities", all infinity-handling logic will be removed
57 | from the Math API functions, making them slightly more efficient. Flags like
58 | "reassociate" and "use reciprocal math" do not affect the behaviour of the
59 | math functions.
60 | 
61 | Each call to a math function will be optimised separately, using the set of 
62 | fp optimisation flags in effect at that point in that file. You can use 
63 | pragmas to mix different optimisation flags at different points within the 
64 | same file. It is OK to compile different source files with different fp 
65 | optimisation flags and then link them together.
66 | 
67 | ## Strict aliasing
68 | 
69 | By default, in C++, the compiler assumes that pointers to unrelated types 
70 | (eg `float` and `int`) never point to the same place. This can significantly
71 | improve optimisation by improving instruction reordering and ILP.
72 | 
73 | Unfortunately, NVIDIA's `nvcc` does not do strict-aliasing optimisations, 
74 | and enabling it breaks some CUDA programs. SCALE-nvcc therefore disables 
75 | this by default.
76 | 
77 | You can explicitly enable this class of optimisations in SCALE by adding 
78 | `-fstrict-aliasing`. This may break your program if it contains TBAA violations.
79 | We recommend you find and fix such violations, since they are undefined
80 | behaviour. This would mean your code is only working correctly because NVIDIA's
81 | compiler doesn't currently exploit this type of optimisation: something which
82 | may change in the future!
83 | 


--------------------------------------------------------------------------------
/docs/manual/runtime-extensions.md:
--------------------------------------------------------------------------------
 1 | # API Extensions
 2 | 
 3 | SCALE has some runtime/library features not found in NVIDIA's CUDA Toolkit.
 4 | 
 5 | ## Environment variables
 6 | 
 7 | Some extra features can be enabled by environment variables.
 8 | 
 9 | ### `SCALE_EXCEPTIONS`
10 | 
11 | Errors from the CUDA API can be hard to debug, since they simply return an
12 | error code that the host program has to do something with.
13 | 
14 | SCALE provides an environment variable to make any error from the CUDA API
15 | produce a observable result.
16 | 
17 | Setting `SCALE_EXCEPTIONS=1` will cause all CUDA APIs to throw descriptive 
18 | exceptions instead of returning C-style error codes.
19 | 
20 | Setting `SCALE_EXCEPTIONS=2` will print the error messages to stderr, but not
21 | throw them. This is helpful for programs that deliberately create CUDA errors
22 | as part of their processing.
23 | 
24 | In cases where CUDA APIs are expected to return a value other than 
25 | `cudaSuccess` during normal operation (such as `cudaStreamQuery()`, an 
26 | exception will not be thrown except if an exceptional case arises.
27 | 
28 | ## API Extensions
29 | 
30 | Some of SCALE's API extensions require the `scale.h` header to be included. 
31 | 
32 | ### Programmatic Exception Enablement
33 | 
34 | SCALE exceptions (see documentation of `SCALE_EXCEPTIONS` environment 
35 | variable above) may also be enabled/disabled programmatically using:
36 | 
37 | ```c++
38 | scale::Exception::enable(); // To enable.
39 | scale::Exception::enable(false); // To disable.
40 | ```
41 | 
42 | Even when exceptions are disabled, you can access a `scale::Exception` object
43 | containing the descriptive error message from the most recent failure using
44 | `scale::Exception::last()`:
45 | 
46 | ```c++
47 | cudaError_t e = cudaSomething();
48 | if (e != cudaSuccess) {
49 |     const scale::Exception &ex = scale::Exception::last();
50 |     std::cerr << "CUDA error: " << ex.what() << '\n';
51 | }
52 | ```
53 | 
54 | The error accessed by this API is the same one you'd get from using the CUDA
55 | API `cudaGetLastError()`, just more descriptive.
56 | 


--------------------------------------------------------------------------------
/docs/manual/troubleshooting.md:
--------------------------------------------------------------------------------
  1 | # Troubleshooting
  2 | 
  3 | This page provides tips for solving common problems encountered when trying 
  4 | to compile or run CUDA programs with SCALE.
  5 | 
  6 | ## Crashes
  7 | 
  8 | Please [report a bug](../contact/report-a-bug.md).
  9 | 
 10 | ## "No such function: cuBlas/cuFFt/cuSolverSomethingSomething()"
 11 | 
 12 | If your project needs a missing "CUDA-X" API (cuBLAS, cuFFT, cuSOLVER and
 13 | friends), this is most likely something you can fix yourself by submitting a
 14 | patch to the [open-source library wrapper project](https://github.com/spectral-compute/scale-library-wrappers).
 15 | So long as an equivalent function is available in a ROCm library, the wrapper
 16 | code is trivial.
 17 | 
 18 | ## CUDA API errors
 19 | 
 20 | The [`SCALE_EXCEPTIONS` feature](runtime-extensions.md#scale_exceptions) can
 21 | be helpful for getting more information about many failures.
 22 | 
 23 | ## wave64 issues
 24 | 
 25 | All current NVIDIA GPUs have a warp size of 32, so many CUDA programs are 
 26 | written in a way that assumes this is always the case.
 27 | 
 28 | Some AMD GPUs have a warp size of 64, which can cause problems for CUDA code 
 29 | written in this way.
 30 | 
 31 | SCALE offers tools to address this problem:
 32 | 
 33 | - APIs that operate on warp masks accept and return a new type: 
 34 |   `cudaWarpSize_t`. This is an integer with as many bits as there are 
 35 |   threads in a warp on the target GPU.
 36 | - Some APIs (such as `__ffs()`) have extra overloads for `cudaWarpSize_t`, so
 37 |   common patterns (such as `__ffs(__ballot(...))`) just work.
 38 | - The SCALE compiler will emit compiler warnings when values that represent 
 39 |   warp masks are implicitly truncated to 32 bits.
 40 | 
 41 | To write code that works correctly on both platforms:
 42 | 
 43 | - Use `auto` instead of `uint32_t` when declaring a variable that is 
 44 |   intended to contain a warp mask. With NVIDIA `nvcc` this will map to
 45 |   `uint32_t`, and with SCALE this will map to `cudaWarpSize_t`, producing 
 46 |   correct behaviour on both platforms.
 47 | - Avoid hardcoding the constant "32" to represent warp size, instead using 
 48 |   the global `warpSize` available on all platforms.
 49 | 
 50 | ## Initialization errors or no devices found
 51 | 
 52 | The SCALE runtime can fail to initialise if:
 53 | 
 54 | - The AMD kernel module is out of date.
 55 | - `/dev/kfd` is not writable by the user running the program.
 56 | - There are no supported GPUs attached.
 57 | 
 58 | This situation produces error messages such as:
 59 | 
 60 | ```
 61 | $ SCALE_EXCEPTIONS=1 ./myProgram
 62 | terminate called after throwing an instance of 'redscale::SimpleException'
 63 |   what():  cudaDeviceSynchronize: No usable CUDA devices found., CUDA error: "no device"
 64 | Aborted (core dumped)
 65 | ```
 66 | 
 67 | ```
 68 | $ /opt/scale/bin/scaleinfo
 69 | Error getting device count: initialization error
 70 | ```
 71 | 
 72 | ```
 73 | $ /opt/scale/bin/hsakmtsysinfo
 74 | terminate called after throwing an instance of 'std::runtime_error'
 75 |   what():  HSAKMT Error 20: Could not open KFD
 76 | Aborted (core dumped)
 77 | ```
 78 | 
 79 | ### Verify you have a supported gpu
 80 | 
 81 | Run `/opt/scale/bin/hsasysinfo | grep 'Name: gfx` to determine the 
 82 | architecture of your GPU, and determine if it is one of the supported 
 83 | architectures listed [here](../README.md#which-gpus-are-supported).
 84 | 
 85 | ### Ensure `/dev/kfd` is writable
 86 | 
 87 | Ensure your user is in the group that grants access to `/dev/kfd`. On Ubuntu,
 88 | this is via membership of the `render` group:
 89 | `sudo usermod -a -G render USERNAME`.
 90 | 
 91 | You could temporarily make `/dev/kfd` world-writable via: `sudo chmod 666 
 92 | /dev/kfd`.
 93 | 
 94 | ## Cannot find shared object
 95 | 
 96 | The correct library search path for a SCALE binary can be target dependent due
 97 | to [compute capability mapping](./compute-capabilities.md). This can lead
 98 | to runtime errors where the SCALE libraries cannot be found, such as:
 99 | 
100 | ```
101 | error while loading shared libraries: libredscale.so: cannot open shared object file: No such file or directory
102 | ```
103 | 
104 | Two ways to solve this problem are:
105 | 
106 |  - Set `LD_LIBRARY_PATH` to the SCALE target library directory, such as:
107 |    `LD_LIBRARY_PATH=/opt/scale/targets/gfx1030/lib:$LD_LIBRARY_PATH` for `gfx1030`.
108 |  - Compile your program is compiled with that directory in RPATH:
109 |    [rpath](https://en.wikipedia.org/wiki/Rpath).
110 | 
111 | ## Cannot compile using the nvrtc API or reported compute capabilities are huge
112 | 
113 | Both of these problems are caused by using a `libredscale.so` that is not
114 | located in the correct place relative to its support files when running a
115 | program. In the case of the nvrtc API, it's because the compiler cannot be
116 | found. In the case of reported huge compute capabilities, it's because the
117 | [compute capability map](./compute-capabilities.md) cannot be found.
118 | 
119 | The solution is to make sure to use the `lib` subdirectories for one of the
120 | targets, rather than the `lib` directory of the SCALE installation directory.
121 | For example, `/opt/scale/targets/gfx1030/lib` rather than `/opt/scale/lib`. The
122 | `gfxany` target is suitable for using the nvrtc API, but it does not contain a
123 | compute capability map so it will not report small compute capabilities.
124 | 
125 | As with being [unable to find the shared object](#cannot-find-shared-object) at
126 | all, this can be solved either by setting `LD_LIBRARY_PATH` or by setting the
127 | binary's rpath.
128 | 
129 | #### Example error:
130 | 
131 | ```
132 | $ SCALE_EXCEPTIONS=1 ./rtc
133 | terminate called after throwing an instance of 'redscale::RtcException'
134 |   what():  nvrtcCompileProgram: Could not find clang-nvcc or nvcc., CUDA error: "JIT compiler not found", NVRTC error: "Compilation"
135 | Aborted (core dumped)
136 | ```
137 | 
138 | ## nvcc: cannot find libdevice for sm_52 and cannot find CUDA installation
139 | 
140 | If `targets/gfxany` rather than a specific target like `targets/gfx1030` is used, then there is no default GPU to
141 | target. This leads to an error like the example below. The solution is to either use a target-specific directory like
142 | `targets/gfx1030`, or to specify a specific target such as with `-arch gfx1030`.
143 | 
144 | #### Example error
145 | 
146 | ```
147 | nvcc: error: cannot find libdevice for sm_52; provide path to different CUDA installation via '--cuda-path', or pass '-nocudalib' to build without linking with libdevice
148 | nvcc: error: cannot find CUDA installation; provide its path via '--cuda-path', or pass '-nocudainc' to build without CUDA includes
149 | ```
150 | 
151 | ## Cannot find C++ standard library include
152 | 
153 | Some distributions, such as Ubuntu, permit multiple versions of `gcc` and `g++` to be installed separately. It is
154 | possible to have a version of `gcc` installed without the corresponding version of `g++`. This can cause our compiler to
155 | be unable to find the C++ standard library headers.
156 | 
157 | The solution is to ensure the corresponding version of `g++` is installed. For example: if the latest version of `gcc`
158 | you have installed is `gcc-12`, but you do not have `g++-12` installed, run: `sudo apt-get install g++-12`.
159 | 
160 | #### Example error
161 | 
162 | ```
163 |   In file included from <built-in>:1:
164 | 
165 |   In file included from
166 |   /opt/scale/targets/gfx1100/include/redscale_impl/device.h:6:
167 | 
168 |   In file included from
169 |   /opt/scale/targets/gfx1100/include/redscale_impl/common.h:40:
170 | 
171 |   /opt/scale/targets/gfx1100/include/redscale_impl/../cuda.h:15:10: fatal
172 |   error: 'cstddef' file not found
173 | 
174 |   #include <cstddef>
175 | 
176 |            ^~~~~~~~~
177 | 
178 |   1 error generated when compiling for gfx1100.
179 | ```
180 | 
181 | ## CMake: Error running link command: no such file or directory
182 | 
183 | CMake tries to detect the linker to use based on the compiler. For SCALE's
184 | `nvcc`, it uses `clang++` as the linker. If this does not exist in your `PATH`,
185 | the result is an error like the one in the example below.
186 | 
187 | A good solution is to make sure SCALE's `nvcc` is at the start of your `PATH`.
188 | This will place our `clang++` on your path too, avoiding the problem.
189 | 
190 | ```bash
191 | # Adjust for the target you want to use.
192 | export PATH=/opt/scale/targets/gfx1030/bin:$PATH
193 | ```
194 | 
195 | #### Example error
196 | 
197 | ```
198 | -- The CUDA compiler identification is NVIDIA 12.5.999
199 | -- Detecting CUDA compiler ABI info
200 | -- Detecting CUDA compiler ABI info - failed
201 | -- Check for working CUDA compiler: /opt/scale/targets/gfx1030/bin/nvcc
202 | -- Check for working CUDA compiler: /opt/scale/targets/gfx1030/bin/nvcc - broken
203 | CMake Error at /usr/local/share/cmake-3.29/Modules/CMakeTestCUDACompiler.cmake:59 (message):
204 |   The CUDA compiler
205 | 
206 |     "/opt/scale/targets/gfx1030/bin/nvcc"
207 | 
208 |   is not able to compile a simple test program.
209 | 
210 |   It fails with the following output:
211 | 
212 |     Change Dir: '/home/user/test/cmake/build/CMakeFiles/CMakeScratch/TryCompile-vLZLYV'
213 | 
214 |     Run Build Command(s): /usr/local/bin/cmake -E env VERBOSE=1 /usr/bin/gmake -f Makefile cmTC_185e7/fast
215 |     /usr/bin/gmake  -f CMakeFiles/cmTC_185e7.dir/build.make CMakeFiles/cmTC_185e7.dir/build
216 |     gmake[1]: Entering directory '/home/user/test/cmake/build/CMakeFiles/CMakeScratch/TryCompile-vLZLYV'
217 |     Building CUDA object CMakeFiles/cmTC_185e7.dir/main.cu.o
218 |     /opt/scale/targets/gfx1030/bin/nvcc -forward-unknown-to-host-compiler   "--generate-code=arch=compute_86,code=[compute_86,sm_86]" -MD -MT CMakeFiles/cmTC_185e7.dir/main.cu.o -MF CMakeFiles/cmTC_185e7.dir/main.cu.o.d -x cu -c /home/user/test/cmake/build/CMakeFiles/CMakeScratch/TryCompile-vLZLYV/main.cu -o CMakeFiles/cmTC_185e7.dir/main.cu.o
219 |     Linking CUDA executable cmTC_185e7
220 |     /usr/local/bin/cmake -E cmake_link_script CMakeFiles/cmTC_185e7.dir/link.txt --verbose=1
221 |     clang++ @CMakeFiles/cmTC_185e7.dir/objects1.rsp -o cmTC_185e7 @CMakeFiles/cmTC_185e7.dir/linkLibs.rsp -L"/opt/scale/targets/gfx1030/lib"
222 |     Error running link command: no such file or directorygmake[1]: *** [CMakeFiles/cmTC_185e7.dir/build.make:102: cmTC_185e7] Error 2
223 |     gmake[1]: Leaving directory '/home/user/test/cmake/build/CMakeFiles/CMakeScratch/TryCompile-vLZLYV'
224 |     gmake: *** [Makefile:127: cmTC_185e7/fast] Error 2
225 | 
226 | 
227 | 
228 | 
229 | 
230 |   CMake will not be able to correctly generate this project.
231 | Call Stack (most recent call first):
232 |   CMakeLists.txt:2 (project)
233 | 
234 | 
235 | -- Configuring incomplete, errors occurred!
236 | ```
237 | 
238 | ## Half precision intrinsics not defined in C++
239 | 
240 | If you're using `__half` in host code in a non-CUDA translation unit, you 
241 | might get an error claiming the function you want does not exist:
242 | 
243 | ```
244 | error: ‘__half2float’ was not declared in this scope
245 | ```
246 | 
247 | This problem can be resolved by using newer C++ compiler.
248 | 
249 | This issue is discussed in more detail in the [Differences from NVIDIA CUDA](differences.md#host-side-__half-support) 
250 | section.
251 | 


--------------------------------------------------------------------------------
/docs/notices.md:
--------------------------------------------------------------------------------
  1 | # Third party software
  2 | 
  3 | 
  4 | SCALE uses a number of third-party software tools, libraries and content.
  5 | 
  6 | The file (gratefully) attributes the authors of those works, the licences under which they 
  7 | are available, and indicates the terms of each license.
  8 | 
  9 | 
 10 | ==============================================================================
 11 | Thrust is under the Apache Licence v2.0, with some specific exceptions listed below
 12 | LLVM and libcu++ is under the Apache License v2.0 with LLVM Exceptions:
 13 | ==============================================================================
 14 |                                 Apache License
 15 |                            Version 2.0, January 2004
 16 |                         http://www.apache.org/licenses/
 17 | 
 18 |     TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 19 | 
 20 |     1. Definitions.
 21 | 
 22 |       "License" shall mean the terms and conditions for use, reproduction,
 23 |       and distribution as defined by Sections 1 through 9 of this document.
 24 | 
 25 |       "Licensor" shall mean the copyright owner or entity authorized by
 26 |       the copyright owner that is granting the License.
 27 | 
 28 |       "Legal Entity" shall mean the union of the acting entity and all
 29 |       other entities that control, are controlled by, or are under common
 30 |       control with that entity. For the purposes of this definition,
 31 |       "control" means (i) the power, direct or indirect, to cause the
 32 |       direction or management of such entity, whether by contract or
 33 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 34 |       outstanding shares, or (iii) beneficial ownership of such entity.
 35 | 
 36 |       "You" (or "Your") shall mean an individual or Legal Entity
 37 |       exercising permissions granted by this License.
 38 | 
 39 |       "Source" form shall mean the preferred form for making modifications,
 40 |       including but not limited to software source code, documentation
 41 |       source, and configuration files.
 42 | 
 43 |       "Object" form shall mean any form resulting from mechanical
 44 |       transformation or translation of a Source form, including but
 45 |       not limited to compiled object code, generated documentation,
 46 |       and conversions to other media types.
 47 | 
 48 |       "Work" shall mean the work of authorship, whether in Source or
 49 |       Object form, made available under the License, as indicated by a
 50 |       copyright notice that is included in or attached to the work
 51 |       (an example is provided in the Appendix below).
 52 | 
 53 |       "Derivative Works" shall mean any work, whether in Source or Object
 54 |       form, that is based on (or derived from) the Work and for which the
 55 |       editorial revisions, annotations, elaborations, or other modifications
 56 |       represent, as a whole, an original work of authorship. For the purposes
 57 |       of this License, Derivative Works shall not include works that remain
 58 |       separable from, or merely link (or bind by name) to the interfaces of,
 59 |       the Work and Derivative Works thereof.
 60 | 
 61 |       "Contribution" shall mean any work of authorship, including
 62 |       the original version of the Work and any modifications or additions
 63 |       to that Work or Derivative Works thereof, that is intentionally
 64 |       submitted to Licensor for inclusion in the Work by the copyright owner
 65 |       or by an individual or Legal Entity authorized to submit on behalf of
 66 |       the copyright owner. For the purposes of this definition, "submitted"
 67 |       means any form of electronic, verbal, or written communication sent
 68 |       to the Licensor or its representatives, including but not limited to
 69 |       communication on electronic mailing lists, source code control systems,
 70 |       and issue tracking systems that are managed by, or on behalf of, the
 71 |       Licensor for the purpose of discussing and improving the Work, but
 72 |       excluding communication that is conspicuously marked or otherwise
 73 |       designated in writing by the copyright owner as "Not a Contribution."
 74 | 
 75 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 76 |       on behalf of whom a Contribution has been received by Licensor and
 77 |       subsequently incorporated within the Work.
 78 | 
 79 |     2. Grant of Copyright License. Subject to the terms and conditions of
 80 |       this License, each Contributor hereby grants to You a perpetual,
 81 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 82 |       copyright license to reproduce, prepare Derivative Works of,
 83 |       publicly display, publicly perform, sublicense, and distribute the
 84 |       Work and such Derivative Works in Source or Object form.
 85 | 
 86 |     3. Grant of Patent License. Subject to the terms and conditions of
 87 |       this License, each Contributor hereby grants to You a perpetual,
 88 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 89 |       (except as stated in this section) patent license to make, have made,
 90 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 91 |       where such license applies only to those patent claims licensable
 92 |       by such Contributor that are necessarily infringed by their
 93 |       Contribution(s) alone or by combination of their Contribution(s)
 94 |       with the Work to which such Contribution(s) was submitted. If You
 95 |       institute patent litigation against any entity (including a
 96 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 97 |       or a Contribution incorporated within the Work constitutes direct
 98 |       or contributory patent infringement, then any patent licenses
 99 |       granted to You under this License for that Work shall terminate
100 |       as of the date such litigation is filed.
101 | 
102 |     4. Redistribution. You may reproduce and distribute copies of the
103 |       Work or Derivative Works thereof in any medium, with or without
104 |       modifications, and in Source or Object form, provided that You
105 |       meet the following conditions:
106 | 
107 |       (a) You must give any other recipients of the Work or
108 |           Derivative Works a copy of this License; and
109 | 
110 |       (b) You must cause any modified files to carry prominent notices
111 |           stating that You changed the files; and
112 | 
113 |       (c) You must retain, in the Source form of any Derivative Works
114 |           that You distribute, all copyright, patent, trademark, and
115 |           attribution notices from the Source form of the Work,
116 |           excluding those notices that do not pertain to any part of
117 |           the Derivative Works; and
118 | 
119 |       (d) If the Work includes a "NOTICE" text file as part of its
120 |           distribution, then any Derivative Works that You distribute must
121 |           include a readable copy of the attribution notices contained
122 |           within such NOTICE file, excluding those notices that do not
123 |           pertain to any part of the Derivative Works, in at least one
124 |           of the following places: within a NOTICE text file distributed
125 |           as part of the Derivative Works; within the Source form or
126 |           documentation, if provided along with the Derivative Works; or,
127 |           within a display generated by the Derivative Works, if and
128 |           wherever such third-party notices normally appear. The contents
129 |           of the NOTICE file are for informational purposes only and
130 |           do not modify the License. You may add Your own attribution
131 |           notices within Derivative Works that You distribute, alongside
132 |           or as an addendum to the NOTICE text from the Work, provided
133 |           that such additional attribution notices cannot be construed
134 |           as modifying the License.
135 | 
136 |       You may add Your own copyright statement to Your modifications and
137 |       may provide additional or different license terms and conditions
138 |       for use, reproduction, or distribution of Your modifications, or
139 |       for any such Derivative Works as a whole, provided Your use,
140 |       reproduction, and distribution of the Work otherwise complies with
141 |       the conditions stated in this License.
142 | 
143 |     5. Submission of Contributions. Unless You explicitly state otherwise,
144 |       any Contribution intentionally submitted for inclusion in the Work
145 |       by You to the Licensor shall be under the terms and conditions of
146 |       this License, without any additional terms or conditions.
147 |       Notwithstanding the above, nothing herein shall supersede or modify
148 |       the terms of any separate license agreement you may have executed
149 |       with Licensor regarding such Contributions.
150 | 
151 |     6. Trademarks. This License does not grant permission to use the trade
152 |       names, trademarks, service marks, or product names of the Licensor,
153 |       except as required for reasonable and customary use in describing the
154 |       origin of the Work and reproducing the content of the NOTICE file.
155 | 
156 |     7. Disclaimer of Warranty. Unless required by applicable law or
157 |       agreed to in writing, Licensor provides the Work (and each
158 |       Contributor provides its Contributions) on an "AS IS" BASIS,
159 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
160 |       implied, including, without limitation, any warranties or conditions
161 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
162 |       PARTICULAR PURPOSE. You are solely responsible for determining the
163 |       appropriateness of using or redistributing the Work and assume any
164 |       risks associated with Your exercise of permissions under this License.
165 | 
166 |     8. Limitation of Liability. In no event and under no legal theory,
167 |       whether in tort (including negligence), contract, or otherwise,
168 |       unless required by applicable law (such as deliberate and grossly
169 |       negligent acts) or agreed to in writing, shall any Contributor be
170 |       liable to You for damages, including any direct, indirect, special,
171 |       incidental, or consequential damages of any character arising as a
172 |       result of this License or out of the use or inability to use the
173 |       Work (including but not limited to damages for loss of goodwill,
174 |       work stoppage, computer failure or malfunction, or any and all
175 |       other commercial damages or losses), even if such Contributor
176 |       has been advised of the possibility of such damages.
177 | 
178 |     9. Accepting Warranty or Additional Liability. While redistributing
179 |       the Work or Derivative Works thereof, You may choose to offer,
180 |       and charge a fee for, acceptance of support, warranty, indemnity,
181 |       or other liability obligations and/or rights consistent with this
182 |       License. However, in accepting such obligations, You may act only
183 |       on Your own behalf and on Your sole responsibility, not on behalf
184 |       of any other Contributor, and only if You agree to indemnify,
185 |       defend, and hold each Contributor harmless for any liability
186 |       incurred by, or claims asserted against, such Contributor by reason
187 |       of your accepting any such warranty or additional liability.
188 | 
189 |     END OF TERMS AND CONDITIONS
190 | 
191 |     APPENDIX: How to apply the Apache License to your work.
192 | 
193 |       To apply the Apache License to your work, attach the following
194 |       boilerplate notice, with the fields enclosed by brackets "[]"
195 |       replaced with your own identifying information. (Don't include
196 |       the brackets!)  The text should be enclosed in the appropriate
197 |       comment syntax for the file format. We also recommend that a
198 |       file or class name and description of purpose be included on the
199 |       same "printed page" as the copyright notice for easier
200 |       identification within third-party archives.
201 | 
202 |     Copyright [yyyy] [name of copyright owner]
203 | 
204 |     Licensed under the Apache License, Version 2.0 (the "License");
205 |     you may not use this file except in compliance with the License.
206 |     You may obtain a copy of the License at
207 | 
208 |        http://www.apache.org/licenses/LICENSE-2.0
209 | 
210 |     Unless required by applicable law or agreed to in writing, software
211 |     distributed under the License is distributed on an "AS IS" BASIS,
212 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
213 |     See the License for the specific language governing permissions and
214 |     limitations under the License.
215 | 
216 | 
217 | ==============================================================================
218 | Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
219 | ==============================================================================
220 | ---- LLVM Exceptions to the Apache 2.0 License ----
221 | 
222 | As an exception, if, as a result of your compiling your source code, portions
223 | of this Software are embedded into an Object form of such source code, you
224 | may redistribute such embedded portions in such Object form without complying
225 | with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
226 | 
227 | In addition, if you combine or link compiled forms of this Software with
228 | software that is licensed under the GPLv2 ("Combined Software") and if a
229 | court of competent jurisdiction determines that the patent provision (Section
230 | 3), the indemnity provision (Section 9) or other Section of the License
231 | conflicts with the conditions of the GPLv2, you may retroactively and
232 | prospectively choose to deem waived or otherwise exclude such Section(s) of
233 | the License, but only in their entirety and only with respect to the Combined
234 | Software.
235 | 
236 | ==============================================================================
237 | Software from third parties included in the LLVM Project:
238 | ==============================================================================
239 | The LLVM Project contains third party software which is under different license
240 | terms. All such code will be identified clearly using at least one of two
241 | mechanisms:
242 | 1) It will be in a separate directory tree with its own `LICENSE.txt` or
243 |    `LICENSE` file at the top containing the specific license and restrictions
244 |    which apply to that software, or
245 | 2) It will contain specific license and restriction terms at the top of every
246 |    file.
247 | 
248 | ==============================================================================
249 | Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
250 | ==============================================================================
251 | 
252 | The libc++ library is dual licensed under both the University of Illinois
253 | "BSD-Like" license and the MIT license.  As a user of this code you may choose
254 | to use it under either license.  As a contributor, you agree to allow your code
255 | to be used under both.
256 | 
257 | Full text of the relevant licenses is included below.
258 | 
259 | ==============================================================================
260 | 
261 | University of Illinois/NCSA
262 | Open Source License
263 | 
264 | Copyright (c) 2009-2019 by the contributors listed in CREDITS.TXT
265 | 
266 | All rights reserved.
267 | 
268 | Developed by:
269 | 
270 |     LLVM Team
271 | 
272 |     University of Illinois at Urbana-Champaign
273 | 
274 |     http://llvm.org
275 | 
276 | Permission is hereby granted, free of charge, to any person obtaining a copy of
277 | this software and associated documentation files (the "Software"), to deal with
278 | the Software without restriction, including without limitation the rights to
279 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
280 | of the Software, and to permit persons to whom the Software is furnished to do
281 | so, subject to the following conditions:
282 | 
283 |     * Redistributions of source code must retain the above copyright notice,
284 |       this list of conditions and the following disclaimers.
285 | 
286 |     * Redistributions in binary form must reproduce the above copyright notice,
287 |       this list of conditions and the following disclaimers in the
288 |       documentation and/or other materials provided with the distribution.
289 | 
290 |     * Neither the names of the LLVM Team, University of Illinois at
291 |       Urbana-Champaign, nor the names of its contributors may be used to
292 |       endorse or promote products derived from this Software without specific
293 |       prior written permission.
294 | 
295 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
296 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
297 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
298 | CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
299 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
300 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
301 | SOFTWARE.
302 | 
303 | ==============================================================================
304 | 
305 | Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT
306 | 
307 | Permission is hereby granted, free of charge, to any person obtaining a copy
308 | of this software and associated documentation files (the "Software"), to deal
309 | in the Software without restriction, including without limitation the rights
310 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
311 | copies of the Software, and to permit persons to whom the Software is
312 | furnished to do so, subject to the following conditions:
313 | 
314 | The above copyright notice and this permission notice shall be included in
315 | all copies or substantial portions of the Software.
316 | 
317 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
318 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
319 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
320 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
321 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
322 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
323 | THE SOFTWARE.
324 | 
325 | ==============================================================================
326 | Some libcudacxx components not shipped with this distribution are covered by
327 | the below license. Each source file indicates which license it is under.
328 | 
329 | If you find a file in our distribution with the following license,
330 | please let us know at legal@spectralcompute.co.uk immediately.
331 | ==============================================================================
332 | 
333 | NVIDIA SOFTWARE LICENSE
334 | 
335 | This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
336 | 
337 | This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
338 | 
339 | You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
340 | 
341 | 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
342 | 
343 | 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
344 | a.	The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
345 | b.	You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
346 | 
347 | 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
348 | a.	The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
349 | b.	You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
350 | c.	You may not modify or create derivative works of any portion of the SOFTWARE.
351 | d.	You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
352 | e.	You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
353 | f.	Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
354 | g.	You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
355 | 
356 | 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
357 | 
358 | 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
359 | 
360 | 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
361 | 
362 | 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
363 | 
364 | 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
365 | 
366 | 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
367 | 
368 | 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
369 | 
370 | 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
371 | 
372 | 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
373 | 
374 | 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
375 | 
376 | 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
377 | 
378 | 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
379 | 
380 | (v. August 20, 2021)
381 | 
382 | ================================================================================
383 | Some portions of Thrust may be licensed under other compatible open-source
384 | licenses. Any divergence from the Apache 2 license will be noted in the source
385 | code where applicable.
386 | Portions under other terms include, but are not limited to:
387 | ================================================================================
388 | 
389 | Various C++ utility classes in Thrust are based on the Boost Iterator, Tuple,
390 | System, and Random Number libraries, which are provided under the Boost Software
391 | License:
392 | 
393 |     Boost Software License - Version 1.0 - August 17th, 2003
394 | 
395 |     Permission is hereby granted, free of charge, to any person or organization
396 |     obtaining a copy of the software and accompanying documentation covered by
397 |     this license (the "Software") to use, reproduce, display, distribute,
398 |     execute, and transmit the Software, and to prepare derivative works of the
399 |     Software, and to permit third-parties to whom the Software is furnished to
400 |     do so, all subject to the following:
401 | 
402 |     The copyright notices in the Software and this entire statement, including
403 |     the above license grant, this restriction and the following disclaimer,
404 |     must be included in all copies of the Software, in whole or in part, and
405 |     all derivative works of the Software, unless such copies or derivative
406 |     works are solely in the form of machine-executable object code generated by
407 |     a source language processor.
408 | 
409 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
410 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
411 |     FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
412 |     SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
413 |     FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
414 |     ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
415 |     DEALINGS IN THE SOFTWARE.
416 | 
417 | ================================================================================
418 | 
419 | Portions of the thrust::complex implementation are derived from FreeBSD with the
420 | following terms:
421 | 
422 | ================================================================================
423 | 
424 | 
425 |     Redistribution and use in source and binary forms, with or without
426 |     modification, are permitted provided that the following conditions
427 |     are met:
428 | 
429 |     1. Redistributions of source code must retain the above copyright
430 |        notice[1] unmodified, this list of conditions, and the following
431 |        disclaimer.
432 |     2. Redistributions in binary form must reproduce the above copyright
433 |        notice, this list of conditions and the following disclaimer in the
434 |        documentation and/or other materials provided with the distribution.
435 | 
436 |     THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
437 |     IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
438 |     OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
439 |     IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
440 |     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
441 |     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
442 |     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
443 |     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
444 |     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
445 |     THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
446 | 
447 | [1] Individual copyright notices from the original authors are included in
448 |     the relevant source files.
449 | 
450 | ==============================================================================
451 | CUB's source code is released under the BSD 3-Clause license:
452 | ==============================================================================
453 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
454 | Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
455 | 
456 | Redistribution and use in source and binary forms, with or without
457 | modification, are permitted provided that the following conditions are met:
458 |    *  Redistributions of source code must retain the above copyright
459 |       notice, this list of conditions and the following disclaimer.
460 |    *  Redistributions in binary form must reproduce the above copyright
461 |       notice, this list of conditions and the following disclaimer in the
462 |       documentation and/or other materials provided with the distribution.
463 |    *  Neither the name of the NVIDIA CORPORATION nor the
464 |       names of its contributors may be used to endorse or promote products
465 |       derived from this software without specific prior written permission.
466 | 
467 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
468 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
469 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
470 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
471 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
472 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
473 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
474 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
475 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
476 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
477 | 
478 | ==============================================================================
479 | The ROCm project is distributed under the following license
480 | ==============================================================================
481 | 
482 | MIT License
483 | 
484 | Copyright © 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved.
485 | 
486 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
487 | 
488 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
489 | 
490 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
491 | 
492 | ==============================================================================
493 | The ROCm ROCT-Thunk-Interface is distributed under the following license
494 | ==============================================================================
495 | 
496 | ROCT-Thunk Interface LICENSE
497 | 
498 | Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
499 | 
500 | MIT LICENSE: Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
501 | 
502 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
503 | 
504 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
505 | 
506 | This product contains software provided by Nginx, Inc. and its contributors.
507 | 
508 | Copyright (C) 2002-2018 Igor Sysoev Copyright (C) 2011-2018 Nginx, Inc. All rights reserved.
509 | 
510 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
511 | 
512 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
513 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
514 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
515 | 
516 | ==============================================================================
517 | The ROCm ROCR-Runtime project is distributed under the following license
518 | ==============================================================================
519 | 
520 | The University of Illinois/NCSA
521 | Open Source License (NCSA)
522 | 
523 | Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
524 | 
525 | Developed by:
526 | 
527 |                 AMD Research and AMD HSA Software Development
528 | 
529 |                 Advanced Micro Devices, Inc.
530 | 
531 |                 www.amd.com
532 | 
533 | Permission is hereby granted, free of charge, to any person obtaining a copy
534 | of this software and associated documentation files (the "Software"), to
535 | deal with the Software without restriction, including without limitation
536 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
537 | and/or sell copies of the Software, and to permit persons to whom the
538 | Software is furnished to do so, subject to the following conditions:
539 | 
540 |  - Redistributions of source code must retain the above copyright notice,
541 |    this list of conditions and the following disclaimers.
542 |  - Redistributions in binary form must reproduce the above copyright
543 |    notice, this list of conditions and the following disclaimers in
544 |    the documentation and/or other materials provided with the distribution.
545 |  - Neither the names of Advanced Micro Devices, Inc,
546 |    nor the names of its contributors may be used to endorse or promote
547 |    products derived from this Software without specific prior written
548 |    permission.
549 | 
550 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
551 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
552 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
553 | THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
554 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
555 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
556 | DEALINGS WITH THE SOFTWARE.
557 | 
558 | ==============================================================================
559 | The erfinvf() function was implemented from scratch with inspiration from 
560 | the following algorithm:
561 | 
562 | “Approximating the erfinv function." - M. Giles
563 | https://people.maths.ox.ac.uk/gilesm/files/gems_erfinv.pdf
564 | ==============================================================================
565 | 
566 | ==============================================================================
567 | The erfcinvf() function's source code is released under the following license:
568 | ==============================================================================
569 | /*
570 |   Copyright 2023, Norbert Juffa
571 | 
572 |   Redistribution and use in source and binary forms, with or without
573 |   modification, are permitted provided that the following conditions
574 |   are met:
575 | 
576 |   1. Redistributions of source code must retain the above copyright
577 |      notice, this list of conditions and the following disclaimer.
578 | 
579 |   2. Redistributions in binary form must reproduce the above copyright
580 |      notice, this list of conditions and the following disclaimer in the
581 |      documentation and/or other materials provided with the distribution.
582 | 
583 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
584 |   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
585 |   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
586 |   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
587 |   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
588 |   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
589 |   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
590 |   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
591 |   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
592 |   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
593 |   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
594 | */
595 | 
596 | ==============================================================================
597 | Blender Cycles is released under the Apache 2.0 license:
598 | ==============================================================================
599 | 
600 | 
601 |                                  Apache License
602 |                            Version 2.0, January 2004
603 |                         http://www.apache.org/licenses/
604 | 
605 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
606 | 
607 |    1. Definitions.
608 | 
609 |       "License" shall mean the terms and conditions for use, reproduction,
610 |       and distribution as defined by Sections 1 through 9 of this document.
611 | 
612 |       "Licensor" shall mean the copyright owner or entity authorized by
613 |       the copyright owner that is granting the License.
614 | 
615 |       "Legal Entity" shall mean the union of the acting entity and all
616 |       other entities that control, are controlled by, or are under common
617 |       control with that entity. For the purposes of this definition,
618 |       "control" means (i) the power, direct or indirect, to cause the
619 |       direction or management of such entity, whether by contract or
620 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
621 |       outstanding shares, or (iii) beneficial ownership of such entity.
622 | 
623 |       "You" (or "Your") shall mean an individual or Legal Entity
624 |       exercising permissions granted by this License.
625 | 
626 |       "Source" form shall mean the preferred form for making modifications,
627 |       including but not limited to software source code, documentation
628 |       source, and configuration files.
629 | 
630 |       "Object" form shall mean any form resulting from mechanical
631 |       transformation or translation of a Source form, including but
632 |       not limited to compiled object code, generated documentation,
633 |       and conversions to other media types.
634 | 
635 |       "Work" shall mean the work of authorship, whether in Source or
636 |       Object form, made available under the License, as indicated by a
637 |       copyright notice that is included in or attached to the work
638 |       (an example is provided in the Appendix below).
639 | 
640 |       "Derivative Works" shall mean any work, whether in Source or Object
641 |       form, that is based on (or derived from) the Work and for which the
642 |       editorial revisions, annotations, elaborations, or other modifications
643 |       represent, as a whole, an original work of authorship. For the purposes
644 |       of this License, Derivative Works shall not include works that remain
645 |       separable from, or merely link (or bind by name) to the interfaces of,
646 |       the Work and Derivative Works thereof.
647 | 
648 |       "Contribution" shall mean any work of authorship, including
649 |       the original version of the Work and any modifications or additions
650 |       to that Work or Derivative Works thereof, that is intentionally
651 |       submitted to Licensor for inclusion in the Work by the copyright owner
652 |       or by an individual or Legal Entity authorized to submit on behalf of
653 |       the copyright owner. For the purposes of this definition, "submitted"
654 |       means any form of electronic, verbal, or written communication sent
655 |       to the Licensor or its representatives, including but not limited to
656 |       communication on electronic mailing lists, source code control systems,
657 |       and issue tracking systems that are managed by, or on behalf of, the
658 |       Licensor for the purpose of discussing and improving the Work, but
659 |       excluding communication that is conspicuously marked or otherwise
660 |       designated in writing by the copyright owner as "Not a Contribution."
661 | 
662 |       "Contributor" shall mean Licensor and any individual or Legal Entity
663 |       on behalf of whom a Contribution has been received by Licensor and
664 |       subsequently incorporated within the Work.
665 | 
666 |    2. Grant of Copyright License. Subject to the terms and conditions of
667 |       this License, each Contributor hereby grants to You a perpetual,
668 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
669 |       copyright license to reproduce, prepare Derivative Works of,
670 |       publicly display, publicly perform, sublicense, and distribute the
671 |       Work and such Derivative Works in Source or Object form.
672 | 
673 |    3. Grant of Patent License. Subject to the terms and conditions of
674 |       this License, each Contributor hereby grants to You a perpetual,
675 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
676 |       (except as stated in this section) patent license to make, have made,
677 |       use, offer to sell, sell, import, and otherwise transfer the Work,
678 |       where such license applies only to those patent claims licensable
679 |       by such Contributor that are necessarily infringed by their
680 |       Contribution(s) alone or by combination of their Contribution(s)
681 |       with the Work to which such Contribution(s) was submitted. If You
682 |       institute patent litigation against any entity (including a
683 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
684 |       or a Contribution incorporated within the Work constitutes direct
685 |       or contributory patent infringement, then any patent licenses
686 |       granted to You under this License for that Work shall terminate
687 |       as of the date such litigation is filed.
688 | 
689 |    4. Redistribution. You may reproduce and distribute copies of the
690 |       Work or Derivative Works thereof in any medium, with or without
691 |       modifications, and in Source or Object form, provided that You
692 |       meet the following conditions:
693 | 
694 |       (a) You must give any other recipients of the Work or
695 |           Derivative Works a copy of this License; and
696 | 
697 |       (b) You must cause any modified files to carry prominent notices
698 |           stating that You changed the files; and
699 | 
700 |       (c) You must retain, in the Source form of any Derivative Works
701 |           that You distribute, all copyright, patent, trademark, and
702 |           attribution notices from the Source form of the Work,
703 |           excluding those notices that do not pertain to any part of
704 |           the Derivative Works; and
705 | 
706 |       (d) If the Work includes a "NOTICE" text file as part of its
707 |           distribution, then any Derivative Works that You distribute must
708 |           include a readable copy of the attribution notices contained
709 |           within such NOTICE file, excluding those notices that do not
710 |           pertain to any part of the Derivative Works, in at least one
711 |           of the following places: within a NOTICE text file distributed
712 |           as part of the Derivative Works; within the Source form or
713 |           documentation, if provided along with the Derivative Works; or,
714 |           within a display generated by the Derivative Works, if and
715 |           wherever such third-party notices normally appear. The contents
716 |           of the NOTICE file are for informational purposes only and
717 |           do not modify the License. You may add Your own attribution
718 |           notices within Derivative Works that You distribute, alongside
719 |           or as an addendum to the NOTICE text from the Work, provided
720 |           that such additional attribution notices cannot be construed
721 |           as modifying the License.
722 | 
723 |       You may add Your own copyright statement to Your modifications and
724 |       may provide additional or different license terms and conditions
725 |       for use, reproduction, or distribution of Your modifications, or
726 |       for any such Derivative Works as a whole, provided Your use,
727 |       reproduction, and distribution of the Work otherwise complies with
728 |       the conditions stated in this License.
729 | 
730 |    5. Submission of Contributions. Unless You explicitly state otherwise,
731 |       any Contribution intentionally submitted for inclusion in the Work
732 |       by You to the Licensor shall be under the terms and conditions of
733 |       this License, without any additional terms or conditions.
734 |       Notwithstanding the above, nothing herein shall supersede or modify
735 |       the terms of any separate license agreement you may have executed
736 |       with Licensor regarding such Contributions.
737 | 
738 |    6. Trademarks. This License does not grant permission to use the trade
739 |       names, trademarks, service marks, or product names of the Licensor,
740 |       except as required for reasonable and customary use in describing the
741 |       origin of the Work and reproducing the content of the NOTICE file.
742 | 
743 |    7. Disclaimer of Warranty. Unless required by applicable law or
744 |       agreed to in writing, Licensor provides the Work (and each
745 |       Contributor provides its Contributions) on an "AS IS" BASIS,
746 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
747 |       implied, including, without limitation, any warranties or conditions
748 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
749 |       PARTICULAR PURPOSE. You are solely responsible for determining the
750 |       appropriateness of using or redistributing the Work and assume any
751 |       risks associated with Your exercise of permissions under this License.
752 | 
753 |    8. Limitation of Liability. In no event and under no legal theory,
754 |       whether in tort (including negligence), contract, or otherwise,
755 |       unless required by applicable law (such as deliberate and grossly
756 |       negligent acts) or agreed to in writing, shall any Contributor be
757 |       liable to You for damages, including any direct, indirect, special,
758 |       incidental, or consequential damages of any character arising as a
759 |       result of this License or out of the use or inability to use the
760 |       Work (including but not limited to damages for loss of goodwill,
761 |       work stoppage, computer failure or malfunction, or any and all
762 |       other commercial damages or losses), even if such Contributor
763 |       has been advised of the possibility of such damages.
764 | 
765 |    9. Accepting Warranty or Additional Liability. While redistributing
766 |       the Work or Derivative Works thereof, You may choose to offer,
767 |       and charge a fee for, acceptance of support, warranty, indemnity,
768 |       or other liability obligations and/or rights consistent with this
769 |       License. However, in accepting such obligations, You may act only
770 |       on Your own behalf and on Your sole responsibility, not on behalf
771 |       of any other Contributor, and only if You agree to indemnify,
772 |       defend, and hold each Contributor harmless for any liability
773 |       incurred by, or claims asserted against, such Contributor by reason
774 |       of your accepting any such warranty or additional liability.
775 | 
776 |    END OF TERMS AND CONDITIONS
777 | 
778 |    APPENDIX: How to apply the Apache License to your work.
779 | 
780 |       To apply the Apache License to your work, attach the following
781 |       boilerplate notice, with the fields enclosed by brackets "[]"
782 |       replaced with your own identifying information. (Don't include
783 |       the brackets!)  The text should be enclosed in the appropriate
784 |       comment syntax for the file format. We also recommend that a
785 |       file or class name and description of purpose be included on the
786 |       same "printed page" as the copyright notice for easier
787 |       identification within third-party archives.
788 | 
789 |    Copyright [yyyy] [name of copyright owner]
790 | 
791 |    Licensed under the Apache License, Version 2.0 (the "License");
792 |    you may not use this file except in compliance with the License.
793 |    You may obtain a copy of the License at
794 | 
795 |        http://www.apache.org/licenses/LICENSE-2.0
796 | 
797 |    Unless required by applicable law or agreed to in writing, software
798 |    distributed under the License is distributed on an "AS IS" BASIS,
799 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
800 |    See the License for the specific language governing permissions and
801 |    limitations under the License.
802 | 
803 | ==============================================================================
804 | The docopt.cpp project is released under the following license:
805 | ==============================================================================
806 | 
807 | Copyright (c) 2012 Vladimir Keleshev, <vladimir@keleshev.com>
808 | 
809 | Permission is hereby granted, free of charge, to any person
810 | obtaining a copy of this software and associated
811 | documentation files (the "Software"), to deal in the Software
812 | without restriction, including without limitation the rights
813 | to use, copy, modify, merge, publish, distribute, sublicense,
814 | and/or sell copies of the Software, and to permit persons to
815 | whom the Software is furnished to do so, subject to the
816 | following conditions:
817 | 
818 | The above copyright notice and this permission notice shall
819 | be included in all copies or substantial portions of the
820 | Software.
821 | 
822 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
823 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
824 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
825 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
826 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
827 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
828 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
829 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/docs/style.css:
--------------------------------------------------------------------------------
1 | [data-md-color-scheme="slate"], [data-md-color-scheme="default"] {
2 |     --md-primary-fg-color: rgb(41, 37, 36);
3 |     --md-typeset-a-color: #4051b5;
4 | }


--------------------------------------------------------------------------------
/docs/use_of_trademarks.md:
--------------------------------------------------------------------------------
1 | # Use of Trademarks
2 | 
3 | All uses of trademarks on this website are purely for nominative and/or descriptive purposes. We do not claim any affiliations, partnerships, licensing agreements or any other kind of association with Nvidia Corporation, Advanced Micro Devices, Inc. or Intel Corporation.
4 | 
5 |  - CUDA is a registered trademark of the Nvidia Corporation.
6 |  - AMD ROCm is a registered trademark of Advanced Micro Devices, Inc.
7 |  - SCALE is a registered trademark of Spectral Compute Ltd.
8 | 
9 | Please contact us at [legal@spectralcompute.co.uk](mailto:legal@spectralcompute.co.uk) for any inquiries or corrections regarding our use of trademarks.


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | **/build/
2 | 
3 | 


--------------------------------------------------------------------------------
/examples/example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCALE_DIR="$1"
 6 | SCALE_GPU_ARCH="$2"
 7 | EXAMPLE="$3"
 8 | 
 9 | source "${SCALE_DIR}/bin/scaleenv" "${SCALE_GPU_ARCH}"
10 | 
11 | case "${EXAMPLE}" in
12 | 
13 |     "basic" | "blas" | "ptx")
14 |         rm -rf "src/${EXAMPLE}/build"
15 | 
16 |         cmake \
17 |             -DCMAKE_CUDA_ARCHITECTURES="${CUDAARCHS}" \
18 |             -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \
19 |             -DCMAKE_BUILD_TYPE=RelWithDebInfo \
20 |             -B "src/${EXAMPLE}/build" \
21 |             "src/${EXAMPLE}"
22 | 
23 |         make \
24 |             -C "src/${EXAMPLE}/build"
25 | 
26 |         export SCALE_EXCEPTIONS=1
27 | 
28 |         "src/${EXAMPLE}/build/example_${EXAMPLE}"
29 |     ;;
30 | 
31 |     *)
32 |         echo "Usage: $0 {PATH_TO_SCALE} {basic|blas|ptx}"
33 |     ;;
34 | 
35 | esac
36 | 


--------------------------------------------------------------------------------
/examples/src/basic/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_basic LANGUAGES CUDA)
3 | 
4 | add_executable(example_basic basic.cu)
5 | 


--------------------------------------------------------------------------------
/examples/src/basic/basic.cu:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iostream>
 3 | 
 4 | 
 5 | // The kernel we are going to launch
 6 | __global__ void basicSum(const int * a, const int * b, size_t n, int * out) {
 7 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 8 |     if(idx < n)
 9 |     {
10 |         out[idx] = a[idx] + b[idx];
11 |     }
12 | }
13 | 
14 | 
15 | // A generic helper function to simplify error handling.
16 | void check(cudaError_t error, const char * file, size_t line) {
17 |     if (error != cudaSuccess)
18 |     {
19 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
20 |         exit(1);
21 |     }
22 | }
23 | 
24 | 
25 | // A wrapper for the helper function above to include the filename and line number
26 | // where the error occurs into the output.
27 | #define CHECK(error) check(error, __FILE__, __LINE__)
28 | 
29 | 
30 | int main(int argc, char ** argv) {
31 | 
32 |     const size_t N = 4096;
33 |     const size_t BYTES = N * sizeof(int);
34 | 
35 |     std::vector<int> a(N);
36 |     std::vector<int> b(N);
37 |     std::vector<int> out(N);
38 | 
39 |     // Generate input data
40 |     for (size_t i = 0; i < N; i++) {
41 |         a[i] = i * 2;
42 |         b[i] = N - i;
43 |     }
44 | 
45 |     int * devA;
46 |     int * devB;
47 |     int * devOut;
48 | 
49 |     // Allocate memory for the inputs and the output
50 |     CHECK(cudaMalloc(&devA, BYTES));
51 |     CHECK(cudaMalloc(&devB, BYTES));
52 |     CHECK(cudaMalloc(&devOut, BYTES));
53 | 
54 |     // Copy the input data to the device
55 |     CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
56 |     CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));
57 | 
58 |     // Launch the kernel
59 |     basicSum<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
60 |     CHECK(cudaDeviceSynchronize());
61 |     CHECK(cudaGetLastError());
62 | 
63 |     // Copy the output data back to host
64 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
65 | 
66 |     // Free up the memory we allocated for the inputs and the output
67 |     CHECK(cudaFree(devA));
68 |     CHECK(cudaFree(devB));
69 |     CHECK(cudaFree(devOut));
70 | 
71 |     // Test that the output matches our expectations
72 |     for (size_t i = 0; i < N; i++) {
73 |         if (a[i] + b[i] != out[i]) {
74 |             std::cout << "Incorrect sum: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
75 |         }
76 |     }
77 | 
78 |     std::cout << "Example finished" << std::endl;
79 | 
80 |     return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/examples/src/blas/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_blas LANGUAGES CUDA)
3 | 
4 | add_executable(example_blas blas.cu)
5 | target_link_libraries(example_blas PRIVATE cublas redscale)
6 | 


--------------------------------------------------------------------------------
/examples/src/blas/blas.cu:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <iostream>
 3 | 
 4 | #include <cublas_v2.h>
 5 | 
 6 | 
 7 | void check(cudaError_t error, const char * file, size_t line) {
 8 |     if (error != cudaSuccess)
 9 |     {
10 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
11 |         exit(1);
12 |     }
13 | }
14 | 
15 | 
16 | void checkCublas(cublasStatus_t error, const char * file, size_t line) {
17 |     if (error != CUBLAS_STATUS_SUCCESS) {
18 |         std::cout << "cublas error: " << cublasGetStatusString(error) << " at " << file << ":" << line << std::endl;
19 |         exit(1);
20 |     }
21 | }
22 | 
23 | 
24 | #define CHECK(error) check(error, __FILE__, __LINE__)
25 | #define CHECK_CUBLAS(error) checkCublas(error, __FILE__, __LINE__)
26 | 
27 | 
28 | int main(int argc, char ** argv) {
29 |     cublasHandle_t handle;
30 |     CHECK_CUBLAS(cublasCreate(&handle));
31 | 
32 |     const size_t N = 10;
33 |     const size_t BYTES = N * sizeof(double);
34 |     const double E = 1e-5;
35 | 
36 |     /* Prepare the data */
37 | 
38 |     std::vector<double> A(N);
39 |     std::vector<double> B(N);
40 | 
41 |     for (size_t i = 0; i < N; i++) {
42 |         A[i] = i;
43 |         B[i] = i + N;
44 |     }
45 | 
46 |     /* Send the data */
47 | 
48 |     double * devA;
49 |     double * devB;
50 | 
51 |     CHECK(cudaMalloc(&devA, BYTES));
52 |     CHECK(cudaMalloc(&devB, BYTES));
53 | 
54 |     CHECK(cudaMemcpy(devA, A.data(), BYTES, cudaMemcpyHostToDevice));
55 |     CHECK(cudaMemcpy(devB, B.data(), BYTES, cudaMemcpyHostToDevice));
56 | 
57 |     /* Calculate */
58 | 
59 |     const int strideA = 1;
60 |     const int strideB = 1;
61 |     double result = 0;
62 | 
63 |     CHECK_CUBLAS(cublasDdot(handle, A.size(), devA, strideA, devB, strideB, &result));
64 | 
65 |     CHECK(cudaDeviceSynchronize());
66 | 
67 |     double expected = 0;
68 |     for (size_t i = 0; i < N; i++) {
69 |         expected += A[i] * B[i];
70 |     }
71 | 
72 |     if (std::abs(result - expected) > E) {
73 |         std::cout << "Result " << result << " is different from expected " << expected << std::endl;
74 |     }
75 | 
76 |     CHECK_CUBLAS(cublasDestroy(handle));
77 | 
78 |     std::cout << "Example finished." << std::endl;
79 | 
80 |     return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/examples/src/ptx/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2 | project(example_ptx LANGUAGES CUDA)
3 | 
4 | add_executable(example_ptx ptx.cu)
5 | 


--------------------------------------------------------------------------------
/examples/src/ptx/ptx.cu:
--------------------------------------------------------------------------------
  1 | #include <bitset>
  2 | #include <vector>
  3 | #include <iostream>
  4 | #include <cstdint>
  5 | 
  6 | 
  7 | __device__ inline uint32_t ptx_add(uint32_t x, uint32_t y) {
  8 |     // Calculate a sum of `x` and `y`, put the result into `x`
  9 |     asm(
 10 |         "add.u32 %0, %0, %1;"
 11 |         : "+r"(x)
 12 |         : "r"(y)
 13 |     );
 14 |     return x;
 15 | }
 16 | 
 17 | 
 18 | __global__ void kernelAdd(const uint32_t * a, const uint32_t * b, size_t n, uint32_t * out) {
 19 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 20 |     if(idx < n)
 21 |     {
 22 |         out[idx] = ptx_add(a[idx], b[idx]);
 23 |     }
 24 | }
 25 | 
 26 | 
 27 | template<uint8_t Op>
 28 | __device__ inline uint32_t ptx_lop3(uint32_t x, uint32_t y, uint32_t z) {
 29 |     // Compute operator `Op` on `x`, `y`, `z`, put the result into `x`
 30 | 
 31 |     asm(
 32 |         "lop3.b32 %0, %0, %1, %2, %3;"
 33 |         : "+r"(x)
 34 |         : "r"(y), "r"(z), "n"(Op)
 35 |     );
 36 |     return x;
 37 | }
 38 | 
 39 | 
 40 | template<uint8_t Op>
 41 | __global__ void kernelLop3(const uint32_t * a, const uint32_t * b, const uint32_t * c, size_t n, uint32_t * out) {
 42 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 43 |     if(idx < n)
 44 |     {
 45 |         out[idx] = ptx_lop3<Op>(a[idx], b[idx], c[idx]);
 46 |     }
 47 | }
 48 | 
 49 | 
 50 | void check(cudaError_t error, const char * file, size_t line) {
 51 |     if (error != cudaSuccess)
 52 |     {
 53 |         std::cout << "cuda error: " << cudaGetErrorString(error) << " at " << file << ":" << line << std::endl;
 54 |         exit(1);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | #define CHECK(error) check(error, __FILE__, __LINE__)
 60 | 
 61 | 
 62 | template<typename T>
 63 | constexpr T lop3op(T a, T b, T c) {
 64 |     return a & b ^ (~c);
 65 | }
 66 | 
 67 | 
 68 | int main(int argc, char ** argv) {
 69 | 
 70 |     const size_t N = 4096;
 71 |     const size_t BYTES = N * sizeof(uint32_t);
 72 | 
 73 |     std::vector<uint32_t> a(N);
 74 |     std::vector<uint32_t> b(N);
 75 |     std::vector<uint32_t> c(N);
 76 |     std::vector<uint32_t> out(N);
 77 | 
 78 |     for (size_t i = 0; i < N; i++) {
 79 |         a[i] = i * 2;
 80 |         b[i] = N - i;
 81 |         c[i] = i * i;
 82 |     }
 83 | 
 84 |     uint32_t * devA;
 85 |     uint32_t * devB;
 86 |     uint32_t * devC;
 87 |     uint32_t * devOut;
 88 | 
 89 |     CHECK(cudaMalloc(&devA, BYTES));
 90 |     CHECK(cudaMalloc(&devB, BYTES));
 91 |     CHECK(cudaMalloc(&devC, BYTES));
 92 |     CHECK(cudaMalloc(&devOut, BYTES));
 93 | 
 94 |     CHECK(cudaMemcpy(devA, a.data(), BYTES, cudaMemcpyHostToDevice));
 95 |     CHECK(cudaMemcpy(devB, b.data(), BYTES, cudaMemcpyHostToDevice));
 96 |     CHECK(cudaMemcpy(devC, c.data(), BYTES, cudaMemcpyHostToDevice));
 97 | 
 98 |     // Test "add"
 99 | 
100 |     kernelAdd<<<N / 256 + 1, 256>>>(devA, devB, N, devOut);
101 |     CHECK(cudaDeviceSynchronize());
102 |     CHECK(cudaGetLastError());
103 | 
104 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
105 | 
106 |     for (size_t i = 0; i < N; i++) {
107 |         if (a[i] + b[i] != out[i]) {
108 |             std::cout << "Incorrect add: " << a[i] << " + " << b[i] << " = " << out[i] << " ?\n";
109 |         }
110 |     }
111 | 
112 |     // Test "lop3"
113 | 
114 |     constexpr uint8_t TA = 0xF0;
115 |     constexpr uint8_t TB = 0xCC;
116 |     constexpr uint8_t TC = 0xAA;
117 |     constexpr uint8_t Op = lop3op(TA, TB, TC);
118 | 
119 |     kernelLop3<Op><<<N / 256 + 1, 256>>>(devA, devB, devC, N, devOut);
120 |     CHECK(cudaDeviceSynchronize());
121 |     CHECK(cudaGetLastError());
122 | 
123 |     CHECK(cudaMemcpy(out.data(), devOut, BYTES, cudaMemcpyDeviceToHost));
124 | 
125 |     for (size_t i = 0; i < N; i++) {
126 |         if (lop3op(a[i], b[i], c[i]) != out[i]) {
127 |             std::cout << "Incorrect lop3: \n"
128 |                 << "    " << std::bitset<32>{a[i]} << "\n"
129 |                 << " &  " << std::bitset<32>{b[i]} << "\n"
130 |                 << " ^ ~" << std::bitset<32>{c[i]} << "\n"
131 |                 << " =  " << std::bitset<32>{out[i]} << " ?\n\n";
132 |         }
133 |     }
134 | 
135 |     CHECK(cudaFree(devA));
136 |     CHECK(cudaFree(devB));
137 |     CHECK(cudaFree(devC));
138 |     CHECK(cudaFree(devOut));
139 | 
140 |     // Finish
141 | 
142 |     std::cout << "Example finished" << std::endl;
143 | 
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | SCALE_STABLE_VERSION = "1.3.1"
 4 | SCALE_UNSTABLE_VERSION = "2025.03.24"
 5 | 
 6 | # Find the current git branch to choose stable or unstable variation.
 7 | g = subprocess.run(["git", "branch", "--show-current"], capture_output=True)
 8 | current_branch = g.stdout.decode("utf-8")[:-1]
 9 | 
10 | print("Current branch: " + current_branch)
11 | 
12 | def define_env(env):
13 |     env.variables["branch"] = current_branch
14 |     env.variables["customer_specific_repo"] = "nonfree" in current_branch
15 |     
16 |     scale_pkgname = "scale"
17 |     scale_version = SCALE_UNSTABLE_VERSION if "unstable" in current_branch else SCALE_STABLE_VERSION
18 | 
19 |     repo_subdomain = "pkgs"
20 |     if "nonfree" in current_branch:
21 |         repo_subdomain = "nonfree-" + repo_subdomain
22 |     else:
23 |         scale_pkgname += "-free"
24 | 
25 |     if "unstable" in current_branch:
26 |         repo_subdomain = "unstable-" + repo_subdomain
27 |         scale_pkgname += "-unstable"
28 | 
29 |     env.variables["scale_pkgname"] = scale_pkgname
30 |     env.variables["scale_version"] = scale_version
31 |     env.variables["repo_subdomain"] = repo_subdomain
32 | 
33 |     @env.macro
34 |     def checksum(url):
35 |         g = subprocess.run(["curl", url + ".sha512"], capture_output=True)
36 |         return g.stdout.decode("utf-8").split(" ")[0]
37 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: SCALE documentation
  2 | site_url: https://docs.scale-lang.com/
  3 | docs_dir: docs/
  4 | extra_css:
  5 |   - style.css
  6 | theme:
  7 |   name: material
  8 |   custom_dir: overrides
  9 |   logo: logo_white.svg
 10 |   favicon: favicon.png
 11 |   palette:
 12 |     # Palette toggle for dark mode
 13 |     - scheme: slate
 14 |       toggle:
 15 |         icon: material/brightness-4
 16 |         name: Switch to light mode
 17 |     # Palette toggle for light mode
 18 |     - scheme: default
 19 |       toggle:
 20 |         icon: material/brightness-7
 21 |         name: Switch to dark mode
 22 |   features:
 23 |     - navigation.sections
 24 |     - navigation.footer
 25 |     - navigation.instant
 26 |     - navigation.instant.progress
 27 |     - navigation.instant.prefetch
 28 |     - content.code.copy
 29 |     - toc.follow
 30 | 
 31 | plugins:
 32 |   - search
 33 |   - macros
 34 |   - mike:
 35 |       canonical_version: latest
 36 |       alias_type: copy
 37 |       version_selector: true
 38 |       css_dir: css
 39 |       javascript_dir: js
 40 | 
 41 | extra:
 42 |   branch: master
 43 |   generator: false
 44 |   version:
 45 |     provider: mike
 46 |     alias: true
 47 |     default: stable
 48 |   social:
 49 |   - icon: fontawesome/brands/github
 50 |     link: https://github.com/spectral-compute
 51 |   - icon: fontawesome/brands/linkedin
 52 |     link: https://www.linkedin.com/company/spectral-compute
 53 |   - icon: fontawesome/brands/discord
 54 |     link: https://discord.gg/KNpgGbTc38
 55 | 
 56 | copyright: Copyright &copy; 2024 <a href="https://www.spectralcompute.co.uk">Spectral Compute Ltd</a>
 57 | 
 58 | markdown_extensions:
 59 |   - toc:
 60 |       permalink: "#"
 61 |   - pymdownx.highlight
 62 |   - pymdownx.inlinehilite
 63 |   - pymdownx.snippets
 64 |   - pymdownx.superfences
 65 |   - attr_list
 66 |   - pymdownx.tabbed:
 67 |       alternate_style: true
 68 |   - pymdownx.emoji:
 69 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
 70 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
 71 | 
 72 | # Navigation and file naming conventions:
 73 | #
 74 | # - Please refrain from setting custom titles for pages.
 75 | #   Instead, modify the top-level title in the corresponding file.
 76 | # - Please try to name files in a predictable way - their names will become links.
 77 | # - Don't rely on numbered filename prefixes -- we don't need to override their ordering,
 78 | #   as we don't use the autogenerated `nav`.
 79 | 
 80 | nav:
 81 |   - README.md
 82 |   - manual/CHANGELOG.md
 83 |   - Getting Started:
 84 |     - manual/how-to-install.md
 85 |     - manual/how-to-use.md
 86 |     - manual/faq.md
 87 |     - Examples:
 88 |       - examples/README.md
 89 |       - examples/basic.md
 90 |       - examples/ptx.md
 91 |       - examples/blas.md
 92 |   - Diving deeper:
 93 |     - manual/comparison.md
 94 |     - manual/troubleshooting.md
 95 |     - manual/dialects.md
 96 |     - manual/differences.md
 97 |     - manual/runtime-extensions.md
 98 |     - Compiler:
 99 |       - manual/inline-ptx.md
100 |       - manual/diagnostic-flags.md
101 |       - manual/optimisation-flags.md
102 |       - manual/language-extensions.md
103 |       - manual/compute-capabilities.md
104 |     - Implemented APIs:
105 |       - manual/apis.md
106 |       - manual/api-driver.md
107 |       - manual/api-math.md
108 |       - manual/api-runtime.md
109 |   - Contact Us:
110 |     - contact/report-a-bug.md
111 |   - Legal:
112 |     - licensing.md
113 |     - notices.md
114 | 
115 | not_in_nav: |
116 |   /use_of_trademarks.md
117 | 


--------------------------------------------------------------------------------
/overrides/partials/footer.html:
--------------------------------------------------------------------------------
 1 | {#-
 2 |   This file was automatically generated - do not edit
 3 | -#}
 4 | <footer class="md-footer">
 5 |   {% if "navigation.footer" in features %}
 6 |     {% if page.previous_page or page.next_page %}
 7 |       {% if page.meta and page.meta.hide %}
 8 |         {% set hidden = "hidden" if "footer" in page.meta.hide %}
 9 |       {% endif %}
10 |       <nav class="md-footer__inner md-grid" aria-label="{{ lang.t('footer') }}" {{ hidden }}>
11 |         {% if page.previous_page %}
12 |           {% set direction = lang.t("footer.previous") %}
13 |           <a href="{{ page.previous_page.url | url }}" class="md-footer__link md-footer__link--prev" aria-label="{{ direction }}: {{ page.previous_page.title | e }}">
14 |             <div class="md-footer__button md-icon">
15 |               {% set icon = config.theme.icon.previous or "material/arrow-left" %}
16 |               {% include ".icons/" ~ icon ~ ".svg" %}
17 |             </div>
18 |             <div class="md-footer__title">
19 |               <span class="md-footer__direction">
20 |                 {{ direction }}
21 |               </span>
22 |               <div class="md-ellipsis">
23 |                 {{ page.previous_page.title }}
24 |               </div>
25 |             </div>
26 |           </a>
27 |         {% endif %}
28 |         {% if page.next_page %}
29 |           {% set direction = lang.t("footer.next") %}
30 |           <a href="{{ page.next_page.url | url }}" class="md-footer__link md-footer__link--next" aria-label="{{ direction }}: {{ page.next_page.title | e }}">
31 |             <div class="md-footer__title">
32 |               <span class="md-footer__direction">
33 |                 {{ direction }}
34 |               </span>
35 |               <div class="md-ellipsis">
36 |                 {{ page.next_page.title }}
37 |               </div>
38 |             </div>
39 |             <div class="md-footer__button md-icon">
40 |               {% set icon = config.theme.icon.next or "material/arrow-right" %}
41 |               {% include ".icons/" ~ icon ~ ".svg" %}
42 |             </div>
43 |           </a>
44 |         {% endif %}
45 |       </nav>
46 |     {% endif %}
47 |   {% endif %}
48 |   <div class="md-footer-meta md-typeset">
49 |     <div class="md-footer-meta__inner md-grid">
50 |       {% if config.extra.social %}
51 |         {% include "partials/social.html" %}
52 |       {% endif %}
53 |       <div class="md-copyright"><a href="https://spectralcompute.co.uk/privacy">Privacy Statement</a></div>
54 |       <div class="md-copyright"><a href="/use_of_trademarks">Trademarks</a></div>
55 |       {% include "partials/copyright.html" %}
56 |     </div>
57 |   </div>
58 | </footer>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Babel==2.15.0
 2 | certifi==2024.7.4
 3 | charset-normalizer==3.3.2
 4 | click==8.1.7
 5 | colorama==0.4.6
 6 | ghp-import==2.1.0
 7 | idna==3.7
 8 | Jinja2==3.1.4
 9 | Markdown==3.6
10 | MarkupSafe==2.1.5
11 | mergedeep==1.3.4
12 | mkdocs==1.6.0
13 | mkdocs-get-deps==0.2.0
14 | mkdocs-material==9.6.9
15 | mkdocs-material-extensions==1.3.1
16 | packaging==24.1
17 | paginate==0.5.6
18 | pathspec==0.12.1
19 | platformdirs==4.2.2
20 | Pygments==2.18.0
21 | pymdown-extensions==10.8.1
22 | python-dateutil==2.9.0.post0
23 | PyYAML==6.0.1
24 | pyyaml_env_tag==0.1
25 | regex==2024.5.15
26 | requests==2.32.3
27 | six==1.16.0
28 | urllib3==2.2.2
29 | watchdog==4.0.1
30 | mike==2.1.3
31 | mkdocs-macros-plugin==1.3.7
32 | 


--------------------------------------------------------------------------------