├── .circleci
    └── config.yml
├── .gitignore
├── .readthedocs.yml
├── 00-Title.markdown
├── 01-Introduction.markdown
├── 02-Porting.markdown
├── 03-Analyze.markdown
├── 04-Parallelize.markdown
├── 05-Data-Locality.markdown
├── 06-Loops.markdown
├── 07-Interoperability.markdown
├── 08-Advanced.markdown
├── 99-End.markdown
├── BUILD.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── build.bat
├── conf.py
├── cover-page
    ├── app_images.png
    ├── cover-page.pdf
    ├── logo.png
    └── main.tex
├── examples
    ├── ch6_matvec
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── matrix.h
    │   ├── matrix_functions.h
    │   ├── vector.h
    │   └── vector_functions.h
    ├── class-data.cpp
    ├── laplace
    │   ├── ch2
    │   │   ├── laplace2d.c
    │   │   ├── laplace2d.f90
    │   │   └── timer.h
    │   ├── ch3
    │   │   ├── laplace2d-kernels.c
    │   │   ├── laplace2d-kernels.f90
    │   │   ├── laplace2d-parallel.c
    │   │   ├── laplace2d-parallel.f90
    │   │   └── timer.h
    │   └── ch4
    │   │   ├── laplace2d-kernels.c
    │   │   ├── laplace2d-kernels.f90
    │   │   ├── laplace2d-parallel.c
    │   │   ├── laplace2d-parallel.f90
    │   │   └── timer.h
    ├── mandelbrot
    │   ├── cpp
    │   │   ├── License.md
    │   │   ├── Makefile
    │   │   ├── constants.h
    │   │   ├── main.cpp
    │   │   ├── mandelbrot.cpp
    │   │   ├── mandelbrot.h
    │   │   ├── original
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   │   ├── task1
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── main_solution.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   │   ├── task2
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── main_solution.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   │   ├── task3
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── main_solution.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   │   ├── task4
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── main_solution.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   │   └── task5.multithread
    │   │   │   ├── Makefile
    │   │   │   ├── constants.h
    │   │   │   ├── main.cpp
    │   │   │   ├── main_solution.cpp
    │   │   │   ├── mandelbrot.cpp
    │   │   │   └── mandelbrot.h
    │   └── f90
    │   │   ├── License.md
    │   │   ├── Makefile
    │   │   ├── main.F90
    │   │   ├── mandelbrot.F90
    │   │   ├── original
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       └── mandelbrot.F90
    │   │   ├── task1
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       ├── main_solution.F90
    │   │       └── mandelbrot.F90
    │   │   ├── task2
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       ├── main_solution.F90
    │   │       └── mandelbrot.F90
    │   │   ├── task3
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       ├── main_solution.F90
    │   │       └── mandelbrot.F90
    │   │   ├── task4
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       ├── main_solution.F90
    │   │       └── mandelbrot.F90
    │   │   └── task5.multithread
    │   │       ├── Makefile
    │   │       ├── main.F90
    │   │       ├── main_solution.F90
    │   │       └── mandelbrot.F90
    ├── saxpy-kernels.c
    ├── saxpy-kernels.f90
    ├── saxpy-parallel.c
    └── saxpy-parallel.f90
├── guide.latex
├── images
    ├── ch2-nsight-initial.png
    ├── ch2-nsight-open.png
    ├── ch2-pgprof-initial.png
    ├── ch2-pgprof.png
    ├── ch3_profile.png
    ├── ch4_profile.png
    ├── execution_model.png
    ├── execution_model2.png
    ├── histogram.png
    ├── idealized_pipeline.png
    ├── jacobi_step1_graph.png
    ├── jacobi_step1_nvvp.png
    ├── jacobi_step2_graph.png
    ├── jacobi_step2_nvvp.png
    ├── laplace.xlsx
    ├── levels_of_parallelism.png
    ├── mandelbrot.png
    ├── mandelbrot_async_nsight.png
    ├── mandelbrot_timeline.png
    ├── multigpu_mandelbrot_timeline.png
    ├── multigpu_mandelbrot_timeline_nsight.png
    ├── openacc-guide-images.pub
    ├── spmv_speedup_num_workers.png
    └── spmv_speedup_vector_length.png
├── index.rst
├── outline.markdown
├── requirements.txt
└── x98-Quick_Reference.markdown


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     docker:
 5 |       - image: openacc/best-practices-guide:latest 
 6 |     steps:
 7 |       - checkout
 8 |       - run:
 9 |           name: Build PDF
10 |           command: /usr/local/bin/pandoc -f markdown+implicit_figures -s -o openacc-guide.pdf ??-*.markdown --citeproc --highlight-style pygments --top-level-division=chapter  -V geometry:letterpaper  -H cover-page/main.tex --pdf-engine=xelatex
11 |       - store_artifacts:
12 |           path: openacc-guide.pdf
13 | workflows:
14 |   version: 2
15 |   build:
16 |     jobs:
17 |       - build
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pdf
 2 | *.swp
 3 | *.swo
 4 | *.o
 5 | *.a
 6 | *.mod
 7 | a.out
 8 | core
 9 | *.doc
10 | *.aux
11 | *.log
12 | *.out
13 | *.toc
14 | *.ccff
15 | ~$*
16 | *.html
17 | openacc-guide.tex
18 | .vscode
19 | _build
20 | venv
21 | .venv
22 | html/
23 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.12"
12 |   jobs:
13 |     pre_build:
14 |       - sed -i "s/{.c .numberLines}/c/" *.markdown
15 |       - sed -i "s/{.fortran .numberLines}/fortran/" *.markdown
16 |       - sed -i "s/{.cpp .numberLines}/fortran/" *.markdown
17 | 
18 | python:
19 |   install:
20 |     - requirements: requirements.txt
21 | 
22 | # Build documentation in the docs/ directory with Sphinx
23 | sphinx:
24 |   configuration: conf.py
25 | 


--------------------------------------------------------------------------------
/00-Title.markdown:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: OpenACC Programming and Best Practices Guide
 3 | date: Unreleased
 4 | toc: yes
 5 | tocdepth: 2
 6 | chapters: yes
 7 | numbersections: yes
 8 | geometry: margin=1in
 9 | documentclass: book
10 | classoption: oneside
11 | ---
12 | 


--------------------------------------------------------------------------------
/01-Introduction.markdown:
--------------------------------------------------------------------------------
  1 | Introduction
  2 | ============
  3 | This guide presents methods and best practices for accelerating applications
  4 | in an incremental, performance portable way. Although some of the examples may
  5 | show results using a given compiler or accelerator, the information presented
  6 | in this document is intended to address all architectures both available at
  7 | publication time and well into the future. Readers should be comfortable with
  8 | C, C++, or Fortran, but do not need experience with parallel programming or
  9 | accelerated computing, although such experience will be helpful. 
 10 | 
 11 | Note: This guide is a community effort. To contribute, please visit the project
 12 | [on Github](https://github.com/OpenACC/openacc-best-practices-guide).
 13 | 
 14 | Writing Portable Code
 15 | ---------------------
 16 | The current computing landscape is spotted with a variety of computing
 17 | architectures: multi-core CPUs, GPUs, many-core devices, DSPs, ARM processors, and FPGAs, to name a few. It is now commonplace to find not just one, but several of these
 18 | differing architectures within the same machine. Programmers must make
 19 | portability of their code a forethought, otherwise they risk locking their
 20 | application to a single architecture, which may limit the ability to run on
 21 | future architectures. Although the variety of architectures may seem daunting
 22 | to the programmer, closer analysis reveals trends that show a lot in common
 23 | between them. The first thing to note is that all of these architectures are
 24 | moving in the direction of more parallelism. CPUs are not only adding CPU cores
 25 | but also expanding the length of their SIMD operations. GPUs have grown to
 26 | require a high degree of block and SIMT parallelism. It is clear that going
 27 | forward all architectures will require a significant degree of parallelism in
 28 | order to achieve high performance. Modern processors need not only a large
 29 | amount of parallelism, but frequently expose multiple levels of parallelism
 30 | with varying degrees of coarseness. The next thing to notice is that all of
 31 | these architectures have exposed hierarchies of memory. CPUs have the main
 32 | system memory, typically DDR, and multiple layers of cache memory. GPUs have the main CPU memory, the main GPU memory, and various degrees of cache or scratchpad memory. Additionally on hybrid architectures, which include two or more different architectures, there exist machines where the two architectures have completely separate memories, some with physically separate but logically the same memory, and some with fully shared memory.
 33 | 
 34 | Because of these complexities, it's important that developers choose a
 35 | programming model that balances the need for portability with the need for
 36 | performance. Below are four programming models of varying degrees of both
 37 | portability and performance. In a real application it's frequently best to use
 38 | a mixture of approaches to ensure a good balance between high portability and
 39 | performance.
 40 | 
 41 | ### Libraries ###
 42 | 
 43 | Standard (and *de facto* standard) libraries provide the highest degree of
 44 | portability because the programmer can frequently replace only the library
 45 | used without even changing the source code itself when changing compute
 46 | architectures. Since many hardware vendors provide highly-tuned versions of
 47 | common libraries, using libraries can also result in very high performance.
 48 | Although libraries can provide both high portability and high performance, few
 49 | applications are able to use only libraries because of their limited scope.
 50 |     
 51 | Some vendors provide additional libraries as a value-add for their
 52 | platform, but which implement non-standard APIs. These libraries provide
 53 | high performance, but little portability. Fortunately because libraries provide
 54 | modular APIs, the impact of using non-portable libraries can be isolated to
 55 | limit the impact on overall application portability.
 56 | 
 57 | ### Standard Programming Languages ###
 58 | 
 59 | Many standard programming languages either have or are beginning to adopt
 60 | features for parallel programming. For example, Fortran 2008 added support
 61 | for `do concurrent`, which exposes the potential parallelism with that loop,
 62 | and C++17 added support for `std::execution`, which enables users to express
 63 | parallelism with certain loop structures.
 64 | Adoption of these language features is often slow, however, and many standard languages are
 65 | only now beginning to discuss parallel programming features for future language
 66 | releases. When these features become commonplace, they will provide high
 67 | portability, since they are part of a standard language, and if well-designed
 68 | can provide high performance as well.
 69 | 
 70 | ### Compiler Directives ###
 71 | 
 72 | When standard programming languages lack support for necessary features
 73 | compiler directives can provide additional functionality. Directives, in the
 74 | form of pragmas in C/C++ and comments in Fortran, provide additional
 75 | information to compilers on how to build and/or optimize the code. Most
 76 | compilers support their own directives, and also directives such as OpenACC and
 77 | OpenMP, which are backed by industry groups and implemented by a range of
 78 | compilers. When using industry-backed compiler directives the programmer can
 79 | write code with a high degree of portability across compilers and
 80 | architectures. Frequently, however, these compiler directives are written to
 81 | remain very high level, both for simplicity and portability, meaning that
 82 | performance may lag lower-level programming paradigms. Many developers are
 83 | willing to give up 10-20% of hand-tuned performance in order to get a high
 84 | degree of portability to other architectures and to enhance programmer
 85 | productivity. The tolerance for this portability/performance trade-off will
 86 | vary according to the needs of the programmer and application.
 87 | 
 88 | ### Parallel Programming Extensions ###
 89 | 
 90 | CUDA and OpenCL are examples of extensions to existing programming languages
 91 | to give additional parallel programming capabilities. Code written in these
 92 | languages is frequently at a lower level than that of other options, but as a
 93 | result can frequently achieve higher performance. Lower level architectural
 94 | details are exposed and the way that a problem is decomposed to the hardware
 95 | must be explicitly managed with these languages. This is the best option when
 96 | performance goals outweigh portability, as the low-level nature of these
 97 | programming languages frequently makes the resulting code less portable. Good
 98 | software engineering practices can reduce the impact these languages have on
 99 | portability.
100 | 
101 | ----
102 | 
103 | There is no one programming model that fits all needs. An application developer
104 | needs to evaluate the priorities of the project and make decisions accordingly.
105 | A best practice is to begin with the most portable and productive programming
106 | models and move to lower level programming models only as needed and in a
107 | modular fashion. In doing so the programmer can accelerate much of the
108 | application very quickly, which is often more beneficial than attempting to get
109 | the absolute highest performance out of a particular routine before moving to
110 | the next. When development time is limited, focusing on accelerating as much of
111 | the application as possible is generally more productive than focusing solely
112 | on the top time consuming routine. 
113 | 
114 | What is OpenACC?
115 | ----------------
116 | With the emergence of GPU and many-core architectures in high performance
117 | computing, programmers desire the ability to program using a familiar, high
118 | level programming model that provides both high performance and portability to
119 | a wide range of computing architectures. OpenACC emerged in 2011 as a
120 | programming model that uses high-level compiler directives to expose
121 | parallelism in the code and parallelizing compilers to build the code for a
122 | variety of parallel accelerators. This document is intended as a best practices
123 | guide for accelerating an application using OpenACC to give both good
124 | performance and portability to other devices.
125 | 
126 | ### The OpenACC Accelerator Model ###
127 | In order to ensure that OpenACC would be portable to all computing
128 | architectures available at the time of its inception and into the future,
129 | OpenACC defines an abstract model for accelerated computing. This model exposes
130 | multiple levels of parallelism that may appear on a processor as well as a
131 | hierarchy of memories with varying degrees of speed and addressability. The
132 | goal of this model is to ensure that OpenACC will be applicable to more than just a
133 | particular architecture or even just the architectures in wide availability at
134 | the time, but to ensure that OpenACC could be used on future devices as well. 
135 | 
136 | At its core OpenACC supports offloading of both computation and data from a
137 | *host* device to an *accelerator* device. In fact, these devices may be the
138 | same or may be completely different architectures, such as the case of a CPU
139 | host and GPU accelerator. The two devices may also have separate memory spaces
140 | or a single memory space. In the case that the two devices have different
141 | memories the OpenACC compiler and runtime will analyze the code and handle any
142 | accelerator memory management and the transfer of data between host and device
143 | memory. Figure 1.1 shows a high level diagram of the OpenACC abstract
144 | accelerator, but remember that the devices and memories may be physically the
145 | same on some architectures.
146 | 
147 | ![OpenACC's Abstract Accelerator Model](images/execution_model2.png)
148 | 
149 | More details of OpenACC's abstract accelerator model will be presented
150 | throughout this guide when they are pertinent. 
151 | 
152 | ----
153 | 
154 | ***Best Practice:*** For developers coming to OpenACC from other accelerator
155 | programming models, such as CUDA or OpenCL, where host and accelerator memory
156 | is frequently represented by two distinct variables (`host_A[]` and
157 | `device_A[]`, for instance), it's important to remember that when using OpenACC
158 | a variable should be thought of as a single object, regardless of whether
159 | it's backed by memory in one or more memory spaces. If one assumes that a
160 | variable represents two separate memories, depending on where it is used in the
161 | program, then it is possible to write programs that access the variable in
162 | unsafe ways, resulting in code that would not be portable to devices that share
163 | a single memory between the host and device. As with any parallel or
164 | asynchronous programming paradigm, accessing the same variable from two
165 | sections of code simultaneously could result in a race condition that produces
166 | inconsistent results. By assuming that you are always accessing a single
167 | variable, regardless of how it is stored in memory, the programmer will avoid
168 | making mistakes that could cost a significant amount of effort to debug.
169 | 
170 | ### Benefits and Limitations of OpenACC ###
171 | OpenACC is designed to be a high-level, platform independent language for
172 | programming accelerators. As such, one can develop a single source code that
173 | can be run on a range of devices and achieve good performance. The simplicity
174 | and portability that OpenACC's programming model provides sometimes comes at a
175 | cost to performance. The OpenACC abstract accelerator model defines a least
176 | common denominator for accelerator devices, but cannot represent architectural
177 | specifics of these devices without making the language less portable. There
178 | will always be some optimizations that are possible in a lower-level
179 | programming model, such as CUDA or OpenCL, that cannot be represented at a high
180 | level. For instance, although OpenACC has the `cache` directive, some uses of
181 | *shared memory* on NVIDIA GPUs are more easily represented using CUDA. The same
182 | is true for any host or device: certain optimizations are too low-level for a
183 | high-level approach like OpenACC. It is up to the developers to determine the
184 | cost and benefit of selectively using a lower level programming language for
185 | performance critical sections of code. In cases where performance is too
186 | critical to take a high-level approach, it's still possible to use OpenACC for
187 | much of the application, while using another approach in certain places, as
188 | will be discussed in a later chapter on interoperability.
189 | 


--------------------------------------------------------------------------------
/02-Porting.markdown:
--------------------------------------------------------------------------------
  1 | Accelerating an Application with OpenACC
  2 | ----------------------------------------
  3 | This section will detail an incremental approach to accelerating an application
  4 | using OpenACC. When taking this approach it is beneficial to revisit each
  5 | step multiple times, checking the results of each step for correctness. Working
  6 | incrementally will limit the scope of each change for improved productivity and
  7 | debugging.
  8 | 
  9 | ### OpenACC Directive Syntax ###
 10 | This guide will introduce OpenACC directives incrementally, as they become
 11 | useful for the porting process. All OpenACC directives have a common syntax,
 12 | however, with the `acc` sentinel, designating to the compiler that the text
 13 | that follows will be OpenACC, a directive, and clauses to that directive, many
 14 | of which are optional but provide the compiler with additional information. 
 15 | 
 16 | In C and C++, these directives take the form of a pragma. The example code
 17 | below shows the OpenACC `kernels` directive without any additional clauses
 18 | 
 19 | ~~~~ {.c .numberLines}
 20 |     #pragma acc kernels
 21 | ~~~~
 22 | 
 23 | In Fortran, the directives take the form of a special comment, as demonstrated
 24 | below.
 25 | 
 26 | ~~~~ {.fortran .numberLines}
 27 |     !$acc kernels
 28 | ~~~~
 29 | 
 30 | Some OpenACC directives apply to structured blocks of code, while others are
 31 | executable statements. In C and C++ a block of code can be represented by
 32 | curly braces (`{` and `}`). In Fortran a block of code will begin with an
 33 | OpenACC directive (`!$acc kernels`) and end with a matching ending directive 
 34 | (`!$acc end kernels`).
 35 | 
 36 | 
 37 | ### Porting Cycle ###
 38 | Programmers should take an incremental approach to accelerating applications
 39 | using OpenACC to ensure correctness. This guide will follow the approach of
 40 | first assessing application performance, then using OpenACC to parallelize
 41 | important loops in the code, next optimizing data locality to remove
 42 | unnecessary data migrations between the host and accelerator, and finally
 43 | optimizing loops within the code to maximize performance on a given
 44 | architecture. This approach has been successful in many applications because it
 45 | prioritizes changes that are likely to provide the greatest returns so that the
 46 | programmer can quickly and productively achieve the acceleration. 
 47 | 
 48 | There are two important things to note before detailing each step. First, at
 49 | times during this process application performance may actually slow down.
 50 | Developers should not become frustrated if their initial efforts result in a
 51 | loss of performance. As will be explained later, this is generally the result
 52 | of implicit data movement between the host and accelerator, which will be
 53 | optimized as a part of the porting cycle. Second, it is critical that
 54 | developers check the program results for correctness after each change.
 55 | Frequent correctness checks will save a lot of debugging effort, since errors
 56 | can be found and fixed immediately, before they have the chance to compound.
 57 | Some developers may find it beneficial to use a source version control tool to
 58 | snapshot the code after each successful change so that any breaking changes can
 59 | be quickly thrown away and the code returned to a known good state.
 60 | 
 61 | #### Assess Application Performance ####
 62 | Before one can begin to accelerate an application it is important to understand
 63 | in which routines and loops an application is spending the bulk of its time and
 64 | why. It is critical to understand the most time-consuming parts of the
 65 | application to maximize the benefit of acceleration. Amdahl's Law
 66 | informs us that the speed-up achievable from running an application on a
 67 | parallel accelerator will be limited by the remaining serial code. In other
 68 | words, the application will see the most benefit by accelerating as much of the
 69 | code as possible and by prioritizing the most time-consuming parts. A variety
 70 | of tools may be used to identify important parts of the code, including simple
 71 | application timers.
 72 | 
 73 | #### Parallelize Loops ####
 74 | Once important regions of the code have been identified, OpenACC directives
 75 | should be used to accelerate these regions on the target device. Parallel loops
 76 | within the code should be decorated with OpenACC directives to provide OpenACC
 77 | compilers the information necessary to parallelize the code for the target
 78 | architecture.
 79 | 
 80 | #### Optimize Data Locality ####
 81 | Because many accelerated architectures, such as CPU + GPU architectures, use
 82 | distinct memory spaces for the *host* and *device* it is necessary for the
 83 | compiler to manage data in both memories and move the data between the two
 84 | memories to ensure correct results. Compilers rarely have full knowledge of the
 85 | application, so they must be cautious in order to ensure correctness, which
 86 | often involves copying data to and from the accelerator more often than is
 87 | actually necessary. The programmer can give the compiler additional information
 88 | about how to manage the memory so that it remains local to the accelerator as
 89 | long as possible and is only moved between the two memories when absolutely
 90 | necessary. Programmers will often realize the largest performance gains after
 91 | optimizing data movement during this step.
 92 | 
 93 | #### Optimize Loops ####
 94 | Compilers will make decisions about how to map the parallelism in the code to
 95 | the target accelerator based on internal heuristics and the limited knowledge
 96 | it has about the application. Sometimes additional performance can be gained by
 97 | providing the compiler with more information so that it can make better
 98 | decisions on how to map the parallelism to the accelerator. When coming from a
 99 | traditional CPU architecture to a more parallel architecture, such as a GPU, it
100 | may also be necessary to restructure loops to expose additional parallelism for
101 | the accelerator or to reduce the frequency of data movement. Frequently code
102 | refactoring that was motivated by improving performance on parallel
103 | accelerators is beneficial to traditional CPUs as well.
104 | 
105 | ---
106 | 
107 | This process is by no means the only way to accelerate using OpenACC, but it
108 | has been proven successful in numerous applications. Doing the same steps in
109 | different orders may cause both frustration and difficulty debugging, so it's
110 | advisable to perform each step of the process in the order shown above. 
111 | 
112 | ### Heterogenous Computing Best Practices ###
113 | Many applications have been written with little or even no parallelism exposed
114 | in the code. The applications that do expose parallelism frequently do so in a
115 | coarse-grained manner, where a small number of threads or processes execute for
116 | a long time and compute a significant amount work each. Modern GPUs and many-core
117 | processors, however, are designed to execute fine-grained threads, which are
118 | short-lived and execute a minimal amount of work each. These parallel
119 | architectures achieve high throughput by trading single-threaded performance in
120 | favor of more parallelism. This means that when
121 | accelerating an application with OpenACC, which was designed in light of 
122 | increased hardware parallelism, it may be necessary to refactor the code to
123 | favor tightly-nested loops with a significant amount of data reuse. In many
124 | cases these same code changes also benefit more traditional CPU architectures as
125 | well by improving cache use and vectorization.
126 | 
127 | OpenACC may be used to accelerate applications on devices that have a discrete
128 | memory or that have a memory space that's shared with the host. Even on devices
129 | that utilize a shared memory there is frequently still a hierarchy of a fast,
130 | close memory for the accelerator and a larger, slower memory used by the host.
131 | For this reason it is important to structure the application code to maximize
132 | reuse of arrays regardless of whether the underlying architecture uses discrete
133 | or unified memories. When refactoring the code for use with OpenACC it is
134 | frequently beneficial to assume a discrete memory, even if the device you are
135 | developing on has a unified memory. This forces data locality to be a primary
136 | consideration in the refactoring and will ensure that the resulting code
137 | exploits hierarchical memories and is portable to a wide range of devices.
138 | 
139 | ### Case Study - Jacobi Iteration
140 | 
141 | Throughout this guide we will use simple applications to demonstrate each step
142 | of the acceleration process. The first such application will solve the
143 | 2D-Laplace equation with the iterative Jacobi solver. Iterative methods are a
144 | common technique to approximate the solution of elliptic PDEs, like the
145 | 2D-Laplace equation, within some allowable tolerance. In the case of our
146 | example we will perform a simple stencil calculation where each point
147 | calculates it value as the mean of its neighbors' values. The calculation will
148 | continue to iterate until either the maximum change in value between two
149 | iterations drops below some tolerance level or a maximum number of iterations
150 | is reached. For the sake of consistent comparison through the document the
151 | examples will always iterate 1000 times. The main iteration loop for both C/C++
152 | and Fortran appears below.
153 | 
154 | ~~~~ {.c .numberLines startFrom="52"}
155 |     while ( error > tol && iter < iter_max )
156 |     {
157 |         error = 0.0;
158 | 
159 |         for( int j = 1; j < n-1; j++)
160 |         {
161 |             for( int i = 1; i < m-1; i++ )
162 |             {
163 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
164 |                                     + A[j-1][i] + A[j+1][i]);
165 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
166 |             }
167 |         }
168 | 
169 |         for( int j = 1; j < n-1; j++)
170 |         {
171 |             for( int i = 1; i < m-1; i++ )
172 |             {
173 |                 A[j][i] = Anew[j][i];
174 |             }
175 |         }
176 | 
177 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
178 | 
179 |         iter++;
180 |     }
181 | ~~~~    
182 | 
183 | ---
184 | 
185 | ~~~~ {.fortran .numberLines startFrom="52"}
186 |     do while ( error .gt. tol .and. iter .lt. iter_max )
187 |       error=0.0_fp_kind
188 |   
189 |       do j=1,m-2
190 |         do i=1,n-2
191 |           Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
192 |                                        A(i  ,j-1) + A(i  ,j+1) )
193 |           error = max( error, abs(Anew(i,j)-A(i,j)) )
194 |         end do
195 |       end do
196 |   
197 |       do j=1,m-2
198 |         do i=1,n-2
199 |           A(i,j) = Anew(i,j)
200 |         end do
201 |       end do
202 |   
203 |       if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
204 |       iter = iter + 1
205 |   
206 |     end do
207 | ~~~~
208 | 
209 | The outermost loop in each example will be referred to as the *convergence
210 | loop*, since it loops until the answer has converged by reaching some maximum
211 | error tolerance or number of iterations. Notice that whether or not a loop
212 | iteration occurs depends on the error value of the previous iteration. Also,
213 | the values for each element of `A` is calculated based on the values of the
214 | previous iteration, known as a data dependency. These two facts mean that this
215 | loop cannot be run in parallel.
216 | 
217 | The first loop nest within the convergence loop calculates the new value for
218 | each element based on the current values of its neighbors. Notice that it is
219 | necessary to store this new value into a different array. If each iteration
220 | stored the new value back into itself then a data dependency would exist between
221 | the data elements, as the order each element is calculated would affect the
222 | final answer. By storing into a temporary array we ensure that all values are
223 | calculated using the current state of `A` before `A` is updated. As a result,
224 | each loop iteration is completely independent of each other iteration. These
225 | loop iterations may safely be run in any order or in parallel and the final
226 | result would be the same. This loop also calculates a maximum error value. The
227 | error value is the difference between the new value and the old. If the maximum
228 | amount of change between two iterations is within some tolerance, the problem
229 | is considered converged and the outer loop will exit.
230 | 
231 | The second loop nest simply updates the value of `A` with the values calculated
232 | into `Anew`. If this is the last iteration of the convergence loop, `A` will be
233 | the final, converged value. If the problem has not yet converged, then `A` will
234 | serve as the input for the next iteration. As with the above loop nest, each
235 | iteration of this loop nest is independent of each other and is safe to
236 | parallelize. 
237 | 
238 | In the coming sections we will accelerate this simple application using the
239 | method described in this document. 
240 | 


--------------------------------------------------------------------------------
/03-Analyze.markdown:
--------------------------------------------------------------------------------
  1 | Assess Application Performance
  2 | ==============================
  3 | A variety of tools can be used to evaluate application performance and
  4 | which are available will depend on your development environment. From simple
  5 | application timers to graphical performance analyzers, the choice of
  6 | performance analysis tool is outside of the scope of this document. The purpose
  7 | of this section is to provide guidance on choosing important sections of code
  8 | for acceleration, which is independent of the profiling tools available. 
  9 | 
 10 | Throughout this guide, the NVIDIA Nsight Systems performance analysis tool which is provided with the CUDA toolkit, will be used for CPU profiling. When accelerator profiling is needed, the application will be run on an NVIDIA GPU and the NVIDIA Nsight Systems profiler will be used again.
 11 | 
 12 | Baseline Profiling
 13 | ------------------
 14 | Before parallelizing an application with OpenACC the programmer must first
 15 | understand where time is currently being spent in the code. Routines and loops
 16 | that take up a significant percentage of the runtime are frequently referred to
 17 | as *hot spots* and will be the starting point for accelerating the application. 
 18 | A variety of tools exist for generating application profiles, such as gprof,
 19 | Vampir, Nsight Systems, and TAU. Selecting the specific tool that works 
 20 | best for a given application is outside of the scope of this document, but regardless 
 21 | of which tool or tools are used below are some important pieces of information
 22 | that will help guide the next steps in parallelizing the application.
 23 | 
 24 | * Application performance - How much time does the application take to run? How
 25 |   efficiently does the program use the computing resources? 
 26 | * Program hotspots - In which routines is the program spending most of its
 27 |   time? What is being done within these important routines? Focusing on the
 28 |   most time consuming parts of the application will yield the greatest results.
 29 | * Performance limiters - Within the identified hotspots, what's currently
 30 |   limiting the application performance? Some common limiters may be I/O, memory
 31 |   bandwidth, cache reuse, floating point performance, communication, etc.
 32 |   One way to evaluate the performance limiters of a given loop nest is to
 33 |   evaluate its *computational intensity*, which is a measure of how many
 34 |   operations are performed on a data element per load or store from memory. 
 35 | * Available parallelism - Examine the loops within the hotspots to understand
 36 |   how much work each loop nest performs. Do the loops iterate 10's, 100's,
 37 |   1000's of times (or more)? Do the loop iterations operate independently of
 38 |   each other? Look not only at the individual loops, but look a nest of loops
 39 |   to understand the bigger picture of the entire nest. 
 40 | 
 41 | Gathering baseline data like the above both helps inform the developer where to
 42 | focus efforts for the best results and provides a basis for comparing
 43 | performance throughout the rest of the process. It's important to choose input
 44 | that will realistically reflect how the application will be used once it has
 45 | been accelerated. It's tempting to use a known benchmark problem for profiling,
 46 | but frequently these benchmark problems use a reduced problem size or reduced
 47 | I/O, which may lead to incorrect assumptions about program performance. Many
 48 | developers also use the baseline profile to gather the expected output of the
 49 | application to use for verifying the correctness of the application as it is
 50 | accelerated.
 51 | 
 52 | Additional Profiling
 53 | --------------------
 54 | Through the process of porting and optimizing an application with OpenACC it's
 55 | necessary to gather additional profile data to guide the next steps in the
 56 | process. Some profiling tools, such as Nsight Systems and Vampir, support profiling on
 57 | CPUs and GPUs, while other tools, such as gprof, may
 58 | only support profiling on a particular platform. Additionally, some compilers
 59 | build their own profiling into the application, such is the case with the NVHPC
 60 | compiler, which supports setting the NVCOMPILER\_ACC\_TIME environment variable for 
 61 | gathering runtime information about the application. When developing on
 62 | offloading platforms, such as CPU + GPU platforms, it's generally important to
 63 | use a profiling tool throughout the development process that can evaluate both
 64 | time spent in computation and time spent performing PCIe data transfers. This
 65 | document will use NVIDIA Nsight Systems Profiler for performing this analysis, although
 66 | it is only available on NVIDIA platforms.
 67 | 
 68 | Case Study - Analysis
 69 | ---------------------
 70 | To get a better understanding of the case study program we will use the
 71 | NVIDIA NSight Systems command line interface that comes as a part of the CUDA Toolkit and NVIDIA HPC SDK. First,
 72 | it's necessary to build the executable. Remember to use the flags included in
 73 | the example below to ensure that additional information about how the
 74 | compiler optimized the program is displayed. The executable is built with the
 75 | following command:
 76 | 
 77 | ~~~~
 78 |     $ nvc -fast -Minfo=all laplace2d.c
 79 |     GetTimer:
 80 |          21, include "timer.h"
 81 |               61, FMA (fused multiply-add) instruction(s) generated
 82 |     main:
 83 |          41, Loop not fused: function call before adjacent loop
 84 |              Loop unrolled 8 times
 85 |          49, StartTimer inlined, size=2 (inline) file laplace2d.c (37)
 86 |          52, FMA (fused multiply-add) instruction(s) generated
 87 |          58, Generated vector simd code for the loop containing reductions
 88 |          68, Memory copy idiom, loop replaced by call to __c_mcopy8
 89 |          79, GetTimer inlined, size=10 (inline) file laplace2d.c (54)
 90 | ~~~~
 91 | 
 92 | Once the executable has been built, the `nsys` command will run the
 93 | executable and generate a profiling report that can be viewed offline in 
 94 | the NVIDIA Nsight Systems GUI
 95 | 
 96 | ~~~~
 97 |     $ nsys profile ./a.out
 98 |     
 99 |     Jacobi relaxation Calculation: 4096 x 4096 mesh
100 |         0, 0.250000
101 |       100, 0.002397
102 |       200, 0.001204
103 |       300, 0.000804
104 |       400, 0.000603
105 |       500, 0.000483
106 |       600, 0.000403
107 |       700, 0.000345
108 |       800, 0.000302
109 |       900, 0.000269
110 |      total: 36.480533 s
111 |      Processing events...
112 | Capturing symbol files...
113 | Saving temporary "/tmp/nsys-report-2f5b-f32e-7dec-9af0.qdstrm" file to disk...
114 | Creating final output files...
115 | 
116 | Processing [==============================================================100%]
117 | Saved report file to "/tmp/nsys-report-2f5b-f32e-7dec-9af0.qdrep"
118 | Report file moved to "/home/ubuntu/openacc-programming-guide/examples/laplace/ch2/report1.qdrep"
119 | ~~~~
120 | 
121 | Once the data has been collected, and the .qdrep report has been generated,
122 | it can be visualized using the Nsight Systems GUI. You must first copy the
123 | report (report1.qdrep in the example above) to a machine that has graphical
124 | capabilities and download the Nsight Systems interface. Next, you must open
125 | the application and select your file via the file manager.
126 | 
127 | ![Nsight Systems initial window in the GUI. You must use the toolbar at the top to find your target report file](images/ch2-nsight-open.png)
128 | 
129 | When we open the report in Nsight Systems, we see that the vast majority of
130 | the time is spent in two routines: main and \_\_c\_mcopy8. A screenshot of
131 | the initial screen for Nsight systems is shown in figure 2.1. Since the code
132 | for this case study is completely within the main function of the program,
133 | it's not surprising that nearly all of the time is spent in main, but in
134 | larger applications it's likely that the time will be spent in several other
135 | routines.
136 | 
137 | ![Nsight initial profile window showing 81% of runtime in main and 17% in a memory copy routine.](images/ch2-nsight-initial.png)
138 | 
139 | Clicking into the main function we can see that nearly all of the runtime
140 | within main comes from the loop that calculates the next value for A. This is
141 | shown in figure 2.2. What is not obvious from the profiler output,
142 | however, is that the time spent in the memory copy routine shown in the initial
143 | screen is actually the second loop nest, which performs the array swap at the
144 | end of each iteration. The compiler output shows above that the loop at line
145 | 68 was replaced by a memory copy, because doing so is more efficient than
146 | copying each element individually. So what the profiler is really showing us
147 | is that the major hotspots for our application are the loop nest that
148 | calculate `Anew` from `A` and the loop nest that copies from `Anew` to `A`
149 | for the next iteration, so we'll concentrate our efforts on these two loop
150 | nests.
151 | 
152 | In the chapters that follow, we will optimize the loops identified in this
153 | chapter as the hotspots within our example application. 
154 | 


--------------------------------------------------------------------------------
/99-End.markdown:
--------------------------------------------------------------------------------
 1 | \appendix
 2 | 
 3 | References
 4 | ==========
 5 | 
 6 | * [OpenACC.org](http://openacc.org)
 7 | * [OpenACC on the NVIDIA Developer Blog](https://developer.nvidia.com/blog/tag/openacc/)
 8 | * [PGI Insider Newsletter](https://www.pgroup.com/resources/articles.htm)
 9 | * [OpenACC at the NVIDIA GPU Technology Conference](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=OpenACC&sort=date)
10 | * [OpenACC on Stack Exchange](http://stackoverflow.com/questions/tagged/openacc)
11 | * [OpenACC Community Slack](https://www.openacc.org/community#slack)
12 | * [OpenACC on the GCC Wiki](https://gcc.gnu.org/wiki/OpenACC)
13 | 


--------------------------------------------------------------------------------
/BUILD.md:
--------------------------------------------------------------------------------
 1 | OpenACC Porting and Portability Guide
 2 | =====================================
 3 | This guide is written in Pandoc markdown format
 4 | (http://johnmacfarlane.net/pandoc/README.html). 
 5 | 
 6 | The build.bat file may be used to generate a PDF once Pandoc and a LaTex
 7 | package are installed or the command may be modified to generate on
 8 | another platform.
 9 | 
10 | General Build Instructions
11 | --------------------------
12 | The included Makefile will use the `pandoc/latex:latest` docker image to 
13 | build the guide. This can be overriden to point to a local install of 
14 | `pandoc` by providing the `PANDOC` variable to the make command.
15 | 
16 | Windows Instructions
17 | --------------------
18 | On Windows it is necessary to install the pandoc package and 
19 | MiKTeX (http://miktex.org/). The first time you build the pdf using
20 | build.bat, MiKTeX will need to install several dependencies. This will 
21 | only happen the first time the document is built.
22 | 
23 | FIXME 
24 | -----
25 | * The implicit_figures feature is disabled until I can determine how to
26 |   make the figures inline rather than lumped together at the end. This
27 |   may simply be an issue of a different LaTeX -> PDF converter.
28 | * Need to look into installing filter for internal figure references.
29 | * Need to generate SVG diagrams instead of PNG.
30 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ------------
 3 | 
 4 | Please use the following guidelines when contributing to this project. 
 5 | 
 6 | Before contributing significant changes, please begin a discussion of the
 7 | desired changes via a GitHub Issue to prevent doing unnecessary or overlapping
 8 | work.
 9 | 
10 | ## License
11 | 
12 | The source code provided in this project is the licensed under Apache License
13 | 2.0 (https://www.apache.org/licenses/LICENSE-2.0) and text documentation is
14 | licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0)
15 | (https://creativecommons.org/licenses/by/4.0/). Contributions under other,
16 | compatible licenses will be considered on a case-by-case basis.
17 | 
18 | Contributions must include a "signed off by" tag in the commit message for the
19 | contributions asserting the signing of the developers certificate of origin
20 | (https://developercertificate.org/). A GPG-signed commit with the "signed off
21 | by" tag is preferred.
22 | 
23 | ## Styling
24 | 
25 | Please use the following style guidelines when making contributions.
26 | 
27 | ### Source Code
28 | 
29 | * Two-space indention, no tabs
30 | * To the extent possible, variable names should be descriptive
31 | * Fortran codes should use free-form source files
32 | * Fortran codes should not use implicit variable names and should use
33 |   `implicit none`
34 | * The following file extensions should be used appropriately
35 |   * C - `.c`
36 |   * C++ - `.cpp`
37 |   * CUDA C/C++ - `.cu`
38 |   * CUDA Fortran - `.cuf`
39 |   * Fortran - `.F90`
40 | 
41 | ### Markdown
42 | 
43 | * When they appear inline with the text; directive names, clauses, function or
44 |   subroutine names, variable names, file names, commands and command-line
45 |   arguments should appear between two back ticks.
46 | * Code blocks should begin with three back ticks and either 'cpp' or 'fortran'
47 |   to enable appropriate source formatting and end with three back ticks.
48 | * Emphasis, including quotes made for emphasis and introduction of new terms
49 |   should be highlighted between a single pair of asterisks


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pandoc/latex:latest
2 | #RUN apk --no-cache add texlive-xetex texmf-dist-pictures texmf-dist-latexextra poppler-utils && texhash
3 | #ADD http://mirror.ctan.org/systems/texlive/tlnet/update-tlmgr-latest.sh /tmp
4 | #RUN sh /tmp/update-tlmgr-latest.sh -- --upgrade
5 | #RUN tlmgr update --self --all 
6 | RUN tlmgr update --all 
7 | RUN tlmgr install ifoddpage tikzpagenodes blindtext textpos koma-script pdfpages && luaotfload-tool -fu


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | MDFILES := $(wildcard ??-*.markdown)
 2 | IMGFILES := $(wildcard images/*)
 3 | 
 4 | # Override with pandoc executable if building without docker
 5 | PANDOC ?= docker run --rm -it -u `id -u`:`id -g` -v ${PWD}:/data openacc/best-practices-guide:latest
 6 | 
 7 | openacc-guide.pdf: ${MDFILES} ${IMGFILES}
 8 | 	${PANDOC} -f markdown+implicit_figures -s -o openacc-guide.pdf ??-*.markdown --citeproc --highlight-style pygments --top-level-division=chapter -V geometry:letterpaper -H cover-page/main.tex --pdf-engine=xelatex
 9 | 
10 | openacc-guide.tex: ${MDFILES}
11 | 	${PANDOC} -f markdown+implicit_figures -s -o openacc-guide.tex ??-*.markdown --citeproc --highlight-style pygments --top-level-division=chapter
12 | 
13 | openacc-guide.html: ${MDFILES}
14 | 	${PANDOC} -f markdown+implicit_figures -s -o openacc-guide.html ??-*.markdown --top-level-division=chapter --toc --toc-depth=2 -V geometry:margin=1in --citeproc -V documentclass:book -V classoption:oneside --highlight-style pygments
15 | 
16 | openacc-guide.doc: ${MDFILES} ${IMGFILES}
17 | 	${PANDOC} -f markdown+implicit_figures -s -o openacc-guide.doc ??-*.markdown --top-level-division=chapter --toc --toc-depth=2 -V geometry:margin=1in --citeproc
18 | 
19 | outline.pdf: outline.markdown
20 | 	${PANDOC} outline.markdown -o outline.pdf -V geometry:margin=1in
21 | 
22 | all: openacc-guide.pdf 
23 | 
24 | clean:
25 | 	-rm -rf outline.pdf openacc-guide.pdf openacc-guide.doc openacc-guide.tex openacc-guide.html openacc-guide.rst outline.pdf _build
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | OpenACC Porting and Portability Guide
 3 | =====================================
 4 | 
 5 | This guide is written as a resource to OpenACC developers, regardless of
 6 | experience level. It contains a proposed workflow when refactoring an existing
 7 | application to use OpenACC. Examples are provided in C/C++ and Fortran. The
 8 | current version can always be found [on
 9 | GitHub](https://github.com/OpenACC/openacc-best-practices-guide). A PDF copy
10 | can also be found on the [OpenACC Website](https://openacc.org).
11 | 
12 | Document Organization
13 | ---------------------
14 | 
15 | 1. [Introduction](01-Introduction.markdown)
16 | 2. [Porting](02-Porting.markdown)
17 | 3. [Analyze](03-Analyze.markdown)
18 | 4. [Parallelize](04-Parallelize.markdown)
19 | 5. [Data Locality](05-Data-Locality.markdown)
20 | 6. [Loop Optimization](06-Loops.markdown)
21 | 7. [Interoperability](07-Interoperability.markdown)
22 | 8. [Advanced Topics](08-Advanced.markdown)
23 | 
24 | This guide is licensed under Creative Commons Attribution 4.0 International
25 | Public License and code examples are licensed under the Apache License 2.0,
26 | unless otherwise noted. Please see [CONTRIBUTING.md](CONTRIBUTING.md) for
27 | details on how to contribute.


--------------------------------------------------------------------------------
/build.bat:
--------------------------------------------------------------------------------
1 | pandoc.exe -f markdown+implicit_figures -s -o openacc-guide.pdf .\00-Title.markdown .\01-Introduction.markdown .\02-Porting.markdown .\03-Analyze.markdown .\04-Parallelize.markdown .\05-Data-Locality.markdown .\06-Loops.markdown .\07-Interoperability.markdown .\08-Advanced.markdown .\99-End.markdown --chapters --toc --toc-depth=2 -V geometry:margin=1in --filter pandoc-citeproc -V documentclass:book -V classoption:oneside -V geometry:letterpaper
2 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'OpenACC Programming and Best Practices Guide'
21 | copyright = '2023, OpenACC.org'
22 | author = 'OpenACC.org'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 | #    'recommonmark'
32 |     'myst_parser'
33 | ]
34 | 
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 | 
38 | # List of patterns, relative to source directory, that match files and
39 | # directories to ignore when looking for source files.
40 | # This pattern also affects html_static_path and html_extra_path.
41 | exclude_patterns = [
42 |     "CONTRIBUTING.md",
43 |     "x*.markdown",
44 |     "outline.markdown",
45 |     "README.md"
46 | ]
47 | 
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_theme = 'sphinx_rtd_theme'
55 | 
56 | # Add any paths that contain custom static files (such as style sheets) here,
57 | # relative to this directory. They are copied after the builtin static files,
58 | # so a file named "default.css" will overwrite the builtin "default.css".
59 | #html_static_path = ['_static']
60 | html_static_path = ['images']
61 | 
62 | #master_doc = 'openacc-guide'
63 | master_doc = 'index'
64 | 
65 | source_suffix = {
66 |     '.rst': 'restructuredtext',
67 | #    '.txt': 'markdown',
68 |     '.markdown': 'markdown',
69 | #    '.md': 'markdown'
70 | }
71 | 


--------------------------------------------------------------------------------
/cover-page/app_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/cover-page/app_images.png


--------------------------------------------------------------------------------
/cover-page/cover-page.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/cover-page/cover-page.pdf


--------------------------------------------------------------------------------
/cover-page/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/cover-page/logo.png


--------------------------------------------------------------------------------
/cover-page/main.tex:
--------------------------------------------------------------------------------
 1 | %\documentclass[letterpaper,fontsize=20pt]{scrartcl}
 2 | \usepackage{tikz}
 3 | \usepackage{tikzpagenodes}
 4 | \usepackage{scrlayer-scrpage}
 5 | \usepackage[final]{pdfpages}
 6 | %\usepackage{lipsum}
 7 | \usepackage{fontspec}
 8 | \usepackage[english]{babel}
 9 | \usepackage{blindtext}
10 | \usepackage[absolute,overlay]{textpos}
11 | 
12 | \definecolor{myblue}{RGB}{13, 43, 109}
13 | 
14 | \definecolor{myblue2}{RGB}{53, 166, 217}
15 | 
16 | \newcommand{\blueborder}{\tikz[remember picture,overlay] 
17 |     \draw [myblue,line width=22mm]
18 |     (current page.south west)
19 |     rectangle
20 |     (current page.north east)
21 |     ;}
22 | 
23 | 
24 | %\usepackage{lipsum}
25 | %\begin{document}
26 | \renewcommand\maketitle{
27 | \begin{titlepage}
28 | \chead[\blueborder]{\blueborder} % for page borders
29 |     \null\vfil
30 | \blueborder
31 | \linespread{2} 
32 | \begin{textblock*}{15cm}(3.5cm,6cm) % {block width} (coords) 
33 | \noindent
34 | \textcolor{myblue}{\textbf{ \huge OpenACC Programming \newline and Best Practices Guide}}
35 | %\textcolor{myblue}{\textbf{\fontspec{QTHelvetCnd} \huge OpenACC Programming \newline and Best Practices Guide}}
36 | 
37 | \noindent
38 | %\textcolor{myblue2}{\fontspec{Liberation Sans Narrow} November 2020}
39 | \textcolor{myblue2}{\@date}
40 | \end{textblock*}
41 | 
42 | \begin{textblock*}{19.408cm}(0.36cm,13cm)
43 | \begin{figure}
44 |  \includegraphics[width=\textwidth]{cover-page/app_images.png}
45 | \end{figure}
46 | \end{textblock*}
47 | 
48 | \begin{textblock*}{10.7cm}(8.7cm,22.3cm)
49 | \begin{figure}[hb!]
50 |  \includegraphics[width=\textwidth]{cover-page/logo.png}
51 | \end{figure}
52 | \end{textblock*}
53 | 
54 | \begin{textblock*}{10.7cm}(1.8cm,26.3cm)
55 | \textcolor{myblue}{ \tiny © 2022 openacc-standard.org. All Rights Reserved.}
56 | %\textcolor{myblue}{\fontspec{QTHelvetCnd} \tiny © 2020 openacc-standard.org. All Rights Reserved.}
57 | \end{textblock*}
58 | 
59 | \pagenumbering{gobble}
60 | %\newpage
61 | %\newpage
62 | %\cleardoublepage
63 | \linespread{1} 
64 | \vfil\null
65 | \end{titlepage}
66 | }
67 | 
68 | %\end{document}
69 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -gpu=lineinfo -Minfo=all,intensity,ccff
 3 | LDFLAGS=${CXXFLAGS}
 4 | 
 5 | cg.x: main.o 
 6 | 	${CXX} $^ -o $@ ${LDFLAGS}
 7 | 
 8 | main.o: main.cpp matrix.h matrix_functions.h vector.h vector_functions.h
 9 | 
10 | .SUFFIXES: .o .cpp .h
11 | 
12 | .PHONY: clean
13 | clean:
14 | 	rm -Rf cg.x *.pgprof *.o core *.nvprof
15 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2016 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #include <cstdlib>
17 | #include <cstdio>
18 | #include <omp.h>
19 | 
20 | #include "vector.h"
21 | #include "vector_functions.h"
22 | #include "matrix.h"
23 | #include "matrix_functions.h"
24 | 
25 | #define N 200
26 | #define MAX_ITERS 100
27 | #define TOL 1e-12
28 | int main() {
29 |   vector x,b;
30 |   vector r,p,Ap;
31 |   matrix A;
32 |   
33 |   double one=1.0, zero=0.0;
34 |   double normr, rtrans, oldtrans, p_ap_dot , alpha, beta;
35 |   int iter=0;
36 | 
37 |   //create matrix
38 |   allocate_3d_poisson_matrix(A,N);
39 |     
40 |   printf("Rows: %d, nnz: %d\n", A.num_rows, A.row_offsets[A.num_rows]);
41 | 
42 |   allocate_vector(x,A.num_rows);
43 |   allocate_vector(Ap,A.num_rows);
44 |   allocate_vector(r,A.num_rows);
45 |   allocate_vector(p,A.num_rows);
46 |   allocate_vector(b,A.num_rows);
47 | 
48 |   initialize_vector(x,100000);
49 |   initialize_vector(b,1);
50 |  
51 | 
52 |   waxpby(one, x, zero, x, p);
53 |   matvec(A,p,Ap);
54 |   waxpby(one, b, -one, Ap, r);
55 |   
56 |   rtrans=dot(r,r);
57 |   normr=sqrt(rtrans);
58 |   
59 |   double st = omp_get_wtime();
60 |   do {
61 |     if(iter==0) {
62 |       waxpby(one,r,zero,r,p);
63 |     } else {
64 |       oldtrans=rtrans;
65 |       rtrans = dot(r,r);
66 |       beta = rtrans/oldtrans;
67 |       waxpby(one,r,beta,p,p);
68 |     }
69 |     
70 |     normr=sqrt(rtrans);
71 |   
72 |     matvec(A,p,Ap);
73 |     p_ap_dot = dot(Ap,p);
74 | 
75 |     alpha = rtrans/p_ap_dot;
76 | 
77 |     waxpby(one,x,alpha,p,x);
78 |     waxpby(one,r,-alpha,Ap,r);
79 | 
80 |     if(iter%10==0)
81 |       printf("Iteration: %d, Tolerance: %.4e\n", iter, normr);
82 |     iter++;
83 |   } while(iter<MAX_ITERS && normr>TOL);
84 |   double et = omp_get_wtime();
85 | 
86 |   printf("Total Iterations: %d Total Time: %lfs\n", iter, (et-st));
87 | 
88 |   free_vector(x);
89 |   free_vector(r);
90 |   free_vector(p);
91 |   free_vector(Ap);
92 |   free_matrix(A);
93 | 
94 |   return 0;
95 | }
96 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/matrix.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2016 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma once
17 | 
18 | #include<cstdlib>
19 | 
20 | struct matrix {
21 |   unsigned int num_rows;
22 |   unsigned int nnz;
23 |   unsigned int *row_offsets;
24 |   unsigned int *cols;
25 |   double *coefs;
26 | };
27 | 
28 | 
29 | void allocate_3d_poisson_matrix(matrix &A, int N) {
30 |   int num_rows=(N+1)*(N+1)*(N+1);
31 |   int nnz=27*num_rows;
32 |   A.num_rows=num_rows;
33 |   A.row_offsets=(unsigned int*)malloc((num_rows+1)*sizeof(unsigned int));
34 |   A.cols=(unsigned int*)malloc(nnz*sizeof(unsigned int));
35 |   A.coefs=(double*)malloc(nnz*sizeof(double));
36 | 
37 |   int offsets[27];
38 |   double coefs[27];
39 |   int zstride=N*N;
40 |   int ystride=N;
41 |   
42 |   int i=0;
43 |   for(int z=-1;z<=1;z++) {
44 |     for(int y=-1;y<=1;y++) {
45 |       for(int x=-1;x<=1;x++) {
46 |         offsets[i]=zstride*z+ystride*y+x;
47 |         if(x==0 && y==0 && z==0)
48 |           coefs[i]=27;
49 |         else
50 |           coefs[i]=-1;
51 |         i++;
52 |       }
53 |     }
54 |   }
55 | 
56 |   nnz=0;
57 |   for(int i=0;i<num_rows;i++) {
58 |     A.row_offsets[i]=nnz;
59 |     for(int j=0;j<27;j++) {
60 |       int n=i+offsets[j];
61 |       if(n>=0 && n<num_rows) {
62 |         A.cols[nnz]=n;
63 |         A.coefs[nnz]=coefs[j];
64 |         nnz++;
65 |       }
66 |     }
67 |   }
68 | 
69 |   A.row_offsets[num_rows]=nnz;
70 |   A.nnz=nnz;
71 | #pragma acc enter data copyin(A)
72 | #pragma acc enter data copyin(A.row_offsets[:num_rows+1],A.cols[:nnz],A.coefs[:nnz])
73 | }
74 | 
75 | void free_matrix(matrix &A) {
76 |   unsigned int *row_offsets=A.row_offsets;
77 |   unsigned int * cols=A.cols;
78 |   double * coefs=A.coefs;
79 | 
80 | #pragma acc exit data delete(A.row_offsets,A.cols,A.coefs)
81 | #pragma acc exit data delete(A)
82 |   free(row_offsets);
83 |   free(cols);
84 |   free(coefs);
85 | }
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/matrix_functions.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2016 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma once
17 | #include "vector.h"
18 | #include "matrix.h"
19 | 
20 | void matvec(const matrix& A, const vector& x, const vector &y) {
21 | 
22 |   unsigned int num_rows=A.num_rows;
23 |   unsigned int *row_offsets=A.row_offsets;
24 |   unsigned int *cols=A.cols;
25 |   double *__restrict Acoefs=A.coefs;
26 |   double *__restrict xcoefs=x.coefs;
27 |   double *__restrict ycoefs=y.coefs;
28 | 
29 | #pragma acc parallel loop present(row_offsets,cols,Acoefs,xcoefs,ycoefs) \
30 |   device_type(nvidia) gang worker num_workers(32) vector_length(32)
31 |   for(int i=0;i<num_rows;i++) {
32 |     double sum=0;
33 |     int row_start=row_offsets[i];
34 |     int row_end=row_offsets[i+1];
35 | #pragma acc loop reduction(+:sum) device_type(nvidia) vector
36 |     for(int j=row_start;j<row_end;j++) {
37 |       unsigned int Acol=cols[j];
38 |       double Acoef=Acoefs[j];
39 |       double xcoef=xcoefs[Acol];
40 |       sum+=Acoef*xcoef;
41 |     }
42 |     ycoefs[i]=sum;
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/vector.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2016 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma once
17 | #include<cmath>
18 | 
19 | struct vector {
20 |   unsigned int n;
21 |   double *coefs;
22 | };
23 | 
24 | void allocate_vector(vector &v, unsigned int n) {
25 |   v.n=n;
26 |   v.coefs=(double*)malloc(n*sizeof(double));
27 | #pragma acc enter data copyin(v)
28 | #pragma acc enter data create(v.coefs[:n])
29 | }
30 | 
31 | void free_vector(vector &v) {
32 |   double *vcoefs=v.coefs;
33 | #pragma acc exit data delete(v.coefs)
34 | #pragma acc exit data delete(v)
35 |   free(v.coefs);
36 | 
37 | }
38 | 
39 | void initialize_vector(vector &v,double val) {
40 | 
41 |   for(int i=0;i<v.n;i++)
42 |     v.coefs[i]=val;
43 | #pragma acc update device(v.coefs[:v.n])
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/examples/ch6_matvec/vector_functions.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2016 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma once
17 | #include<cstdlib>
18 | #include "vector.h"
19 | 
20 | 
21 | double dot(const vector& x, const vector& y) {
22 |   double sum=0;
23 |   unsigned int n=x.n;
24 |   double *__restrict xcoefs=x.coefs;
25 |   double *__restrict ycoefs=y.coefs;
26 | 
27 | #pragma acc parallel loop reduction(+:sum) present(xcoefs,ycoefs)
28 |   for(int i=0;i<n;i++) {
29 |     sum+=xcoefs[i]*ycoefs[i];
30 |   }
31 |   return sum;
32 | }
33 | 
34 | void waxpby(double alpha, const vector &x, double beta, const vector &y, const vector& w) {
35 |   unsigned int n=x.n;
36 |   double *__restrict xcoefs=x.coefs;
37 |   double *__restrict ycoefs=y.coefs;
38 |   double *__restrict wcoefs=w.coefs;
39 | 
40 | #pragma acc parallel loop present(xcoefs,ycoefs,wcoefs)
41 |   for(int i=0;i<n;i++) {
42 |     wcoefs[i]=alpha*xcoefs[i]+beta*ycoefs[i];
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/examples/class-data.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2019 NVIDIA Corporation
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | #include <iostream>
 17 | using namespace std;
 18 | 
 19 | template <class ctype> class Data
 20 | {
 21 |   private:
 22 |     /// Length of the data array
 23 |     int len; 
 24 |     /// Data array
 25 |     ctype *arr;
 26 | 
 27 |   public:
 28 |     /// Class constructor
 29 |     Data(int length)
 30 |     {
 31 |       len = length;
 32 |       arr = new ctype[len];
 33 | #pragma acc enter data copyin(this)      
 34 | #pragma acc enter data create(arr[0:len])
 35 |     }
 36 |     /// Copy constructor
 37 |     Data(const Data<ctype> &d)
 38 |     {
 39 |       len = d.len;
 40 |       arr = new ctype[len];
 41 | #pragma acc enter data copyin(this)      
 42 | #pragma acc enter data create(arr[0:len])
 43 | #pragma acc parallel loop present(arr[0:len],d)
 44 |       for(int i = 0; i < len; i++)
 45 |         arr[i] = d.arr[i];
 46 |     }
 47 | 
 48 |     /// Class destructor
 49 |     ~Data()
 50 |     {
 51 | #pragma acc exit data delete(arr)
 52 | #pragma acc exit data delete(this)
 53 |       delete arr;
 54 |       len = 0;
 55 |     }
 56 |     int size()
 57 |     {
 58 |       return len;
 59 |     }
 60 | #pragma acc routine seq
 61 |     ctype &operator[](int i)
 62 |     {
 63 |       // Simple bounds protection
 64 |       if ( (i < 0) || (i >= len) ) return arr[0];
 65 |       return arr[i];
 66 |     }
 67 |     void populate()
 68 |     {
 69 | #pragma acc parallel loop present(arr[0:len])
 70 |       for(int i = 0; i < len; i++)
 71 |         arr[i] = 2*i;
 72 |     }
 73 | #ifdef _OPENACC
 74 |     void update_host()
 75 |     {
 76 | #pragma acc update self(arr[0:len])
 77 |       ;
 78 |     }
 79 |     void update_device()
 80 |     {
 81 | #pragma acc update device(arr[0:len])
 82 |       ;
 83 |     }
 84 | #endif    
 85 | };
 86 | 
 87 | int main(int argc, char **argv)
 88 | {
 89 |   Data <double> d_data = Data<double>(1024);
 90 | 
 91 |   d_data.populate();
 92 | 
 93 |   Data <double> d_data2 = Data<double>(d_data);
 94 | 
 95 | #ifdef _OPENACC
 96 |   d_data2.update_host();
 97 | #endif
 98 |   cout << d_data2.size() << endl;
 99 |   cout << d_data2[0] << endl;
100 |   cout << d_data2[d_data2.size()-1] << endl;
101 | 
102 |   return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/examples/laplace/ch2/laplace2d.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include <stdio.h>
21 | #include "timer.h"
22 | 
23 | #define NN 4096
24 | #define NM 4096
25 | 
26 | double A[NN][NM];
27 | double Anew[NN][NM];
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     const int n = NN;
32 |     const int m = NM;
33 |     const int iter_max = 1000;
34 |     
35 |     const double tol = 1.0e-6;
36 |     double error     = 1.0;
37 |     
38 |     memset(A, 0, n * m * sizeof(double));
39 |     memset(Anew, 0, n * m * sizeof(double));
40 |         
41 |     for (int j = 0; j < n; j++)
42 |     {
43 |         A[j][0]    = 1.0;
44 |         Anew[j][0] = 1.0;
45 |     }
46 |     
47 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
48 |     
49 |     StartTimer();
50 |     int iter = 0;
51 |     
52 |     while ( error > tol && iter < iter_max )
53 |     {
54 |         error = 0.0;
55 | 
56 |         for( int j = 1; j < n-1; j++)
57 |         {
58 |             for( int i = 1; i < m-1; i++ )
59 |             {
60 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
61 |                                     + A[j-1][i] + A[j+1][i]);
62 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
63 |             }
64 |         }
65 |         
66 |         for( int j = 1; j < n-1; j++)
67 |         {
68 |             for( int i = 1; i < m-1; i++ )
69 |             {
70 |                 A[j][i] = Anew[j][i];    
71 |             }
72 |         }
73 | 
74 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
75 |         
76 |         iter++;
77 |     }
78 | 
79 |     double runtime = GetTimer();
80 |  
81 |     printf(" total: %f s\n", runtime / 1000);
82 |     return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/examples/laplace/ch2/laplace2d.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 |     do j=1,m-2
45 |       do i=1,n-2
46 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
47 |                                      A(i  ,j-1) + A(i  ,j+1) )
48 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
49 |       end do
50 |     end do
51 | 
52 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
53 |     iter = iter + 1
54 | 
55 |     do j=1,m-2
56 |       do i=1,n-2
57 |         A(i,j) = Anew(i,j)
58 |       end do
59 |     end do
60 | 
61 |   end do
62 | 
63 |   call cpu_time(stop_time) 
64 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
65 | 
66 |   deallocate (A,Anew)
67 | end program laplace
68 | 


--------------------------------------------------------------------------------
/examples/laplace/ch2/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/examples/laplace/ch3/laplace2d-kernels.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include <stdio.h>
21 | #include "timer.h"
22 | 
23 | #define NN 4096
24 | #define NM 4096
25 | 
26 | double A[NN][NM];
27 | double Anew[NN][NM];
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     const int n = NN;
32 |     const int m = NM;
33 |     const int iter_max = 1000;
34 |     
35 |     const double tol = 1.0e-6;
36 |     double error     = 1.0;
37 |     
38 |     memset(A, 0, n * m * sizeof(double));
39 |     memset(Anew, 0, n * m * sizeof(double));
40 |         
41 |     for (int j = 0; j < n; j++)
42 |     {
43 |         A[j][0]    = 1.0;
44 |         Anew[j][0] = 1.0;
45 |     }
46 |     
47 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
48 |     
49 |     StartTimer();
50 |     int iter = 0;
51 |     
52 |     while ( error > tol && iter < iter_max )
53 |     {
54 |         error = 0.0;
55 | 
56 | #pragma acc kernels
57 |         {
58 |         for( int j = 1; j < n-1; j++)
59 |         {
60 |             for( int i = 1; i < m-1; i++ )
61 |             {
62 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
63 |                                     + A[j-1][i] + A[j+1][i]);
64 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
65 |             }
66 |         }
67 |         
68 |         for( int j = 1; j < n-1; j++)
69 |         {
70 |             for( int i = 1; i < m-1; i++ )
71 |             {
72 |                 A[j][i] = Anew[j][i];    
73 |             }
74 |         }
75 |         }
76 | 
77 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
78 |         
79 |         iter++;
80 |     }
81 | 
82 |     double runtime = GetTimer();
83 |  
84 |     printf(" total: %f s\n", runtime / 1000);
85 | 
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/laplace/ch3/laplace2d-kernels.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 | !$acc kernels
45 |     do j=1,m-2
46 |       do i=1,n-2
47 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
48 |                                      A(i  ,j-1) + A(i  ,j+1) )
49 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
50 |       end do
51 |     end do
52 | 
53 |     do j=1,m-2
54 |       do i=1,n-2
55 |         A(i,j) = Anew(i,j)
56 |       end do
57 |     end do
58 | !$acc end kernels
59 | 
60 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
61 |     iter = iter + 1
62 | 
63 |   end do
64 | 
65 |   call cpu_time(stop_time) 
66 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
67 | 
68 |   deallocate (A,Anew)
69 | end program laplace
70 | 


--------------------------------------------------------------------------------
/examples/laplace/ch3/laplace2d-parallel.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include <stdio.h>
21 | #include "timer.h"
22 | 
23 | #define NN 4096
24 | #define NM 4096
25 | 
26 | double A[NN][NM];
27 | double Anew[NN][NM];
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     const int n = NN;
32 |     const int m = NM;
33 |     const int iter_max = 1000;
34 |     
35 |     const double tol = 1.0e-6;
36 |     double error     = 1.0;
37 |     
38 |     memset(A, 0, n * m * sizeof(double));
39 |     memset(Anew, 0, n * m * sizeof(double));
40 |         
41 |     for (int j = 0; j < n; j++)
42 |     {
43 |         A[j][0]    = 1.0;
44 |         Anew[j][0] = 1.0;
45 |     }
46 |     
47 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
48 |     
49 |     StartTimer();
50 |     int iter = 0;
51 |     
52 |     while ( error > tol && iter < iter_max )
53 |     {
54 |         error = 0.0;
55 | 
56 | #pragma acc parallel loop reduction(max:error)
57 |         for( int j = 1; j < n-1; j++)
58 |         {
59 |             for( int i = 1; i < m-1; i++ )
60 |             {
61 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
62 |                                     + A[j-1][i] + A[j+1][i]);
63 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
64 |             }
65 |         }
66 |         
67 | #pragma acc parallel loop
68 |         for( int j = 1; j < n-1; j++)
69 |         {
70 |             for( int i = 1; i < m-1; i++ )
71 |             {
72 |                 A[j][i] = Anew[j][i];    
73 |             }
74 |         }
75 | 
76 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
77 |         
78 |         iter++;
79 |     }
80 | 
81 |     double runtime = GetTimer();
82 |  
83 |     printf(" total: %f s\n", runtime / 1000);
84 | 
85 |     return 0;
86 | }
87 | 


--------------------------------------------------------------------------------
/examples/laplace/ch3/laplace2d-parallel.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   do while ( error .gt. tol .and. iter .lt. iter_max )
42 |     error=0.0_fp_kind
43 | 
44 |     !$acc parallel loop reduction(max:error)
45 |     do j=1,m-2
46 |       do i=1,n-2
47 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
48 |                                      A(i  ,j-1) + A(i  ,j+1) )
49 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
50 |       end do
51 |     end do
52 | 
53 |     !$acc parallel loop
54 |     do j=1,m-2
55 |       do i=1,n-2
56 |         A(i,j) = Anew(i,j)
57 |       end do
58 |     end do
59 | 
60 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
61 |     iter = iter + 1
62 | 
63 |   end do
64 | 
65 |   call cpu_time(stop_time) 
66 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
67 | 
68 |   deallocate (A,Anew)
69 | end program laplace
70 | 


--------------------------------------------------------------------------------
/examples/laplace/ch3/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/examples/laplace/ch4/laplace2d-kernels.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include <stdio.h>
21 | #include "timer.h"
22 | 
23 | #define NN 4096
24 | #define NM 4096
25 | 
26 | double A[NN][NM];
27 | double Anew[NN][NM];
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     const int n = NN;
32 |     const int m = NM;
33 |     const int iter_max = 1000;
34 |     
35 |     const double tol = 1.0e-6;
36 |     double error     = 1.0;
37 |     
38 |     memset(A, 0, n * m * sizeof(double));
39 |     memset(Anew, 0, n * m * sizeof(double));
40 |         
41 |     for (int j = 0; j < n; j++)
42 |     {
43 |         A[j][0]    = 1.0;
44 |         Anew[j][0] = 1.0;
45 |     }
46 |     
47 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
48 |     
49 |     StartTimer();
50 |     int iter = 0;
51 |     
52 | #pragma acc data copy(A[:n][:m]) create(Anew[:n][:m])
53 |     while ( error > tol && iter < iter_max )
54 |     {
55 |         error = 0.0;
56 | 
57 | #pragma acc kernels
58 |         {
59 |         for( int j = 1; j < n-1; j++)
60 |         {
61 |             for( int i = 1; i < m-1; i++ )
62 |             {
63 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
64 |                                     + A[j-1][i] + A[j+1][i]);
65 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
66 |             }
67 |         }
68 |         
69 |         for( int j = 1; j < n-1; j++)
70 |         {
71 |             for( int i = 1; i < m-1; i++ )
72 |             {
73 |                 A[j][i] = Anew[j][i];    
74 |             }
75 |         }
76 |         }
77 | 
78 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
79 |         
80 |         iter++;
81 |     }
82 | 
83 |     double runtime = GetTimer();
84 |  
85 |     printf(" total: %f s\n", runtime / 1000);
86 | 
87 |     return 0;
88 | }
89 | 


--------------------------------------------------------------------------------
/examples/laplace/ch4/laplace2d-kernels.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   !$acc data copy(A) create(Anew)
42 |   do while ( error .gt. tol .and. iter .lt. iter_max )
43 |     error=0.0_fp_kind
44 | 
45 | !$acc kernels
46 |     do j=1,m-2
47 |       do i=1,n-2
48 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
49 |                                      A(i  ,j-1) + A(i  ,j+1) )
50 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
51 |       end do
52 |     end do
53 | 
54 |     do j=1,m-2
55 |       do i=1,n-2
56 |         A(i,j) = Anew(i,j)
57 |       end do
58 |     end do
59 | !$acc end kernels
60 | 
61 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
62 |     iter = iter + 1
63 | 
64 |   end do
65 |   !$acc end data
66 | 
67 |   call cpu_time(stop_time) 
68 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
69 | 
70 |   deallocate (A,Anew)
71 | end program laplace
72 | 


--------------------------------------------------------------------------------
/examples/laplace/ch4/laplace2d-parallel.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <math.h>
18 | #include <string.h>
19 | #include <openacc.h>
20 | #include <stdio.h>
21 | #include "timer.h"
22 | 
23 | #define NN 4096
24 | #define NM 4096
25 | 
26 | double A[NN][NM];
27 | double Anew[NN][NM];
28 | 
29 | int main(int argc, char** argv)
30 | {
31 |     const int n = NN;
32 |     const int m = NM;
33 |     const int iter_max = 1000;
34 |     
35 |     const double tol = 1.0e-6;
36 |     double error     = 1.0;
37 |     
38 |     memset(A, 0, n * m * sizeof(double));
39 |     memset(Anew, 0, n * m * sizeof(double));
40 |         
41 |     for (int j = 0; j < n; j++)
42 |     {
43 |         A[j][0]    = 1.0;
44 |         Anew[j][0] = 1.0;
45 |     }
46 |     
47 |     printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
48 |     
49 |     StartTimer();
50 |     int iter = 0;
51 |     
52 | #pragma acc data copy(A[:n][:m]) create(Anew[:n][:m])
53 |     while ( error > tol && iter < iter_max )
54 |     {
55 |         error = 0.0;
56 | 
57 | #pragma acc parallel loop reduction(max:error)
58 |         for( int j = 1; j < n-1; j++)
59 |         {
60 |             for( int i = 1; i < m-1; i++ )
61 |             {
62 |                 Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
63 |                                     + A[j-1][i] + A[j+1][i]);
64 |                 error = fmax( error, fabs(Anew[j][i] - A[j][i]));
65 |             }
66 |         }
67 |         
68 | #pragma acc parallel loop
69 |         for( int j = 1; j < n-1; j++)
70 |         {
71 |             for( int i = 1; i < m-1; i++ )
72 |             {
73 |                 A[j][i] = Anew[j][i];    
74 |             }
75 |         }
76 | 
77 |         if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error);
78 |         
79 |         iter++;
80 |     }
81 | 
82 |     double runtime = GetTimer();
83 |  
84 |     printf(" total: %f s\n", runtime / 1000);
85 | 
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/laplace/ch4/laplace2d-parallel.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2012 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | 
17 | program laplace
18 |   implicit none
19 |   integer, parameter :: fp_kind=kind(1.0d0)
20 |   integer, parameter :: n=4096, m=4096, iter_max=1000
21 |   integer :: i, j, iter
22 |   real(fp_kind), dimension (:,:), allocatable :: A, Anew
23 |   real(fp_kind) :: tol=1.0e-6_fp_kind, error=1.0_fp_kind
24 |   real(fp_kind) :: start_time, stop_time
25 | 
26 |   allocate ( A(0:n-1,0:m-1), Anew(0:n-1,0:m-1) )
27 | 
28 |   A    = 0.0_fp_kind
29 |   Anew = 0.0_fp_kind
30 | 
31 |   ! Set B.C.
32 |   A(0,:)    = 1.0_fp_kind
33 |   Anew(0,:) = 1.0_fp_kind
34 |    
35 |   write(*,'(a,i5,a,i5,a)') 'Jacobi relaxation Calculation:', n, ' x', m, ' mesh'
36 |  
37 |   call cpu_time(start_time) 
38 | 
39 |   iter=0
40 | 
41 |   !$acc data copy(A) create(Anew)
42 |   do while ( error .gt. tol .and. iter .lt. iter_max )
43 |     error=0.0_fp_kind
44 | 
45 |     !$acc parallel loop reduction(max:error)
46 |     do j=1,m-2
47 |       do i=1,n-2
48 |         Anew(i,j) = 0.25_fp_kind * ( A(i+1,j  ) + A(i-1,j  ) + &
49 |                                      A(i  ,j-1) + A(i  ,j+1) )
50 |         error = max( error, abs(Anew(i,j)-A(i,j)) )
51 |       end do
52 |     end do
53 | 
54 |     !$acc parallel loop
55 |     do j=1,m-2
56 |       do i=1,n-2
57 |         A(i,j) = Anew(i,j)
58 |       end do
59 |     end do
60 | 
61 |     if(mod(iter,100).eq.0 ) write(*,'(i5,f10.6)'), iter, error
62 |     iter = iter + 1
63 | 
64 |   end do
65 |   !$acc end data
66 | 
67 |   call cpu_time(stop_time)
68 |   write(*,'(a,f10.3,a)')  ' completed in ', stop_time-start_time, ' seconds'
69 |   deallocate (A,Anew)
70 | end program laplace
71 | 


--------------------------------------------------------------------------------
/examples/laplace/ch4/timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2012 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #ifndef TIMER_H
18 | #define TIMER_H
19 | 
20 | #include <stdlib.h>
21 | 
22 | #ifdef WIN32
23 | #define WIN32_LEAN_AND_MEAN
24 | #include <windows.h>
25 | #else
26 | #include <sys/time.h>
27 | #endif
28 | 
29 | #ifdef WIN32
30 | double PCFreq = 0.0;
31 | __int64 timerStart = 0;
32 | #else
33 | struct timeval timerStart;
34 | #endif
35 | 
36 | void StartTimer()
37 | {
38 | #ifdef WIN32
39 |     LARGE_INTEGER li;
40 |     if(!QueryPerformanceFrequency(&li))
41 |         printf("QueryPerformanceFrequency failed!\n");
42 | 
43 |     PCFreq = (double)li.QuadPart/1000.0;
44 | 
45 |     QueryPerformanceCounter(&li);
46 |     timerStart = li.QuadPart;
47 | #else
48 |     gettimeofday(&timerStart, NULL);
49 | #endif
50 | }
51 | 
52 | // time elapsed in ms
53 | double GetTimer()
54 | {
55 | #ifdef WIN32
56 |     LARGE_INTEGER li;
57 |     QueryPerformanceCounter(&li);
58 |     return (double)(li.QuadPart-timerStart)/PCFreq;
59 | #else
60 |     struct timeval timerStop, timerElapsed;
61 |     gettimeofday(&timerStop, NULL);
62 |     timersub(&timerStop, &timerStart, &timerElapsed);
63 |     return timerElapsed.tv_sec*1000.0+timerElapsed.tv_usec/1000.0;
64 | #endif
65 | }
66 | 
67 | #endif // TIMER_H
68 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/License.md:
--------------------------------------------------------------------------------
 1 |  Copyright 2014 NVIDIA Corporation
 2 |  
 3 |  Licensed under the Apache License, Version 2.0 (the "License");
 4 |  you may not use this file except in compliance with the License.
 5 |  You may obtain a copy of the License at
 6 |  
 7 |      http://www.apache.org/licenses/LICENSE-2.0
 8 |  
 9 |  Unless required by applicable law or agreed to in writing, software
10 |  distributed under the License is distributed on an "AS IS" BASIS,
11 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  See the License for the specific language governing permissions and
13 |  limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot_solution.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   //int num_blocks, block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 | #pragma acc parallel loop
46 |   for(int y=0;y<HEIGHT;y++) {
47 |     for(int x=0;x<WIDTH;x++) {
48 |       image[y*WIDTH+x]=mandelbrot(x,y);
49 |     }
50 |   }
51 |   
52 |   double et = omp_get_wtime();
53 |   printf("Time: %lf seconds.\n", (et-st));
54 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
55 |   fclose(fp);
56 |   free(image);
57 |   return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/original/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/original/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/original/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   //int num_blocks, block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 | #pragma acc parallel loop
46 |   for(int y=0;y<HEIGHT;y++) {
47 |     for(int x=0;x<WIDTH;x++) {
48 |       image[y*WIDTH+x]=mandelbrot(x,y);
49 |     }
50 |   }
51 |   
52 |   double et = omp_get_wtime();
53 |   printf("Time: %lf seconds.\n", (et-st));
54 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
55 |   fclose(fp);
56 |   free(image);
57 |   return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/original/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/original/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   //int num_blocks, block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 | #pragma acc parallel loop
46 |   for(int y=0;y<HEIGHT;y++) {
47 |     for(int x=0;x<WIDTH;x++) {
48 |       image[y*WIDTH+x]=mandelbrot(x,y);
49 |     }
50 |   }
51 |   
52 |   double et = omp_get_wtime();
53 |   printf("Time: %lf seconds.\n", (et-st));
54 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
55 |   fclose(fp);
56 |   free(image);
57 |   return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/main_solution.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16;
46 |   for(int block = 0; block < num_blocks; block++ ) {
47 |     int start = block * (HEIGHT/num_blocks),
48 |         end   = start + (HEIGHT/num_blocks);
49 | #pragma acc parallel loop
50 |     for(int y=start;y<end;y++) {
51 |       for(int x=0;x<WIDTH;x++) {
52 |         image[y*WIDTH+x]=mandelbrot(x,y);
53 |       }
54 |     }
55 |   }
56 |   
57 |   double et = omp_get_wtime();
58 |   printf("Time: %lf seconds.\n", (et-st));
59 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
60 |   fclose(fp);
61 |   free(image);
62 |   return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task1/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16;
46 |   for(int block = 0; block < num_blocks; block++ ) {
47 |     int start = block * (HEIGHT/num_blocks),
48 |         end   = start + (HEIGHT/num_blocks);
49 | #pragma acc parallel loop
50 |     for(int y=start;y<end;y++) {
51 |       for(int x=0;x<WIDTH;x++) {
52 |         image[y*WIDTH+x]=mandelbrot(x,y);
53 |       }
54 |     }
55 |   }
56 |   
57 |   double et = omp_get_wtime();
58 |   printf("Time: %lf seconds.\n", (et-st));
59 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
60 |   fclose(fp);
61 |   free(image);
62 |   return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/main_solution.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks , block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16; 
46 |   block_size = (HEIGHT/num_blocks)*WIDTH;
47 | #pragma acc data create(image[WIDTH*HEIGHT])
48 |   for(int block = 0; block < num_blocks; block++ ) {
49 |     int start = block * (HEIGHT/num_blocks),
50 |         end   = start + (HEIGHT/num_blocks);
51 | #pragma acc parallel loop
52 |     for(int y=start;y<end;y++) {
53 |       for(int x=0;x<WIDTH;x++) {
54 |         image[y*WIDTH+x]=mandelbrot(x,y);
55 |       }
56 |     }
57 | #pragma acc update self(image[block*block_size:block_size])
58 |   }
59 |   
60 |   double et = omp_get_wtime();
61 |   printf("Time: %lf seconds.\n", (et-st));
62 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
63 |   fclose(fp);
64 |   free(image);
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task2/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks , block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16; 
46 |   block_size = (HEIGHT/num_blocks)*WIDTH;
47 | #pragma acc data create(image[WIDTH*HEIGHT])
48 |   for(int block = 0; block < num_blocks; block++ ) {
49 |     int start = block * (HEIGHT/num_blocks),
50 |         end   = start + (HEIGHT/num_blocks);
51 | #pragma acc parallel loop
52 |     for(int y=start;y<end;y++) {
53 |       for(int x=0;x<WIDTH;x++) {
54 |         image[y*WIDTH+x]=mandelbrot(x,y);
55 |       }
56 |     }
57 | #pragma acc update self(image[block*block_size:block_size])
58 |   }
59 |   
60 |   double et = omp_get_wtime();
61 |   printf("Time: %lf seconds.\n", (et-st));
62 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
63 |   fclose(fp);
64 |   free(image);
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/main_solution.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks , block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16; 
46 |   block_size = (HEIGHT/num_blocks)*WIDTH;
47 | #pragma acc data create(image[WIDTH*HEIGHT])
48 |   {
49 |     for(int block = 0; block < num_blocks; block++ ) {
50 |       int start = block * (HEIGHT/num_blocks),
51 |           end   = start + (HEIGHT/num_blocks);
52 | #pragma acc parallel loop async
53 |       for(int y=start;y<end;y++) {
54 |         for(int x=0;x<WIDTH;x++) {
55 |           image[y*WIDTH+x]=mandelbrot(x,y);
56 |         }
57 |       }
58 | #pragma acc update self(image[block*block_size:block_size]) async
59 |     }
60 |   }
61 | #pragma acc wait
62 |   
63 |   double et = omp_get_wtime();
64 |   printf("Time: %lf seconds.\n", (et-st));
65 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
66 |   fclose(fp);
67 |   free(image);
68 |   return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task3/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main() {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks , block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 | #pragma acc parallel num_gangs(1)
40 |   {
41 |     image[0] = 0;
42 |   }
43 |   double st = omp_get_wtime();
44 | 
45 |   num_blocks = 16; 
46 |   block_size = (HEIGHT/num_blocks)*WIDTH;
47 | #pragma acc data create(image[WIDTH*HEIGHT])
48 |   {
49 |     for(int block = 0; block < num_blocks; block++ ) {
50 |       int start = block * (HEIGHT/num_blocks),
51 |           end   = start + (HEIGHT/num_blocks);
52 | #pragma acc parallel loop async
53 |       for(int y=start;y<end;y++) {
54 |         for(int x=0;x<WIDTH;x++) {
55 |           image[y*WIDTH+x]=mandelbrot(x,y);
56 |         }
57 |       }
58 | #pragma acc update self(image[block*block_size:block_size]) async
59 |     }
60 |   }
61 | #pragma acc wait
62 |   
63 |   double et = omp_get_wtime();
64 |   printf("Time: %lf seconds.\n", (et-st));
65 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
66 |   fclose(fp);
67 |   free(image);
68 |   return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/main_solution.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <cstring>
21 | #include <omp.h>
22 | #include <openacc.h>
23 | #include "mandelbrot.h"
24 | #include "constants.h"
25 | 
26 | using namespace std;
27 | 
28 | int main( int argc, char **argv ) {
29 |   
30 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
31 |   unsigned char *image=(unsigned char*)malloc(bytes);
32 |   int num_blocks, block_size;
33 |   FILE *fp=fopen("image.pgm","wb");
34 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
35 |   acc_init(acc_device_nvidia);
36 | 
37 |   // This region absorbs overheads that occur once in a typical run
38 |   // to prevent them from skewing the results of the example.
39 |   for ( int i = 0; i < 2 ; i++ )
40 |   { 
41 | #pragma acc parallel num_gangs(1) copy(image[:WIDTH*HEIGHT]) async(i)
42 |     {
43 |       image[i] = 0;
44 |     }
45 |   }
46 |   double st = omp_get_wtime();
47 | 
48 |   num_blocks = 16;
49 |   if ( argc > 1 ) num_blocks = atoi(argv[1]);
50 |   block_size = (HEIGHT/num_blocks)*WIDTH;
51 | #pragma acc data create(image[WIDTH*HEIGHT])
52 |   {
53 |     for(int block = 0; block < num_blocks; block++ ) {
54 |       int start = block * (HEIGHT/num_blocks),
55 |           end   = start + (HEIGHT/num_blocks);
56 | #pragma acc parallel loop async(block%4)
57 |       for(int y=start;y<end;y++) {
58 |         for(int x=0;x<WIDTH;x++) {
59 |           image[y*WIDTH+x]=mandelbrot(x,y);
60 |         }
61 |       }
62 | #pragma acc update self(image[block*block_size:block_size]) async(block%2)
63 |     }
64 |   }
65 | #pragma acc wait
66 |   
67 |   double et = omp_get_wtime();
68 |   printf("Time: %lf seconds.\n", (et-st));
69 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
70 |   fclose(fp);
71 |   free(image);
72 |   return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task4/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=nvc++
 2 | CXXFLAGS=-fast -acc=gpu -gpu=pinned -Minfo=all -mp
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(CXX) $(CXXFLAGS) -o $@ $^
 8 | 
 9 | .PHONY: clean
10 | clean:
11 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
12 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/constants.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | const unsigned int WIDTH=16384;
17 | const unsigned int HEIGHT=16384;
18 | const unsigned int MAX_ITERS=50;
19 | const unsigned int MAX_COLOR=255;
20 | const double xmin=-1.7;
21 | const double xmax=.5;
22 | const double ymin=-1.2;
23 | const double ymax=1.2;
24 | const double dx=(xmax-xmin)/WIDTH;
25 | const double dy=(ymax-ymin)/HEIGHT;
26 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/main.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <omp.h>
21 | #include "mandelbrot.h"
22 | #include "constants.h"
23 | 
24 | using namespace std;
25 | 
26 | int main() {
27 |   
28 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
29 |   unsigned char *image=(unsigned char*)malloc(bytes);
30 |   int num_blocks = 8, block_size = (HEIGHT/num_blocks)*WIDTH;
31 |   FILE *fp=fopen("image.pgm","wb");
32 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
33 |   double st = omp_get_wtime();
34 | 
35 | #pragma acc data create(image[WIDTH*HEIGHT])
36 |   for(int block = 0; block < num_blocks; block++ ) {
37 |     int start = block * (HEIGHT/num_blocks),
38 |         end   = start + (HEIGHT/num_blocks);
39 | #pragma acc update device(image[block*block_size:block_size]) async(block)
40 | #pragma acc parallel loop async(block)
41 |     for(int y=start;y<end;y++) {
42 |       for(int x=0;x<WIDTH;x++) {
43 |         image[y*WIDTH+x]=mandelbrot(x,y);
44 |       }
45 |     }
46 | #pragma acc update self(image[block*block_size:block_size]) async(block)
47 |   }
48 | #pragma acc wait
49 |   
50 |   double et = omp_get_wtime();
51 |   printf("Time: %lf seconds.\n", (et-st));
52 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
53 |   fclose(fp);
54 |   free(image);
55 |   return 0;
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/main_solution.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include <omp.h>
21 | #include <openacc.h>
22 | #include "mandelbrot.h"
23 | #include "constants.h"
24 | 
25 | using namespace std;
26 | 
27 | int main() {
28 |   
29 |   size_t bytes=WIDTH*HEIGHT*sizeof(unsigned int);
30 |   unsigned char *image=(unsigned char*)malloc(bytes);
31 |   int num_blocks=64, block_size = (HEIGHT/num_blocks)*WIDTH;
32 |   FILE *fp=fopen("image.pgm","wb");
33 |   fprintf(fp,"P5\n%s\n%d %d\n%d\n","#comment",WIDTH,HEIGHT,MAX_COLOR);
34 | 
35 |   int num_gpus = acc_get_num_devices(acc_device_nvidia);
36 | // This parallel section eats the cost of initializing the devices to
37 | // prevent the initialization time from skewing the results.
38 | #pragma omp parallel num_threads(num_gpus)
39 | {
40 |   acc_init(acc_device_nvidia);
41 |   acc_set_device_num(omp_get_thread_num(),acc_device_nvidia);
42 | }
43 |   printf("Found %d NVIDIA GPUs.\n", num_gpus);
44 | 
45 |   double st = omp_get_wtime();
46 | #pragma omp parallel num_threads(num_gpus)
47 | {
48 |   int queue = 1;
49 |   int my_gpu = omp_get_thread_num();
50 |   acc_set_device_num(my_gpu,acc_device_nvidia);
51 |   printf("Thread %d is using GPU %d\n", my_gpu, acc_get_device_num(acc_device_nvidia));
52 | #pragma acc data create(image[WIDTH*HEIGHT])
53 | {
54 |   #pragma omp for schedule(static,1)
55 |   for(int block = 0; block < num_blocks; block++ ) {
56 |     int start = block * (HEIGHT/num_blocks),
57 |         end   = start + (HEIGHT/num_blocks);
58 | #pragma acc parallel loop async(queue)
59 |     for(int y=start;y<end;y++) {
60 |       for(int x=0;x<WIDTH;x++) {
61 |         image[y*WIDTH+x]=mandelbrot(x,y);
62 |       }
63 |     }
64 | #pragma acc update self(image[block*block_size:block_size]) async(queue)
65 |     queue = (queue + 1) % 2; 
66 |   }
67 | }
68 | #pragma acc wait
69 | } // OMP Parallel 
70 |   
71 |   double et = omp_get_wtime();
72 |   printf("Time: %lf seconds.\n", (et-st));
73 |   fwrite(image,sizeof(unsigned char),WIDTH*HEIGHT,fp);
74 |   fclose(fp);
75 |   free(image);
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/mandelbrot.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #include <cstdio>
18 | #include <cstdlib>
19 | #include <fstream>
20 | #include "mandelbrot.h"
21 | #include "constants.h"
22 | 
23 | using namespace std;
24 | 
25 | unsigned char mandelbrot(int Px, int Py) {
26 |   double x0=xmin+Px*dx;
27 |   double y0=ymin+Py*dy;
28 |   double x=0.0;
29 |   double y=0.0;
30 |   int i;
31 |   for(i=0;x*x+y*y<4.0 && i<MAX_ITERS;i++) {
32 |     double xtemp=x*x-y*y+x0;
33 |     y=2*x*y+y0;
34 |     x=xtemp;
35 |   }
36 |   return (double)MAX_COLOR*i/MAX_ITERS;
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/cpp/task5.multithread/mandelbrot.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2014 NVIDIA Corporation
 3 |  *  
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *  
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *  
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #pragma acc routine seq
17 | unsigned char mandelbrot(int Px, int Py);
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/License.md:
--------------------------------------------------------------------------------
 1 |  Copyright 2014 NVIDIA Corporation
 2 |  
 3 |  Licensed under the Apache License, Version 2.0 (the "License");
 4 |  you may not use this file except in compliance with the License.
 5 |  You may obtain a copy of the License at
 6 |  
 7 |      http://www.apache.org/licenses/LICENSE-2.0
 8 |  
 9 |  Unless required by applicable law or agreed to in writing, software
10 |  distributed under the License is distributed on an "AS IS" BASIS,
11 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  See the License for the specific language governing permissions and
13 |  limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot_solution.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | !integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | real :: startt, stopt
22 | image = 0
23 | 
24 | call cpu_time(startt)
25 | !$acc parallel loop
26 | do iy=1,width
27 |   do ix=1,HEIGHT
28 |     image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
29 |   enddo
30 | enddo
31 | call cpu_time(stopt)
32 | 
33 | print *,"Time:",(stopt-startt)
34 | 
35 | call write_pgm(image,'image.pgm')
36 | end
37 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/original/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/original/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | !integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | real :: startt, stopt
22 | image = 0
23 | 
24 | call cpu_time(startt)
25 | !$acc parallel loop
26 | do iy=1,width
27 |   do ix=1,HEIGHT
28 |     image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
29 |   enddo
30 | enddo
31 | call cpu_time(stopt)
32 | 
33 | print *,"Time:",(stopt-startt)
34 | 
35 | call write_pgm(image,'image.pgm')
36 | end
37 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/original/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task1/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task1/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | !integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | real :: startt, stopt
22 | image = 0
23 | 
24 | call cpu_time(startt)
25 | !$acc parallel loop
26 | do iy=1,width
27 |   do ix=1,HEIGHT
28 |     image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
29 |   enddo
30 | enddo
31 | call cpu_time(stopt)
32 | 
33 | print *,"Time:",(stopt-startt)
34 | 
35 | call write_pgm(image,'image.pgm')
36 | end
37 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task1/main_solution.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | do block=0,(num_blocks-1)
31 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
32 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
33 |   !$acc parallel loop
34 |   do iy=starty,endy
35 |     do ix=1,HEIGHT
36 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
37 |     enddo
38 |   enddo
39 | enddo
40 | call cpu_time(stopt)
41 | 
42 | print *,"Time:",(stopt-startt)
43 | 
44 | call write_pgm(image,'image.pgm')
45 | end
46 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task1/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task2/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task2/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | do block=0,(num_blocks-1)
31 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
32 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
33 |   !$acc parallel loop
34 |   do iy=starty,endy
35 |     do ix=1,HEIGHT
36 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
37 |     enddo
38 |   enddo
39 | enddo
40 | call cpu_time(stopt)
41 | 
42 | print *,"Time:",(stopt-startt)
43 | 
44 | call write_pgm(image,'image.pgm')
45 | end
46 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task2/main_solution.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | !$acc data create(image)
31 | do block=0,(num_blocks-1)
32 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
33 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
34 |   !$acc parallel loop
35 |   do iy=starty,endy
36 |     do ix=1,HEIGHT
37 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
38 |     enddo
39 |   enddo
40 |   !$acc update self(image(:,starty:endy))
41 | enddo
42 | !$acc end data
43 | call cpu_time(stopt)
44 | 
45 | print *,"Time:",(stopt-startt)
46 | 
47 | call write_pgm(image,'image.pgm')
48 | end
49 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task2/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task3/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task3/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | !$acc data create(image)
31 | do block=0,(num_blocks-1)
32 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
33 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
34 |   !$acc parallel loop
35 |   do iy=starty,endy
36 |     do ix=1,HEIGHT
37 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
38 |     enddo
39 |   enddo
40 |   !$acc update self(image(:,starty:endy))
41 | enddo
42 | !$acc end data
43 | call cpu_time(stopt)
44 | 
45 | print *,"Time:",(stopt-startt)
46 | 
47 | call write_pgm(image,'image.pgm')
48 | end
49 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task3/main_solution.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | !$acc data create(image(HEIGHT,WIDTH))
31 | do block=0,(num_blocks-1)
32 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
33 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
34 |   !$acc parallel loop async
35 |   do iy=starty,endy
36 |     do ix=1,HEIGHT
37 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
38 |     enddo
39 |   enddo
40 |   !$acc update self(image(:,starty:endy)) async
41 | enddo
42 | !$acc wait
43 | !$acc end data
44 | call cpu_time(stopt)
45 | 
46 | print *,"Time:",(stopt-startt)
47 | 
48 | call write_pgm(image,'image.pgm')
49 | end
50 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task3/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task4/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -gpu=pinned -Minfo=all
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task4/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | !$acc data create(image(HEIGHT,WIDTH))
31 | do block=0,(num_blocks-1)
32 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
33 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
34 |   !$acc parallel loop async
35 |   do iy=starty,endy
36 |     do ix=1,HEIGHT
37 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
38 |     enddo
39 |   enddo
40 |   !$acc update self(image(:,starty:endy)) async
41 | enddo
42 | !$acc wait
43 | !$acc end data
44 | call cpu_time(stopt)
45 | 
46 | print *,"Time:",(stopt-startt)
47 | 
48 | call write_pgm(image,'image.pgm')
49 | end
50 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task4/main_solution.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | use openacc
18 | implicit none
19 | integer      :: num_blocks
20 | integer(1)   :: image(HEIGHT, WIDTH)
21 | integer      :: iy, ix
22 | integer      :: block, block_size, block_start
23 | integer      :: starty, endy
24 | real         :: startt, stopt
25 | character(8) :: arg
26 | 
27 | call acc_init(acc_device_nvidia)
28 | 
29 | call getarg(1, arg)
30 | num_blocks = 8
31 | if ( arg /= '' ) then
32 |   read(arg, '(I10)') num_blocks
33 | endif
34 | print *,'num_blocks',num_blocks
35 | block_size = (HEIGHT*WIDTH)/num_blocks
36 | 
37 | image = 0
38 | 
39 | call cpu_time(startt)
40 | !$acc data create(image(HEIGHT,WIDTH))
41 | do block=0,(num_blocks-1)
42 |   starty = block  * (WIDTH/num_blocks) + 1
43 |   endy   = min(starty + (WIDTH/num_blocks), WIDTH)
44 |   !$acc parallel loop async(mod(block,2))
45 |   do iy=starty,endy
46 |     do ix=1,HEIGHT
47 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
48 |     enddo
49 |   enddo
50 |   !$acc update self(image(:,starty:endy)) async(mod(block,2))
51 | enddo
52 | !$acc wait
53 | !$acc end data
54 | call cpu_time(stopt)
55 | 
56 | print *,"Time:",(stopt-startt)
57 | 
58 | call write_pgm(image,'image.pgm')
59 | end
60 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task4/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task5.multithread/Makefile:
--------------------------------------------------------------------------------
 1 | FC=nvfortran
 2 | FCFLAGS=-fast -acc=gpu -gpu=pinned -Minfo=all -mp
 3 | 
 4 | mandelbrot.x: mandelbrot.o main.o
 5 | 	$(FC) $(FCFLAGS) -o $@ $^
 6 | mandelbrot_solution.x: mandelbrot.o main_solution.o
 7 | 	$(FC) $(FCFLAGS) -o $@ $^
 8 | 
 9 | .SUFFIXES: .o .F90
10 | .F90.o: 
11 | 	$(FC) $(FCFLAGS) -c -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -f *.o core *.lst *.ptx *.pgprof *.cubin *.s *.x *.mod *.nvprof
16 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task5.multithread/main.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | implicit none
18 | integer, parameter   :: NUM_BLOCKS=8
19 | integer(1) :: image(HEIGHT, WIDTH)
20 | integer :: iy, ix
21 | integer :: block, block_size, block_start
22 | integer :: starty, endy
23 | real :: startt, stopt
24 | 
25 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
26 | 
27 | image = 0
28 | 
29 | call cpu_time(startt)
30 | !$acc data create(image(HEIGHT,WIDTH))
31 | do block=0,(num_blocks-1)
32 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
33 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
34 |   !$acc parallel loop async(mod(block,2))
35 |   do iy=starty,endy
36 |     do ix=1,HEIGHT
37 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
38 |     enddo
39 |   enddo
40 |   !$acc update self(image(:,starty:endy)) async(mod(block,2))
41 | enddo
42 | !$acc wait
43 | !$acc end data
44 | call cpu_time(stopt)
45 | 
46 | print *,"Time:",(stopt-startt)
47 | 
48 | call write_pgm(image,'image.pgm')
49 | end
50 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task5.multithread/main_solution.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | program mandelbrot_main
16 | use mandelbrot_mod
17 | use omp_lib
18 | use openacc
19 | implicit none
20 | integer, parameter   :: NUM_BLOCKS=64
21 | integer(1) :: image(HEIGHT, WIDTH)
22 | integer :: iy, ix
23 | integer :: block, block_size, block_start
24 | integer :: starty, endy
25 | integer :: num_gpus, my_gpu, queue
26 | real :: startt, stopt
27 | 
28 | num_gpus = acc_get_num_devices(acc_device_nvidia)
29 | 
30 | ! This region is used to absorb the start-up cost
31 | ! so that it doesn't skew timing results
32 | !$omp parallel num_threads(num_gpus)
33 | call acc_init(acc_device_nvidia)
34 | call acc_set_device(omp_get_thread_num(),acc_device_nvidia)
35 | !$omp end parallel
36 | 
37 | block_size = (HEIGHT*WIDTH)/NUM_BLOCKS
38 | 
39 | image = 0
40 | queue = 1
41 | 
42 | !$omp parallel num_threads(num_gpus) private(my_gpu) firstprivate(queue)
43 | my_gpu = omp_get_thread_num()
44 | call acc_set_device_num(my_gpu,acc_device_nvidia)
45 | print *, "Thread:",my_gpu,"is using GPU",acc_get_device_num(acc_device_nvidia)
46 | 
47 | startt = omp_get_wtime()
48 | !$acc data create(image(HEIGHT,WIDTH))
49 | !$omp do schedule(static,1)
50 | do block=0,(num_blocks-1)
51 |   starty = block  * (WIDTH/NUM_BLOCKS) + 1
52 |   endy   = min(starty + (WIDTH/NUM_BLOCKS), WIDTH)
53 |   !$acc parallel loop async(queue)
54 |   do iy=starty,endy
55 |     do ix=1,HEIGHT
56 |       image(ix,iy) = min(max(int(mandelbrot(ix-1,iy-1)),0),MAXCOLORS)
57 |     enddo
58 |   enddo
59 |   !$acc update self(image(:,starty:endy)) async(queue)
60 |   queue = mod((queue+1),2)
61 | enddo
62 | !$acc wait
63 | !$acc end data
64 | !$omp end parallel
65 | stopt = omp_get_wtime()
66 | 
67 | print *,"Time:",(stopt-startt)
68 | 
69 | call write_pgm(image,'image.pgm')
70 | end
71 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/f90/task5.multithread/mandelbrot.F90:
--------------------------------------------------------------------------------
 1 | !  Copyright 2014 NVIDIA Corporation
 2 | !  
 3 | !  Licensed under the Apache License, Version 2.0 (the "License");
 4 | !  you may not use this file except in compliance with the License.
 5 | !  You may obtain a copy of the License at
 6 | !  
 7 | !      http://www.apache.org/licenses/LICENSE-2.0
 8 | !  
 9 | !  Unless required by applicable law or agreed to in writing, software
10 | !  distributed under the License is distributed on an "AS IS" BASIS,
11 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | !  See the License for the specific language governing permissions and
13 | !  limitations under the License.
14 | 
15 | module mandelbrot_mod
16 |   implicit none
17 |   integer, parameter :: HEIGHT=16384
18 |   integer, parameter :: WIDTH=16384
19 |   integer, parameter :: MAXCOLORS = 255
20 | 
21 | contains
22 |   subroutine write_pgm(image, filename, iunit)
23 |     integer(1) :: image(HEIGHT, WIDTH)
24 |     character*(*) :: filename
25 |     integer, optional :: iunit
26 |     character*50 :: hdr
27 |     integer nc, junit
28 |     if (present(iunit)) then
29 |       junit = iunit
30 |     else
31 |       junit = 10
32 |     endif
33 |     open(unit=junit,file=filename,access='stream',form='unformatted')
34 |     write(hdr,fmt='(a,a,a,a,i0,1x,i0,a,i0,a)') 'P5', new_line('a'), '#comment', &
35 |            new_line('a'),WIDTH, HEIGHT, new_line('a'), MAXCOLORS, new_line('a')
36 |     nc = len_trim(hdr)
37 |     write(junit) hdr(1:nc)
38 |     write(junit) image
39 |     close(junit)
40 |   end subroutine
41 |    
42 |   real(8) function mandelbrot(px,py)
43 |     !$acc routine seq
44 |     integer, parameter :: MAX_ITERS=100
45 |     real(8), parameter :: xmin=-1.7d0
46 |     real(8), parameter :: xmax=.5d0
47 |     real(8), parameter :: ymin=-1.2d0
48 |     real(8), parameter :: ymax=1.2d0
49 |     real(8), parameter :: dx=(xmax-xmin)/WIDTH
50 |     real(8), parameter :: dy=(ymax-ymin)/HEIGHT
51 |     integer, intent(in), value :: px, py
52 |     real(8)             :: x0, y0, xtemp, x, y
53 |     integer             :: i
54 | 
55 |     x0 = xmin+Px*dx
56 |     y0 = ymin+Py*dy
57 |     x = 0.0d0
58 |     y = 0.0d0
59 |     i = 0
60 | 
61 |     do while(((x*x+y*y).lt.4.0d0).and.(i.lt.MAX_ITERS))
62 |       xtemp=x*x - y*y + x0
63 |       y=2*x*y + y0
64 |       x=xtemp
65 |       i = i+1
66 |     enddo
67 |     mandelbrot =  dble(MAXCOLORS)*i/MAX_ITERS
68 |   end function mandelbrot
69 | 
70 | end module mandelbrot_mod
71 | 


--------------------------------------------------------------------------------
/examples/saxpy-kernels.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2019 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #include <stdio.h>
17 | #define N 1024
18 | 
19 | int main(int argc, char **argv)
20 | {
21 |   float x[N], y[N];
22 |   int i;
23 | 
24 | #pragma acc kernels
25 | {
26 |   for (i=0; i<N; i++)
27 |   {
28 |     y[i] = 0.0f;
29 |     x[i] = (float)(i+1);
30 |   }
31 | 
32 |   for (i=0; i<N; i++)
33 |   {
34 |     y[i] = 2.0f * x[i] + y[i];
35 |   }
36 | }
37 | 
38 |   printf("%f %f\n",y[0],y[N-1]);
39 | 
40 |   return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/examples/saxpy-kernels.f90:
--------------------------------------------------------------------------------
 1 | !
 2 | !  Copyright 2019 NVIDIA Corporation
 3 | !
 4 | !  Licensed under the Apache License, Version 2.0 (the "License");
 5 | !  you may not use this file except in compliance with the License.
 6 | !  You may obtain a copy of the License at
 7 | !
 8 | !      http://www.apache.org/licenses/LICENSE-2.0
 9 | !
10 | !  Unless required by applicable law or agreed to in writing, software
11 | !  distributed under the License is distributed on an "AS IS" BASIS,
12 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | !  See the License for the specific language governing permissions and
14 | !  limitations under the License.
15 | !
16 | program saxpy
17 |   integer, parameter :: N=1024
18 |   real, dimension(N) :: x, y
19 |   integer            :: i
20 | 
21 |   !$acc kernels
22 |   do i=1,N
23 |     y(i) = 0
24 |     x(i) = i
25 |   enddo
26 | 
27 |   do i=1,N
28 |     y(i) = 2.0 * x(i) + y(i)
29 |   enddo
30 |   !$acc end kernels
31 | 
32 |   print *, y(1), y(N)
33 | 
34 | end program saxpy
35 | 


--------------------------------------------------------------------------------
/examples/saxpy-parallel.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2019 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | #include <stdio.h>
17 | #define N 1024
18 | 
19 | int main(int argc, char **argv)
20 | {
21 |   float x[N], y[N];
22 |   int i;
23 | 
24 | #pragma acc parallel loop
25 |   for (i=0; i<N; i++)
26 |   {
27 |     y[i] = 0.0f;
28 |     x[i] = (float)(i+1);
29 |   }
30 | 
31 | #pragma acc parallel loop
32 |   for (i=0; i<N; i++)
33 |   {
34 |     y[i] = 2.0f * x[i] + y[i];
35 |   }
36 | 
37 |   printf("%f %f\n",y[0],y[N-1]);
38 | 
39 |   return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/saxpy-parallel.f90:
--------------------------------------------------------------------------------
 1 | program saxpy
 2 |   integer, parameter :: N=1024
 3 |   real, dimension(N) :: x, y
 4 |   integer            :: i
 5 | 
 6 |   !$acc parallel loop
 7 |   do i=1,N
 8 |     y(i) = 0
 9 |     x(i) = i
10 |   enddo
11 | 
12 |   !$acc parallel loop
13 |   do i=1,N
14 |     y(i) = 2.0 * x(i) + y(i)
15 |   enddo
16 | 
17 |   print *, y(1), y(N)
18 | 
19 | end program saxpy
20 | 


--------------------------------------------------------------------------------
/guide.latex:
--------------------------------------------------------------------------------
  1 | % Options for packages loaded elsewhere
  2 | \PassOptionsToPackage{unicode$for(hyperrefoptions)$,$hyperrefoptions$$endfor$}{hyperref}
  3 | \PassOptionsToPackage{hyphens}{url}
  4 | $if(colorlinks)$
  5 | \PassOptionsToPackage{dvipsnames,svgnames*,x11names*}{xcolor}
  6 | $endif$
  7 | $if(dir)$
  8 | $if(latex-dir-rtl)$
  9 | \PassOptionsToPackage{RTLdocument}{bidi}
 10 | $endif$
 11 | $endif$
 12 | $if(CJKmainfont)$
 13 | \PassOptionsToPackage{space}{xeCJK}
 14 | $endif$
 15 | %
 16 | \documentclass[
 17 | $if(fontsize)$
 18 |   $fontsize$,
 19 | $endif$
 20 | $if(lang)$
 21 |   $babel-lang$,
 22 | $endif$
 23 | $if(papersize)$
 24 |   $papersize$paper,
 25 | $endif$
 26 | $for(classoption)$
 27 |   $classoption$$sep$,
 28 | $endfor$
 29 | ]{$documentclass$}
 30 | \usepackage{amsmath,amssymb}
 31 | $if(fontfamily)$
 32 | \usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$}
 33 | $else$
 34 | \usepackage{lmodern}
 35 | $endif$
 36 | $if(linestretch)$
 37 | \usepackage{setspace}
 38 | $endif$
 39 | \usepackage{iftex}
 40 | \ifPDFTeX
 41 |   \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
 42 |   \usepackage[utf8]{inputenc}
 43 |   \usepackage{textcomp} % provide euro and other symbols
 44 | \else % if luatex or xetex
 45 | $if(mathspec)$
 46 |   \ifXeTeX
 47 |     \usepackage{mathspec}
 48 |   \else
 49 |     \usepackage{unicode-math}
 50 |   \fi
 51 | $else$
 52 |   \usepackage{unicode-math}
 53 | $endif$
 54 |   \defaultfontfeatures{Scale=MatchLowercase}
 55 |   \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
 56 | $if(mainfont)$
 57 |   \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$}
 58 | $endif$
 59 | $if(sansfont)$
 60 |   \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$}
 61 | $endif$
 62 | $if(monofont)$
 63 |   \setmonofont[$for(monofontoptions)$$monofontoptions$$sep$,$endfor$]{$monofont$}
 64 | $endif$
 65 | $for(fontfamilies)$
 66 |   \newfontfamily{$fontfamilies.name$}[$for(fontfamilies.options)$$fontfamilies.options$$sep$,$endfor$]{$fontfamilies.font$}
 67 | $endfor$
 68 | $if(mathfont)$
 69 | $if(mathspec)$
 70 |   \ifXeTeX
 71 |     \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 72 |   \else
 73 |     \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 74 |   \fi
 75 | $else$
 76 |   \setmathfont[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
 77 | $endif$
 78 | $endif$
 79 | $if(CJKmainfont)$
 80 |   \ifXeTeX
 81 |     \usepackage{xeCJK}
 82 |     \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
 83 |   \fi
 84 | $endif$
 85 | $if(luatexjapresetoptions)$
 86 |   \ifLuaTeX
 87 |     \usepackage[$for(luatexjapresetoptions)$$luatexjapresetoptions$$sep$,$endfor$]{luatexja-preset}
 88 |   \fi
 89 | $endif$
 90 | $if(CJKmainfont)$
 91 |   \ifLuaTeX
 92 |     \usepackage[$for(luatexjafontspecoptions)$$luatexjafontspecoptions$$sep$,$endfor$]{luatexja-fontspec}
 93 |     \setmainjfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
 94 |   \fi
 95 | $endif$
 96 | \fi
 97 | % Use upquote if available, for straight quotes in verbatim environments
 98 | \IfFileExists{upquote.sty}{\usepackage{upquote}}{}
 99 | \IfFileExists{microtype.sty}{% use microtype if available
100 |   \usepackage[$for(microtypeoptions)$$microtypeoptions$$sep$,$endfor$]{microtype}
101 |   \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
102 | }{}
103 | $if(indent)$
104 | $else$
105 | \makeatletter
106 | \@ifundefined{KOMAClassName}{% if non-KOMA class
107 |   \IfFileExists{parskip.sty}{%
108 |     \usepackage{parskip}
109 |   }{% else
110 |     \setlength{\parindent}{0pt}
111 |     \setlength{\parskip}{6pt plus 2pt minus 1pt}}
112 | }{% if KOMA class
113 |   \KOMAoptions{parskip=half}}
114 | \makeatother
115 | $endif$
116 | $if(verbatim-in-note)$
117 | \usepackage{fancyvrb}
118 | $endif$
119 | \usepackage{xcolor}
120 | \IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
121 | \IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
122 | \hypersetup{
123 | $if(title-meta)$
124 |   pdftitle={$title-meta$},
125 | $endif$
126 | $if(author-meta)$
127 |   pdfauthor={$author-meta$},
128 | $endif$
129 | $if(lang)$
130 |   pdflang={$lang$},
131 | $endif$
132 | $if(subject)$
133 |   pdfsubject={$subject$},
134 | $endif$
135 | $if(keywords)$
136 |   pdfkeywords={$for(keywords)$$keywords$$sep$, $endfor$},
137 | $endif$
138 | $if(colorlinks)$
139 |   colorlinks=true,
140 |   linkcolor={$if(linkcolor)$$linkcolor$$else$Maroon$endif$},
141 |   filecolor={$if(filecolor)$$filecolor$$else$Maroon$endif$},
142 |   citecolor={$if(citecolor)$$citecolor$$else$Blue$endif$},
143 |   urlcolor={$if(urlcolor)$$urlcolor$$else$Blue$endif$},
144 | $else$
145 |   hidelinks,
146 | $endif$
147 |   pdfcreator={LaTeX via pandoc}}
148 | \urlstyle{same} % disable monospaced font for URLs
149 | $if(verbatim-in-note)$
150 | \VerbatimFootnotes % allow verbatim text in footnotes
151 | $endif$
152 | $if(geometry)$
153 | \usepackage[$for(geometry)$$geometry$$sep$,$endfor$]{geometry}
154 | $endif$
155 | $if(listings)$
156 | \usepackage{listings}
157 | \newcommand{\passthrough}[1]{#1}
158 | \lstset{defaultdialect=[5.3]Lua}
159 | \lstset{defaultdialect=[x86masm]Assembler}
160 | $endif$
161 | $if(lhs)$
162 | \lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{}
163 | $endif$
164 | $if(highlighting-macros)$
165 | $highlighting-macros$
166 | $endif$
167 | $if(tables)$
168 | \usepackage{longtable,booktabs,array}
169 | $if(multirow)$
170 | \usepackage{multirow}
171 | $endif$
172 | \usepackage{calc} % for calculating minipage widths
173 | % Correct order of tables after \paragraph or \subparagraph
174 | \usepackage{etoolbox}
175 | \makeatletter
176 | \patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
177 | \makeatother
178 | % Allow footnotes in longtable head/foot
179 | \IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
180 | \makesavenoteenv{longtable}
181 | $endif$
182 | $if(graphics)$
183 | \usepackage{graphicx}
184 | \makeatletter
185 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
186 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
187 | \makeatother
188 | % Scale images if necessary, so that they will not overflow the page
189 | % margins by default, and it is still possible to overwrite the defaults
190 | % using explicit options in \includegraphics[width, height, ...]{}
191 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
192 | % Set default figure placement to htbp
193 | \makeatletter
194 | \def\fps@figure{htbp}
195 | \makeatother
196 | $endif$
197 | $if(links-as-notes)$
198 | % Make links footnotes instead of hotlinks:
199 | \DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
200 | $endif$
201 | $if(strikeout)$
202 | \usepackage[normalem]{ulem}
203 | % Avoid problems with \sout in headers with hyperref
204 | \pdfstringdefDisableCommands{\renewcommand{\sout}{}}
205 | $endif$
206 | \setlength{\emergencystretch}{3em} % prevent overfull lines
207 | \providecommand{\tightlist}{%
208 |   \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
209 | $if(numbersections)$
210 | \setcounter{secnumdepth}{$if(secnumdepth)$$secnumdepth$$else$5$endif$}
211 | $else$
212 | \setcounter{secnumdepth}{-\maxdimen} % remove section numbering
213 | $endif$
214 | $if(block-headings)$
215 | % Make \paragraph and \subparagraph free-standing
216 | \ifx\paragraph\undefined\else
217 |   \let\oldparagraph\paragraph
218 |   \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
219 | \fi
220 | \ifx\subparagraph\undefined\else
221 |   \let\oldsubparagraph\subparagraph
222 |   \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
223 | \fi
224 | $endif$
225 | $if(pagestyle)$
226 | \pagestyle{$pagestyle$}
227 | $endif$
228 | $for(header-includes)$
229 | $header-includes$
230 | $endfor$
231 | $if(lang)$
232 | \ifXeTeX
233 |   % Load polyglossia as late as possible: uses bidi with RTL langages (e.g. Hebrew, Arabic)
234 |   \usepackage{polyglossia}
235 |   \setmainlanguage[$for(polyglossia-lang.options)$$polyglossia-lang.options$$sep$,$endfor$]{$polyglossia-lang.name$}
236 | $for(polyglossia-otherlangs)$
237 |   \setotherlanguage[$for(polyglossia-otherlangs.options)$$polyglossia-otherlangs.options$$sep$,$endfor$]{$polyglossia-otherlangs.name$}
238 | $endfor$
239 | \else
240 |   \usepackage[$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel}
241 | % get rid of language-specific shorthands (see #6817):
242 | \let\LanguageShortHands\languageshorthands
243 | \def\languageshorthands#1{}
244 | $if(babel-newcommands)$
245 |   $babel-newcommands$
246 | $endif$
247 | \fi
248 | $endif$
249 | \ifLuaTeX
250 |   \usepackage{selnolig}  % disable illegal ligatures
251 | \fi
252 | $if(dir)$
253 | \ifXeTeX
254 |   % Load bidi as late as possible as it modifies e.g. graphicx
255 |   \usepackage{bidi}
256 | \fi
257 | \ifPDFTeX
258 |   \TeXXeTstate=1
259 |   \newcommand{\RL}[1]{\beginR #1\endR}
260 |   \newcommand{\LR}[1]{\beginL #1\endL}
261 |   \newenvironment{RTL}{\beginR}{\endR}
262 |   \newenvironment{LTR}{\beginL}{\endL}
263 | \fi
264 | $endif$
265 | $if(natbib)$
266 | \usepackage[$natbiboptions$]{natbib}
267 | \bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
268 | $endif$
269 | $if(biblatex)$
270 | \usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex}
271 | $for(bibliography)$
272 | \addbibresource{$bibliography$}
273 | $endfor$
274 | $endif$
275 | $if(nocite-ids)$
276 | \nocite{$for(nocite-ids)$$it$$sep$, $endfor$}
277 | $endif$
278 | $if(csl-refs)$
279 | \newlength{\cslhangindent}
280 | \setlength{\cslhangindent}{1.5em}
281 | \newlength{\csllabelwidth}
282 | \setlength{\csllabelwidth}{3em}
283 | \newenvironment{CSLReferences}[2] % #1 hanging-ident, #2 entry spacing
284 |  {% don't indent paragraphs
285 |   \setlength{\parindent}{0pt}
286 |   % turn on hanging indent if param 1 is 1
287 |   \ifodd #1 \everypar{\setlength{\hangindent}{\cslhangindent}}\ignorespaces\fi
288 |   % set entry spacing
289 |   \ifnum #2 > 0
290 |   \setlength{\parskip}{#2\baselineskip}
291 |   \fi
292 |  }%
293 |  {}
294 | \usepackage{calc}
295 | \newcommand{\CSLBlock}[1]{#1\hfill\break}
296 | \newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}}
297 | \newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break}
298 | \newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
299 | $endif$
300 | $if(csquotes)$
301 | \usepackage{csquotes}
302 | $endif$
303 | 
304 | $if(title)$
305 | \title{$title$$if(thanks)$\thanks{$thanks$}$endif$}
306 | $endif$
307 | $if(subtitle)$
308 | \usepackage{etoolbox}
309 | \makeatletter
310 | \providecommand{\subtitle}[1]{% add subtitle to \maketitle
311 |   \apptocmd{\@title}{\par {\large #1 \par}}{}{}
312 | }
313 | \makeatother
314 | \subtitle{$subtitle$}
315 | $endif$
316 | \author{$for(author)$$author$$sep$ \and $endfor$}
317 | \date{$date$}
318 | 
319 | \begin{document}
320 | $if(has-frontmatter)$
321 | \frontmatter
322 | $endif$
323 | $if(title)$
324 | \maketitle
325 | $if(abstract)$
326 | \begin{abstract}
327 | $abstract$
328 | \end{abstract}
329 | $endif$
330 | $endif$
331 | 
332 | $for(include-before)$
333 | $include-before$
334 | 
335 | $endfor$
336 | $if(toc)$
337 | $if(toc-title)$
338 | \renewcommand*\contentsname{$toc-title$}
339 | $endif$
340 | {
341 | $if(colorlinks)$
342 | \hypersetup{linkcolor=$if(toccolor)$$toccolor$$else$$endif$}
343 | $endif$
344 | \setcounter{tocdepth}{$toc-depth$}
345 | \tableofcontents
346 | }
347 | $endif$
348 | $if(lot)$
349 | \listoftables
350 | $endif$
351 | $if(lof)$
352 | \listoffigures
353 | $endif$
354 | $if(linestretch)$
355 | \setstretch{$linestretch$}
356 | $endif$
357 | $if(has-frontmatter)$
358 | \mainmatter
359 | $endif$
360 | $body$
361 | 
362 | $if(has-frontmatter)$
363 | \backmatter
364 | $endif$
365 | $if(natbib)$
366 | $if(bibliography)$
367 | $if(biblio-title)$
368 | $if(has-chapters)$
369 | \renewcommand\bibname{$biblio-title$}
370 | $else$
371 | \renewcommand\refname{$biblio-title$}
372 | $endif$
373 | $endif$
374 |   \bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
375 | 
376 | $endif$
377 | $endif$
378 | $if(biblatex)$
379 | \printbibliography$if(biblio-title)$[title=$biblio-title$]$endif$
380 | 
381 | $endif$
382 | $for(include-after)$
383 | $include-after$
384 | 
385 | $endfor$
386 | \end{document}
387 | 


--------------------------------------------------------------------------------
/images/ch2-nsight-initial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch2-nsight-initial.png


--------------------------------------------------------------------------------
/images/ch2-nsight-open.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch2-nsight-open.png


--------------------------------------------------------------------------------
/images/ch2-pgprof-initial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch2-pgprof-initial.png


--------------------------------------------------------------------------------
/images/ch2-pgprof.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch2-pgprof.png


--------------------------------------------------------------------------------
/images/ch3_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch3_profile.png


--------------------------------------------------------------------------------
/images/ch4_profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/ch4_profile.png


--------------------------------------------------------------------------------
/images/execution_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/execution_model.png


--------------------------------------------------------------------------------
/images/execution_model2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/execution_model2.png


--------------------------------------------------------------------------------
/images/histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/histogram.png


--------------------------------------------------------------------------------
/images/idealized_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/idealized_pipeline.png


--------------------------------------------------------------------------------
/images/jacobi_step1_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/jacobi_step1_graph.png


--------------------------------------------------------------------------------
/images/jacobi_step1_nvvp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/jacobi_step1_nvvp.png


--------------------------------------------------------------------------------
/images/jacobi_step2_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/jacobi_step2_graph.png


--------------------------------------------------------------------------------
/images/jacobi_step2_nvvp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/jacobi_step2_nvvp.png


--------------------------------------------------------------------------------
/images/laplace.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/laplace.xlsx


--------------------------------------------------------------------------------
/images/levels_of_parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/levels_of_parallelism.png


--------------------------------------------------------------------------------
/images/mandelbrot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/mandelbrot.png


--------------------------------------------------------------------------------
/images/mandelbrot_async_nsight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/mandelbrot_async_nsight.png


--------------------------------------------------------------------------------
/images/mandelbrot_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/mandelbrot_timeline.png


--------------------------------------------------------------------------------
/images/multigpu_mandelbrot_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/multigpu_mandelbrot_timeline.png


--------------------------------------------------------------------------------
/images/multigpu_mandelbrot_timeline_nsight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/multigpu_mandelbrot_timeline_nsight.png


--------------------------------------------------------------------------------
/images/openacc-guide-images.pub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/openacc-guide-images.pub


--------------------------------------------------------------------------------
/images/spmv_speedup_num_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/spmv_speedup_num_workers.png


--------------------------------------------------------------------------------
/images/spmv_speedup_vector_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenACC/openacc-best-practices-guide/9134e05bd1b8f087374efd1d8f5038ef59d7ab03/images/spmv_speedup_vector_length.png


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
 1 | OpenACC Programming and Best Practices Guide
 2 | ============================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 | 
 7 |    01-Introduction
 8 |    02-Porting
 9 |    03-Analyze
10 |    04-Parallelize
11 |    05-Data-Locality
12 |    06-Loops
13 |    07-Interoperability
14 |    08-Advanced
15 |    09-End
16 | 


--------------------------------------------------------------------------------
/outline.markdown:
--------------------------------------------------------------------------------
  1 | OpenACC Porting and Best Practices Guide
  2 | ========================================
  3 | 1. What is OpenACC?
  4 | -------------------
  5 | High level overview of compiler directives
  6 | 
  7 | * Compiler directives basics
  8 | * Benefits and Tradeoffs to compiler directives
  9 | * Abstract accelerator model
 10 |     * Multiple Levels of Parallelism
 11 |     * Memory hierarchies
 12 | * Should I say something here about the relevance toward OpenMP target
 13 |   directives?
 14 | 
 15 | 2. Accelerating an application
 16 | ------------------------------
 17 | Describe the porting process at a high level including APOD
 18 | 
 19 | * Assess the code to identify parallelism
 20 | * Parallelize the code with directives
 21 | * Optimize data locality (Optimize, part 1)
 22 | * Optimize loops (Optimize, part 2)
 23 | * Return to step 1 (Deploy?)
 24 | 
 25 | 3. Assess
 26 | ---------
 27 | Where to start
 28 | 
 29 | * Generating a CPU profile using common tools
 30 |    * What tools can we point to here?
 31 | * Discuss coarse vs. fine-grained parallelism
 32 | * Utilizing existing OpenMP directives?
 33 | 
 34 | 4. Parallelize
 35 | --------------
 36 | Moving computation to the GPU
 37 | 
 38 | * PARALLEL and KERNELS regions
 39 | * LOOP directive
 40 |     * private clause
 41 |     * reduction clause
 42 | * ROUTINE directive
 43 | 
 44 | 5. Optimize Data Locality
 45 | -------------------------
 46 | Improving data movement
 47 | 
 48 | * Introduction to data regions
 49 | * Data clauses
 50 | * Introduction to unstructured data lifetimes
 51 |     * Show usage in C++ classes
 52 | * CACHE directive
 53 | * Asynchronous overlapping
 54 | * Dealing with global data (is this needed?)
 55 | 
 56 | 6. Optimize Loops
 57 | -----------------
 58 | Loop-level optimizations that make a difference
 59 | 
 60 | * Common loop transformations
 61 | * COLLAPSE directive
 62 | * TILE directive
 63 |     * I will leave this out unless someone can provide me a case where it's
 64 |       beneficial
 65 | 
 66 | 7. Deploy
 67 | ---------
 68 | What more is there to say at this point?
 69 | 
 70 | 8. Interoperability
 71 | -------------------
 72 | How to use OpenACC with math libraries and CUDA
 73 | 
 74 | * Reuse examples from my blog post
 75 | 
 76 | 9. Interacting with MPI
 77 | -----------------------
 78 | Expand on previous chapter with specifics around MPI
 79 | 
 80 | * Reuse Jiri's material
 81 | 
 82 | 10. Writing Portable OpenACC
 83 | ----------------------------
 84 | Tips on writing code with portability in mind. I'm not sure exactly what will
 85 | go here, but it seems important to me.
 86 | 
 87 | A. Appendix - OpenACC for XX programmers
 88 | ----------------------------------------
 89 | Tips for understanding OpenACC if you already have understanding of another parallel programming paradigm.
 90 | * OpenACC for CUDA programmers
 91 | * OpenACC for OpenCL programmers
 92 | * OpenACC for OpenMP programmers
 93 | 
 94 | NOTES: 
 95 | ------
 96 | 
 97 | * What tools can/should I show?
 98 |     * pgprof/gprof for CPU profiling
 99 |         * Identifies hotspots, but not loop info
100 |     * CUDA visual profiler/Nsight 
101 |         * Good for data motion insights
102 |         * Good for prioritizing kernels
103 |         * Poor for guiding how to optimize loops
104 |     * Vampir/Tau?
105 |         * CPU/GPU/MPI profiling in 1 tool
106 |     * DDT/Totalview?
107 |         * Commercial products, limited OpenACC support
108 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.16
 2 | Babel==2.15.0
 3 | certifi==2024.7.4
 4 | charset-normalizer==3.3.2
 5 | docutils==0.20.1
 6 | idna==3.7
 7 | imagesize==1.4.1
 8 | Jinja2==3.1.6
 9 | markdown-it-py==3.0.0
10 | MarkupSafe==2.1.5
11 | mdit-py-plugins==0.4.1
12 | mdurl==0.1.2
13 | myst-parser==4.0.0
14 | packaging==24.1
15 | Pygments==2.18.0
16 | PyYAML==6.0.1
17 | requests==2.32.3
18 | snowballstemmer==2.2.0
19 | Sphinx==7.4.7
20 | sphinx-rtd-theme==2.0.0
21 | sphinxcontrib-applehelp==2.0.0
22 | sphinxcontrib-devhelp==2.0.0
23 | sphinxcontrib-htmlhelp==2.1.0
24 | sphinxcontrib-jquery==4.1
25 | sphinxcontrib-jsmath==1.0.1
26 | sphinxcontrib-qthelp==2.0.0
27 | sphinxcontrib-serializinghtml==2.0.0
28 | tomli==2.0.1
29 | urllib3==2.2.2
30 | 


--------------------------------------------------------------------------------
/x98-Quick_Reference.markdown:
--------------------------------------------------------------------------------
1 | \appendix
2 | 
3 | OpenACC Quick Reference
4 | =======================
5 | Insert a quick reference guide here for OpenACC directives so that there's a
6 | single place to look at when there's a question about a particular directive.
7 | 


--------------------------------------------------------------------------------