├── .gitignore
├── Makefile
├── README.md
├── doc
    ├── 1d_decomp.md
    ├── api_decomposition.md
    ├── api_fft.md
    ├── api_halo.md
    ├── api_io.md
    ├── api_nonblocking.md
    ├── decomposition.md
    ├── dstar.md
    ├── hector.md
    ├── images
    │   ├── 1d_decomp.png
    │   ├── 2d_decomp.png
    │   ├── Brachos.png
    │   ├── compact.png
    │   ├── decomp-17-13-11-p_col-1.png
    │   ├── decomp-17-13-11-p_row-1.png
    │   ├── dstar-flame.png
    │   ├── fft_bgp.png
    │   ├── fft_hector_2a.png
    │   ├── fractal-grids.png
    │   ├── incompact3d-strong.png
    │   ├── incompact3d-weak.png
    │   ├── io_model-1.png
    │   ├── io_model-2.png
    │   ├── p3dfft_hector_phase1.png
    │   ├── shm1.png
    │   ├── shm2.png
    │   ├── vort-fractal.png
    │   └── yes.png
    ├── incompact3d.md
    ├── jugene.md
    ├── overview.md
    ├── p3dfft.md
    ├── papers
    │   └── 09C-Anton-Paper.pdf
    ├── samples.md
    ├── shared_memory.md
    └── vortex.md
├── examples
    ├── Makefile
    ├── README.md
    ├── fft_test_c2c
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   ├── c06fxfe.r
    │   └── fft_test_c2c.f90
    ├── fft_test_r2c
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   └── fft_test_r2c.f90
    ├── halo_test
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   └── halo_test.f90
    ├── io_test
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   ├── io_bench.f90
    │   ├── io_plane_test.f90
    │   ├── io_read.f90
    │   ├── io_test.f90
    │   ├── io_var_test.f90
    │   └── run_test.sh
    ├── non_blocking
    │   ├── Makefile
    │   ├── README.md
    │   ├── blocking.f90
    │   └── non_blocking.f90
    ├── p3dfft
    │   ├── Makefile
    │   ├── README.md
    │   └── p3dfft.f90
    ├── tecplot_view
    │   ├── 2decomp_decomp.png
    │   ├── Makefile
    │   ├── README
    │   └── tecplot_view.f90
    ├── test2d
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   └── test2d.f90
    └── timing
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README
    │   └── timing.f90
├── include
    └── .gitignore
├── lib
    └── Makefile
└── src
    ├── Makefile
    ├── Makefile.inc
    ├── Makefile.inc.BlueGene
    ├── Makefile.inc.Cray_XE
    ├── Makefile.inc.Fujitsu_SPARC64_VIIIfx
    ├── acml_plan.f90
    ├── alloc.f90
    ├── alloc_shm.c
    ├── cuda_fft_1m.cu
    ├── decomp_2d.f90
    ├── factor.f90
    ├── fft_acml.f90
    ├── fft_common.f90
    ├── fft_common_3d.f90
    ├── fft_cufft.f90
    ├── fft_essl.f90
    ├── fft_ffte.f90
    ├── fft_fftpack5.f90
    ├── fft_fftw3.f90
    ├── fft_fftw3_f03.f90
    ├── fft_generic.f90
    ├── fft_mkl.f90
    ├── glassman.f90
    ├── halo.f90
    ├── halo_common.f90
    ├── io.f90
    ├── io_read_one.f90
    ├── io_read_var.f90
    ├── io_write_every.f90
    ├── io_write_one.f90
    ├── io_write_plane.f90
    ├── io_write_var.f90
    ├── transpose_x_to_y.f90
    ├── transpose_y_to_x.f90
    ├── transpose_y_to_z.f90
    └── transpose_z_to_y.f90


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.a
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 2DECOMP_DIR=$(CURDIR)
 2 | 
 3 | .PHONY: lib examples clean install_dir
 4 | 
 5 | all: lib basic_test
 6 | 
 7 | lib:
 8 | 	cd lib; $(MAKE) $@
 9 | 
10 | examples:
11 | 	cd $@ ; $(MAKE) $@
12 | 
13 | basic_test: examples
14 | 	@echo "Basic Test target is examples"
15 | 
16 | clean:
17 | 	cd src; $(MAKE) $@
18 | 	cd lib; $(MAKE) $@
19 | 	cd include; rm -f *.mod
20 | 	cd examples; $(MAKE) $@
21 | 
22 | install_dir:
23 | 	mkdir -p $(DESTDIR)$(prefix)
24 | 	mkdir -p $(DESTDIR)$(prefix)/include
25 | 	mkdir -p $(DESTDIR)$(prefix)/lib
26 | 	mkdir -p $(DESTDIR)$(prefix)/doc
27 | 
28 | install: all install_dir
29 | 	cp $(2DECOMP_DIR)/include/*.mod $(DESTDIR)$(prefix)/include
30 | 	cp $(2DECOMP_DIR)/lib/lib*.a $(DESTDIR)$(prefix)/lib
31 | 	cp $(2DECOMP_DIR)/README $(DESTDIR)$(prefix)/README_2DECOMP
32 | 	cp $(2DECOMP_DIR)/doc/* $(DESTDIR)$(prefix)/doc
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2DECOMP&FFT
 2 | 
 3 | 2DECOMP&FFT is a library for 2D pencil decomposition and highly scalable distributed 3D Fast Fourier Transforms
 4 | 
 5 | #### Table of Contents
 6 | 
 7 | - [Overview](doc/overview.md)
 8 | - Software
 9 |   - [Download](doc/download.md)
10 |   - [Installation](doc/installation.md)
11 | - [Domain decomposition strategies](doc/decomposition.md)
12 | - [Fast Fourier Transform (FFT) review](doc/fft.md)
13 | - APIs
14 |   - [2D pencil decomposition APIs](doc/api_decomposition.md)
15 |   - [FFT APIs](doc/api_fft.md)
16 |   - [Halo cell support](doc/api_halo.md)
17 |   - [Parallel I/O](doc/api_io.md)
18 |   - [Non-blocking communication](doc/api_nonblocking.md)
19 | - Performance benchmarks
20 |   - [2DECOMP&FFT vs. P3DFFT](doc/p3dfft.md)
21 |   - [HECToR](doc/hector.md)
22 |   - [JUGENE](doc/jugene.md)
23 | - Applications and case studies
24 |   - [Sample applications](doc/samples.md)
25 |   - [Case study - Vortex generation using FFT](doc/vortex.md)
26 |   - [Incompact3D - a CFD application for turbulence research](doc/incompact3d.md)
27 |   - [DSTAR - a CFD application for studies of turbulence, aeroacoustics, combustion and multiphase flow](doc/dstar.md)
28 | - Miscellaneous technical subjects
29 |   - [Interactive decomposition map](https://monet.nag.co.uk/2decomp/decomp_map.php) 
30 |   - [Using the 1D slab decompostion mode](doc/1d_decomp.md)
31 |   - [Shared-memory optimisation](doc/shared_memory.md)
32 |   - [Process grid](doc/pgrid.md)
33 |   - [Padded all-to-all optimisation](doc/padded_alltoall.md)
34 |   - [Precision guidelines](doc/precision.md)
35 |   - [Memory comsumption](doc/memory.md)
36 | 
37 | #### Software License
38 | 
39 | Copyright &copy; 2011-2021, The Numerical Algorithms Group (NAG)
40 | All rights reserved.
41 | 
42 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
43 | 
44 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
45 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
46 | - Neither the name of the copyright owner nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
47 | 
48 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/doc/1d_decomp.md:
--------------------------------------------------------------------------------
 1 | ## Using the 1D Decomposition Mode
 2 | 
 3 | While 2DECOMP&FFT implements a general-purpose 2D pencil decomposition library, 1D slab decomposition remains an attractive option for certain applications.
 4 | 
 5 | - For small-to-medium size applications, which are unlikely to hit the constraint imposed by the decomposition strategy, having simpler and more efficient implementations is often preferable.
 6 | - There are applications with algorithms that can not be easily split to multiple 1D operations (for example a Poisson solver using 1D FFT in a homogeneous direction and a 2D multigrid solver in the two remaining directions).
 7 | - For applications with multiple levels of parallelism, it may be more convenient to use 1D decomposition for the coarse-grain level data distribution. Then on each slab fine-grain parallelism can be applied (e.g. using OpenMP on shared-memory node).
 8 | 
 9 | <p align="center">
10 |   <img src="images/2d_decomp.png"><br>
11 |   <span style="font-size:smaller;">Figure 1: 2D domain decomposition example using a P<sub>row</sub>*P<sub>col</sub>=4*3 processor grid: (a) X-pencil; (b) Y-pencil; (c) Z-pencil.</span>
12 | </p>
13 | 
14 | Fig.1 shows an arbitrary 3D domain partitioned using a 2D processor grid of P<sub>row</sub>=4 by P<sub>col</sub>=3. Clearly 1D decomposition is just a special case of 2D decomposition with either P<sub>row</sub>=1 or P<sub>col</sub>=1. In both cases, the communication algorithms can be simplified significantly.
15 | 
16 | <p align="center">
17 |   <img src="images/decomp-17-13-11-p_row-1.png" border="1">
18 |   <img src="images/decomp-17-13-11-p_col-1.png" border="1"><br>
19 |   <span style="font-size:smaller;">Figure 2: 1D slab decomposition of the same domain as in Figure 1<br> Left: P<sub>row</sub>=1; Right: P<sub>col</sub>=1.</span>
20 | </p>
21 | 
22 | If P<sub>row</sub>=1, state (a) and (b) are identical, as shown in Fig.2 (left); similarly, for P<sub>col</sub>=1, state (b) and (c) are identical, shown in Fig.2 (right). So the 1D decomposition can be defined as either slabs in Y and Z or slabs in X and Y. The former is often preferred as better cache efficiency may be achieved by always keeping the X direction in local memory.
23 | 
24 | When using the 2DECOMP&FFT library with 1D decomposition, half of the global transpositions can be dropped, resulting in more efficient code. This optimisation was introduced in version 1.1 of 2DECOMP&FFT.
25 | 
26 | Finally, note that one can also rely on this arrangement to perform large distributed 2D simulations. For example one option is to define the 2D data sets in an X-Y plane by setting nz=1 and P<sub>col</sub>=1 (arrays are still to be declared as 3D to satisfy the programming interface of the library).


--------------------------------------------------------------------------------
/doc/api_fft.md:
--------------------------------------------------------------------------------
 1 | ## API for Parallel Three-dimensional FFTs
 2 | 
 3 | #### Initialisation
 4 | 
 5 | To use the FFT programming interface, first of all, one additional Fortran module has to be used:
 6 | ```
 7 |       use decomp_2d_fft
 8 | ```
 9 | 
10 | The FFT interface is built on top of the 2D decomposition library which, naturally, needs to be initialised first:
11 | ```
12 |       call decomp_2d_init(nx, ny, nz, P_row, P_col)
13 | ```
14 | where *nx\*ny\*nz* is the 3D domain size and *P_row \* P_col* is the 2D processor grid. 
15 | 
16 | Next one needs to initialise the FFT interface by:
17 | ```
18 |       call decomp_2d_fft_init
19 | ```
20 | 
21 | The initialisation routine handles planing for the underlying FFT engine (if supported) and defines global data structures (such as temporary work spaces) for the computations. By default, it assumes that physical-space data is distributed in X-pencil format. The corresponding spectral-space data is stored in transposed Z-pencil format after the FFT. To give applications more flexibility, the library also supports the opposite direction, if an optional parameter is passed to the initialisation routine:
22 | ```
23 |       call decomp_2d_fft_init(PHYSICAL_IN_Z)
24 | ```
25 | 
26 | Physical-space data in Y-pencil is not an option as it would require additional expensive transpositions which does not make economical sense. There is a third and the most flexible form of the initialisation routine:
27 | ```
28 |       call decomp_2d_fft_init(pencil, n1, n2, n3)
29 | ```
30 | It allows applications to initialise FFT computations using an arbitrary problem size *n1\*n2\*n3*, which can be different from the main domain size *nx\*ny\*nz*.
31 | 
32 | #### Complex-to-complex Transforms
33 | 
34 | The library supports three-dimensional FFTs whose data is distributed as 2D pencils and stored in ordinary ijk-ordered 3D arrays across processors. For complex-to-complex (c2c) FFTs, the user interface is:
35 | ```
36 |       call decomp_2d_fft_3d(in, out, direction)
37 | ```
38 | where direction can be either `DECOMP_2D_FFT_FORWARD` (-1) for forward transforms, or `DECOMP_2D_FFT_BACKWARD` (1) for backward transforms. The input array `in` and output array `out` are both complex and have to be either a X-pencil/Z-pencil combination or vice versa, depending on the direction of FFT and how the FFT interface is initialised earlier (`PHYSICAL_IN_X`, the optional default, or `PHYSICAL_IN_Z`).
39 | 
40 | #### Real-to-complex & Complex-to-Real Transforms
41 | 
42 | While the c2c interface is already in the simplest possible form, for r2c and c2r transforms, the 3D FFT interface can be used in a more compact form:
43 | ```
44 |       call decomp_2d_fft_3d(in, out)
45 | ```
46 | Here if `in` is a real array and `out` a complex array, then a forward FFT is implied. Similarly a backward FFT is computed if `in` is a complex array and `out` a real array.
47 | 
48 | When real input is involved, the corresponding complex output satisfies so-called ***Hermitian redundancy*** - i.e. some output values are complex conjugates of others. Taking advantage of this, FFT algorithms can normally compute r2c and c2r transforms twice as fast as c2c transforms while only using about half of the memory. Unfortunately, the price to pay is that application's data structures have to become slightly more complex. For a 3D real input data set of size nx*ny*nz, the complex output can be held in an array of size *(nx/2+1)\*ny\*nz*, with the first dimension being cut roughly in half<a href="#note1" id="note1ref"><sup>1</sup></a>. Applications can either rely on the advanced interface described in the [decomposition API](api_decomposition.md), or use the following utility routine to distribute the complex output as 2D pencils:
49 | ```
50 |       call decomp_2d_fft_get_size(start,end,size)
51 | ```
52 | 
53 | Here all three arguments are 1D array of three elements, returning to the caller the starting index, ending index and size of the sub-domain held by the current processor - information very similar to the *start/end/size* variables defined in the main decomposition library.
54 | 
55 | Note that the complex output arrays obtained from X-pencil and Z-pencil input do not contain identical information (see the output of the fft_test_r2c [sample application](samples.md)). However, if 'Hermitian redundancy' is taken into account, no physical information is lost and the real input can be fully recovered through the corresponding inverse FFT from either complex array.
56 | 
57 | Also note that 2DECOMP&FFT does not scale the transforms. So a forward transform followed by a backward transform will not recover the input unless applications normalise the results by the sizes of the transforms.
58 | 
59 | #### Finalisation
60 | 
61 | Finally, to release the memory used by the FFT interface:
62 | ```
63 |       call decomp_2d_fft_finalize
64 | ```
65 | 
66 | It is possible to re-initialise the FFT interface in the same application at the later stage after it has been finalised, if this becomes necessary.
67 | 
68 | To obtain first-hand experience on the FFT interface, users are advised to examine the [sample applications](samples.md)) distributed with the library.
69 | 
70 | <hr size="1">
71 | 
72 | <a id="note1" href="#note1ref"><sup>1</sup></a>The storage is for Fortran. In C/C++, the last dimension has to be cut in half due to different memory pattern. For Z-pencil input, the complex output is of size *nx\*ny\*(nz/2+1)* instead. Also note that the integer division is rounded down.


--------------------------------------------------------------------------------
/doc/api_halo.md:
--------------------------------------------------------------------------------
 1 | ## API for Halo-cell Support
 2 | 
 3 | While most of the communications using the 2D decomposition are via the global transposition calls, it may become necessary for neighbouring blocks to exchange data explicitly. One such scenario is in CFD applications performing large-eddy simulations (LES). While most spatial derivatives are computed using the implicit formulation to achieve a high order of accuracy, some derivatives may be evaluated quickly using local stencils and explicit formulae, such as those used by sub-grid scale models (a model by definition does not require higher-order of accuracy).
 4 | 
 5 | The halo-cell support API provides data structures and nearest-neighbour communication routines that support explicit message passing between neighbouring pencils. As with the rest of the 2DECOMP&FFT library, the API is designed to be very user-friendly:
 6 | 
 7 | ```
 8 |       call update_halo(var, var_halo, level)
 9 | ```
10 | Here the first parameter `var`, a 3D input array, contains the normal pencil-distributed data as defined by the decomposition library. After invoking the routine, the second parameter `var_halo`, an output, returns all original data plus halo data from the neighbouring processes. One can imagine that pencils are now fatter and overlap with the neighbouring pencils. The third parameter `level` defines how many layers of overlapping is required. `var_halo` should be defined from the calling routine as either a 3D allocatable array or pointer. Its memory space will be calculated and allocated by the library. When the routine returns, `var_halo` can be referenced by the calling program using the normal *i,j,k* indices.
11 | 
12 | As with the rest of the 2DECOMP&FFT library, a more general form of the routine is available (implemented using Fortran optional arguments):
13 | ```
14 |       call update_halo(var, var_halo, level, opt_decomp, opt_global)
15 | ```
16 | This supports halo-cell communications among pencils with arbitrary global sizes, as described by `opt_decomp`, the decomposition object. The last optional parameter `opt_global` is required (to be set to `.true.`) if global coordinate is used to define the pencils, i.e. the input array `var` is defined using the *start/end* variables rather than the *size* variables. This ensures the coordinate systems used by `var` and `var_halo` are consistent.
17 | 
18 | To demonstrate the use of this API, here is an example that computes spatial derivatives:
19 | 
20 | ```
21 |       ! to calculate dv/dy, assume that variables are stored in X-pencil
22 |       
23 |       real, allocatable, dimension(:,:,:) :: v, v_halo, dvdy
24 |       
25 |       allocate(v(xsize(1), xsize(2), xsize(3)))
26 |       allocate(dvdy(xsize(1), xsize(2), xsize(3)))
27 |       
28 |       call update_halo(v,v_halo,level=1)
29 |       
30 |       ! compute derivatives
31 |       do k=1,xsize(3)
32 |          do j=1,xsize(2)
33 |             do i=1,xsize(1)
34 |                dvdy(i,j,k) = (v_halo(i,j+1,k)-v_halo(i,j-1,k)) / dy
35 |             end do
36 |          end do
37 |       end do
38 | ```
39 | 
40 | As seen, the variables are stored in X-pencil and derivatives are to be evaluated over distributed data along Y direction using a central finite difference scheme. This is the perfect situation to use the halo-cell support API. Using global transpositions would be unnecessarily too expensive for this type of local/explicit calculations. After the call to `update_halo`, it is safe to refer to the *j+1* and *j-1* indices on array `v_halo` in order to compute the derivatives.
41 | 
42 | Note that for the pencils bordering the computational domain, it is up to the application to handle the physical boundary conditions. The library does support periodic condition, i.e. for processes near the boundary of the computational domain, a call to the update_halo routine will fill the halo cells of one side with values from the other side of the domain, when periodic condition is required. To specify periodic condition, one need to initialise the decomposition library with additional information:
43 | ```
44 |       call decomp_2d_init(nx, ny, nz, P_row, P_col, periodic_bc)
45 | ```
46 | The extra parameter `periodic_bc` is a 1D array containing 3 logical values that specify which dimension should be periodic. This parameter is optional and is only used with the halo-cell API. The domain decomposition should otherwise behaves exactly as normal.
47 | 
48 | Like the rest of 2DECOMP&FFT, the halo-cell support API is implemented in a black-box fashion. The library internally handles the communications between neighbouring blocks using the standard MPI non-blocking point-to-point communications.


--------------------------------------------------------------------------------
/doc/api_nonblocking.md:
--------------------------------------------------------------------------------
 1 | ## Non-blocking API for Overlap of Communication and Computation
 2 | 
 3 | Transpose-based parallelisation is inherently communication intensive. For large-scale applications, it is not unusual that communication accounts for more than half of the total cost. Application performance may be significantly improved if algorithms can be redesigned to allow overlap of communication and computation. From version 1.4, 2DECOMP&FFT provides a low-level communication API to facilitate such effort.
 4 | 
 5 | The API is based on ideas of non-blocking MPI collectives (such as MPI_IALLTOALL and MPI_IALLTOALLV) introduced in MPI version 3.
 6 | 
 7 | [Old users of 2DECOMP&FFT may recall the use of third-party library libNBC, which implemented the non-blocking MPI collectives using existing MPI 1 functions, to support such features. Using third party libraries is no longer necessary.]
 8 | 
 9 | ### The API
10 | 
11 | Each of the four transposition routines in the base [decomposition library](api_decomposition.md) contains three key elements: algorithm to pack the MPI send buffers, MPI_ALLTOALL(V) communication, and algorithms to unpack the MPI receive buffers. When the non-blocking version of the MPI_ALLTOALL(V) is used, these routines are broken into smaller routines. For example, when transposing from X pencils to Y pencils, the blocking version of the communication routine is:
12 | ```
13 | 	call transpose_x_to_y(in, out, decomp)
14 | ```
15 | The corresponding non-blocking routines are:
16 | ```
17 | 	call transpose_x_to_y_start(handle, in, out, sbuf, rbuf, decomp)
18 | 	call transpose_x_to_y_wait(handle, in, out, sbuf, rbuf, decomp)
19 | ```
20 | The *start* routine packs the MPI send buffer, starts the non-blocking MPI_ALLTOALL(V) communication, and returns immediately. Later, a call to the corresponding *wait* routine ensures the communication is completed and then unpacks the MPI receive buffer. The first parameter `handle` is used to uniquely identify each communication session. Because several non-blocking communications may be ongoing at the same time, each has to define its own send buffer *sbuf* and receive buffer *rbuf*<a href="#note1" id="note1ref"><sup>1</sup></a>. It is up to the applications to supply (and if possible, reuse) these buffers, the size and shape of which should match the corresponding input array in and output array out. Between a *start* call and the corresponding *wait* call, the content of *sbuf* should not be modified and the content of *out* should not be referenced, to avoid unpredictable results. Other unrelated computations may be carried out while the communication is ongoing.
21 | 
22 | There are similar *start/wait* routines defined to all other transposition routines.
23 | 
24 | These routines are useful on systems with dedicated networking hardware to process the communication stack. On systems without such hardware, one has to call `MPI_TEST` explicitly from the user thread to progress the non-blocking communication. A utility routine is provided for this purpose:
25 | ```
26 | 	call transpose_test(handle)
27 | ```
28 | This needs to be called from time to time from the computational part of application, in order to progress the communication identified by `handle`. Of course, the practical difficulty is where and how frequently this should be called, a matter that is entirely application dependent. 
29 | 
30 | Currently, the author is not aware of any stable and high-quality software implementation that progresses all-to-all type of communication asynchronously<a href="#note2" id="note2ref"><sup>2</sup></a>.
31 | 
32 | #### A Sample Application
33 | 
34 | To demonstrate the use of this API, a sample application (non_blocking) is provided to compute multiple independent FFTs, using both the blocking and non-blocking versions of the communication library. The idea of overlapping the communication of one 3D FFT and the computation of another, as described by Kandalla et al.[1], is implemented. The algorithm's pseudo-code looks like:
35 | ```
36 |       1D FFT in X for V_1
37 |       call transpose_x_to_y for V_1 (blocking)
38 |       1D FFT in Y for V_1
39 |       call transpose_y_z_start for V_1
40 |       do k=2,N
41 |         1D FFT in X for V_k
42 |         call transpose_x_to_y for V_k (blocking)
43 |         1D FFT in Y for V_k
44 |         call transpose_y_to_z_start for V_k
45 |         call transpose_y_to_z_wait for V_(k-1)
46 |         1D FFT in Z for V_(k-1)
47 |       end do
48 |       call transpose_y_to_z_wait for V_N to complete
49 |       1D FFT in Z for V_N
50 | ```
51 | 
52 | This algorithm compute multiple independent 3D FFTs on dataset *V<sub>k</sub> (k=1,N)*. As can be seen, the Y=>Z transpose for dataset *k* and the computation of 1D FFT in Z for dataset *k-1* are overlapped. Note that in the sample application the computations are done using loops of 1D FFTs, rather than with FFTW's advanced interface that allows multiple 1D FFTs to be done in one go. This design is to allow `MPI_TEST` calls to be inserted to progress the communication.
53 | 
54 | It is up to the application developers to identify opportunities in their algorithms that may benefit from this non-blocking API.
55 | 
56 | #### References
57 | 
58 | [1] K. Kandalla, H. Subramoni, K. Tomko, D. Pekurovsky, S. Sur and D.K. Panda, "High-performance and scalable non-blocking all-to-all with collective offload on InfiniBand clusters: a study with parallel 3D FFT", *Computer Science - Research and Development*, vol. 26(3-4):237-246, 2011.
59 | 
60 | 
61 | ---
62 | 
63 | <a id="note1" href="#note1ref"><sup>1</sup></a>The blocking version also needs to define send/recv buffers. But because there is only one communication at any time, the buffers are temporarily allocated as required by the library, or for performance reason defined globally and shared by multiple communication calls.
64 | 
65 | <a id="note2" href="#note2ref"><sup>2</sup></a>There are *asynchronous progress control* in Intel MPI library. However, the only supported non-blocking collective calls are *Ibcast*, *Ireduce* and *Iallreduce*.


--------------------------------------------------------------------------------
/doc/decomposition.md:
--------------------------------------------------------------------------------
 1 | ## Domain Decomposition Strategies
 2 | 
 3 | The discussions here apply to many applications based on three-dimensional Cartesian meshes (or to be exact, having a Cartesian topology), and in particular those using spatially implicit numerical schemes. For example, a compact finite difference scheme often results in solving a tridiagonal linear system when evaluating spatial derivatives or doing spatial interpolations; a spectral code often involves performing a Fast Fourier Transform along a global mesh line.
 4 | 
 5 | There are two approaches to performing such computations on distributed-memory systems. One can either develop distributed algorithms (such as a parallel tridiagonal solver or a parallel FFT algorithm working on distributed data), or one can at runtime redistribute (transpose) data among processors in order to apply serial algorithms in local memory. The second approach is often preferred due to its simplicity: existing serial algorithms (hopefully already optimised for a single CPU) remain unchanged; porting serial code can be straight-forward as much of the original code logic still holds, and the only major addition is the data transposition procedures.
 6 | 
 7 | #### 1D Slab Decomposition
 8 | 
 9 | In early days, many applications implemented the above idea using 1D domain decomposition (also known as slab decomposition). In Fig.1, a 3D domain is arbitrarily chosen to be decomposed in Y and X directions. It can be seen that in state (a), any computations in the X-Z planes can be done in local memories while data along a Y mesh-line is distributed. When it is necessary to calculate along Y mesh-lines (say to evaluate Y-derivatives, or to perform 1D FFTs along Y), one can redistribute the data among processors to reach state (b), in which any computation in Y becomes 'local'. If using standard MPI library, switching between state (a) and (b) can be achieved using the MPI_ALLTOALL(V) routines.
10 | 
11 | <p align="center">
12 |   <img src="images/1d_decomp.png"><br>
13 |   <span style="font-size:smaller;">Figure 1. 1D domain decomposition example using 4 processors: (a) decomposed in Y direction; (b) decomposed in X direction.</span>
14 | </p>
15 | 
16 | A 1D decomposition, while quite simple, has some limitations, especially for large-scale applications. Given a cubic mesh of size N^3 , one obvious constraint is that the maximum number of processors N<sub>proc</sub> that can be used in a 1D decomposition is N as each slab has to contain at least one plane of data. For a cubic mesh with 1 billion points (which is very large but becomes increasingly common in CFD applications, such as those for fundamental turbulence studies), the constraint is N<sub>proc</sub><=1000. This is a serious limitation as most supercomputers today have at least tens of thousands of cores. Large applications are also likely to hit the memory limit when each processor handles too much workload.
17 | 
18 | #### 2D Pencil Decomposition
19 | 
20 | A 2D pencil decomposition (also known as a 'drawer' or 'block' decomposition) is a natural extension to 1D decompositions. Fig.2 shows that the same 3D domain as in Fig.1 can be partitioned in two dimensions. States (a), (b) and (c) are referred to as X-pencil, Y-pencil and Z-pencil arrangements, respectively. While a 1D decomposition algorithm swaps between two states, in a 2D decomposition one needs to traverse 3 different states using 4 global transpositions ((a) =>(b) => (c) => (b) => (a)).
21 | 
22 | <p align="center">
23 |   <img src="images/2d_decomp.png"><br>
24 |   <span style="font-size:smaller;">Figure 2: 2D domain decomposition example using a 4*3 processor grid: (a) X-pencil; (b) Y-pencil; (c) Z-pencil.</span>
25 | </p>
26 | 
27 | An interactive view of the 2D pencil decomposition can be found from this [web application](https://monet.nag.co.uk/2decomp/decomp_map.php).
28 | 
29 | Again MPI_ALLTOALL(V) can be used to realise the transpositions. However it is significantly more complex than the 1D case. There are two separate communicator groups. For a P<sub>row</sub>*P<sub>col</sub> processor grid: P<sub>row</sub> groups of P<sub>col</sub> processors need to exchange data among themselves for (a) <=> (b) ; P<sub>col</sub> groups of P<sub>row</sub> processors need to exchange data among themselves for (b) <=> (c). For example, the red, green and blue processes in state (b) and (c) occupy exactly the same physical domain.
30 | 
31 | On one hand, the proper implementation of the communication routines can be quite tricky. For example the communications are very sensitive to the orientations of pencils and their associated memory patterns. The packing and unpacking of memory buffers for the MPI library calls must be handled with great care for efficiency. These are pure software engineering topics, which are almost certainly irrelevant to the scientific researches conducted by the applications.
32 | 
33 | On the other hand, although the idea of 2D decomposition has long been established, its adoption in real applications was not essential until recently, when ordinary researchers can realistically expect to regularly use thousands of cores on major supercomputers, therefore hitting the limitation imposed by 1D decomposition.
34 | 
35 | These motivated the author to create the 2DECOMP&FFT library - a general-purpose domain decomposition library that can be reused by many applications - to handle these technical issues properly and to hide most software-engineering details from application developers who can concentrate on their scientific studies.
36 | 
37 | 


--------------------------------------------------------------------------------
/doc/dstar.md:
--------------------------------------------------------------------------------
 1 | ## DSTAR - Direct Simulation of Turbulence And Reaction
 2 | 
 3 | DSTAR is a high-order code for **D**irect **S**imulation of **T**urbulence **A**nd **R**eaction, initially developed by Professor Kai Luo (Southampton University) and extended by co-workers over the past 20 years. It solves the complete Navier-Stokes equations as well as conservation equations for energy and chemical species. Modules for both direct numerical simulation (DNS) and large eddy simulation (LES) have been developed for high-fidelity simulation of turbulence, aeroacoustics, turbulent combustion, multiphase turbulent flow and combustion. DSTAR incorporates highly accurate numerical techniques such as 6th-order spatial discretisation, non-reflecting boundary conditions and low-storage Runge-Kutta explicit time-advancement.
 4 | 
 5 | Parallel algorithms include MPI and mixed MPI/OpenMP. Parallel operations can be performed in 1D or 2D decomposition as supported by the 2DECOMP&FFT library. In this case, only the decomposition API is required. The code solves the fluid problem in compressible form and there is no tricky Poisson problem involved. The code was ran successfully using 6144 cores in pure MPI mode and 18432 cores in hybrid mode on HECToR.
 6 | 
 7 | A typical scientific application is shown below:
 8 | 
 9 | <p align="center">
10 |   <img src="images/dstar-flame.png"><br>
11 |   <span style="font-size:smaller;">Large eddy simulation of a turbulent diffusion flame interacting with evaporating water droplets.
12 |   </span>
13 | </p>
14 | 
15 | The mathematical framework of DSTAR is described in details in:
16 | 
17 | - K. H. Luo, "Combustion effects on turbulence in a partially premixed supersonic diffusion flame", *Combustion and Flame*, vol. 119(4):417-435, 1999.
18 | - J. Xia, K. H. Luo and S. Kumar, "Large-Eddy Simulation of Interactions Between a Reacting Jet and Evaporating Droplets", *Flow Turbulence and Combustion*, vol. 80(1):133-153, 2008.
19 | - J. Xia and K. H. Luo, "Conditional statistics of inert droplet effects on turbulent combustion in reacting mixing layers", *Combustion Theory and Modelling*, vol. 13(5):901-920, 2009.
20 | 
21 | Details of the parallelisation was reported at the 2011 Cray User Group conference.
22 | 
23 | - L. Anton, N. Li and K. H. Luo, "A study of scalability performance for hybrid mode computation and asynchronous MPI transpose operation in DSTAR", *Cray User Group 2011 conference*, Fairbanks, 2011. [PDF](papers/09C-Anton-Paper.pdf)


--------------------------------------------------------------------------------
/doc/images/1d_decomp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/1d_decomp.png


--------------------------------------------------------------------------------
/doc/images/2d_decomp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/2d_decomp.png


--------------------------------------------------------------------------------
/doc/images/Brachos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/Brachos.png


--------------------------------------------------------------------------------
/doc/images/compact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/compact.png


--------------------------------------------------------------------------------
/doc/images/decomp-17-13-11-p_col-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/decomp-17-13-11-p_col-1.png


--------------------------------------------------------------------------------
/doc/images/decomp-17-13-11-p_row-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/decomp-17-13-11-p_row-1.png


--------------------------------------------------------------------------------
/doc/images/dstar-flame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/dstar-flame.png


--------------------------------------------------------------------------------
/doc/images/fft_bgp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fft_bgp.png


--------------------------------------------------------------------------------
/doc/images/fft_hector_2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fft_hector_2a.png


--------------------------------------------------------------------------------
/doc/images/fractal-grids.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fractal-grids.png


--------------------------------------------------------------------------------
/doc/images/incompact3d-strong.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/incompact3d-strong.png


--------------------------------------------------------------------------------
/doc/images/incompact3d-weak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/incompact3d-weak.png


--------------------------------------------------------------------------------
/doc/images/io_model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/io_model-1.png


--------------------------------------------------------------------------------
/doc/images/io_model-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/io_model-2.png


--------------------------------------------------------------------------------
/doc/images/p3dfft_hector_phase1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/p3dfft_hector_phase1.png


--------------------------------------------------------------------------------
/doc/images/shm1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/shm1.png


--------------------------------------------------------------------------------
/doc/images/shm2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/shm2.png


--------------------------------------------------------------------------------
/doc/images/vort-fractal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/vort-fractal.png


--------------------------------------------------------------------------------
/doc/images/yes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/yes.png


--------------------------------------------------------------------------------
/doc/jugene.md:
--------------------------------------------------------------------------------
 1 | ## Benchmarks on JUGENE
 2 | 
 3 | This set of benchmarked was performed in May 2010 on JUGENE, the big IBM Blue Gene/P system at Jülich Supercomputing Centre in Germany. The system ranked world No. 4 by that time, with a Linpack capability of 825.5 TFLOPs.
 4 | 
 5 | The work was made possible with the assistance of high performance computing resources (Tier-0) provided by PRACE. 2DECOMP&FFT was ported onto the Blue Gene/P. One major improvement achieved was the implementation of the FFT interface using ESSL, a high-performance math library native to IBM systems. The FFT interface was then benchmarked on problem sizes up to 8192^3 using up to 131072 cores.
 6 | 
 7 | <p align="center">
 8 |   <img src="images/fft_bgp.png"><br>
 9 |   <span style="font-size:smaller;">Scaling of the FFT interface on Blue Gene/P JUGENE.</span>
10 | </p>
11 | 
12 | As seen, the code scales extremely well on the system for all problem sizes. The apparent super-linear scaling for the 1024^3 case is understood to be related to the Torus network configurations that favour larger jobs.
13 | 


--------------------------------------------------------------------------------
/doc/overview.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | ### Introduction
 4 | 
 5 | The 2DECOMP&FFT library is a software framework in Fortran to build large-scale parallel applications. It is designed for applications using three-dimensional structured mesh and spatially implicit numerical algorithms. At the foundation it implements a general-purpose 2D pencil decomposition for data distribution on distributed-memory platforms. On top, it provides a highly scalable and efficient interface to perform three-dimensional distributed FFTs. The library is optimised for supercomputers and scales well to hundreds of thousands of cores. It relies on MPI but provides a user-friendly programming interface that hides communication details from application developers.
 6 | 
 7 | ### Features
 8 | 
 9 | Here is a list of 2DECOMP&FFT's main features:
10 | 
11 | * General-purpose 2D pencil decomposition module to support building large-scale parallel applications on distributed memory systems.
12 | * Highly scalable and efficient distributed Fast Fourier Transform module, supporting three dimensional FFTs (both complex-to-complex and real-to-complex/complex-to-real).
13 | * Halo-cell support allowing explicit message passing between neighbouring blocks.
14 | * Parallel I/O module to support the handling of large data sets.
15 | * Shared-memory optimisation on the communication code for multi-core systems.
16 | 
17 | 2DECOMP&FFT distinguishes itself from many other popular distributed FFT libraries by exposing its communication APIs upon which many other parallel algorithms can be built.
18 | 
19 | 2DECOMP&FFT is designed to be:
20 | 
21 | * **Scalable** - The library and applications built upon it are known to scale to o(10^5) cores on major supercomputers.
22 | * **Flexible** - Software framework to support building higher-level libraries and many types of applications.
23 | * **User-friendly** - Black-box implementation and very clean application programming interface hiding most communication details from applications.
24 | * **Portable** - Code tested on many major supercomputing architectures. The FFT library interfaces with almost every popular external FFT implementations.
25 | 
26 | ### History
27 | 
28 | This software package was originally derived from several projects funded under the HECToR Distributed Computational Science and Engineering (dCSE) programme operated by NAG Ltd. HECToR - a UK Research Councils' high end computing service - served as the UK's national supercomputer for open science between 2008 and 2014.
29 | 
30 | The active development of this library completed in 2012. It has been in production use in many research applications since then. The code quality appears to be very good with almost no major bugs reported over the years. Its performance remains very competitive as reported by a [recent study](https://www.icl.utk.edu/files/publications/2021/icl-utk-1490-2021.pdf). 
31 | 
32 | Since August 2021, this project is hosted in NAG's official GitHub account to facilitate future development and maintenance.
33 | 
34 | ### Citation
35 | 
36 | If you wish to cite this work, you are recommended to use the following paper:
37 | 
38 | * N. Li and S. Laizet, "2DECOMP&FFT – A highly scalable 2D decomposition library and FFT interface", Cray User Group 2010 conference, Edinburgh, 2010.


--------------------------------------------------------------------------------
/doc/p3dfft.md:
--------------------------------------------------------------------------------
 1 | ## 2DECOMP&FFT vs. P3DFFT
 2 | 
 3 | P3DFFT is probably the most well-known open-source distributed FFT library. The project was initiated at San Diego Supercomputer Center at UCSD by Dmitry Pekurovsky. It is highly efficient and it has been widely adopted by scientists doing large-scale simulations, such as high-resolution turbulence simulations.
 4 | 
 5 | P3DFFT was actually ported onto HECToR (my development system) at the early stage of the 2DECOMP&FFT project. Fig. 1 shows its good scaling on the old hardware (back in early 2009, the system was a Cray XT4 using dual-core AMD Opteron processors and Cray SeaStar interconnect).
 6 | 
 7 | <p align="center">
 8 |   <img src="images/p3dfft_hector_phase1.png"><br>
 9 |   <span style="font-size:smaller;">Figure 1. P3DFFT scaling on Cray XT4 HECToR.</span>
10 | </p>
11 | 
12 | What motivated the author to develop a new and somewhat competing library were the following:
13 | - P3DFFT is an FFT-only package. It is not designed as a general-purpose 2D decomposition library and its communication routines are not designed to be user callable. 2DECOMP&FFT provides a general-purpose decomposition library to support the building of a variety of applications (the applications do not necessarily need to use FFT).
14 | - P3DFFT appears to be targeting applications using spectral method and only performs real-to-complex and complex-to-real transforms. 2DECOMP&FFT is also able to support complex-to-complex transforms. **Note that the new generation of P3DFFT library (dubbed P3DFFT++ or P3DFFT v.3) is a generalization of the concept of P3DFFT and does support complex-to-complex transforms.**
15 | - The separation of communication layer and the FFT layer in 2DECOMP&FFT makes it possible to build additional libraries (such as transforms using Chebyshev or Jacobian basis functions, or a general-purpose PDE solver). It is also easier to implement advanced software features (such as the shared-memory implementation) where only the low-level communication code needs to be updated.
16 | 
17 | #### Performance Comparison
18 | 
19 | The parallel performance of 2DECOMP&FFT and P3DFFT has been studied in great detail in a [MSc thesis by E. Brachos at University of Edinburgh](https://static.epcc.ed.ac.uk/dissertations/hpc-msc/2010-2011/EvangelosBrachos.pdf). Fig. 2 shows a set of benchmark on r2c/c2r transforms of size 256^3. The MPI interface of FFTW 3.3 was also examined, although it can only run in 1D slab decomposition mode.
20 | 
21 | <p align="center">
22 |   <img src="images/Brachos.png"><br>
23 |   <span style="font-size:smaller;">Figure 2. Speedup of 2DECOMP&FFT, P3DFFT and FFTW 3.3's MPI interface.</span>
24 | </p>
25 | 
26 | The performance difference between 2DECOMP&FFT and P3DFFT is often shown to be marginal, although the best 2D processor grid to achieve the optimal performance can be very different due to the different internal architecture of the two libraries.
27 | 
28 | The scalability and the absolute performance of both 2DECOMP&FFT and P3DFFT are better than FFTW 3.3 running in MPI mode. FFTW is, however, much more efficient in OpenMP mode. This suggests that a hybrid implementation may be the future direction of 2DECOMP&FFT.


--------------------------------------------------------------------------------
/doc/papers/09C-Anton-Paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/papers/09C-Anton-Paper.pdf


--------------------------------------------------------------------------------
/doc/samples.md:
--------------------------------------------------------------------------------
 1 | ## Sample Applications
 2 | 
 3 | A list of sample applications are distributed with 2DECOMP&FFT package to validate the library and to demonstrate the proper use of it.
 4 | 
 5 | - **test2d** - This application is to test the base 2D pencil decomposition module. It arranges to transpose data among the three pencil orientations and validate the result against a copy of the global data held on each process. It also demonstrates the use of the parallel I/O library - regardless of how the global data is distributed (X-pencil, Y-pencil or Z-pencil), when processes write to files collectively using the I/O library the resulting files should be identical.
 6 | - **fft_test_c2c** - This is a simple application to validate the complex-to-complex FFT programming interface. Its input is taken from the example program of NAG library routine C06FXF (also for c2c FFTs). Its output should match the C06FXF output exactly.
 7 | - **fft_test_r2c** - This is to test the FFT library's real-to-complex and complex-to-real interface. It generates some random input, computes a serial 3D r2c transform on rank 0 to generate reference data. It then computes two sets of transforms on distributed data, with the input distributed in X-pencil and Z-pencil, respectively. In both cases, a r2c transform is computed first and its result on rank 0 printed out (which should contain a subset of numbers found in the reference serial transform output). An inverse c2r transform is then followed to recover the input to machine accuracy (system dependent, but somewhere around 10<sup>-6</sup> for single precision and 10<sup>-15</sup> for double precision).
 8 | - **timing** - This application can be used to benchmark the FFT library performance when porting it to a new system. It performs both c2c and r2c/c2r benchmarks, collects timing information and validates the results.
 9 | - **halo_test** - This application demonstrate the use of the halo-cell support API. It calculates the divergence of a random field using an explicit 3-stencil finite different method. The parallel program relies on two different communication methods: (1) the global transposition routines; (2) the halo-cell support API. Both methods should return exactly the same results. Of course the halo-cell method is more efficient for such a stencil-based calculation.
10 | - **io_test** - A collection of sample applications testing the I/O APIs thoroughly.
11 | - **tecplot_view** - This application was used to generate the visualisation of the 2D decomposition, as frequently shown in the documentation. The output is in the format of Tecplot, a popular visualisation tool mainly used by the CFD community. Data from each process is written as a zone.
12 | - **p3dfft** - This application uses 2DECOMP&FFT and P3DFFT side-by-side to perform some FFTs. It was used to validate and benchmark 2DECOMP&FFT against its famous counterpart. P3DFFT has to be built separately in order to use this test.
13 | - **non_blocking** - This contains sample applications to compute multi-variable FFTs using the both the blocking and non-blocking versions of the communication library. This demonstrates how to use the non_blocking APIs to overlap communication and computation.
14 | 
15 | Please consult the README files associated with these sample applications for more detail. 


--------------------------------------------------------------------------------
/doc/shared_memory.md:
--------------------------------------------------------------------------------
 1 | ## Shared-memory Programming using System V IPC
 2 | 
 3 | Most modern supercomputers are equipped with multi-core processors and cores on same node often share local memory. There are various programming models which can take advantage of this architecture, including the popular hybrid MPI/OpenMP model. In the context of this project, shared-memory programming is used to improve the efficiency of the communication code. 
 4 | 
 5 | For all-to-all type of communication in which each MPI rank has to send/receive messages to/from all other MPI ranks, traffics from cores on the same physical node compete for their network interface. Even if the network bandwidth is sufficient, the performance is likely to be affected by network latency when too many small messages are passed within the system. One solution is to create shared send/recv buffers on each SMP node. Then only leaders of the nodes participate MPI_ALLTOALL(V), resulting in fewer but larger messages, hopefully improving the communication performance. The interconnects of supercomputers are often optimised for handling small amount of large messages. 
 6 | 
 7 | This feature has been implemented within the communication library as a black box. It can be activated by users at compile time by using '-DSHM' flag. The shared-memory code uses the System V Inter-Process Communication (IPC) API which is widely supported on many variants of UNIX. 
 8 | 
 9 | 2DECOMP&FFT has two independent shared-memory implementations (they validate each other):
10 | 
11 | - The first version is based on code supplied by David Tanqueray of Cray Inc., who initially applied this idea to several molecular dynamics applications. This code accesses platform-dependent information<a href="#note1" id="note1ref"><sup>1</sup></a> in order to establish the share-memory configurations (such as which MPI rank belongs to which node). It has been tested on Cray hardware only.
12 | -  The second version is based on the open-source package FreeIPC, created by Ian Bush, a former NAG colleague. FreeIPC is basically a Fortran wrapper for the System V IPC API and it provides a system-independent way to gather shared-memory information. This makes it possible to write more portable shared-memory code. 
13 | 
14 | Fig. 1 below demonstrates the typical benefit of shared-memory programming. The data was collected on HECToR phase 2a system (Cray XT4 with quad-core AMD Opteron processors) from a series of simulations using 256 MPI ranks over a range of problem sizes. When the problem size is small (so is the message size), the communication routines were called more times so that the total amount of data moving within the system remains a constant. It can be seen that when the problem size is smaller, the overhead of setting up communications is much higher and the shared-memory code can improve communication efficiency by up to 30%. As the problem size increases, the benefit of using shared-memory code becomes smaller. For large message size (> 32Kb in this example), the shared-memory code is actually slower due to the extra memory copying operations required to assemble/disassemble the shared-memory buffers. 
15 | 
16 | <p align="center">
17 |   <img src="images/shm1.png"><br>
18 |   <span style="font-size:smaller;">Figure 1: Typical shared-memory code performance. 
19 |   </span>
20 | </p>
21 | 
22 | The HECToR upgrade to phase 2b (world's first production Cray XE6) presented a unique opportunity to demonstrate the benefit of shared-memory programming in real applications. The 24-core nodes were introduced to HECToR several months before the arrival of new Gemini interconnect. During the transitional period, communication intensive applications often produced more network traffic than the old SeaStar interconnect could handle. Fig.2 shows the benchmark of 2DECOMP&FFT's FFT interface with a 2592^3 problem size<a href="#note2" id="note2ref"><sup>2</sup></a>. With the slow SeaStar interconnect, the scaling was poor when using more than few thousands cores. However, switching on the shared-memory code significantly improved the application performance (by as far as 40%) and parallel efficiency of more than 90% was observed through out the scale. The new Gemini interconnect offered significant improvement in terms of both network bandwidth and latency. As a result, significant performance gain was to be expected for communication intensive codes. The FFT benchmark was almost twice as fast in some cases. However, the shared-memory code on Gemini (not shown in the figure) offered absolutely no benefit when the network was fast enough to handle all the messages efficiently. 
23 | 
24 | <p align="center">
25 |   <img src="images/shm2.png"><br>
26 |   <span style="font-size:smaller;">Figure 2: Parallel FFT performance: SeaStar (with and without shared-memory) vs. Gemini. 
27 |   </span>
28 | </p>
29 | 
30 | ---
31 | 
32 | <a id="note1" href="#note1ref"><sup>1</sup></a>On Cray XT/XE systems, this is done by checking the /proc file system of the computing nodes. 
33 | 
34 | <a id="note2" href="#note2ref"><sup>2</sup></a>The problem size of 2592 was chosen intentionally because it is divisible by 6 multiple times, which helped achieve better load balance on the system using 24-core node (containing two Magny-Cours processors, each with two six-core dies).


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test2d fft_test_c2c fft_test_r2c timing halo_test io_test
 2 | 
 3 | # Just build the examples
 4 | examples: test2d fft_test_c2c fft_test_r2c timing halo_test io_test
 5 | 	@echo "Built the examples"
 6 | 
 7 | test2d:
 8 | 	cd test2d; $(MAKE) $@
 9 | fft_test_c2c:
10 | 	cd fft_test_c2c; $(MAKE) $@
11 | fft_test_r2c:
12 | 	cd fft_test_r2c; $(MAKE) $@
13 | timing:
14 | 	cd timing; $(MAKE) $@
15 | halo_test:
16 | 	cd halo_test; $(MAKE) $@
17 | io_test:
18 | 	cd io_test; $(MAKE) $@
19 | 
20 | # test all the examples (individual Makefiles should take care of updating)
21 | basic_test:
22 | 	cd test2d; $(MAKE) $@
23 | 	cd fft_test_c2c; $(MAKE) $@
24 | 	cd fft_test_r2c; $(MAKE) $@
25 | 	cd timing; $(MAKE) $@
26 | 	cd halo_test; $(MAKE) $@
27 | 	cd io_test; $(MAKE) $@
28 | 
29 | clean:
30 | 	cd test2d; $(MAKE) $@
31 | 	cd fft_test_c2c; $(MAKE) $@
32 | 	cd fft_test_r2c; $(MAKE) $@
33 | 	cd timing; $(MAKE) $@
34 | 	cd halo_test; $(MAKE) $@
35 | 	cd io_test; $(MAKE) $@
36 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | * test2d       - to test the base 2D pencil decomposition module 
 5 | 
 6 | * fft_test_c2c - to test the complex-to-complex FFTs 
 7 | 
 8 | * fft_test_r2c - to test the real-to-complex/complex-to-real FFTs
 9 | 
10 | * timing       - to benchmark the FFT library
11 | 
12 | * halo_test    - to test the halo-cell exchange code
13 | 
14 | * io_test      - to test various IO functions
15 | 
16 | * p3dfft       - to crosscheck the library against P3DFFT
17 | 
18 | * non_blocking - to test the idea of overlap communication and computation
19 | 
20 | * tecplot_view - to generate Tecplot visualisation of the decomposition
21 | 
22 | 
23 | Some examples may require external libraries to be built first. Refer to the README files for each example for details.
24 | 


--------------------------------------------------------------------------------
/examples/fft_test_c2c/.gitignore:
--------------------------------------------------------------------------------
1 | fft_test_c2c
2 | 


--------------------------------------------------------------------------------
/examples/fft_test_c2c/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT)
 5 | 
 6 | OBJ = fft_test_c2c.o
 7 | 
 8 | fft_test_c2c: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o fft_test_c2c
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/fft_test_c2c/README:
--------------------------------------------------------------------------------
 1 | fft_test_c2c
 2 | ------------
 3 | 
 4 | This example demonstrates the use of the FFT c2c interface. The test uses the 
 5 | input of NAG routine 'c06fxf' (also for c2c transform) and attempts to 
 6 | reproduce the output.
 7 | 
 8 | To run: use 4 MPI processes.
 9 | 
10 | What to expect: the output should match what is in 'c06fxfe.r'.
11 | 


--------------------------------------------------------------------------------
/examples/fft_test_c2c/c06fxfe.r:
--------------------------------------------------------------------------------
 1 |  C06FXF Example Program Results
 2 |  
 3 |  Original data values
 4 |  
 5 |  z(i,j,k) for i =     1
 6 |  
 7 |  Real      1.000     0.999     0.987     0.936
 8 |  Imag      0.000    -0.040    -0.159    -0.352
 9 |  
10 |  Real      0.994     0.989     0.963     0.891
11 |  Imag     -0.111    -0.151    -0.268    -0.454
12 |  
13 |  Real      0.903     0.885     0.823     0.694
14 |  Imag     -0.430    -0.466    -0.568    -0.720
15 |  
16 |  z(i,j,k) for i =     2
17 |  
18 |  Real      0.500     0.499     0.487     0.436
19 |  Imag      0.500     0.040     0.159     0.352
20 |  
21 |  Real      0.494     0.489     0.463     0.391
22 |  Imag      0.111     0.151     0.268     0.454
23 |  
24 |  Real      0.403     0.385     0.323     0.194
25 |  Imag      0.430     0.466     0.568     0.720
26 |  
27 |  Components of discrete Fourier transform
28 |  
29 |  z(i,j,k) for i =     1
30 |  
31 |  Real      3.292     0.051     0.113     0.051
32 |  Imag      0.102    -0.042     0.102     0.246
33 |  
34 |  Real      0.143     0.016    -0.024    -0.050
35 |  Imag     -0.086     0.153     0.127     0.086
36 |  
37 |  Real      0.143    -0.050    -0.024     0.016
38 |  Imag      0.290     0.118     0.077     0.051
39 |  
40 |  z(i,j,k) for i =     2
41 |  
42 |  Real      1.225     0.355     0.000    -0.355
43 |  Imag     -1.620     0.083     0.162     0.083
44 |  
45 |  Real      0.424     0.020     0.013    -0.007
46 |  Imag      0.320    -0.115    -0.091    -0.080
47 |  
48 |  Real     -0.424     0.007    -0.013    -0.020
49 |  Imag      0.320    -0.080    -0.091    -0.115
50 |  
51 |  Original sequence as restored by inverse transform
52 |  
53 |  z(i,j,k) for i =     1
54 |  
55 |  Real      1.000     0.999     0.987     0.936
56 |  Imag      0.000    -0.040    -0.159    -0.352
57 |  
58 |  Real      0.994     0.989     0.963     0.891
59 |  Imag     -0.111    -0.151    -0.268    -0.454
60 |  
61 |  Real      0.903     0.885     0.823     0.694
62 |  Imag     -0.430    -0.466    -0.568    -0.720
63 |  
64 |  z(i,j,k) for i =     2
65 |  
66 |  Real      0.500     0.499     0.487     0.436
67 |  Imag      0.500     0.040     0.159     0.352
68 |  
69 |  Real      0.494     0.489     0.463     0.391
70 |  Imag      0.111     0.151     0.268     0.454
71 |  
72 |  Real      0.403     0.385     0.323     0.194
73 |  Imag      0.430     0.466     0.568     0.720
74 | 


--------------------------------------------------------------------------------
/examples/fft_test_c2c/fft_test_c2c.f90:
--------------------------------------------------------------------------------
  1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  2 | ! Main test program for the FFT interface
  3 | !  - use input data from a NAG FFT library for validation
  4 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  5 | 
  6 | program fft_test_c2c
  7 | 
  8 | use decomp_2d
  9 | use decomp_2d_fft
 10 | 
 11 | implicit none
 12 | 
 13 | integer, parameter :: nx=2, ny=3, nz=4
 14 | integer, parameter :: p_row=2, p_col=2
 15 | 
 16 | complex(mytype), allocatable, dimension(:,:,:) :: in, out
 17 | 
 18 | complex(mytype), dimension(nx,ny,nz) :: in1, out1
 19 | integer :: ierror, i,j,k
 20 | 
 21 | interface
 22 |    subroutine assemble_global(ndir,local,global,nx,ny,nz)
 23 |      use decomp_2d
 24 |      integer, intent(IN) :: ndir
 25 |      integer, intent(IN) :: nx,ny,nz
 26 |      complex(mytype), dimension(:,:,:), intent(IN) :: local
 27 |      complex(mytype), dimension(nx,ny,nz), intent(OUT) :: global
 28 |    end subroutine assemble_global
 29 | end interface
 30 | 
 31 | call MPI_INIT(ierror)
 32 | call decomp_2d_init(nx,ny,nz,p_row,p_col)
 33 | call decomp_2d_fft_init
 34 | 
 35 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 36 | ! (1) Testing the complex-to-complex interface (c2c) 
 37 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 38 | 
 39 | !  input is X-pencil data
 40 | ! output is Z-pencil data
 41 | allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
 42 | allocate (out(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
 43 | 
 44 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 45 | ! Following is the testing input for NAG library C06FXF
 46 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
 47 | in1(1,1,1) = (1.000,  0.000)
 48 | in1(1,1,2) = (0.999, -0.040)
 49 | in1(1,1,3) = (0.987, -0.159)
 50 | in1(1,1,4) = (0.936, -0.352)
 51 | in1(1,2,1) = (0.994, -0.111)
 52 | in1(1,2,2) = (0.989, -0.151)
 53 | in1(1,2,3) = (0.963, -0.268)
 54 | in1(1,2,4) = (0.891, -0.454)
 55 | in1(1,3,1) = (0.903, -0.430)
 56 | in1(1,3,2) = (0.885, -0.466)
 57 | in1(1,3,3) = (0.823, -0.568)
 58 | in1(1,3,4) = (0.694, -0.720)
 59 | in1(2,1,1) = (0.500,  0.500)
 60 | in1(2,1,2) = (0.499,  0.040)
 61 | in1(2,1,3) = (0.487,  0.159)
 62 | in1(2,1,4) = (0.436,  0.352)
 63 | in1(2,2,1) = (0.494,  0.111)
 64 | in1(2,2,2) = (0.489,  0.151)
 65 | in1(2,2,3) = (0.463,  0.268)
 66 | in1(2,2,4) = (0.391,  0.454)
 67 | in1(2,3,1) = (0.403,  0.430)
 68 | in1(2,3,2) = (0.385,  0.466)
 69 | in1(2,3,3) = (0.323,  0.568)
 70 | in1(2,3,4) = (0.194,  0.720)
 71 | 
 72 | ! each processor gets its local portion of global data
 73 | do k=xstart(3),xend(3)
 74 |    do j=xstart(2),xend(2)
 75 |       do i=xstart(1),xend(1)
 76 |          in(i,j,k) = in1(i,j,k)
 77 |       end do
 78 |    end do
 79 | end do
 80 | 
 81 | ! write out input, to match the format of NAG example result file
 82 | if (nrank==0) then
 83 |    write(*,*) 'C06FXF Example Program Results'
 84 |    write(*,*) ''
 85 |    write(*,*) 'Original data values'
 86 |    write(*,*) ''
 87 |    call print_global(in1,nx,ny,nz)
 88 | end if
 89 | 
 90 | ! ===== 3D forward FFT =====
 91 | call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
 92 | 
 93 | ! normalisation - note FFTW doesn't normalise 
 94 | do k=zstart(3),zend(3)
 95 |    do j=zstart(2),zend(2)
 96 |       do i=zstart(1),zend(1)
 97 |          out(i,j,k) = out(i,j,k) / sqrt(real(nx*ny*nz))
 98 |       end do
 99 |    end do
100 | end do
101 | 
102 | call assemble_global(3,out,out1,nx,ny,nz)
103 | 
104 | ! write out forward FFT result
105 | if (nrank==0) then
106 |    write(*,*) 'Components of discrete Fourier transform'
107 |    write(*,*) ''
108 |    call print_global(out1,nx,ny,nz)
109 | end if
110 | 
111 | ! ===== 3D inverse FFT =====
112 | call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
113 | 
114 | ! normalisation - note FFTW doesn't normalise
115 | do k=xstart(3),xend(3)
116 |    do j=xstart(2),xend(2)
117 |       do i=xstart(1),xend(1)
118 |          in(i,j,k) = in(i,j,k) / sqrt(real(nx*ny*nz))
119 |       end do
120 |    end do
121 | end do
122 | 
123 | call assemble_global(1,in,in1,nx,ny,nz)
124 | 
125 | ! write out inverse FFT result
126 | if (nrank==0) then
127 |    write(*,*) 'Original sequence as restored by inverse transform'
128 |    write(*,*) ''
129 |    call print_global(in1,nx,ny,nz)
130 | end if
131 | 
132 | call decomp_2d_fft_finalize
133 | call decomp_2d_finalize
134 | call MPI_FINALIZE(ierror)
135 | 
136 | end program fft_test_c2c
137 | 
138 | 
139 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
140 | ! Collect data from each processor and assemble into a global array
141 | ! at the master rank
142 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
143 | subroutine assemble_global(ndir,local,global,nx,ny,nz)
144 |   
145 |   use decomp_2d
146 |   use MPI
147 |   
148 |   implicit none
149 |   
150 |   integer, intent(IN) :: ndir  ! 1 = X-pencil; 3 = Z-pencil
151 |   integer, intent(IN) :: nx,ny,nz
152 |   complex(mytype), dimension(:,:,:), intent(IN) :: local
153 |   complex(mytype), dimension(nx,ny,nz), intent(OUT) :: global
154 |   
155 |   complex(mytype), allocatable, dimension(:,:,:) :: rbuf
156 |   integer, dimension(9) :: sbuf1, rbuf1
157 |   
158 |   integer :: ierror, i,j,k,m, i1,i2,j1,j2,k1,k2, count
159 |   integer, dimension(MPI_STATUS_SIZE) :: status
160 |   
161 |   if (nrank==0) then
162 |      ! master writes its own data to a global array
163 |      if (ndir==3) then  ! Z-pencil 
164 |         i1 = zstart(1)
165 |         i2 = zend(1)
166 |         j1 = zstart(2)
167 |         j2 = zend(2)
168 |         k1 = zstart(3)
169 |         k2 = zend(3)
170 |      else if (ndir==1) then  ! X-pencil
171 |         i1 = xstart(1)
172 |         i2 = xend(1)
173 |         j1 = xstart(2)
174 |         j2 = xend(2)
175 |         k1 = xstart(3)
176 |         k2 = xend(3)
177 |      end if
178 |      do k=k1,k2
179 |         do j=j1,j2
180 |            do i=i1,i2
181 |               ! 'local' is assumbed shape array
182 |               ! but it is OK as starting index for rank 0 always 1
183 |               global(i,j,k)=local(i,j,k)
184 |            end do
185 |         end do
186 |      end do
187 |      ! then loop through all other ranks to collect data
188 |      do m=1,nproc-1
189 |         CALL MPI_RECV(rbuf1,9,MPI_INTEGER,m,m,MPI_COMM_WORLD, &
190 |              status,ierror)
191 |         allocate(rbuf(rbuf1(1):rbuf1(2),rbuf1(4):rbuf1(5), &
192 |              rbuf1(7):rbuf1(8)))
193 |         CALL MPI_RECV(rbuf,rbuf1(3)*rbuf1(6)*rbuf1(9),complex_type,m, &
194 |              m+nproc,MPI_COMM_WORLD,status,ierror)
195 |         do k=rbuf1(7),rbuf1(8)
196 |            do j=rbuf1(4),rbuf1(5)
197 |               do i=rbuf1(1),rbuf1(2)
198 |                  global(i,j,k)=rbuf(i,j,k)
199 |               end do
200 |            end do
201 |         end do
202 |         deallocate(rbuf)
203 |      end do
204 |   else
205 |      ! slaves send data to mater
206 |      if (ndir==3) then  ! Z-pencil
207 |         sbuf1(1) = zstart(1)
208 |         sbuf1(2) = zend(1)
209 |         sbuf1(3) = zsize(1)
210 |         sbuf1(4) = zstart(2)
211 |         sbuf1(5) = zend(2)
212 |         sbuf1(6) = zsize(2)
213 |         sbuf1(7) = zstart(3)
214 |         sbuf1(8) = zend(3)
215 |         sbuf1(9) = zsize(3)
216 |         count = zsize(1)*zsize(2)*zsize(3)
217 |      else if (ndir==1) then  ! X-pencil
218 |         sbuf1(1) = xstart(1)
219 |         sbuf1(2) = xend(1)
220 |         sbuf1(3) = xsize(1)
221 |         sbuf1(4) = xstart(2)
222 |         sbuf1(5) = xend(2)
223 |         sbuf1(6) = xsize(2)
224 |         sbuf1(7) = xstart(3)
225 |         sbuf1(8) = xend(3)
226 |         sbuf1(9) = xsize(3)
227 |         count = xsize(1)*xsize(2)*xsize(3)
228 |      end if
229 |      ! send partition information
230 |      CALL MPI_SEND(sbuf1,9,MPI_INTEGER,0,nrank,MPI_COMM_WORLD,ierror)
231 |      ! send data array
232 |      CALL MPI_SEND(local,count,complex_type,0, &
233 |           nrank+nproc,MPI_COMM_WORLD,ierror)
234 |   end if
235 |   
236 |   return
237 | end subroutine assemble_global
238 | 
239 | 
240 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
241 | ! Print out a global data array using special format that matches
242 | ! NAG library C06FXF Example Program Results for validation purpose
243 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
244 | subroutine print_global(data,nx,ny,nz)
245 | 
246 | use decomp_2d
247 | 
248 | implicit none
249 | 
250 | integer, intent(IN) :: nx,ny,nz
251 | complex(mytype), dimension(nx,ny,nz), intent(IN) :: data
252 | 
253 | integer :: i,j,k
254 | 
255 | do i=1,nx
256 |    write(*,10) i
257 |    write(*,*) ''
258 |    do j=1,ny
259 |       write(*,20) (real(data(i,j,k)),k=1,nz)
260 |       write(*,21) (aimag(data(i,j,k)),k=1,nz)
261 |       write(*,*) ''
262 |    end do
263 | end do
264 | 10 format(1x,'z(i,j,k) for i =', I6)
265 | 20 format(1x,'Real ', 4F10.3)
266 | 21 format(1x,'Imag ', 4F10.3)
267 | 
268 | return
269 | end subroutine print_global
270 | 
271 | 


--------------------------------------------------------------------------------
/examples/fft_test_r2c/.gitignore:
--------------------------------------------------------------------------------
1 | fft_test_r2c
2 | 


--------------------------------------------------------------------------------
/examples/fft_test_r2c/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT)
 5 | 
 6 | OBJ = fft_test_r2c.o
 7 | 
 8 | fft_test_r2c: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o fft_test_r2c
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/fft_test_r2c/README:
--------------------------------------------------------------------------------
 1 | fft_test_r2c
 2 | ------------
 3 | 
 4 | This example demonstrates the use of the FFT r2c/c2r interface. It generates
 5 | random input and computes a serial 3D r2c transform on rank 0 to generate 
 6 | reference results. It then performs parallel computations of the same transform
 7 | on distributed data. There are two separate tests, with input data distributed 
 8 | in X-pencil and Z-pencil, respectively. In each test, a r2c transform is 
 9 | performed first and its results on rank 0 printed out. Then an inverse c2r 
10 | transform is followed which should recover the input to machine accuracy.
11 | 
12 | To run: use 4 MPI processes.
13 | 
14 | What to expect:
15 | - The output from the distributed computations should contain a subset of
16 |   numbers as in the serial output.
17 | - The error reported should be around machine accuracy (~ 10^-6 for single
18 |   precision and 10^-15 for double)
19 | 
20 | 


--------------------------------------------------------------------------------
/examples/fft_test_r2c/fft_test_r2c.f90:
--------------------------------------------------------------------------------
  1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  2 | ! Main test program for the FFT r2c/c2r interface
  3 | !  also demonstrate the use of the IO library 
  4 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  5 | 
  6 | program fft_test_r2c
  7 | 
  8 |   use decomp_2d
  9 |   use decomp_2d_fft
 10 |   use glassman
 11 |   use decomp_2d_io
 12 |   
 13 |   use MPI
 14 |   
 15 |   implicit none
 16 |   !include "fftw3.f"
 17 |   
 18 |   integer, parameter :: nx=4, ny=2, nz=3
 19 |   integer, parameter :: p_row=2, p_col=2
 20 |   
 21 |   real(mytype), allocatable, dimension(:,:,:) :: in, in2
 22 |   complex(mytype), allocatable, dimension(:,:,:) :: out
 23 |   
 24 |   integer, dimension(3) :: fft_start, fft_end, fft_size
 25 |   
 26 |   real(mytype), dimension(nx,ny,nz) :: in_global, in_g2, in_g3
 27 |   complex(mytype), dimension(nx/2+1,ny,nz) :: out_global
 28 |   
 29 |   integer (kind=MPI_OFFSET_KIND) :: filesize, disp
 30 |   
 31 |   real(mytype) :: err
 32 |   !integer*8 :: plan
 33 |   integer :: fh, ierror, i,j,k, n,iol
 34 |   
 35 |   call MPI_INIT(ierror)
 36 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 37 |   call decomp_2d_fft_init
 38 |   
 39 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 40 |   ! Compute a small problem all on rank 0 as reference
 41 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 42 |   call random_number(in_global)
 43 |   
 44 |   if (nrank==0) then
 45 |      write(*,*) '*** Reference serial computation on rank 0 only'
 46 |      write(*,*) ' global real input'
 47 |      do i=1,nx
 48 |         write(*,20) ((in_global(i,j,k),j=1,ny),k=1,nz)
 49 |      end do
 50 |      
 51 |      ! Using a 3D FFT routine supplied by this library
 52 |      call glassman_3d_r2c(in_global,nx,ny,nz,out_global)
 53 |      
 54 |      ! If using FFTW library:
 55 |      !  - uncomment the FFTW include file & plan above
 56 |      !  - uncomment the follwing function calls
 57 |      !  - change names to dfftw... for double precision 
 58 |      !call sfftw_plan_dft_r2c_3d(plan,nx,ny,nz, &
 59 |      !     in_global,out_global,FFTW_ESTIMATE)
 60 |      !call sfftw_execute_dft_r2c(plan,in_global,out_global)
 61 |      
 62 |      write(*,*) ' global complex output'
 63 |      do i=1,nx/2+1
 64 |         write(*,10) ((out_global(i,j,k),j=1,ny),k=1,nz)
 65 |      end do
 66 |   end if
 67 | 10 format(1x,6(:,'(',F5.2,',',F5.2,')'))
 68 | 20 format(1x,6F5.2)
 69 |   
 70 |   ! File for testing IO
 71 |   call MPI_FILE_OPEN(MPI_COMM_WORLD, 'fftdata', &
 72 |        MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
 73 |        fh, ierror)
 74 |   filesize = 0_MPI_OFFSET_KIND
 75 |   call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
 76 |   disp = 0_MPI_OFFSET_KIND
 77 |   
 78 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 79 |   ! Test the real-to-complex interface (r2c) 
 80 |   
 81 |   !  input is X-pencil real    data whose global size is nx*ny*nz
 82 |   ! output is Z-pencil complex data whose global size is (nx/2+1)*ny*nz
 83 |   allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
 84 |   
 85 |   call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
 86 |   allocate (out(fft_start(1):fft_end(1), &
 87 |        fft_start(2):fft_end(2), &
 88 |        fft_start(3):fft_end(3)))
 89 |   
 90 |   ! each processor gets its local portion of global data
 91 |   do k=xstart(3),xend(3)
 92 |      do j=xstart(2),xend(2)
 93 |         do i=xstart(1),xend(1)
 94 |            in(i,j,k) = in_global(i,j,k)
 95 |         end do
 96 |      end do
 97 |   end do
 98 |   
 99 |   ! write input to file
100 |   call decomp_2d_write_var(fh,disp,1,in)
101 |   
102 |   if (nrank==0) then
103 |      write(*,*) ' '
104 |      write(*,*) '*** Distributed computation (X-pencil input)'
105 |      write(*,*) ' real input held by rank 0:'
106 |      write(*,20) in
107 |   end if
108 |   
109 |   ! compute r2c transform 
110 |   call decomp_2d_fft_3d(in,out)
111 |   
112 |   if (nrank==0) then
113 |      write(*,*) ' - after forward transform'
114 |      write(*,*) ' complex output held by rank 0:'
115 |      write(*,10) out
116 |   end if
117 | 
118 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
119 |   ! Test the complex-to-real interface (c2r) 
120 | 
121 |   allocate (in2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
122 |   
123 |   ! compute c2r transform
124 |   call decomp_2d_fft_3d(out,in2)
125 |   
126 |   ! normalisation
127 |   in2 = in2 / real(nx) / real(ny) / real(nz)
128 |   
129 |   ! write the data recovered by inverse FFT to file
130 |   call decomp_2d_write_var(fh,disp,1,in2)
131 |   
132 |   if (nrank==0) then
133 |      write(*,*) ' - after backward transform and normalisation'
134 |      write(*,*) ' real output held by rank 0:'
135 |      write(*,20) in2
136 |   end if
137 |   
138 |   deallocate(in,in2,out)
139 |   call decomp_2d_fft_finalize
140 |   
141 |   call MPI_FILE_CLOSE(fh,ierror)
142 |   
143 |   ! check on rank 0 if input data is properly recovered
144 |   ! this also tests the IO routines
145 |   if (nrank==0) then
146 |      in_g2(1,1,1) = real(0., mytype)
147 |      inquire(iolength=iol) in_g2(1,1,1)
148 |      OPEN(10, FILE='fftdata', FORM='unformatted', &
149 |           ACCESS='DIRECT', RECL=iol)
150 |      n=1
151 |      do k=1,nz
152 |         do j=1,ny
153 |            do i=1,nx
154 |               read(10,rec=n) in_g2(i,j,k)
155 |               n=n+1
156 |            end do
157 |         end do
158 |      end do
159 |      do k=1,nz
160 |         do j=1,ny
161 |            do i=1,nx
162 |               read(10,rec=n) in_g3(i,j,k)
163 |               n=n+1
164 |            end do
165 |         end do
166 |      end do
167 |      err = 0._mytype
168 |      do k=1,nz
169 |         do j=1,ny
170 |            do i=1,nx
171 |               err = err + (in_g2(i,j,k)-in_g3(i,j,k))**2
172 |            end do
173 |         end do
174 |      end do
175 |      err = err / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
176 |      write(*,*) ' error / mesh point: ', sqrt(err)
177 |   end if
178 | 
179 | 
180 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
181 |   ! Repeat the above but using Z-pencil input
182 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
183 |   
184 |   call decomp_2d_fft_init(PHYSICAL_IN_Z)
185 |   
186 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
187 |   ! Test the real-to-complex interface (r2c) 
188 |   
189 |   allocate (in(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
190 |   call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
191 |   allocate (out(fft_start(1):fft_end(1), &
192 |        fft_start(2):fft_end(2), &
193 |        fft_start(3):fft_end(3)))
194 |   
195 |   ! each processor gets its local portion of global data
196 |   do k=zstart(3),zend(3)
197 |      do j=zstart(2),zend(2)
198 |         do i=zstart(1),zend(1)
199 |            in(i,j,k) = in_global(i,j,k)
200 |         end do
201 |      end do
202 |   end do
203 |   if (nrank==0) then
204 |      write(*,*) ' '
205 |      write(*,*) '*** Distributed computation (Z-pencil input)'
206 |      write(*,*) ' real input held by rank 0:'
207 |      write(*,20) in
208 |   end if
209 |   
210 |   ! compute r2c transform 
211 |   call decomp_2d_fft_3d(in,out)
212 |   
213 |   if (nrank==0) then
214 |      write(*,*) ' - after forward transform'
215 |      write(*,*) ' complex output held by rank 0:'
216 |      write(*,10) out
217 |   end if
218 |   
219 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
220 |   ! Test the complex-to-real interface (c2r) 
221 |   
222 |   allocate (in2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
223 |   
224 |   ! compute c2r transform
225 |   call decomp_2d_fft_3d(out,in2)
226 |   
227 |   ! normalisation
228 |   in2 = in2 / real(nx) / real(ny) / real(nz)
229 |   
230 |   if (nrank==0) then
231 |      write(*,*) ' - after backward transform and normalisation'
232 |      write(*,*) ' real output held by rank 0:'
233 |      write(*,20) in2
234 |   end if
235 |   
236 |   deallocate(in,in2,out)
237 |   
238 |   call decomp_2d_fft_finalize
239 |   call decomp_2d_finalize
240 |   call MPI_FINALIZE(ierror)
241 |   
242 | end program fft_test_r2c
243 | 


--------------------------------------------------------------------------------
/examples/halo_test/.gitignore:
--------------------------------------------------------------------------------
1 | halo_test
2 | 


--------------------------------------------------------------------------------
/examples/halo_test/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft
 5 | 
 6 | OBJ = halo_test.o
 7 | 
 8 | halo_test: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o halo_test
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/halo_test/README:
--------------------------------------------------------------------------------
 1 | halo_test
 2 | ---------
 3 | 
 4 | This example demonstrates the use of the halo-cell support API. It calculates
 5 | the divergency of an arbitrary field, which contains evaluation of spatial
 6 | derivatives in all three dimensions. The calculation was first implemented via
 7 | the global transposition routines, then via halo-cell exchanges. Identical
 8 | results are to be expected regardless of the communication algorithm. The 
 9 | computation is based on an explicit finite difference method so clearly using 
10 | the halo-cell support API is more efficient.
11 | 
12 | To run: use 12 MPI processes.
13 | 
14 | What to expect: the output using different communication algorithms should be 
15 | exactly the same.
16 | 


--------------------------------------------------------------------------------
/examples/io_test/.gitignore:
--------------------------------------------------------------------------------
1 | io_test
2 | 


--------------------------------------------------------------------------------
/examples/io_test/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft
 5 | ifneq (,$(findstring DT3PIO,$(OPTIONS)))
 6 |   LIBS+= -L$(T3PIO_PATH)/lib -lt3pio
 7 | endif
 8 | 
 9 | all: io_test io_read io_var_test io_plane_test io_bench
10 | 
11 | io_test: io_test.o
12 | 	$(F90) -o $@ $@.o $(LIBS)
13 | 
14 | io_read: io_read.o
15 | 	$(F90) -o $@ $@.o $(LIBS)
16 | 
17 | io_var_test: io_var_test.o
18 | 	$(F90) -o $@ $@.o $(LIBS)
19 | 
20 | io_plane_test: io_plane_test.o
21 | 	$(F90) -o $@ $@.o $(LIBS)
22 | 
23 | io_bench: io_bench.o
24 | 	$(F90) -o $@ $@.o $(LIBS)
25 | 
26 | clean:
27 | 	rm -f *.o io_test io_read io_var_test io_plane_test io_bench
28 | 
29 | realclean: clean
30 | 	rm -f *.dat io_var_data.*
31 | 
32 | %.o : %.f90
33 | 	$(F90) $(INCLUDE) $(OPTIONS) $(ARG) $(F90FLAGS) -c $<
34 | 


--------------------------------------------------------------------------------
/examples/io_test/README:
--------------------------------------------------------------------------------
1 | io_test
2 | -------
3 | 
4 | This folder contains several applications to thoroughly test 2DECOMP&FFT's
5 | IO library.
6 | 
7 | To run: If interactive run is possible on your system, adapt the script 
8 | provided to run all the tests in one go.
9 | 


--------------------------------------------------------------------------------
/examples/io_test/io_bench.f90:
--------------------------------------------------------------------------------
 1 | program io_bench
 2 | 
 3 |   use decomp_2d
 4 |   use decomp_2d_io
 5 |   use MPI
 6 | 
 7 |   implicit none
 8 |   
 9 |   integer, parameter :: nx=100, ny=100, nz=100
10 |   integer, parameter :: p_row=4, p_col=4
11 | 
12 |   real(mytype), allocatable, dimension(:,:,:) :: u1
13 |   
14 |   double precision :: t1, t2
15 |   integer :: ierror
16 | 
17 |   call MPI_INIT(ierror)
18 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
19 | 
20 |   allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
21 |   call random_number(u1)
22 | 
23 |   t1 = MPI_WTIME()
24 |   call decomp_2d_write_one(1,u1,'io.dat')
25 |   t2 = MPI_WTIME()
26 | 
27 |   if (nrank==0) write(*,*) 'I/O time: ', t2-t1
28 | 
29 |   call decomp_2d_finalize 
30 |   call MPI_FINALIZE(ierror)
31 |   deallocate(u1)
32 | 
33 | end program io_bench
34 |   
35 | 


--------------------------------------------------------------------------------
/examples/io_test/io_plane_test.f90:
--------------------------------------------------------------------------------
  1 | program io_plane_test
  2 | 
  3 |   use decomp_2d
  4 |   use decomp_2d_io
  5 | 
  6 |   implicit none
  7 | 
  8 |   integer, parameter :: nx=17, ny=13, nz=11
  9 |   integer, parameter :: p_row=4, p_col=3
 10 | 
 11 |   real(mytype), dimension(nx,ny,nz) :: data1
 12 |   real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
 13 | 
 14 |   real(mytype), allocatable, dimension(:,:,:) :: work
 15 |   
 16 |   integer :: i,j,k, m, ierror, iol
 17 | 
 18 |   call MPI_INIT(ierror)
 19 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 20 |   
 21 |   ! ***** global data *****
 22 |   m = 1
 23 |   do k=1,nz
 24 |      do j=1,ny
 25 |         do i=1,nx
 26 |            data1(i,j,k) = real(m,mytype)
 27 |            m = m+1
 28 |         end do
 29 |      end do
 30 |   end do
 31 | 
 32 |   allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
 33 |   allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
 34 |   allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
 35 | 
 36 |   ! original X-pensil based data 
 37 |   do k=xstart(3),xend(3)
 38 |     do j=xstart(2),xend(2)
 39 |       do i=xstart(1),xend(1)
 40 |         u1(i,j,k) = data1(i,j,k)
 41 |       end do
 42 |     end do
 43 |   end do
 44 |   call decomp_2d_write_plane(1,u1,1,nx/2,'x_pencil-x_plane.dat')
 45 |   call decomp_2d_write_plane(1,u1,2,ny/2,'x_pencil-y_plane.dat')
 46 |   call decomp_2d_write_plane(1,u1,3,nz/2,'x_pencil-z_plane.dat')
 47 | 
 48 |   ! Y-pencil data
 49 |   call transpose_x_to_y(u1,u2)
 50 |   call decomp_2d_write_plane(2,u2,1,nx/2,'y_pencil-x_plane.dat')
 51 |   call decomp_2d_write_plane(2,u2,2,ny/2,'y_pencil-y_plane.dat')
 52 |   call decomp_2d_write_plane(2,u2,3,nz/2,'y_pencil-z_plane.dat')
 53 | 
 54 |   ! Z-pencil data
 55 |   call transpose_y_to_z(u2,u3)
 56 |   call decomp_2d_write_plane(3,u3,1,nx/2,'z_pencil-x_plane.dat')
 57 |   call decomp_2d_write_plane(3,u3,2,ny/2,'z_pencil-y_plane.dat')
 58 |   call decomp_2d_write_plane(3,u3,3,nz/2,'z_pencil-z_plane.dat')
 59 | 
 60 |   ! Attemp to read the files
 61 |   if (nrank==0) then
 62 |      inquire(iolength=iol) data1(1,1,1)
 63 | 
 64 |      ! X-plane
 65 |      allocate(work(1,ny,nz))
 66 |      open(10, FILE='x_pencil-x_plane.dat', FORM='unformatted', &
 67 |           ACCESS='DIRECT', RECL=iol)
 68 |      m=1
 69 |      do k=1,nz
 70 |         do j=1,ny
 71 |            read(10,rec=m) work(1,j,k)
 72 |            m=m+1
 73 |         end do
 74 |      end do
 75 |      write(*,*) ' '
 76 |      write(*,'(15I5)') int(work)
 77 |      close(10)
 78 |      deallocate(work)
 79 | 
 80 |      ! Y-plane
 81 |      allocate(work(nx,1,nz))
 82 |      open(10, FILE='x_pencil-y_plane.dat', FORM='unformatted', &
 83 |           ACCESS='DIRECT', RECL=iol)
 84 |      m=1
 85 |      do k=1,nz
 86 |         do i=1,nx
 87 |            read(10,rec=m) work(i,1,k)
 88 |            m=m+1
 89 |         end do
 90 |      end do
 91 |      write(*,*) ' '
 92 |      write(*,'(15I5)') int(work)
 93 |      close(10)
 94 |      deallocate(work)
 95 | 
 96 |      ! Z-plane
 97 |      allocate(work(nx,ny,1))
 98 |      open(10, FILE='x_pencil-z_plane.dat', FORM='unformatted', &
 99 |           ACCESS='DIRECT', RECL=iol)
100 |      m=1
101 |      do j=1,ny
102 |         do i=1,nx
103 |            read(10,rec=m) work(i,j,1)
104 |            m=m+1
105 |         end do
106 |      end do
107 |      write(*,*) ' '
108 |      write(*,'(15I5)') int(work)
109 |      close(10)
110 |      deallocate(work)
111 | 
112 |   end if
113 | 
114 |   call decomp_2d_finalize 
115 |   call MPI_FINALIZE(ierror)
116 |   deallocate(u1,u2,u3)
117 |   
118 | end program io_plane_test
119 | 


--------------------------------------------------------------------------------
/examples/io_test/io_read.f90:
--------------------------------------------------------------------------------
 1 | program io_read
 2 | 
 3 |   use decomp_2d
 4 |   use decomp_2d_io
 5 | 
 6 |   implicit none
 7 | 
 8 |   integer, parameter :: nx=17, ny=13, nz=11
 9 |   ! use different number of processes
10 |   integer, parameter :: p_row=3, p_col=2
11 | 
12 | #ifdef COMPLEX_TEST
13 |   complex(mytype), dimension(nx,ny,nz) :: data1
14 | 
15 |   complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
16 | #else
17 |   real(mytype), dimension(nx,ny,nz) :: data1
18 | 
19 |   real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
20 | #endif
21 | 
22 |   real(mytype), parameter :: eps = 1.0E-7_mytype
23 |  
24 |   integer :: i,j,k, m, ierror
25 |   
26 |   call MPI_INIT(ierror)
27 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
28 | 
29 |   ! ***** global data *****
30 |   m = 1
31 |   do k=1,nz
32 |      do j=1,ny
33 |         do i=1,nx
34 | #ifdef COMPLEX_TEST
35 |            data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 
36 | #else
37 |            data1(i,j,k) = real(m,mytype)
38 | #endif
39 |            m = m+1
40 |         end do
41 |      end do
42 |   end do
43 | 
44 |   allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
45 |   allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
46 |   allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
47 | 
48 |   ! read back to different arrays
49 |   call decomp_2d_read_one(1,u1b,'u1.dat')
50 |   call decomp_2d_read_one(2,u2b,'u2.dat')
51 |   call decomp_2d_read_one(3,u3b,'u3.dat')
52 | 
53 |   ! Check against the global data array
54 |   do k=xstart(3),xend(3)
55 |     do j=xstart(2),xend(2)
56 |       do i=xstart(1),xend(1)
57 |         if (abs((data1(i,j,k)-u1b(i,j,k))) > eps) stop 4
58 |       end do
59 |     end do
60 |   end do
61 | 
62 |   do k=ystart(3),yend(3)
63 |     do j=ystart(2),yend(2)
64 |       do i=ystart(1),yend(1)
65 |         if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5
66 |       end do
67 |     end do
68 |   end do
69 |   
70 |   do k=zstart(3),zend(3)
71 |     do j=zstart(2),zend(2)
72 |       do i=zstart(1),zend(1)
73 |         if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6
74 |       end do
75 |     end do
76 |   end do
77 | 
78 |   call decomp_2d_finalize 
79 |   call MPI_FINALIZE(ierror)
80 |   deallocate(u1b,u2b,u3b)
81 | 
82 | end program io_read
83 | 


--------------------------------------------------------------------------------
/examples/io_test/io_test.f90:
--------------------------------------------------------------------------------
  1 | program io_test
  2 | 
  3 |   use decomp_2d
  4 |   use decomp_2d_io
  5 | 
  6 |   implicit none
  7 | 
  8 |   integer, parameter :: nx=17, ny=13, nz=11
  9 |   integer, parameter :: p_row=4, p_col=3
 10 | 
 11 | #ifdef COMPLEX_TEST
 12 |   complex(mytype), dimension(nx,ny,nz) :: data1
 13 | 
 14 |   complex(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
 15 |   complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
 16 | #else
 17 |   real(mytype), dimension(nx,ny,nz) :: data1
 18 | 
 19 |   real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
 20 |   real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
 21 | #endif
 22 | 
 23 |   real(mytype), parameter :: eps = 1.0E-7_mytype
 24 |  
 25 |   integer :: i,j,k, m, ierror
 26 |   
 27 |   call MPI_INIT(ierror)
 28 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 29 | 
 30 |   ! ***** global data *****
 31 |   m = 1
 32 |   do k=1,nz
 33 |      do j=1,ny
 34 |         do i=1,nx
 35 | #ifdef COMPLEX_TEST
 36 |            data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 
 37 | #else
 38 |            data1(i,j,k) = real(m,mytype)
 39 | #endif
 40 |            m = m+1
 41 |         end do
 42 |      end do
 43 |   end do
 44 | 
 45 |   allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
 46 |   allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
 47 |   allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
 48 | 
 49 |   allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
 50 |   allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
 51 |   allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
 52 | 
 53 |   ! original x-pensil based data 
 54 |   do k=xstart(3),xend(3)
 55 |     do j=xstart(2),xend(2)
 56 |       do i=xstart(1),xend(1)
 57 |         u1(i,j,k) = data1(i,j,k)
 58 |       end do
 59 |     end do
 60 |   end do
 61 | 
 62 |   ! transpose
 63 |   call transpose_x_to_y(u1,u2)
 64 |   call transpose_y_to_z(u2,u3)
 65 | 
 66 |   ! write to disk
 67 |   call decomp_2d_write_one(1,u1,'u1.dat')
 68 |   call decomp_2d_write_one(2,u2,'u2.dat')
 69 |   call decomp_2d_write_one(3,u3,'u3.dat')
 70 | 
 71 |   ! read back to different arrays
 72 |   call decomp_2d_read_one(1,u1b,'u1.dat')
 73 |   call decomp_2d_read_one(2,u2b,'u2.dat')
 74 |   call decomp_2d_read_one(3,u3b,'u3.dat')
 75 | 
 76 |   ! compare  
 77 |   do k=xstart(3),xend(3)
 78 |     do j=xstart(2),xend(2)
 79 |       do i=xstart(1),xend(1)
 80 |         if (abs((u1(i,j,k)-u1b(i,j,k))) > eps) stop 1
 81 |       end do
 82 |     end do
 83 |   end do
 84 | 
 85 |   do k=ystart(3),yend(3)
 86 |     do j=ystart(2),yend(2)
 87 |       do i=ystart(1),yend(1)
 88 |         if (abs((u2(i,j,k)-u2b(i,j,k))) > eps) stop 2
 89 |       end do
 90 |     end do
 91 |   end do
 92 | 
 93 |   do k=zstart(3),zend(3)
 94 |     do j=zstart(2),zend(2)
 95 |       do i=zstart(1),zend(1)
 96 |         if (abs((u3(i,j,k)-u3b(i,j,k))) > eps) stop 3
 97 |       end do
 98 |     end do
 99 |   end do
100 | 
101 |   ! Also check against the global data array
102 |   do k=xstart(3),xend(3)
103 |     do j=xstart(2),xend(2)
104 |       do i=xstart(1),xend(1)
105 |         if (abs((data1(i,j,k)-u1b(i,j,k))) > eps) stop 4
106 |       end do
107 |     end do
108 |   end do
109 | 
110 |   do k=ystart(3),yend(3)
111 |     do j=ystart(2),yend(2)
112 |       do i=ystart(1),yend(1)
113 |         if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5
114 |       end do
115 |     end do
116 |   end do
117 |   
118 |   do k=zstart(3),zend(3)
119 |     do j=zstart(2),zend(2)
120 |       do i=zstart(1),zend(1)
121 |         if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6
122 |       end do
123 |     end do
124 |   end do
125 | 
126 |   call decomp_2d_finalize 
127 |   call MPI_FINALIZE(ierror)
128 |   deallocate(u1,u2,u3)
129 |   deallocate(u1b,u2b,u3b)
130 | 
131 | end program io_test
132 | 


--------------------------------------------------------------------------------
/examples/io_test/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | make
 4 | if [ $? -ne 0 ] ; then
 5 |     echo "================================================="
 6 |     echo "Failed to build the applications. Fix them first!"
 7 |     echo "================================================="
 8 |     exit 1
 9 | fi
10 | 
11 | echo " "
12 | echo "writing data files using MPI-IO..."
13 | echo "mpirun -np 12 ./io_test"
14 | mpirun -np 12 ./io_test
15 | 
16 | echo " "
17 | echo "reading data files back (different number of processes)..."
18 | echo "mpirun -np 6 ./io_read"
19 | mpirun -np 6 ./io_read
20 | 
21 | # The files written by MPI-IO should be independent on # processes
22 | echo " "
23 | echo "*** testing write_var..."
24 | echo "mpirun -np 20 ./io_var_test 5 4"
25 | mpirun -np 20 ./io_var_test 5 4
26 | echo "mpirun -np 12 ./io_var_test 4 3"
27 | mpirun -np 12 ./io_var_test 4 3
28 | echo "mpirun -np 6 ./io_var_test 3 2"
29 | mpirun -np 6 ./io_var_test 3 2
30 | echo "mpirun -np 2 ./io_var_test 2 1"
31 | mpirun -np 2 ./io_var_test 2 1
32 | diff -s io_var_data.020 io_var_data.012 
33 | diff -s io_var_data.020 io_var_data.006
34 | diff -s io_var_data.020 io_var_data.002
35 | 
36 | echo " "
37 | echo "*** testing write_plane..."
38 | echo "mpirun -np 12 ./io_plane_test"
39 | mpirun -np 12 ./io_plane_test
40 | diff -s x_pencil-x_plane.dat y_pencil-x_plane.dat
41 | diff -s x_pencil-x_plane.dat z_pencil-x_plane.dat
42 | diff -s x_pencil-y_plane.dat y_pencil-y_plane.dat
43 | diff -s x_pencil-y_plane.dat z_pencil-y_plane.dat
44 | diff -s x_pencil-z_plane.dat y_pencil-z_plane.dat
45 | diff -s x_pencil-z_plane.dat z_pencil-z_plane.dat
46 | 
47 | echo " "
48 | echo "Tests PASSED, unless errors reported"
49 | 
50 | 


--------------------------------------------------------------------------------
/examples/non_blocking/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT)
 5 | 
 6 | all: blocking non_blocking
 7 | 
 8 | blocking: blocking.o
 9 | 	$(F90) -o $@ $@.o $(LIBS)
10 | 
11 | non_blocking: non_blocking.o
12 | 	$(F90) -o $@ $@.o $(LIBS)
13 | 
14 | clean:
15 | 	rm -f *.o blocking non_blocking
16 | 
17 | %.o : %.f90
18 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
19 | 


--------------------------------------------------------------------------------
/examples/non_blocking/README.md:
--------------------------------------------------------------------------------
 1 | non_blocking
 2 | ------------
 3 | 
 4 | This test contains two sample applications to compute multiple independent FFTs. The first application `blocking.f90` uses the standard blocking version of MPI communication code to transpose the data among different stages of the computation. The second application `non_blocking.f90` performs the same computation using the non-blocking communication routines supplied by 2DECOMP&FFT. 
 5 | 
 6 | These two applications are using FFTW APIs directly. Please compile them separately using the `Makefile` in this directory after building 2DECOMP&FFT with the FFTW engine.
 7 | 
 8 | Non-blocking collective communication is part of MPI 3 standard and it is now widely supported. Earlier users of 2DECOMP&FFT may remember the use of libNBC (http://www.unixer.de/research/nbcoll/libnbc/), a library implementing non-blocking MPI collectives (such as IALLTOALL) with existing MPI 1 functions. libNBC is now obsolete and reference to it has been removed from the source codes. 
 9 | 
10 | To demonstrate the idea of overlap communication and computation, the 3D FFT is implemented using loops of 1D FFTs (without using the advanced interface of FFTW) so that MPI_TEST calls can be easily inserted in the computational part of the code. This is required because the communication has to be explicitly progressed when running on the same thread as the computation.
11 | 
12 | The two applications should produce identical results.
13 | 
14 | End users are responsible for identifying opportunities in their applications to overlap communication with computation.
15 | 


--------------------------------------------------------------------------------
/examples/non_blocking/blocking.f90:
--------------------------------------------------------------------------------
  1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  2 | ! This program computes multiple distributed 3D FFTs
  3 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  4 | 
  5 | program blocking
  6 | 
  7 |   use decomp_2d
  8 |   use MPI
  9 | 
 10 |   implicit none
 11 | 
 12 |   include "fftw3.f"
 13 | 
 14 |   integer, parameter :: nx=16, ny=16, nz=16
 15 |   integer, parameter :: p_row=2, p_col=2
 16 |   integer, parameter :: NFFT=20   ! number of independent FFTs
 17 | 
 18 |   integer :: i,j,k, m, nmax, ierror
 19 |   real(mytype) :: tmp1, tmp2
 20 | 
 21 |   double precision :: t1, t2
 22 | 
 23 |   ! FFTW plans for the 1D forward/backward transforms
 24 |   integer*8, save :: x_plan_f, x_plan_b
 25 |   integer*8, save :: y_plan_f, y_plan_b
 26 |   integer*8, save :: z_plan_f, z_plan_b
 27 | 
 28 |   ! dummy array used for planning
 29 |   complex(mytype), allocatable, dimension(:) :: buf1, buf2
 30 | 
 31 |   ! input/output of the FFT
 32 |   complex(mytype), allocatable, dimension(:,:,:) :: in, out, wk2
 33 | 
 34 | 
 35 |   call MPI_INIT(ierror)
 36 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 37 |   
 38 |   ! ===== planning 1D FFT in x =====
 39 |   allocate(buf1(xsize(1)), buf2(xsize(1)))
 40 | 
 41 | #ifdef DOUBLE_PREC
 42 |   call dfftw_plan_dft_1d(x_plan_f, xsize(1), buf1, buf2,  &
 43 |        FFTW_FORWARD,  FFTW_MEASURE)
 44 |   call dfftw_plan_dft_1d(x_plan_b, xsize(1), buf1, buf2,  &
 45 |        FFTW_BACKWARD, FFTW_MEASURE)
 46 | #else
 47 |   call sfftw_plan_dft_1d(x_plan_f, xsize(1), buf1, buf2,  &
 48 |        FFTW_FORWARD,  FFTW_MEASURE)
 49 |   call sfftw_plan_dft_1d(x_plan_b, xsize(1), buf1, buf2,  &
 50 |        FFTW_BACKWARD, FFTW_MEASURE)
 51 | #endif
 52 | 
 53 |   deallocate(buf1,buf2)
 54 | 
 55 |   ! ===== planning 1D FFT in Y =====
 56 |   allocate(buf1(ysize(2)), buf2(ysize(2)))
 57 |   
 58 | #ifdef DOUBLE_PREC
 59 |   call dfftw_plan_dft_1d(y_plan_f, ysize(2), buf1, buf2,  &
 60 |        FFTW_FORWARD,  FFTW_MEASURE)
 61 |   call dfftw_plan_dft_1d(y_plan_b, ysize(2), buf1, buf2,  &
 62 |        FFTW_BACKWARD, FFTW_MEASURE)
 63 | #else
 64 |   call sfftw_plan_dft_1d(y_plan_f, ysize(2), buf1, buf2,  &
 65 |        FFTW_FORWARD,  FFTW_MEASURE)
 66 |   call sfftw_plan_dft_1d(y_plan_b, ysize(2), buf1, buf2,  &
 67 |        FFTW_BACKWARD, FFTW_MEASURE)
 68 | #endif
 69 |   
 70 |   deallocate(buf1,buf2)
 71 |   
 72 |   ! ===== planning 1D FFT in Z =====
 73 |   allocate(buf1(zsize(3)), buf2(zsize(3)))
 74 |   
 75 | #ifdef DOUBLE_PREC
 76 |   call dfftw_plan_dft_1d(z_plan_f, zsize(3), buf1, buf2,  &
 77 |        FFTW_FORWARD,  FFTW_MEASURE)
 78 |   call dfftw_plan_dft_1d(z_plan_b, zsize(3), buf1, buf2,  &
 79 |        FFTW_BACKWARD, FFTW_MEASURE)
 80 | #else
 81 |   call sfftw_plan_dft_1d(z_plan_f, zsize(3), buf1, buf2,  &
 82 |        FFTW_FORWARD,  FFTW_MEASURE)
 83 |   call sfftw_plan_dft_1d(z_plan_b, zsize(3), buf1, buf2,  &
 84 |        FFTW_BACKWARD, FFTW_MEASURE)
 85 | #endif
 86 |   
 87 |   deallocate(buf1,buf2)
 88 | 
 89 | 
 90 |   allocate( in(xsize(1),xsize(2),xsize(3)))   ! x-pencil input
 91 |   allocate(out(zsize(1),zsize(2),zsize(3)))   ! z-pencil output
 92 |   allocate(wk2(ysize(1),ysize(2),ysize(3)))   ! y-pencil intermediate
 93 | 
 94 |   ! 1D temp buffer
 95 |   nmax = max(xsize(1),max(ysize(2),zsize(3)))
 96 |   allocate (buf1(nmax))
 97 |   allocate (buf2(nmax))
 98 | 
 99 |   t1 = MPI_WTIME()
100 | 
101 |   do m=1,NFFT
102 | 
103 |      do k=1,xsize(3)
104 |         do j=1,xsize(2)
105 |            do i=1,xsize(1)
106 |               tmp1 = real(xstart(1)+i-1, mytype) / real(nx, mytype) &
107 |                    * real(xstart(2)+j-1, mytype) / real(ny, mytype) &
108 |                    * real(xstart(3)+k-1, mytype) / real(nz, mytype) &
109 |                    * real(m, mytype) / real(NFFT, mytype)
110 |               in(i,j,k) = cmplx(tmp1, 0._mytype, mytype)
111 |            end do
112 |         end do
113 |      end do
114 | 
115 |      ! This shows how to perform 3D FFT by using the FFTW basic interface.
116 |      ! Copy data to/from 1D buffers and loop through all 1D FFTs.
117 | 
118 |      ! 1D FFT in X
119 |      do k=1,xsize(3)
120 |         do j=1,xsize(2)
121 |            do i=1,xsize(1)
122 |               buf1(i) = in(i,j,k)
123 |            end do
124 | #ifdef DOUBLE_PREC
125 |            call dfftw_execute_dft(x_plan_f, buf1, buf2)
126 | #else
127 |            call sfftw_execute_dft(x_plan_f, buf1, buf2)
128 | #endif
129 |            do i=1,xsize(1)
130 |               in(i,j,k) = buf2(i)
131 |            end do
132 |         end do
133 |      end do
134 | 
135 |      ! ===== Swap X --> Y =====
136 |      call transpose_x_to_y(in,wk2)
137 | 
138 |      ! ===== 1D FFTs in Y =====
139 |      do k=1,ysize(3)
140 |         do i=1,ysize(1)
141 |            do j=1,ysize(2)
142 |               buf1(j) = wk2(i,j,k)
143 |            end do
144 | #ifdef DOUBLE_PREC
145 |            call dfftw_execute_dft(y_plan_f, buf1, buf2)
146 | #else
147 |            call sfftw_execute_dft(y_plan_f, buf1, buf2)
148 | #endif
149 |            do j=1,ysize(2)
150 |               wk2(i,j,k) = buf2(j)
151 |            end do
152 |         end do
153 |      end do
154 | 
155 |      ! ===== Swap Y --> Z =====
156 |      call transpose_y_to_z(wk2,out)
157 | 
158 |      ! ===== 1D FFTs in Z =====
159 |      do j=1,zsize(2)
160 |         do i=1,zsize(1)
161 |            do k=1,zsize(3)
162 |               buf1(k) = out(i,j,k)
163 |            end do
164 | #ifdef DOUBLE_PREC
165 |            call dfftw_execute_dft(z_plan_f, buf1, buf2)
166 | #else
167 |            call sfftw_execute_dft(z_plan_f, buf1, buf2)
168 | #endif
169 |            do k=1,zsize(3)
170 |               out(i,j,k) = buf2(k)
171 |            end do
172 |         end do
173 |      end do
174 | 
175 |      if (nrank==0) write(*,*) 'TEST ', m, out(1:2,1:2,1:2)
176 | 
177 |   end do ! NFFT
178 | 
179 |   t2 = MPI_WTIME() - t1
180 |   call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, &
181 |        MPI_COMM_WORLD,ierror)
182 |   t1 = t1 / real(nproc, mytype)
183 | 
184 |   if (nrank==0) then
185 |      write(*,*) 'Average Forward FFT Time(sec): ', t1
186 |   end if
187 | 
188 |   ! clean up
189 | #ifdef DOUBLE_PREC
190 |   call dfftw_destroy_plan(x_plan_f)
191 |   call dfftw_destroy_plan(x_plan_b)
192 |   call dfftw_destroy_plan(y_plan_f)
193 |   call dfftw_destroy_plan(y_plan_b)
194 |   call dfftw_destroy_plan(z_plan_f)
195 |   call dfftw_destroy_plan(z_plan_b)
196 | #else
197 |   call sfftw_destroy_plan(x_plan_f)
198 |   call sfftw_destroy_plan(x_plan_b)
199 |   call sfftw_destroy_plan(y_plan_f)
200 |   call sfftw_destroy_plan(y_plan_b)
201 |   call sfftw_destroy_plan(z_plan_f)
202 |   call sfftw_destroy_plan(z_plan_b)
203 | #endif
204 | 
205 |   call decomp_2d_finalize 
206 |   call MPI_FINALIZE(ierror)
207 |   deallocate(in,out,wk2,buf1,buf2)
208 |   
209 | 
210 | end program blocking
211 | 


--------------------------------------------------------------------------------
/examples/p3dfft/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | P3DFFT_HOME=$(HOME)/software/build/p3dfft-2.7.9-dimsc
 4 | FFTW3_HOME=$(HOME)/software/build/fftw-3.3
 5 | 
 6 | INCLUDE = -I../../include -I$(P3DFFT_HOME)/include
 7 | LIBS = -L../../lib -l2decomp_fft -L$(P3DFFT_HOME)/lib -lp3dfft $(LIBFFT) -L$(FFTW3_HOME)/lib -lfftw3
 8 | 
 9 | OBJ = p3dfft.o
10 | 
11 | p3dfft: $(OBJ)
12 | 	$(F90) -o $@ $(OBJ) $(LIBS)
13 | 
14 | clean:
15 | 	rm -f *.o p3dfft
16 | 
17 | %.o : %.f90
18 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
19 | 


--------------------------------------------------------------------------------
/examples/p3dfft/README.md:
--------------------------------------------------------------------------------
 1 | p3dfft
 2 | ------
 3 | 
 4 | This test program performs the following tasks:
 5 | 
 6 | * It crosschecks 2DECOMP&FFT results against P3DFFT results.
 7 | * It compares the performance of the two codes. 
 8 | 
 9 | **How to set up this test?**
10 | 
11 | Due to external dependency, this test has to be built separately after building 2DECOMP&FFT.
12 | 
13 | First, download P3DFFT version 2.7.9, the most recent 2.x release. Install P3DFFT at a directory denoted as `$P3DFFT_HOME` and set this path properly in the `Makefile`. To build P3DFFT :
14 | ```
15 | FC=mpif90 CC=mpicc LDFLAGS="-lm" ./configure --prefix=$HOME/software/build/p3dfft-2.7.9-dimsc \
16 | --enable-gnu --enable-fftw --with-fftw=$HOME/software/build/fftw-3.3.9 --enable-openmpi --enable-dimsc
17 | make
18 | make install
19 | ```
20 | This instruction is on a workstation with gcc 8.4.1 and OpenMPI 4.0.5. Adapt this accordlingly. Note that P3DFFT is built with `-DDIMS_C` flag (enabling same decomposition as 2DECOMP&FFT for a fair comparison). P3DFFT needs to link to *libm* but somehow its build system doesn't handle this correctly, thus the added `LDFLAGS` setting being required.
21 | 
22 | Both P3DFFT and 2DECOMP&FFT are built in double precision. P3DFFT is built against FFTW (provide its path in `Makefile`); 2DECOMP&FFT can use any FFT engine.
23 | 
24 | **What to expect:**
25 | 
26 | * Results from P3DFFT and 2DECOMP&FFT should be almost identical, even when different FFT engines are used.
27 | * Each library should recover its input after one forward and one backward transform with errors reported close to machine accuracy.
28 | 


--------------------------------------------------------------------------------
/examples/tecplot_view/2decomp_decomp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/examples/tecplot_view/2decomp_decomp.png


--------------------------------------------------------------------------------
/examples/tecplot_view/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft
 5 | 
 6 | OBJ = tecplot_view.o
 7 | 
 8 | tecplot_view: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o tecplot_view
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/tecplot_view/README:
--------------------------------------------------------------------------------
 1 | tecplot_view
 2 | ------------
 3 | 
 4 | This program generates data to visualise the 2D decomposition options.
 5 | 
 6 | To run: use 12 MPI processes.
 7 | 
 8 | What to expect: several data files are written to disk. These are in the 
 9 | standard format that can be read by Tecplot (a visualisation package popular 
10 | in the CFD community).
11 | 
12 | There is also an interactive web page at http://www.2decomp.org/decomp.php 
13 | to demonstrate the data distribution in a 2D decomposition.
14 | 


--------------------------------------------------------------------------------
/examples/tecplot_view/tecplot_view.f90:
--------------------------------------------------------------------------------
  1 | program tecplot_view
  2 |   
  3 |   use decomp_2d
  4 |   use decomp_2d_io
  5 |   use MPI
  6 | 
  7 |   implicit none
  8 | 
  9 |   integer, parameter :: nx=17, ny=13, nz=11
 10 |   integer, parameter :: p_row=4, p_col=3
 11 | 
 12 |   real(mytype), dimension(nx,ny,nz) :: data1
 13 |   
 14 |   integer, dimension(3) :: lstart, lend, lsize
 15 | 
 16 |   integer :: i,j,k, m, ierror
 17 |   
 18 |   call MPI_INIT(ierror)
 19 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 20 | 
 21 |   ! a copy of global data saved on every rank
 22 |   m = 1
 23 |   do k=1,nz
 24 |      do j=1,ny
 25 |         do i=1,nx
 26 |            data1(i,j,k) = real(m, mytype)
 27 |            m = m+1
 28 |         end do
 29 |      end do
 30 |   end do
 31 | 
 32 |   ! master rank generated a Tecplot view of the global data 
 33 |   if (nrank==0) then
 34 |      open(10,file='data0.dat',form='formatted')
 35 |      write(10,*) 'TITLE="Tecplot Output"'
 36 |      write(10,*) 'VARIABLES= "X" "Y" "Z" "VAR"'
 37 |      write(10,*) 'ZONE F=POINT I=',nx,' J=',ny,' ,K=',nz
 38 |      do k=1,nz
 39 |         do j=1,ny
 40 |            do i=1,nx
 41 |               write(10,*) i,j,k, data1(i,j,k)
 42 |            end do
 43 |         end do
 44 |      end do
 45 |      close(10)
 46 |   end if
 47 |   
 48 |   ! Generate Tecplot views of the decompositions
 49 |   ! -------------------------------------------- 
 50 |   ! For each pencil orientation there are two ways decomposing: 
 51 |   ! p_row*p_col or p_col*p_row. One set is used in 2DECOMP and is 
 52 |   ! described by the library's global variables. The other set is
 53 |   ! generated here for visualisation.
 54 |   
 55 |   ! (/ 1,2,3 /)
 56 |   call tecplot(nx, ny, nz, data1, xstart, xend, xsize, 'data1.dat')
 57 |   ! (/ 2,1,3 /)
 58 |   call tecplot(nx, ny, nz, data1, ystart, yend, ysize, 'data2.dat')
 59 |   ! (/ 2,3,1 /)
 60 |   call tecplot(nx, ny, nz, data1, zstart, zend, zsize, 'data3b.dat')
 61 | 
 62 |   call partition(nx, ny, nz, (/ 1,3,2 /), lstart, lend, lsize)
 63 |   call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data1b.dat')
 64 |   call partition(nx, ny, nz, (/ 3,1,2 /), lstart, lend, lsize)
 65 |   call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data2b.dat')
 66 |   call partition(nx, ny, nz, (/ 3,2,1 /), lstart, lend, lsize)
 67 |   call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data3.dat')
 68 | 
 69 |   call decomp_2d_finalize
 70 |   call MPI_FINALIZE(ierror)
 71 | 
 72 | end program tecplot_view
 73 | 
 74 | 
 75 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 76 | ! Generate Tecplot files to visualise the 2D decompositions
 77 | !   - each rank corresponds to a Tecplot 'zone'
 78 | !   - rank 0 handles all I/O
 79 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 80 | 
 81 | subroutine tecplot(nx, ny, nz, data1, lstart, lend, lsize, filename)
 82 | 
 83 |   use decomp_2d
 84 |   use MPI
 85 | 
 86 |   implicit none
 87 | 
 88 |   integer, intent(IN) :: nx ,ny ,nz
 89 |   real(mytype), dimension(nx,ny,nz), intent(IN) :: data1
 90 |   integer, dimension(3), intent(IN) :: lstart, lend, lsize
 91 |   character(len=*), intent(IN) :: filename
 92 | 
 93 |   real(mytype), dimension(nx,ny,nz) :: data1b
 94 |   real(mytype), allocatable, dimension(:,:,:) :: local_data, rbuf
 95 |   integer, dimension(9) :: sbuf1, rbuf1
 96 |   integer, dimension(MPI_STATUS_SIZE) :: status
 97 |   character(len=7) :: tempstr
 98 |   integer :: i,j,k, m, ierror
 99 | 
100 |   ! data1 holds the first copy of global data, generated locally
101 |   ! data1b holds a second copy of global data, collected via communication
102 | 
103 |   ! each rank holds its local data
104 |   allocate (local_data(lstart(1):lend(1),lstart(2):lend(2), &
105 |        lstart(3):lend(3)))
106 |   do k=lstart(3),lend(3)
107 |      do j=lstart(2),lend(2)
108 |         do i=lstart(1),lend(1)
109 |            local_data(i,j,k) = data1(i,j,k)
110 |         end do
111 |      end do
112 |   end do
113 | 
114 |   if (nrank==0) then
115 | 
116 |      ! master writes file header, collect data from each slave process,
117 |      ! and write as seraprate Tecplot zones
118 |      open(10,file=filename,form='formatted')
119 |      write(10,*) 'TITLE="Tecplot Output"'
120 |      write(10,*) 'VARIABLES= "X" "Y" "Z" "VAR"'
121 |      write(10,*) 'ZONE F=POINT T="Rank 00" I=',lsize(1),' J=',lsize(2), &
122 |           ' ,K=',lsize(3)
123 |      do k=lstart(3),lend(3)
124 |         do j=lstart(2),lend(2)
125 |            do i=lstart(1),lend(1)
126 |               write(10,*) i,j,k, local_data(i,j,k)
127 |               ! master copies its local data to the second global array
128 |               data1b(i,j,k)=local_data(i,j,k)
129 |            end do
130 |         end do
131 |      end do
132 | 
133 |      ! loop through all other ranks to receive data and write
134 |      do m=1,nproc-1
135 |         CALL MPI_RECV(rbuf1,9,MPI_INTEGER,m,m,MPI_COMM_WORLD, &
136 |              status,ierror)
137 |         write(tempstr,100)'Rank ',m
138 | 100     format(A,I2.2)
139 |         write(10,*) 'ZONE F=POINT T="', tempstr, '" I=',rbuf1(3), &
140 |              ' J=',rbuf1(6), ' ,K=',rbuf1(9)
141 |         allocate (rbuf(rbuf1(1):rbuf1(2),rbuf1(4):rbuf1(5), &
142 |              rbuf1(7):rbuf1(8)))
143 |         CALL MPI_RECV(rbuf,rbuf1(3)*rbuf1(6)*rbuf1(9),real_type,m, &
144 |              m+nproc,MPI_COMM_WORLD,status,ierror)
145 |         do k=rbuf1(7),rbuf1(8)
146 |            do j=rbuf1(4),rbuf1(5)
147 |               do i=rbuf1(1),rbuf1(2)
148 |                  write(10,*) i,j,k, rbuf(i,j,k)
149 |                  ! data received copied to global array
150 |                  data1b(i,j,k)=rbuf(i,j,k)
151 |               end do
152 |            end do
153 |         end do
154 |         deallocate(rbuf)
155 |      end do
156 | 
157 |      close (10)
158 | 
159 |      ! check if data set collected via communication is correct
160 |      do k=1,nz
161 |         do j=1,ny
162 |            do i=1,nx
163 |               if (abs(data1b(i,j,k)-data1(i,j,k)) > 1.0e-5) then
164 |                  stop "error"
165 |               end if
166 |            end do
167 |         end do
168 |      end do
169 |      
170 |   else
171 | 
172 |      ! slaves send data to the master
173 |      sbuf1(1) = lstart(1)
174 |      sbuf1(2) = lend(1)
175 |      sbuf1(3) = lsize(1)
176 |      sbuf1(4) = lstart(2)
177 |      sbuf1(5) = lend(2)
178 |      sbuf1(6) = lsize(2)
179 |      sbuf1(7) = lstart(3)
180 |      sbuf1(8) = lend(3)
181 |      sbuf1(9) = lsize(3)
182 |      CALL MPI_SEND(sbuf1,9,MPI_INTEGER,0,nrank,MPI_COMM_WORLD,ierror)
183 |      CALL MPI_SEND(local_data,lsize(1)*lsize(2)*lsize(3),real_type,0, &
184 |           nrank+nproc,MPI_COMM_WORLD,ierror)
185 | 
186 |   endif
187 | 
188 |   deallocate(local_data)
189 |   return
190 | 
191 | end subroutine tecplot
192 | 


--------------------------------------------------------------------------------
/examples/test2d/.gitignore:
--------------------------------------------------------------------------------
1 | test2d
2 | 


--------------------------------------------------------------------------------
/examples/test2d/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft
 5 | 
 6 | OBJ = test2d.o
 7 | 
 8 | test2d: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o test2d u*.dat
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/test2d/README:
--------------------------------------------------------------------------------
 1 | test2d
 2 | ------
 3 | 
 4 | This program is to validate the 2D pencil decomposition library. It transposes
 5 | a set of data into different storage formats so that all communication routines
 6 | are tested. The test code also demonstrates to use of the I/O library.
 7 | 
 8 | To run: use 12 MPI processes.
 9 | 
10 | What to expect: many files are written to disk. Regardless of the pencil-
11 | orientation of the distributed data, files written collectively by all MPI
12 | processes should contain identical information. For example, u1.dat, u2.dat,
13 | u3.dat, u2b.dat and u1b.dat should be all identical.
14 | 


--------------------------------------------------------------------------------
/examples/test2d/test2d.f90:
--------------------------------------------------------------------------------
  1 | program test2d
  2 |   
  3 |   use decomp_2d
  4 |   use decomp_2d_io
  5 | 
  6 |   implicit none
  7 | 
  8 |   integer, parameter :: nx=17, ny=13, nz=11
  9 |   integer, parameter :: p_row=4, p_col=3
 10 | 
 11 |   real(mytype), dimension(nx,ny,nz) :: data1
 12 |   
 13 |   real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
 14 |   
 15 |   integer :: i,j,k, m, ierror
 16 |   
 17 |   call MPI_INIT(ierror)
 18 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 19 | 
 20 |   ! ***** global data *****
 21 |   m = 1
 22 |   do k=1,nz
 23 |      do j=1,ny
 24 |         do i=1,nx
 25 |            data1(i,j,k) = float(m)
 26 |            m = m+1
 27 |         end do
 28 |      end do
 29 |   end do
 30 | 
 31 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 32 |   ! Testing the swap routines
 33 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 34 |   
 35 |   !allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
 36 |   !allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
 37 |   !allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
 38 |   call alloc_x(u1, opt_global=.true.)
 39 |   call alloc_y(u2, opt_global=.true.)
 40 |   call alloc_z(u3, opt_global=.true.)
 41 |  
 42 |   ! original x-pensil based data 
 43 |   do k=xstart(3),xend(3)
 44 |     do j=xstart(2),xend(2)
 45 |       do i=xstart(1),xend(1)
 46 |         u1(i,j,k) = data1(i,j,k)
 47 |       end do
 48 |     end do
 49 |   end do
 50 | 
 51 | 10 format(15I5)
 52 | 
 53 |   if (nrank==0) then 
 54 |      write(*,*) 'Numbers held on Rank 0'
 55 |      write(*,*) ' '
 56 |      write(*,*) 'X-pencil'
 57 |      write(*,10) int(u1)
 58 |   end if
 59 | 
 60 |   call decomp_2d_write_one(1,u1,'u1.dat')
 61 | 
 62 |   !!!!!!!!!!!!!!!!!!!!!!!
 63 |   ! x-pensil ==> y-pensil
 64 |   call transpose_x_to_y(u1,u2)
 65 | 
 66 |   if (nrank==0) then
 67 |      write(*,*) ' '
 68 |      write(*,*) 'Y-pencil'
 69 |      write(*,10) int(u2)
 70 |   end if
 71 | 
 72 |   call decomp_2d_write_one(2,u2,'u2.dat')
 73 |   ! 'u1.dat' and 'u2.dat' should be identical byte-by-byte 
 74 | 
 75 |   ! also check the transposition this way
 76 |   do k=ystart(3),yend(3)
 77 |     do j=ystart(2),yend(2)
 78 |       do i=ystart(1),yend(1)
 79 |         if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping x->y"
 80 |       end do
 81 |     end do
 82 |   end do
 83 | 
 84 |   !!!!!!!!!!!!!!!!!!!!!!!
 85 |   ! y-pensil ==> z-pensil
 86 |   call transpose_y_to_z(u2,u3)
 87 | 
 88 |   if (nrank==0) then
 89 |      write(*,*) ' '
 90 |      write(*,*) 'Z-pencil'
 91 |      write(*,10) int(u3)
 92 |   end if
 93 | 
 94 |   call decomp_2d_write_one(3,u3,'u3.dat')
 95 |   ! 'u1.dat','u2.dat' and 'u3.dat' should be identical
 96 | 
 97 |   do k=zstart(3),zend(3)
 98 |     do j=zstart(2),zend(2)
 99 |       do i=zstart(1),zend(1)
100 |         if (abs(u3(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping y->z"
101 |       end do
102 |     end do
103 |   end do
104 | 
105 |   !!!!!!!!!!!!!!!!!!!!!!!
106 |   ! z-pensil ==> y-pensil
107 |   call transpose_z_to_y(u3,u2)
108 |   call decomp_2d_write_one(2,u2,'u2b.dat')
109 | 
110 |   do k=ystart(3),yend(3)
111 |     do j=ystart(2),yend(2)
112 |       do i=ystart(1),yend(1)
113 |         if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping z->y"
114 |       end do
115 |     end do
116 |   end do
117 | 
118 |   !!!!!!!!!!!!!!!!!!!!!!!
119 |   ! y-pensil ==> x-pensil
120 |   call transpose_y_to_x(u2,u1)
121 |   call decomp_2d_write_one(1,u1,'u1b.dat')
122 | 
123 |   do k=xstart(3),xend(3)
124 |     do j=xstart(2),xend(2)
125 |       do i=xstart(1),xend(1)
126 |         if (abs(u1(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping y->x"
127 |       end do
128 |     end do
129 |   end do
130 | 
131 |   call decomp_2d_finalize 
132 |   call MPI_FINALIZE(ierror)
133 |   deallocate(u1,u2,u3)
134 | 
135 | end program test2d
136 | 
137 | 


--------------------------------------------------------------------------------
/examples/timing/.gitignore:
--------------------------------------------------------------------------------
1 | timing
2 | 


--------------------------------------------------------------------------------
/examples/timing/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../src/Makefile.inc
 2 | 
 3 | INCLUDE = -I../../include
 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT)
 5 | 
 6 | OBJ = timing.o
 7 | 
 8 | timing: $(OBJ)
 9 | 	$(F90) -o $@ $(OBJ) $(LIBS)
10 | 
11 | clean:
12 | 	rm -f *.o timing
13 | 
14 | %.o : %.f90
15 | 	$(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $<
16 | 


--------------------------------------------------------------------------------
/examples/timing/README:
--------------------------------------------------------------------------------
 1 | timing
 2 | ------
 3 | 
 4 | This program can be used to benchmark the performance of the FFT library. 
 5 | Timings for both c2c and r2c/c2r transforms are collected. Also the FFT
 6 | library is validated as the input data should be recovered to machine
 7 | accuracy after a forward and a backward transform and proper normalisation.
 8 | 
 9 | To run: set a proper problem size and number of MPI processes to use (these
10 | are hardware dependent).
11 | 
12 | What to expect:
13 | - The timing results
14 | - The error reported should be around machine accuracy (~ 10^-6 for single
15 |   precision and 10^-15 for double)
16 | 


--------------------------------------------------------------------------------
/examples/timing/timing.f90:
--------------------------------------------------------------------------------
  1 | program fft_timing
  2 | 
  3 |   use decomp_2d
  4 |   use decomp_2d_fft
  5 |   use MPI
  6 |   
  7 |   implicit none
  8 |   
  9 |   integer, parameter :: nx=17, ny=13, nz=11
 10 |   integer, parameter :: p_row=0, p_col=0
 11 |   
 12 |   integer, parameter :: NTEST = 10  ! repeat test this times
 13 |   
 14 |   complex(mytype), allocatable, dimension(:,:,:) :: in, out
 15 |   real(mytype), allocatable, dimension(:,:,:) :: in_r
 16 |   
 17 |   integer, dimension(3) :: fft_start, fft_end, fft_size
 18 |   
 19 |   real(mytype) :: dr,di, err, err_all, n1,flops
 20 |   integer :: ierror, i,j,k,m
 21 |   double precision :: t1, t2, t3 ,t4
 22 |   
 23 |   call MPI_INIT(ierror)
 24 |   call decomp_2d_init(nx,ny,nz,p_row,p_col)
 25 | 
 26 | 
 27 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 28 |   ! Test the c2c interface
 29 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 30 | 
 31 |   call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input
 32 | 
 33 |   !  input is Z-pencil data
 34 |   ! output is X-pencil data
 35 |   allocate (in(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
 36 |   allocate (out(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
 37 |   ! initilise input
 38 |   do k=zstart(3),zend(3)
 39 |      do j=zstart(2),zend(2)
 40 |         do i=zstart(1),zend(1)
 41 |            dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
 42 |                 /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
 43 |            di = dr
 44 |            in(i,j,k) = cmplx(dr,di,mytype)
 45 |         end do
 46 |      end do
 47 |   end do
 48 |   
 49 |   t2 = 0.0D0
 50 |   t4 = 0.0D0
 51 |   do m=1,NTEST
 52 |      
 53 |      ! forward FFT
 54 |      t1 = MPI_WTIME()
 55 |      call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
 56 |      t2 = t2 + MPI_WTIME() - t1
 57 | 
 58 |      ! inverse FFT
 59 |      t3 = MPI_WTIME()
 60 |      call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
 61 |      t4 = t4 + MPI_WTIME() - t3
 62 |   
 63 |      ! normalisation - note 2DECOMP&FFT doesn't normalise
 64 |      in = in / real(nx,mytype) / real(ny,mytype) /real(nz,mytype)
 65 | 
 66 |   end do
 67 |   
 68 |   call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, &
 69 |        MPI_COMM_WORLD,ierror)
 70 |   t1 = t1 / real(nproc,mytype)
 71 |   call MPI_ALLREDUCE(t4,t3,1,MPI_DOUBLE_PRECISION,MPI_SUM, &
 72 |        MPI_COMM_WORLD,ierror)
 73 |   t3 = t3 / real(nproc,mytype)
 74 |   
 75 |   ! checking accuracy
 76 |   err = 0.
 77 |   do k=zstart(3),zend(3)
 78 |      do j=zstart(2),zend(2)
 79 |         do i=zstart(1),zend(1)
 80 |            dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
 81 |                 /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
 82 |            di = dr
 83 |            dr = dr - real(in(i,j,k),mytype)
 84 |            di = di - aimag(in(i,j,k))
 85 |            err = err + sqrt(dr*dr + di*di)
 86 |         end do
 87 |      end do
 88 |   end do
 89 |   call MPI_ALLREDUCE(err,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
 90 |   err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
 91 | 
 92 |   if (nrank==0) then
 93 |      write(*,*) '===== c2c interface ====='
 94 |      write(*,*) 'error / mesh point: ', err_all 
 95 |      write(*,*) 'time (sec): ', t1,t3
 96 |      n1 = real(nx,mytype) * real(ny,mytype) * real(nz,mytype)
 97 |      n1 = n1 ** (1._mytype/3._mytype)
 98 |      ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
 99 |      flops = 5._mytype * n1 * log(n1) / log(2.0_mytype)
100 |      ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
101 |      flops = flops * 3._mytype * n1**2  
102 |      flops = 2._mytype * flops / ((t1+t3)/real(NTEST,mytype))
103 |      write(*,*) 'GFLOPS : ', flops / 1000._mytype**3
104 |   end if
105 |   
106 |   deallocate(in,out)
107 |   call decomp_2d_fft_finalize
108 | 
109 | 
110 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
111 |   ! Test the r2c/c2r interface
112 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
113 |   call decomp_2d_fft_init
114 |   
115 |   allocate (in_r(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
116 |   call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
117 |   allocate (out(fft_start(1):fft_end(1), &
118 |        fft_start(2):fft_end(2), &
119 |        fft_start(3):fft_end(3)))
120 |   
121 |   ! initilise input
122 |   do k=xstart(3),xend(3)
123 |      do j=xstart(2),xend(2)
124 |         do i=xstart(1),xend(1)
125 |            in_r(i,j,k) = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
126 |                 /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
127 |         end do
128 |      end do
129 |   end do
130 |   
131 |   t2 = 0.0D0
132 |   t4 = 0.0D0
133 |   do m=1,NTEST
134 |   
135 |      ! 3D r2c FFT
136 |      t1 = MPI_WTIME()
137 |      call decomp_2d_fft_3d(in_r, out)
138 |      t2 = t2 + MPI_WTIME() - t1
139 |   
140 |      ! 3D inverse FFT
141 |      t3 = MPI_WTIME()
142 |      call decomp_2d_fft_3d(out, in_r)
143 |      t4 = t4 + MPI_WTIME() - t3
144 |   
145 |      ! normalisation - note 2DECOMP&FFT doesn't normalise
146 |      do k=xstart(3),xend(3)
147 |         do j=xstart(2),xend(2)
148 |            do i=xstart(1),xend(1)
149 |               in_r(i,j,k) = in_r(i,j,k) &
150 |                    / (real(nx,mytype)*real(ny,mytype)*real(nz,mytype))
151 |            end do
152 |         end do
153 |      end do
154 | 
155 |   end do
156 |   
157 |   call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, &
158 |        MPI_COMM_WORLD,ierror)
159 |   t1 = t1 / real(nproc,mytype)
160 |   call MPI_ALLREDUCE(t4,t3,1,MPI_DOUBLE_PRECISION,MPI_SUM, &
161 |        MPI_COMM_WORLD,ierror)
162 |   t3 = t3 / real(nproc,mytype)
163 |   
164 |   ! checking accuracy
165 |   err = 0.
166 |   do k=xstart(3),xend(3)
167 |      do j=xstart(2),xend(2)
168 |         do i=xstart(1),xend(1)
169 |            dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
170 |                 /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
171 |            err = err + abs(in_r(i,j,k)-dr)
172 |         end do
173 |      end do
174 |   end do
175 |   call MPI_ALLREDUCE(err,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
176 |   err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
177 |   
178 |   if (nrank==0) then
179 |      write(*,*) '===== r2c/c2r interface ====='
180 |      write(*,*) 'error / mesh point: ', err_all
181 |      write(*,*) 'time (sec): ', t1,t3
182 |   end if
183 |   
184 |   deallocate(in_r,out)
185 |   call decomp_2d_fft_finalize
186 |   call decomp_2d_finalize
187 |   call MPI_FINALIZE(ierror)
188 | 
189 | end program fft_timing
190 | 
191 | 


--------------------------------------------------------------------------------
/include/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
 1 | VPATH=../src
 2 | 
 3 | LIBNAME:=lib2decomp_fft.a
 4 | 
 5 | all: lib
 6 | 
 7 | .PHONY: lib
 8 | 
 9 | lib:
10 | 	cd ../src ; $(MAKE) $@
11 | 
12 | clean:
13 | 	rm -f $(LIBNAME)
14 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | ########################################################################
 2 | # Makefile for building the 2DECOMP&FFT library
 3 | ########################################################################
 4 | 
 5 | # Normally, do not change anything here. Modify the platform-dependent
 6 | # Makefile.inc file instead
 7 | 
 8 | include Makefile.inc
 9 | 
10 | SRCS = decomp_2d.f90 io.f90 glassman.f90 fft_$(FFT).f90
11 | 
12 | # special treatment for shared-memory code
13 | ifneq (,$(findstring DSHM,$(OPTIONS)))
14 | SRCS := FreeIPC.f90 $(SRCS)
15 | OBJS =	$(SRCS:.f90=.o) alloc_shm.o FreeIPC_c.o
16 | else
17 | OBJS =	$(SRCS:.f90=.o)
18 | endif
19 | 
20 | # special treatment for Intel MKL (need to build MKL mod files)
21 | MKL_MOD=
22 | ifeq ($(FFT),mkl)
23 |    MKL_MOD=mkl_mod
24 | endif
25 | 
26 | # spcial treatment for CUDA
27 | ifeq ($(FFT),cufft)
28 |    OBJS += cuda_fft_1m.o
29 | endif
30 | 
31 | 
32 | all: lib examples
33 | 
34 | lib: includes lib2decomp_fft.a
35 | 	mv lib2decomp_fft.a ../lib
36 | 
37 | includes: lib2decomp_fft.a
38 | 	mv *.mod ../include
39 | 
40 | cuda_fft_1m.o: cuda_fft_1m.cu
41 | 	$(CUDA_PATH)/bin/nvcc -c $<
42 | 
43 | lib2decomp_fft.a: $(MKL_MOD) $(OBJS)
44 | 	ar qc $@ $(OBJS)
45 | 
46 | alloc_shm.o: alloc_shm.c
47 | 	$(CC) $(CFLAGS) -c $<
48 | 
49 | FreeIPC_c.o: FreeIPC_c.c
50 | 	$(CC) $(CFLAGS) -c $<
51 | 
52 | .PHONY: mkl_mod
53 | mkl_mod:
54 | 	echo Building Intel MKL mod files...
55 | 	$(IFORT) -c $(MKL_ROOT)/include/mkl_dfti.f90
56 | 
57 | .PHONY: examples
58 | examples: lib
59 | 
60 | .PHONY: test
61 | test: examples
62 | 
63 | .PHONY: clean
64 | clean:
65 | 	rm -f *.o *.mod lib*.a
66 | 
67 | .PHONY: realclean
68 | realclean: clean
69 | 	rm -f *~ \#*\#
70 | 
71 | %.o : %.f90
72 | 	$(F90) $(OPTIONS) $(F90FLAGS) -c $<
73 | 


--------------------------------------------------------------------------------
/src/Makefile.inc:
--------------------------------------------------------------------------------
  1 | # Configurations on an ordinary Linux PCs for development
  2 | #   using GNU compiler, OpenMPI and various FFT libraries
  3 | 
  4 | # 2DECOMP&FFT options
  5 | #====================
  6 | 
  7 | # Choose pre-processing options
  8 | #   -DDOUBLE_PREC   - use double-precision (default single)
  9 | #   -DEVEN          - for evenly distributed data, use ALLTOALL
 10 | #   -DSHM	        - enable system V shared-memory implementation
 11 | #   -DOVERWRITE     - allow FFT input to be overwritten (save memory)
 12 | #   -DT3PIO         - turn on LUSTRE IO optimisation code using T3PIO
 13 | OPTIONS=
 14 | 
 15 | # Choose one FFT engine, available options are:
 16 | #   acml       - AMD Core Math Library
 17 | #   cufft      - cuFFT, the CUDA Fast Fourier Transform library
 18 | #   ffte       - FFTE
 19 | #   fftpack5   - FFTPACK5
 20 | #   fftw3      - FFTW version 3.x
 21 | #   fftw3_f03  - FFTW 3.3-beta1 or later (with Fortran 2003 interface)
 22 | #   generic    - A general FFT algorithm (no 3rd-party library needed)
 23 | #   mkl        - Intel Math Kernel Library
 24 | FFT=generic
 25 | 
 26 | # Platform-dependent information - compiler, external library etc
 27 | #================================================================
 28 | 
 29 | # Inlcude path if necessary (change to your actual paths)
 30 | MKL_ROOT=/opt/intel/oneapi/mkl/latest
 31 | FFTW_PATH=$(HOME)/software/build/fftw-3.3.9
 32 | CUDA_PATH=/usr/local/cuda-11.1
 33 | ifeq ($(FFT),mkl)
 34 |   INC=-I$(MKL_ROOT)/include
 35 |   # Fortran compiler used to compile MKL mod files
 36 |   IFORT=gfortran
 37 | else ifeq ($(FFT),fftw3)
 38 |   INC=-I$(FFTW_PATH)/include
 39 | else ifeq ($(FFT),fftw3_f03)
 40 |   INC=-I$(FFTW_PATH)/include
 41 | else ifeq ($(FFT),cufft)
 42 |   INC=-I$(CUDA_PATH)/include
 43 | else
 44 |   INC=
 45 | endif
 46 | 
 47 | ifneq (,$(findstring DT3PIO,$(OPTIONS)))
 48 |   T3PIO_PATH=/opt/t3pio
 49 |   INC+= -I$(T3PIO_PATH)/include
 50 | endif
 51 | 
 52 | 
 53 | #----------------------- Fortran Compiler ----------------------------
 54 | F90=mpif90
 55 | 
 56 | # enable preprocessing
 57 | CPPFLAGS=-cpp
 58 | # enable Cray pointer support if needed
 59 | CRAYPTR=-fcray-pointer
 60 | # optimisation or debugging flags
 61 | #OPTIM=-g -fcheck=all
 62 | OPTIM=-O3
 63 | 
 64 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC)
 65 | LDFLAGS=$(OPTIM)
 66 | 
 67 | #--------------------------C Compiler---------------------------------
 68 | CC=mpicc
 69 | CFLAGS=-O3
 70 | 
 71 | #-----------------------External Library------------------------------
 72 | 
 73 | # For FFTW
 74 | LIB_FFTW3=-L$(FFTW_PATH)/lib -lfftw3f -lfftw3
 75 | 
 76 | # For ACML
 77 | # This assumes that 32-bit ACML installed at /opt/. Adjust properly.
 78 | # It helps to create a symbolic link such as 'acml -> acml4.4.0'
 79 | # and update the symbolic link when future ACML version is installed.
 80 | LIB_ACML=/opt/acml/gfortran32/lib/libacml.a -lrt
 81 | 
 82 | # For FFTPACK5
 83 | FFTPACK5_PATH=$(HOME)/software/fftpack5
 84 | LIB_FFTPACK5=$(FFTPACK5_PATH)/libfftpack5.a
 85 | 
 86 | # For Intel MKL
 87 | MKL_LIB_PATH= $(MKL_ROOT)/lib/intel64
 88 | LIB_MKL=-Wl,--start-group $(MKL_LIB_PATH)/libmkl_gf_lp64.a $(MKL_LIB_PATH)/libmkl_sequential.a $(MKL_LIB_PATH)/libmkl_core.a -Wl,--end-group -lpthread -lm -ldl
 89 | 
 90 | # For FFTE
 91 | FFTE_PATH=$(HOME)/software/ffte-4.1
 92 | LIB_FFTE=$(FFTE_PATH)/libffte.a
 93 | 
 94 | # For cuFFT
 95 | LIB_CUFFT=-L$(CUDA_PATH)/lib64 -lcudart -lcufft
 96 | 
 97 | ifeq ($(FFT),generic)
 98 |      LIBFFT=
 99 | else ifeq ($(FFT),acml)
100 |      LIBFFT=$(LIB_ACML)
101 | else ifeq ($(FFT),ffte)
102 |      LIBFFT=$(LIB_FFTE)
103 | else ifeq ($(FFT),fftpack5)
104 |      LIBFFT=$(LIB_FFTPACK5)
105 | else ifeq ($(FFT),fftw3)
106 |      LIBFFT=$(LIB_FFTW3)
107 | else ifeq ($(FFT),fftw3_f03)
108 |      LIBFFT=$(LIB_FFTW3)
109 | else ifeq ($(FFT),mkl)
110 |      LIBFFT=$(LIB_MKL)
111 | else ifeq ($(FFT),cufft)
112 |      LIBFFT=$(LIB_CUFFT)
113 | endif
114 | 


--------------------------------------------------------------------------------
/src/Makefile.inc.BlueGene:
--------------------------------------------------------------------------------
 1 | # Configurations for IBM BlueGene systems
 2 | #   using IBM XL compilers
 3 | 
 4 | # 2DECOMP&FFT options
 5 | #====================
 6 | 
 7 | # Choose pre-processing options
 8 | #   -DDOUBLE_PREC - use double-precision (default single)
 9 | #   -DEVEN        - for evenly distributed data, use ALLTOALL
10 | #   -DOVERWRITE   - allow FFT input to be overwritten (save memory)
11 | OPTION=-DDOUBLE_PREC -DOVERWRITE
12 | 
13 | # Choose one FFT engine, available options are:
14 | #   essl       - IBM Engineering and Scientific Subroutine Library
15 | #   fftw3      - FFTW version 3.x
16 | #   fftw3_f03  - FFTW 3.3-beta1 or later (with Fortran 2003 interface)
17 | #   generic    - A general FFT algorithm (no 3rd-party library needed)
18 | FFT=generic
19 | 
20 | # Platform-dependent information
21 | #===============================
22 | 
23 | # special syntax for IBM XL compiler's preprocessing
24 | # instead of using "-DTAG1 -DTAG2", XL needs "-WF,-DTAG1 -WF,-DTAG2"
25 | from:=-D
26 | to:=-WF,-D
27 | TMP=$(subst $(from),$(to),$(OPTION))
28 | OPTIONS=$(TMP)
29 | 
30 | # The path of the base BlueGene system software
31 | BGP_SYS=/bgsys/drivers/V1R4M1_460_2009-091110P/ppc/comm/xl
32 | 
33 | # Inlcude path
34 | INC=
35 | 
36 | #----------------------- Fortran Compiler ----------------------------
37 | F90=$(BGP_SYS)/bin/mpixlf90_r
38 | 
39 | # enable preprocessing
40 | CPPFLAGS=-qsuffix=cpp=f90
41 | 
42 | # enable Cray pointer support if needed
43 | CRAYPTR=
44 | 
45 | # optimisation or debugging flags
46 | OPTIM=-O3 -qarch=450d -qtune=450
47 | 
48 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC)
49 | LDFLAGS=$(OPTIM)
50 | 
51 | #--------------------------C Compiler---------------------------------
52 | CC=mpixlc_r
53 | CFLAGS=-O3
54 | 
55 | #-----------------------External Library------------------------------
56 | 
57 | # for FFTW
58 | LIB_FFTW3=-lfftw3f -lfftw3 -L  # supply path to FFTW3 here 
59 | 
60 | # for ESSL
61 | LIB_ESSL=-L$(BGP_SYS)/lib -L/opt/ibmmath/lib -lesslbg
62 | 
63 | ifeq ($(FFT),generic)
64 |      LIBFFT=
65 | else ifeq ($(FFT),fftw3)
66 |      LIBFFT=$(LIB_FFTW3)
67 | else ifeq ($(FFT),fftw3_f03)
68 |      LIBFFT=$(LIB_FFTW3)
69 | else ifeq ($(FFT),essl)
70 |      LIBFFT=$(LIB_ESSL)
71 | endif
72 | 


--------------------------------------------------------------------------------
/src/Makefile.inc.Cray_XE:
--------------------------------------------------------------------------------
 1 | # Configurations for Cray XT/XE systems
 2 | #   using PGI/PathScale/GNU/Cray compilers
 3 | 
 4 | # 2DECOMP&FFT options
 5 | #====================
 6 | 
 7 | # Choose pre-processing options
 8 | #   -DDOUBLE_PREC - use double-precision (default single)
 9 | #   -DEVEN        - for evenly distributed data, use ALLTOALL
10 | #   -DSHM	  - enable shared-memory implementation
11 | #   -DOVERWRITE   - allow FFT input to be overwritten (save memory)
12 | OPTIONS=-DDOUBLE_PREC -DOVERWRITE
13 | 
14 | # Choose one FFT engine, available options are:
15 | #   acml       - AMD Core Math Library
16 | #   fftw3      - FFTW version 3.x
17 | #   fftw3_f03  - FFTW 3.3-beta1 or later (with Fortran 2003 interface)
18 | #   generic    - A general FFT algorithm (no 3rd-party library needed)
19 | FFT=fftw3
20 | 
21 | # Platform-dependent information
22 | #===============================
23 | 
24 | # Choose compiler suite - valid options: PGI, PathScale, GNU, Cray, Intel
25 | COMPILER=PGI
26 | 
27 | # Inlcude path not set, relying on Cray's modules
28 | INC=
29 | 
30 | #----------------------- Fortran Compiler ----------------------------
31 | F90=ftn
32 | 
33 | # enable preprocessing
34 | ifeq ($(COMPILER),PGI)
35 |    CPPFLAGS=-Mpreprocess
36 | else ifeq ($(COMPILER),PathScale)
37 |    CPPFLAGS=-cpp
38 | else ifeq ($(COMPILER),GNU)
39 |    CPPFLAGS=-cpp
40 | else ifeq ($(COMPILER),Cray)
41 |    CPPFLAGS=-e Fm
42 | else ifeq ($(COMPILER),Intel)
43 |    CPPFLAGS=-fpp
44 | endif
45 | 
46 | # enable Cray pointer support if needed
47 | ifeq ($(COMPILER),GNU)
48 |    CRAYPTR=-fcray-pointer
49 | else
50 |    CRAYPTR=
51 | endif
52 | 
53 | # optimisation or debugging flags
54 | ifeq ($(COMPILER),PGI)
55 |    OPTIM=-O3
56 | else ifeq ($(COMPILER),PathScale)
57 |    OPTIM=-O3
58 | else ifeq ($(COMPILER),GNU)
59 |    #OPTIM=-g -fbounds-check
60 |    OPTIM=-O3
61 | else ifeq ($(COMPILER),Cray)
62 |    OPTIM=-O3
63 | else ifeq ($(COMPILER),Intel)
64 |    OPTIM=-O3     # no -fast as IPA cause problem
65 | endif
66 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC)
67 | LDFLAGS=$(OPTIM)
68 | 
69 | #--------------------------C Compiler---------------------------------
70 | CC=cc
71 | CFLAGS=-O3
72 | 
73 | #-----------------------External Library------------------------------
74 | 
75 | # Do not need any as this is all handled by the 'module' system
76 | 
77 | LIBFFT=
78 | 
79 | 


--------------------------------------------------------------------------------
/src/Makefile.inc.Fujitsu_SPARC64_VIIIfx:
--------------------------------------------------------------------------------
 1 | # Configurations for SPARC64 servers using Fujitsu compiler
 2 | #   configutation tested on a SPARC64 VIIIfx machine
 3 | 
 4 | # 2DECOMP&FFT options
 5 | #====================
 6 | 
 7 | # Choose pre-processing options
 8 | #   -DDOUBLE_PREC - use double-precision (default single)
 9 | #   -DEVEN        - for evenly distributed data, use ALLTOALL
10 | #   -DOVERWRITE   - allow FFT input to be overwritten (save memory)
11 | OPTIONS=-DDOUBLE_PREC -DOVERWRITE
12 | 
13 | # Choose one FFT engine, available options are:
14 | #   ffte       - FFTE
15 | #   fftw3      - FFTW version 3.x
16 | #   fftw3_f03  - FFTW 3.3-beta1 or later (with Fortran 2003 interface)
17 | #   generic    - A general FFT algorithm (no 3rd-party library needed)
18 | FFT=generic
19 | 
20 | # Platform-dependent information - compiler, external library etc
21 | #================================================================
22 | 
23 | # Inlcude path if necessary
24 | # Need to compile a copy of FFTW version 3.x using the same Fujitsu compiler
25 | FFTW_PATH=
26 | ifeq ($(FFT),fftw3)
27 |   INC=-I$(FFTW_PATH)/include
28 | else ifeq ($(FFT),fftw3_f03)
29 |   INC=-I$(FFTW_PATH)/include
30 | else
31 |   INC=
32 | endif
33 | 
34 | #----------------------- Fortran Compiler ----------------------------
35 | F90=mpifrtpx
36 | 
37 | # enable preprocessing
38 | CPPFLAGS=-Cpp
39 | # enable Cray pointer support if needed
40 | CRAYPTR=
41 | # optimisation or debugging flags
42 | #OPTIM=-g
43 | OPTIM=-Kfast
44 | 
45 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC)
46 | LDFLAGS=$(OPTIM)
47 | 
48 | #--------------------------C Compiler---------------------------------
49 | CC=mpifccpx
50 | CFLAGS=-Kfast
51 | 
52 | #-----------------------External Library------------------------------
53 | 
54 | # For FFTW
55 | LIB_FFTW3=-L$(FFTW_PATH)/lib -lfftw3f -lfftw3
56 | 
57 | # For FFTE
58 | FFTE_PATH=path/to/ffte-4.1  # compile FFTE using the same compiler
59 | LIB_FFTE=$(FFTE_PATH)/libffte.a
60 | 
61 | ifeq ($(FFT),generic)
62 |      LIBFFT=
63 | else ifeq ($(FFT),ffte)
64 |      LIBFFT=$(LIB_FFTE)
65 | else ifeq ($(FFT),fftw3)
66 |      LIBFFT=$(LIB_FFTW3)
67 | else ifeq ($(FFT),fftw3_f03)
68 |      LIBFFT=$(LIB_FFTW3)
69 | endif	
70 | 


--------------------------------------------------------------------------------
/src/acml_plan.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This file contains subroutines that generate ACML plans
 13 | ! for several types of 1D multiple FFTs.
 14 | 
 15 | ! Note most ACML plans can be shared by forward/backward transforms
 16 | 
 17 |   ! Return an ACML plan for multiple 1D c2c FFTs in X direction
 18 |   subroutine c2c_1m_x_plan(comm, decomp)
 19 | 
 20 |     implicit none
 21 |     complex(mytype), allocatable, dimension(:), intent(OUT) :: comm
 22 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
 23 | 
 24 |     complex(mytype), allocatable, dimension(:,:,:) :: dummy
 25 |     integer :: info
 26 | 
 27 | #ifdef DOUBLE_PREC
 28 |     allocate(comm(3*decomp%xsz(1)+100))
 29 | #else
 30 |     allocate(comm(5*decomp%xsz(1)+100))
 31 | #endif
 32 | 
 33 |     allocate(dummy(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
 34 | 
 35 | #ifdef DOUBLE_PREC
 36 |     call zfft1mx(plan_type,scale,.true.,decomp%xsz(2)*decomp%xsz(3), &
 37 |          decomp%xsz(1), dummy,1,decomp%xsz(1),dummy,1,decomp%xsz(1), &
 38 |          comm,info)
 39 | #else
 40 |     call cfft1mx(plan_type,scale,.true.,decomp%xsz(2)*decomp%xsz(3), &
 41 |          decomp%xsz(1), dummy,1,decomp%xsz(1),dummy,1,decomp%xsz(1), &
 42 |          comm,info)
 43 | #endif
 44 | 
 45 |     deallocate(dummy)
 46 | 
 47 |     return
 48 |   end subroutine c2c_1m_x_plan
 49 | 
 50 |   ! Return an ACML plan for multiple 1D c2c FFTs in Y direction
 51 |   subroutine c2c_1m_y_plan(comm, decomp)
 52 | 
 53 |     implicit none
 54 |     complex(mytype), allocatable, dimension(:), intent(OUT) :: comm
 55 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
 56 | 
 57 |     complex(mytype), allocatable, dimension(:,:,:) :: dummy
 58 |     integer :: info
 59 | 
 60 | #ifdef DOUBLE_PREC
 61 |     allocate(comm(3*decomp%ysz(2)+100))
 62 | #else
 63 |     allocate(comm(5*decomp%ysz(2)+100))
 64 | #endif
 65 | 
 66 |     allocate(dummy(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
 67 | 
 68 | #ifdef DOUBLE_PREC
 69 |     call zfft1mx(plan_type,scale,.true.,decomp%ysz(1),decomp%ysz(2), &
 70 |             dummy,decomp%ysz(1),1,dummy,decomp%ysz(1),1,comm,info)
 71 | #else
 72 |     call cfft1mx(plan_type,scale,.true.,decomp%ysz(1),decomp%ysz(2), &
 73 |             dummy,decomp%ysz(1),1,dummy,decomp%ysz(1),1,comm,info)
 74 | #endif
 75 | 
 76 |     deallocate(dummy)
 77 | 
 78 |     return
 79 |   end subroutine c2c_1m_y_plan
 80 | 
 81 |   ! Return an ACML plan for multiple 1D c2c FFTs in Z direction  
 82 |   subroutine c2c_1m_z_plan(comm, decomp)
 83 | 
 84 |     implicit none
 85 |     complex(mytype), allocatable, dimension(:), intent(OUT) :: comm
 86 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
 87 | 
 88 |     complex(mytype), allocatable, dimension(:,:,:) :: dummy
 89 |     integer :: info
 90 | 
 91 | #ifdef DOUBLE_PREC
 92 |     allocate(comm(3*decomp%zsz(3)+100))
 93 | #else
 94 |     allocate(comm(5*decomp%zsz(3)+100))
 95 | #endif
 96 | 
 97 |     allocate(dummy(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
 98 | 
 99 | #ifdef DOUBLE_PREC
100 |     call zfft1mx(plan_type,scale,.true.,decomp%zsz(1)*decomp%zsz(2), &
101 |          decomp%zsz(3),dummy,decomp%zsz(1)*decomp%zsz(2),1,dummy, &
102 |          decomp%zsz(1)*decomp%zsz(2),1,comm,info)
103 | #else
104 |     call cfft1mx(plan_type,scale,.true.,decomp%zsz(1)*decomp%zsz(2), &
105 |          decomp%zsz(3),dummy,decomp%zsz(1)*decomp%zsz(2),1,dummy, &
106 |          decomp%zsz(1)*decomp%zsz(2),1,comm,info)
107 | #endif
108 | 
109 |     deallocate(dummy)
110 | 
111 |     return
112 |   end subroutine c2c_1m_z_plan
113 | 
114 | 
115 |   ! Return an ACML plan for multiple 1D r2c FFTs in X direction
116 |   subroutine r2c_1m_x_plan(comm, decomp)
117 | 
118 |     implicit none
119 |     real(mytype), allocatable, dimension(:), intent(OUT) :: comm
120 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
121 | 
122 |     real(mytype), allocatable, dimension(:) :: dummy
123 |     integer :: info
124 | 
125 |     allocate(comm(3*decomp%xsz(1)+100))
126 | 
127 |     allocate(dummy(decomp%xsz(1)))
128 | 
129 | #ifdef DOUBLE_PREC
130 |     call dzfft(plan_type,decomp%xsz(1),dummy,comm,info)
131 | #else
132 |     call scfft(plan_type,decomp%xsz(1),dummy,comm,info)
133 | #endif
134 | 
135 |     deallocate(dummy)
136 | 
137 |     return
138 |   end subroutine r2c_1m_x_plan
139 | 
140 |   ! Return an ACML plan for multiple 1D c2r FFTs in X direction
141 |   subroutine c2r_1m_x_plan(comm, decomp)
142 | 
143 |     implicit none
144 |     real(mytype), allocatable, dimension(:), intent(OUT) :: comm
145 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
146 | 
147 |     real(mytype), allocatable, dimension(:) :: dummy
148 |     integer :: info
149 | 
150 |     allocate(comm(3*decomp%xsz(1)+100))
151 | 
152 |     allocate(dummy(decomp%xsz(1)))
153 | 
154 | #ifdef DOUBLE_PREC
155 |     call zdfft(plan_type,decomp%xsz(1),dummy,comm,info)
156 | #else
157 |     call csfft(plan_type,decomp%xsz(1),dummy,comm,info)
158 | #endif
159 | 
160 |     deallocate(dummy)
161 | 
162 |     return
163 |   end subroutine c2r_1m_x_plan
164 | 
165 |   ! Return an ACML plan for multiple 1D r2c FFTs in Z direction
166 |   subroutine r2c_1m_z_plan(comm, decomp)
167 | 
168 |     implicit none
169 |     real(mytype), allocatable, dimension(:), intent(OUT) :: comm
170 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
171 | 
172 |     real(mytype), allocatable, dimension(:) :: dummy
173 |     integer :: info
174 | 
175 |     allocate(comm(3*decomp%zsz(3)+100))
176 | 
177 |     allocate(dummy(decomp%zsz(3)))
178 | 
179 | #ifdef DOUBLE_PREC
180 |     call dzfft(plan_type,decomp%zsz(3),dummy,comm,info)
181 | #else
182 |     call scfft(plan_type,decomp%zsz(3),dummy,comm,info)
183 | #endif
184 | 
185 |     deallocate(dummy)
186 | 
187 |     return
188 |   end subroutine r2c_1m_z_plan
189 | 
190 |   ! Return an ACML plan for multiple 1D c2r FFTs in Z direction
191 |   subroutine c2r_1m_z_plan(comm, decomp)
192 | 
193 |     implicit none
194 |     real(mytype), allocatable, dimension(:), intent(OUT) :: comm
195 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
196 | 
197 |     real(mytype), allocatable, dimension(:) :: dummy
198 |     integer :: info
199 | 
200 |     allocate(comm(3*decomp%zsz(3)+100))
201 | 
202 |     allocate(dummy(decomp%zsz(3)))
203 | 
204 | #ifdef DOUBLE_PREC
205 |     call zdfft(plan_type,decomp%zsz(3),dummy,comm,info)
206 | #else
207 |     call csfft(plan_type,decomp%zsz(3),dummy,comm,info)
208 | #endif
209 | 
210 |     deallocate(dummy)
211 | 
212 |     return
213 |   end subroutine c2r_1m_z_plan
214 | 


--------------------------------------------------------------------------------
/src/alloc.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 13 |   ! Utility routine to help allocate 3D arrays
 14 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 15 | 
 16 |   ! X-pencil real arrays
 17 |   subroutine alloc_x_real(var, opt_decomp, opt_global)
 18 | 
 19 |     implicit none
 20 | 
 21 |     real(mytype), allocatable, dimension(:,:,:) :: var
 22 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 23 |     logical, intent(IN), optional :: opt_global
 24 | 
 25 |     TYPE(DECOMP_INFO) :: decomp
 26 |     logical :: global
 27 |     integer :: alloc_stat, errorcode
 28 | 
 29 |     if (present(opt_decomp)) then
 30 |        decomp = opt_decomp
 31 |     else
 32 |        decomp = decomp_main
 33 |     end if
 34 | 
 35 |     if (present(opt_global)) then
 36 |        global = opt_global
 37 |     else
 38 |        global = .false.
 39 |     end if
 40 | 
 41 |     if (global) then
 42 |        allocate(var(decomp%xst(1):decomp%xen(1), &
 43 |             decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), &
 44 |             stat=alloc_stat)
 45 |     else
 46 |        allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), &
 47 |             stat=alloc_stat)
 48 |     end if
 49 |     
 50 |     if (alloc_stat /= 0) then
 51 |        errorcode = 8
 52 |        call decomp_2d_abort(errorcode, &
 53 |             'Memory allocation failed when creating new arrays')
 54 |     end if
 55 | 
 56 |     return
 57 |   end subroutine alloc_x_real
 58 | 
 59 | 
 60 |   ! X-pencil complex arrays
 61 |   subroutine alloc_x_complex(var, opt_decomp, opt_global)
 62 | 
 63 |     implicit none
 64 | 
 65 |     complex(mytype), allocatable, dimension(:,:,:) :: var
 66 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 67 |     logical, intent(IN), optional :: opt_global
 68 | 
 69 |     TYPE(DECOMP_INFO) :: decomp
 70 |     logical :: global
 71 |     integer :: alloc_stat, errorcode
 72 | 
 73 |     if (present(opt_decomp)) then
 74 |        decomp = opt_decomp
 75 |     else
 76 |        decomp = decomp_main
 77 |     end if
 78 | 
 79 |     if (present(opt_global)) then
 80 |        global = opt_global
 81 |     else
 82 |        global = .false.
 83 |     end if
 84 | 
 85 |     if (global) then
 86 |        allocate(var(decomp%xst(1):decomp%xen(1), &
 87 |             decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), &
 88 |             stat=alloc_stat)
 89 |     else
 90 |        allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), &
 91 |             stat=alloc_stat)
 92 |     end if
 93 |     
 94 |     if (alloc_stat /= 0) then
 95 |        errorcode = 8
 96 |        call decomp_2d_abort(errorcode, &
 97 |             'Memory allocation failed when creating new arrays')
 98 |     end if
 99 | 
100 |     return
101 |   end subroutine alloc_x_complex
102 | 
103 | 
104 |   ! Y-pencil real arrays
105 |   subroutine alloc_y_real(var, opt_decomp, opt_global)
106 | 
107 |     implicit none
108 | 
109 |     real(mytype), allocatable, dimension(:,:,:) :: var
110 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
111 |     logical, intent(IN), optional :: opt_global
112 | 
113 |     TYPE(DECOMP_INFO) :: decomp
114 |     logical :: global
115 |     integer :: alloc_stat, errorcode
116 | 
117 |     if (present(opt_decomp)) then
118 |        decomp = opt_decomp
119 |     else
120 |        decomp = decomp_main
121 |     end if
122 | 
123 |     if (present(opt_global)) then
124 |        global = opt_global
125 |     else
126 |        global = .false.
127 |     end if
128 | 
129 |     if (global) then
130 |        allocate(var(decomp%yst(1):decomp%yen(1), &
131 |             decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), &
132 |             stat=alloc_stat)
133 |     else
134 |        allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), &
135 |             stat=alloc_stat)
136 |     end if
137 |     
138 |     if (alloc_stat /= 0) then
139 |        errorcode = 8
140 |        call decomp_2d_abort(errorcode, &
141 |             'Memory allocation failed when creating new arrays')
142 |     end if
143 | 
144 |     return
145 |   end subroutine alloc_y_real
146 | 
147 | 
148 |   ! Y-pencil complex arrays
149 |   subroutine alloc_y_complex(var, opt_decomp, opt_global)
150 | 
151 |     implicit none
152 | 
153 |     complex(mytype), allocatable, dimension(:,:,:) :: var
154 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
155 |     logical, intent(IN), optional :: opt_global
156 | 
157 |     TYPE(DECOMP_INFO) :: decomp
158 |     logical :: global
159 |     integer :: alloc_stat, errorcode
160 | 
161 |     if (present(opt_decomp)) then
162 |        decomp = opt_decomp
163 |     else
164 |        decomp = decomp_main
165 |     end if
166 | 
167 |     if (present(opt_global)) then
168 |        global = opt_global
169 |     else
170 |        global = .false.
171 |     end if
172 | 
173 |     if (global) then
174 |        allocate(var(decomp%yst(1):decomp%yen(1), &
175 |             decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), &
176 |             stat=alloc_stat)
177 |     else
178 |        allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), &
179 |             stat=alloc_stat)
180 |     end if
181 |     
182 |     if (alloc_stat /= 0) then
183 |        errorcode = 8
184 |        call decomp_2d_abort(errorcode, &
185 |             'Memory allocation failed when creating new arrays')
186 |     end if
187 | 
188 |     return
189 |   end subroutine alloc_y_complex
190 | 
191 | 
192 |   ! Z-pencil real arrays
193 |   subroutine alloc_z_real(var, opt_decomp, opt_global)
194 | 
195 |     implicit none
196 | 
197 |     real(mytype), allocatable, dimension(:,:,:) :: var
198 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
199 |     logical, intent(IN), optional :: opt_global
200 | 
201 |     TYPE(DECOMP_INFO) :: decomp
202 |     logical :: global
203 |     integer :: alloc_stat, errorcode
204 | 
205 |     if (present(opt_decomp)) then
206 |        decomp = opt_decomp
207 |     else
208 |        decomp = decomp_main
209 |     end if
210 | 
211 |     if (present(opt_global)) then
212 |        global = opt_global
213 |     else
214 |        global = .false.
215 |     end if
216 | 
217 |     if (global) then
218 |        allocate(var(decomp%zst(1):decomp%zen(1), &
219 |             decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), &
220 |             stat=alloc_stat)
221 |     else
222 |        allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), &
223 |             stat=alloc_stat)
224 |     end if
225 |     
226 |     if (alloc_stat /= 0) then
227 |        errorcode = 8
228 |        call decomp_2d_abort(errorcode, &
229 |             'Memory allocation failed when creating new arrays')
230 |     end if
231 | 
232 |     return
233 |   end subroutine alloc_z_real
234 | 
235 | 
236 |   ! Z-pencil complex arrays
237 |   subroutine alloc_z_complex(var, opt_decomp, opt_global)
238 | 
239 |     implicit none
240 | 
241 |     complex(mytype), allocatable, dimension(:,:,:) :: var
242 |     TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
243 |     logical, intent(IN), optional :: opt_global
244 | 
245 |     TYPE(DECOMP_INFO) :: decomp
246 |     logical :: global
247 |     integer :: alloc_stat, errorcode
248 | 
249 |     if (present(opt_decomp)) then
250 |        decomp = opt_decomp
251 |     else
252 |        decomp = decomp_main
253 |     end if
254 | 
255 |     if (present(opt_global)) then
256 |        global = opt_global
257 |     else
258 |        global = .false.
259 |     end if
260 | 
261 |     if (global) then
262 |        allocate(var(decomp%zst(1):decomp%zen(1), &
263 |             decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), &
264 |             stat=alloc_stat)
265 |     else
266 |        allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), &
267 |             stat=alloc_stat)
268 |     end if
269 |     
270 |     if (alloc_stat /= 0) then
271 |        errorcode = 8
272 |        call decomp_2d_abort(errorcode, &
273 |             'Memory allocation failed when creating new arrays')
274 |     end if
275 | 
276 |     return
277 |   end subroutine alloc_z_complex
278 | 


--------------------------------------------------------------------------------
/src/alloc_shm.c:
--------------------------------------------------------------------------------
  1 | //=======================================================================
  2 | // This is part of the 2DECOMP&FFT library
  3 | // 
  4 | // 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | // decomposition. It also implements a highly scalable distributed
  6 | // three-dimensional Fast Fourier Transform (FFT).
  7 | //
  8 | // Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | //
 10 | //=======================================================================
 11 | 
 12 | // This is the shared-memory code using System V IPC API
 13 | 
 14 | /*
 15 |  This shared-memory code is kindly provided by David Tanqueray of Cray Inc.
 16 |  who also helped the author adapt it to use in 2DECOMP&FFT. His assistance
 17 |  is greatly appreciated.
 18 |  */
 19 | 
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <math.h>
 23 | #include <sys/types.h>
 24 | #include <sys/ipc.h>
 25 | #include <sys/shm.h>
 26 | #include <mpi.h>
 27 | 
 28 | #ifndef DBG
 29 | #define DBG
 30 | #endif
 31 | static int shm_debug=0;
 32 | 
 33 | float log2f(float); 
 34 | 
 35 | void set_shm_debug_();
 36 | void get_smp_map2_(MPI_Fint *comm, MPI_Fint *nnodes, MPI_Fint *my_node,
 37 |                   MPI_Fint *ncores, MPI_Fint *my_core, MPI_Fint *maxcor);
 38 | void alloc_shm_(MPI_Aint *ptr, MPI_Fint *nelem, MPI_Fint *type,
 39 |                 MPI_Fint *comm, MPI_Fint *ret);
 40 | void dealloc_shm_(MPI_Aint *ptr, MPI_Fint *comm);
 41 | 
 42 | void set_shm_debug_()
 43 | {
 44 |     shm_debug = 1;
 45 | }
 46 | 
 47 | void get_smp_map2_(MPI_Fint *comm, MPI_Fint *nnodes, MPI_Fint *my_node,
 48 |                   MPI_Fint *ncores, MPI_Fint *my_core, MPI_Fint *maxcor)
 49 | {
 50 |     MPI_Comm world;
 51 |     int err, pe, mype, npes, nnds, ncrs, maxc;
 52 |     int nlen, mynid, *nidlst, *nodlst;
 53 |     int i, n;
 54 |     char nodnam[MPI_MAX_PROCESSOR_NAME];
 55 |     char string[20];
 56 |     FILE *fp;
 57 | 
 58 |     MPI_Comm_rank(MPI_COMM_WORLD,&pe);
 59 | 
 60 |     world = MPI_Comm_f2c(*comm);
 61 |     MPI_Comm_rank(world, &mype);
 62 |     MPI_Comm_size(world, &npes);
 63 |     MPI_Get_processor_name(nodnam, &nlen);
 64 | #ifdef USE_NAME
 65 |     mynid = atoi(nodnam+3);
 66 | #else
 67 |     sprintf(string," pe %d /proc/cray_xt/nid",mype);
 68 |     if ((fp = fopen("/proc/cray_xt/nid", "r")) ==  NULL) {
 69 |           perror(string);
 70 |           exit(1);
 71 |     }
 72 |     fscanf(fp,"%i", &mynid);
 73 |     fclose(fp);
 74 | #endif
 75 | #ifdef DBG
 76 |     if (shm_debug) {
 77 |       fprintf(stderr," pe %d mype %d of %d, nodnam = %s (len = %d), node = %d\n",
 78 |              pe, mype, npes, nodnam, nlen, mynid);
 79 |       MPI_Barrier(world);
 80 |     }
 81 | #endif
 82 | 
 83 |     /* get list of nodeid for each pe */
 84 |     nidlst = malloc(npes*sizeof(int));
 85 |     MPI_Allgather(&mynid, 1, MPI_INT, nidlst, 1, MPI_INT, world);
 86 | 
 87 |     nodlst = malloc(npes*sizeof(int));
 88 |     nnds = ncrs = 0;
 89 |     for (i=0; i<npes; i++) {
 90 |       /* get my core id and no. of cores on my node */
 91 |       if (i == mype) *my_core = (MPI_Fint)(ncrs);
 92 |       if (nidlst[i] == mynid) ncrs++;
 93 | 
 94 |       /* get my node id and number of unique nodes */
 95 |       for (n=0; n<nnds; n++) {
 96 |         if (nodlst[n] == nidlst[i]) break;            /* node already in list */
 97 |       }
 98 |       if (nidlst[i] == mynid) *my_node = (MPI_Fint)(n); /* save my node index */
 99 |       if (n >= nnds) nodlst[nnds++] = nidlst[i];      /* add new node to list */
100 |     }
101 |     /* get max core counts over all nodes */
102 |     MPI_Allreduce(&ncrs, &maxc, 1, MPI_INT, MPI_MAX, world);
103 |     *nnodes = (MPI_Fint)(nnds);
104 |     *ncores = (MPI_Fint)(ncrs);
105 |     *maxcor = (MPI_Fint)(maxc);
106 | 
107 | #ifdef DBG
108 |     if (shm_debug) {
109 |       fprintf(stderr," pe %d nnodes=%d ncores=%d maxcor=%d\n",pe,nnds,ncrs,maxc);
110 |       for (n=0; n<nnds; n++) {
111 |         if (nodlst[n]==mynid && *my_core==0) {
112 |           fprintf(stderr," pe %d node %d (%s) of %d nodes, with %d cores\n",
113 |                  pe,n,nodnam,nnds,ncrs);
114 |         }
115 |         MPI_Barrier(world);
116 |       }
117 |       fprintf(stderr,"max core count = %d\n",maxc);
118 |     }
119 | #endif
120 |     free(nidlst);
121 |     free(nodlst);
122 | }
123 | 
124 | 
125 | /* Implementation using System V shared memory [shmget & shmat] */
126 | /* Because this uses 32-bit sizes, we allow for multiple segments */
127 | 
128 | #define BLKSIZE (1L<<30)
129 | 
130 | struct blktyp {
131 |     struct blktyp *next;
132 |     size_t size;
133 |     void *addr;
134 |     int shmid;
135 |     key_t key;
136 | };
137 | typedef struct blktyp Blktyp;
138 | 
139 | struct segtyp {
140 |     struct segtyp *next;
141 |     void *base;
142 |     Blktyp *blks;
143 | };
144 | typedef struct segtyp Segtyp;
145 | 
146 | Segtyp *seglst=NULL;
147 | 
148 | void alloc_shm_(MPI_Aint *ptr, MPI_Fint *nelem, MPI_Fint *type,
149 |                 MPI_Fint *comm, MPI_Fint *ret)
150 | {
151 |     MPI_Comm world;
152 |     MPI_Fint err;
153 |     int pe, np, mype, typsiz, msk;
154 |     size_t size, blksize;
155 |     char *shm, *shmaddr;
156 |     char string[20];
157 |     int shmid;
158 |     key_t key;
159 |     Segtyp *seg;
160 |     Blktyp *blk;
161 | #ifdef DBG
162 |     int npes,*plst,i;
163 |     char plist[1024],*s;
164 | #endif
165 | 
166 |     MPI_Comm_rank(MPI_COMM_WORLD, &pe);
167 |     MPI_Comm_size(MPI_COMM_WORLD, &np);
168 |     msk = ( 1 << ((int)ceilf(log2f((float)np))) ) - 1;
169 | 
170 |     world = MPI_Comm_f2c(*comm);
171 |     MPI_Comm_rank(world, &mype);
172 | #ifdef DBG
173 |     if (shm_debug) {
174 |       MPI_Comm_size(world, &npes);
175 |       plst = (int*)malloc(npes*sizeof(int));
176 |       MPI_Allgather(&pe,1,MPI_INT,plst,1,MPI_INT,world);
177 |       for (i=0,s=plist; i<npes; i++) s=s+sprintf(s,"%d,",plst[i]);
178 |       s[0]='\0'; if (s>plist) s[-1]='\0';
179 |       fprintf(stderr," pe %d al_shm: comm sz/rk=%d/%d pes=%s\n",pe,npes,mype,plist);
180 |     }
181 | #endif
182 |     MPI_Type_size(MPI_Type_f2c(*type), &typsiz);
183 |     size = (size_t)*nelem * (size_t)typsiz;
184 | 
185 |     err = 0;
186 |     /* setup structure to keep track of this segment and its blocks */
187 |     seg = (Segtyp *)malloc(sizeof(Segtyp));
188 |     seg->next = seglst;
189 |     seg->base = NULL;
190 |     seg->blks = NULL;
191 |     seglst = seg;
192 | 
193 |     shmaddr = NULL;
194 |     while (size) {
195 |       blksize = size<BLKSIZE ? size : BLKSIZE;
196 | 
197 |       if (!mype) key = rand()&~msk | pe&msk;
198 |       MPI_Bcast(&key, 1, MPI_INT, 0, world);
199 | 
200 |       sprintf(string," pe %d shmget",pe);
201 |       if ((shmid = shmget(key, blksize, IPC_CREAT | 0666)) < 0) {
202 |         perror(string);
203 |         if (*ret) err++; else exit(1);
204 |       }
205 | #ifdef DBG
206 |       if (shm_debug) fprintf(stderr," pe %d shmget: key = %d, shmid = %d\n",
207 |                             pe,key,shmid);
208 | #endif
209 | 
210 |       sprintf(string," pe %d shmat",pe);
211 |       if ((shm = shmat(shmid, shmaddr, 0)) == (char *) -1) {
212 |         perror(string);
213 |         err++;
214 |       }
215 | 
216 |       /* wait for all cores to attach block before ... */
217 |       MPI_Barrier(world);
218 | 
219 |       /* ... marking it for deletion */
220 |       sprintf(string," pe %d shmctl",pe);
221 |       if (shmctl(shmid, IPC_RMID, NULL) < 0) {
222 |         perror(string);
223 |         if (*ret) err++; else exit(1);
224 |       }
225 | #ifdef DBG
226 |       if (shm_debug) fprintf(stderr," pe %d shmat: shm = %lx, size = %d, limit = %lx\n",pe,shm,blksize,shm+blksize);
227 | #endif
228 |       if (seg->base == NULL) seg->base = shm;
229 | 
230 |       /* setup structure to record block details */
231 |       blk = malloc(sizeof(Blktyp));
232 |       blk->size = blksize;
233 |       blk->addr = shm;
234 |       blk->shmid = shmid;
235 |       blk->key   = key;
236 |       /* and add it to segment's list */
237 |       blk->next = seg->blks;
238 |       seg->blks = blk;
239 | 
240 |       shmaddr = shm + blksize;
241 |       size -= blksize;
242 |     }
243 | 
244 |     *ptr = (MPI_Aint)seg->base;
245 |     if (*ret) *ret = err;
246 |     else if (err) exit(1);
247 | }
248 | 
249 | void dealloc_shm_(MPI_Aint *ptr, MPI_Fint *comm)
250 | {
251 |     MPI_Comm world;
252 |     int pe, mype, err;
253 |     char string[20];
254 |     Segtyp *seg;
255 |     Segtyp **prev;
256 |     Blktyp *blk=NULL, *nblk;
257 |     void *shm;
258 | 
259 |     MPI_Comm_rank(MPI_COMM_WORLD, &pe);
260 | #ifdef DBG
261 |     if (shm_debug) fprintf(stderr," pe %d dealloc_shm: ptr=%lx\n",pe,*ptr);
262 | #endif
263 | 
264 |     world = MPI_Comm_f2c(*comm);
265 |     MPI_Comm_rank(world, &mype);
266 |     err = 0;
267 |     /* Find segment with specified start address and remove from list */
268 |     seg = seglst;
269 |     prev = &seglst;
270 |     while (seg) {
271 |       if (seg->base == (void*)*ptr) {
272 |         blk=seg->blks;
273 |         *prev = seg->next;
274 |         free(seg);
275 |         break;
276 |       }
277 |       prev = &seg->next;
278 |       seg = seg->next;
279 |     }
280 |     if (blk == NULL) {
281 |       fprintf(stderr," pe %d dealloc_shm: segment at address %lx not found\n",
282 |               pe, *ptr);
283 |     }
284 |     /* detach all blocks in this segment */
285 |     while (blk) {
286 |       shm = blk->addr;
287 |       sprintf(string," pe %d shmdt",pe);
288 |       if (shmdt((char*)shm) < 0) {
289 |         perror(string);
290 |         err++;
291 |       }
292 |       nblk = blk->next;
293 |       free(blk);
294 |       blk = nblk;
295 |     }
296 |     if (err) exit(1);
297 | }
298 | 
299 | 


--------------------------------------------------------------------------------
/src/cuda_fft_1m.cu:
--------------------------------------------------------------------------------
 1 | //=======================================================================
 2 | // This is part of the 2DECOMP&FFT library
 3 | // 
 4 | // 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | // decomposition. It also implements a highly scalable distributed
 6 | // three-dimensional Fast Fourier Transform (FFT).
 7 | //
 8 | // Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | //
10 | //=======================================================================
11 | 
12 | // This contains CUDA code that compute multiple 1D FFTs on NVidia GPU
13 | 
14 | #ifdef DOUBLE_PREC
15 | #define CUFFT_REAL_TYPE cufftDoubleReal
16 | #define CUFFT_COMPLEX_TYPE cufftDoubleComplex
17 | #define CUFFT_PLAN_TYPE_C2C CUFFT_Z2Z
18 | #define CUFFT_PLAN_TYPE_R2C CUFFT_D2Z
19 | #define CUFFT_PLAN_TYPE_C2R CUFFT_Z2D
20 | #define CUFFT_EXEC_TYPE_C2C cufftExecZ2Z
21 | #define CUFFT_EXEC_TYPE_R2C cufftExecD2Z
22 | #define CUFFT_EXEC_TYPE_C2R cufftExecZ2D
23 | #else
24 | #define CUFFT_REAL_TYPE cufftReal
25 | #define CUFFT_COMPLEX_TYPE cufftComplex
26 | #define CUFFT_PLAN_TYPE_C2C CUFFT_C2C
27 | #define CUFFT_PLAN_TYPE_R2C CUFFT_R2C
28 | #define CUFFT_PLAN_TYPE_C2R CUFFT_C2R
29 | #define CUFFT_EXEC_TYPE_C2C cufftExecC2C
30 | #define CUFFT_EXEC_TYPE_R2C cufftExecR2C
31 | #define CUFFT_EXEC_TYPE_C2R cufftExecC2R
32 | #endif
33 | 
34 | #include <stdio.h>
35 | #include <stdlib.h>
36 | #include "cufft.h"
37 | #include "cuda.h"
38 | 
39 | extern "C" void fft_1m_r2c_(int *nx, int *m, CUFFT_REAL_TYPE *h_a, CUFFT_COMPLEX_TYPE *h_b)
40 | {
41 |   unsigned long size1 = sizeof(CUFFT_REAL_TYPE) * (*nx) * (*m);
42 |   unsigned long size2 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx/2+1) * (*m);
43 |   CUFFT_REAL_TYPE *d_ic = NULL;
44 |   CUFFT_COMPLEX_TYPE *d_oc = NULL; 
45 |   cufftHandle plan;
46 |   cudaMalloc((void **)&d_ic, size1);
47 |   cudaMalloc((void **)&d_oc, size2);
48 |   cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice);
49 |   int dims[1] = {*nx};
50 |   cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_R2C,*m);
51 |   CUFFT_EXEC_TYPE_R2C(plan, d_ic, d_oc);
52 |   cudaMemcpy(h_b, d_oc, size2, cudaMemcpyDeviceToHost);
53 |   cudaFree(d_ic);
54 |   cudaFree(d_oc);
55 |   cufftDestroy(plan);
56 | }
57 | 
58 | 
59 | extern "C" void fft_1m_c2r_(int *nx, int *m, CUFFT_COMPLEX_TYPE *h_a, CUFFT_REAL_TYPE *h_b)
60 | {
61 |   unsigned long size1 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx/2+1)*(*m);
62 |   unsigned long size2 = sizeof(CUFFT_REAL_TYPE) * (*nx)*(*m);
63 |   CUFFT_COMPLEX_TYPE *d_ic = NULL;
64 |   CUFFT_REAL_TYPE *d_oc = NULL; 
65 |   cufftHandle plan;
66 |   cudaMalloc((void **)&d_ic, size1);
67 |   cudaMalloc((void **)&d_oc, size2);
68 |   cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice);
69 |   int dims[1] = {*nx};
70 |   cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_C2R,*m);
71 |   CUFFT_EXEC_TYPE_C2R(plan, d_ic, d_oc);
72 |   cudaMemcpy(h_b, d_oc, size2, cudaMemcpyDeviceToHost);
73 |   cudaFree(d_ic);
74 |   cudaFree(d_oc);
75 |   cufftDestroy(plan);
76 | }
77 | 
78 | 
79 | extern "C" void fft_1m_c2c_(int *nx, int *m, CUFFT_COMPLEX_TYPE *h_a, CUFFT_COMPLEX_TYPE *h_b, int *sign)
80 | {
81 |   unsigned long size1 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx) * (*m);
82 |   CUFFT_COMPLEX_TYPE *d_ic = NULL;
83 |   CUFFT_COMPLEX_TYPE *d_oc = NULL; 
84 |   cufftHandle plan;
85 |   cudaMalloc((void **)&d_ic, size1);
86 |   cudaMalloc((void **)&d_oc, size1);
87 |   cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice);
88 |   int dims[1] = {*nx};
89 |   cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_C2C,*m);
90 |   CUFFT_EXEC_TYPE_C2C(plan, d_ic, d_oc, *sign);
91 |   cudaMemcpy(h_b, d_oc, size1, cudaMemcpyDeviceToHost);
92 |   cudaFree(d_ic);
93 |   cudaFree(d_oc);
94 |   cufftDestroy(plan);
95 | }
96 | 


--------------------------------------------------------------------------------
/src/factor.f90:
--------------------------------------------------------------------------------
 1 | !=======================================================================
 2 | ! This is part of the 2DECOMP&FFT library
 3 | ! 
 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | ! decomposition. It also implements a highly scalable distributed
 6 | ! three-dimensional Fast Fourier Transform (FFT).
 7 | !
 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | !
10 | !=======================================================================
11 | 
12 | ! A few utility routines to find factors of integer numbers
13 | 
14 |   subroutine findfactor(num, factors, nfact)
15 |     
16 |     implicit none
17 | 
18 |     integer, intent(IN) :: num
19 |     integer, intent(OUT), dimension(*) :: factors
20 |     integer, intent(OUT) :: nfact
21 |     integer :: i, m
22 | 
23 |     ! find the factors <= sqrt(num)
24 |     m = int(sqrt(real(num)))
25 |     nfact = 1
26 |     do i=1,m
27 |        if (num/i*i == num) then
28 |           factors(nfact) = i
29 |           nfact = nfact + 1
30 |        end if
31 |     end do
32 |     nfact = nfact - 1
33 | 
34 |     ! derive those > sqrt(num)
35 |     if (factors(nfact)**2/=num) then
36 |        do i=nfact+1, 2*nfact
37 |           factors(i) = num / factors(2*nfact-i+1)
38 |        end do
39 |        nfact = nfact * 2
40 |     else
41 |        do i=nfact+1, 2*nfact-1
42 |           factors(i) = num / factors(2*nfact-i)
43 |        end do
44 |        nfact = nfact * 2 - 1
45 |     endif
46 |        
47 |     return
48 | 
49 |   end subroutine findfactor
50 | 
51 | 
52 |   subroutine primefactors(num, factors, nfact)
53 | 
54 |     implicit none
55 |   
56 |     integer, intent(IN) :: num
57 |     integer, intent(OUT), dimension(*) :: factors
58 |     integer, intent(INOUT) :: nfact
59 | 
60 |     integer :: i, n
61 |     
62 |     i = 2  
63 |     nfact = 1
64 |     n = num 
65 |     do
66 |        if (mod(n,i) == 0) then
67 |           factors(nfact) = i
68 |           nfact = nfact + 1
69 |           n = n / i
70 |        else
71 |           i = i + 1
72 |        end if
73 |        if (n == 1) then
74 |           nfact = nfact - 1
75 |           exit
76 |        end if
77 |     end do
78 |     
79 |     return
80 | 
81 |   end subroutine primefactors
82 |   
83 | 


--------------------------------------------------------------------------------
/src/fft_common.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This file contains common code shared by all FFT engines
 13 | 
 14 |   integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1
 15 |   integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1
 16 |   
 17 |   ! Physical space data can be stored in either X-pencil or Z-pencil
 18 |   integer, parameter, public :: PHYSICAL_IN_X = 1
 19 |   integer, parameter, public :: PHYSICAL_IN_Z = 3 
 20 | 
 21 |   integer, save :: format                 ! input X-pencil or Z-pencil
 22 |   
 23 |   ! The libary can only be initialised once
 24 |   logical, save :: initialised = .false. 
 25 | 
 26 |   ! Global size of the FFT
 27 |   integer, save :: nx_fft, ny_fft, nz_fft
 28 | 
 29 |   ! 2D processor grid
 30 |   integer, save, dimension(2) :: dims
 31 | 
 32 |   ! Decomposition objects
 33 |   TYPE(DECOMP_INFO), save :: ph  ! physical space
 34 |   TYPE(DECOMP_INFO), save :: sp  ! spectral space
 35 | 
 36 |   ! Workspace to store the intermediate Y-pencil data
 37 |   ! *** TODO: investigate how to use only one workspace array
 38 |   complex(mytype), allocatable, dimension(:,:,:) :: wk2_c2c, wk2_r2c
 39 |   complex(mytype), allocatable, dimension(:,:,:) :: wk13
 40 | 
 41 |   public :: decomp_2d_fft_init, decomp_2d_fft_3d, &
 42 |        decomp_2d_fft_finalize, decomp_2d_fft_get_size
 43 |   
 44 |   ! Declare generic interfaces to handle different inputs
 45 |   
 46 |   interface decomp_2d_fft_init
 47 |      module procedure fft_init_noarg
 48 |      module procedure fft_init_arg
 49 |      module procedure fft_init_general
 50 |   end interface
 51 |   
 52 |   interface decomp_2d_fft_3d
 53 |      module procedure fft_3d_c2c
 54 |      module procedure fft_3d_r2c
 55 |      module procedure fft_3d_c2r
 56 |   end interface
 57 | 
 58 |   
 59 | contains
 60 |   
 61 |   
 62 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 63 |   ! Initialise the FFT module
 64 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 65 |   subroutine fft_init_noarg
 66 |     
 67 |     implicit none
 68 |     
 69 |     call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
 70 |     
 71 |     return
 72 |   end subroutine fft_init_noarg
 73 | 
 74 |   subroutine fft_init_arg(pencil)     ! allow to handle Z-pencil input
 75 | 
 76 |     implicit none
 77 | 
 78 |     integer, intent(IN) :: pencil
 79 | 
 80 |     call fft_init_general(pencil, nx_global, ny_global, nz_global)
 81 | 
 82 |     return
 83 |   end subroutine fft_init_arg
 84 | 
 85 |   ! Initialise the FFT library to perform arbitrary size transforms
 86 |   subroutine fft_init_general(pencil, nx, ny, nz)
 87 | 
 88 |     implicit none
 89 | 
 90 |     integer, intent(IN) :: pencil
 91 |     integer, intent(IN) :: nx, ny, nz
 92 | 
 93 |     logical, dimension(2) :: dummy_periods
 94 |     integer, dimension(2) :: dummy_coords
 95 |     integer :: status, errorcode, ierror
 96 | 
 97 |     if (initialised) then
 98 |        errorcode = 4
 99 |        call decomp_2d_abort(errorcode, &
100 |             'FFT library should only be initialised once')
101 |     end if
102 |     
103 |     format = pencil
104 |     nx_fft = nx
105 |     ny_fft = ny
106 |     nz_fft = nz
107 | 
108 |     ! determine the processor grid in use
109 |     call MPI_CART_GET(DECOMP_2D_COMM_CART_X, 2, &
110 |          dims, dummy_periods, dummy_coords, ierror)
111 | 
112 |     ! for c2r/r2c interface:
113 |     ! if in physical space, a real array is of size: nx*ny*nz
114 |     ! in spectral space, the complex array is of size:
115 |     !         (nx/2+1)*ny*nz, if PHYSICAL_IN_X
116 |     !      or nx*ny*(nz/2+1), if PHYSICAL_IN_Z
117 | 
118 |     call decomp_info_init(nx, ny, nz, ph)
119 |     if (format==PHYSICAL_IN_X) then
120 |        call decomp_info_init(nx/2+1, ny, nz, sp)
121 |     else if (format==PHYSICAL_IN_Z) then
122 |        call decomp_info_init(nx, ny, nz/2+1, sp)
123 |     end if
124 | 
125 |     allocate(wk2_c2c(ph%ysz(1),ph%ysz(2),ph%ysz(3)), STAT=status)
126 |     allocate(wk2_r2c(sp%ysz(1),sp%ysz(2),sp%ysz(3)), STAT=status)
127 |     if (format==PHYSICAL_IN_X) then
128 |        allocate(wk13(sp%xsz(1),sp%xsz(2),sp%xsz(3)), STAT=status)
129 |     else if (format==PHYSICAL_IN_Z) then
130 |        allocate(wk13(sp%zsz(1),sp%zsz(2),sp%zsz(3)), STAT=status)
131 |     end if
132 |     if (status /= 0) then
133 |        errorcode = 3
134 |        call decomp_2d_abort(errorcode, &
135 |             'Out of memory when initialising FFT')
136 |     end if
137 | 
138 |     call init_fft_engine
139 |     
140 |     initialised = .true.
141 |     
142 |     return
143 |   end subroutine fft_init_general
144 | 
145 |   
146 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
147 |   ! Final clean up
148 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
149 |   subroutine decomp_2d_fft_finalize
150 |     
151 |     implicit none
152 | 
153 |     call decomp_info_finalize(ph)
154 |     call decomp_info_finalize(sp)
155 | 
156 |     deallocate(wk2_c2c, wk2_r2c, wk13)
157 | 
158 |     call finalize_fft_engine
159 | 
160 |     initialised = .false.
161 | 
162 |     return
163 |   end subroutine decomp_2d_fft_finalize
164 | 
165 | 
166 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
167 |   ! Return the size, starting/ending index of the distributed array 
168 |   !  whose global size is (nx/2+1)*ny*nz, for defining data structures
169 |   !  in r2c and c2r interfaces
170 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
171 |   subroutine decomp_2d_fft_get_size(istart, iend, isize)
172 |     
173 |     implicit none
174 |     integer, dimension(3), intent(OUT) :: istart, iend, isize
175 |     
176 |     if (format==PHYSICAL_IN_X) then
177 |        istart = sp%zst
178 |        iend   = sp%zen
179 |        isize  = sp%zsz
180 |     else if (format==PHYSICAL_IN_Z) then
181 |        istart = sp%xst
182 |        iend   = sp%xen
183 |        isize  = sp%xsz
184 |     end if
185 |     
186 |     return
187 |   end subroutine decomp_2d_fft_get_size
188 | 


--------------------------------------------------------------------------------
/src/fft_common_3d.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This file contains 3D c2c/r2c/c2r transform subroutines which are
 13 | ! identical for several FFT engines 
 14 | 
 15 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 16 |   ! 3D FFT - complex to complex
 17 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 18 |   subroutine fft_3d_c2c(in, out, isign)
 19 |     
 20 |     implicit none
 21 |     
 22 |     complex(mytype), dimension(:,:,:), intent(INOUT) :: in
 23 |     complex(mytype), dimension(:,:,:), intent(OUT) :: out
 24 |     integer, intent(IN) :: isign
 25 | 
 26 | #ifndef OVERWRITE
 27 |     complex(mytype), allocatable, dimension(:,:,:) :: wk1
 28 | #endif
 29 | 
 30 |     if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
 31 |          format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
 32 |        
 33 |        ! ===== 1D FFTs in X =====
 34 | #ifdef OVERWRITE
 35 |        call c2c_1m_x(in,isign,ph)
 36 | #else
 37 |        allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
 38 |        wk1 = in
 39 |        call c2c_1m_x(wk1,isign,ph)
 40 | #endif
 41 | 
 42 |        ! ===== Swap X --> Y; 1D FFTs in Y =====
 43 | 
 44 |        if (dims(1)>1) then
 45 | #ifdef OVERWRITE
 46 |           call transpose_x_to_y(in,wk2_c2c,ph)
 47 | #else
 48 |           call transpose_x_to_y(wk1,wk2_c2c,ph)
 49 | #endif
 50 |           call c2c_1m_y(wk2_c2c,isign,ph)
 51 |        else
 52 | #ifdef OVERWRITE
 53 |           call c2c_1m_y(in,isign,ph)
 54 | #else
 55 |           call c2c_1m_y(wk1,isign,ph)
 56 | #endif
 57 |        end if
 58 | 
 59 |        ! ===== Swap Y --> Z; 1D FFTs in Z =====
 60 |        if (dims(1)>1) then
 61 |           call transpose_y_to_z(wk2_c2c,out,ph)
 62 |        else
 63 | #ifdef OVERWRITE
 64 |           call transpose_y_to_z(in,out,ph)
 65 | #else
 66 |           call transpose_y_to_z(wk1,out,ph)
 67 | #endif
 68 |        end if
 69 |        call c2c_1m_z(out,isign,ph)
 70 | 
 71 |     else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
 72 |          .OR. & 
 73 |          format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
 74 | 
 75 |        ! ===== 1D FFTs in Z =====
 76 | #ifdef OVERWRITE
 77 |        call c2c_1m_z(in,isign,ph)
 78 | #else
 79 |        allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
 80 |        wk1 = in
 81 |        call c2c_1m_z(wk1,isign,ph)
 82 | #endif
 83 | 
 84 |        ! ===== Swap Z --> Y; 1D FFTs in Y =====
 85 |        if (dims(1)>1) then
 86 | #ifdef OVERWRITE
 87 |           call transpose_z_to_y(in,wk2_c2c,ph)
 88 | #else
 89 |           call transpose_z_to_y(wk1,wk2_c2c,ph)
 90 | #endif
 91 |           call c2c_1m_y(wk2_c2c,isign,ph)
 92 |        else  ! out==wk2_c2c if 1D decomposition
 93 | #ifdef OVERWRITE
 94 |           call transpose_z_to_y(in,out,ph)
 95 | #else
 96 |           call transpose_z_to_y(wk1,out,ph)
 97 | #endif
 98 |           call c2c_1m_y(out,isign,ph)
 99 |        end if
100 | 
101 |        ! ===== Swap Y --> X; 1D FFTs in X =====
102 |        if (dims(1)>1) then
103 |           call transpose_y_to_x(wk2_c2c,out,ph)
104 |        end if
105 |        call c2c_1m_x(out,isign,ph)
106 |        
107 |     end if
108 | 
109 |     return
110 |   end subroutine fft_3d_c2c
111 | 
112 |   
113 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
114 |   ! 3D forward FFT - real to complex
115 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
116 |   subroutine fft_3d_r2c(in_r, out_c)
117 |     
118 |     implicit none
119 |     
120 |     real(mytype), dimension(:,:,:), intent(IN) :: in_r
121 |     complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
122 | 
123 |     if (format==PHYSICAL_IN_X) then
124 | 
125 |        ! ===== 1D FFTs in X =====
126 |        call r2c_1m_x(in_r,wk13)
127 | 
128 |        ! ===== Swap X --> Y; 1D FFTs in Y =====
129 |        if (dims(1)>1) then
130 |           call transpose_x_to_y(wk13,wk2_r2c,sp)
131 |           call c2c_1m_y(wk2_r2c,-1,sp)
132 |        else
133 |           call c2c_1m_y(wk13,-1,sp)
134 |        end if
135 | 
136 |        ! ===== Swap Y --> Z; 1D FFTs in Z =====
137 |        if (dims(1)>1) then
138 |           call transpose_y_to_z(wk2_r2c,out_c,sp)
139 |        else
140 |           call transpose_y_to_z(wk13,out_c,sp)
141 |        end if
142 |        call c2c_1m_z(out_c,-1,sp)
143 |                 
144 |     else if (format==PHYSICAL_IN_Z) then
145 | 
146 |        ! ===== 1D FFTs in Z =====
147 |        call r2c_1m_z(in_r,wk13)
148 | 
149 |        ! ===== Swap Z --> Y; 1D FFTs in Y =====
150 |        if (dims(1)>1) then
151 |           call transpose_z_to_y(wk13,wk2_r2c,sp)
152 |           call c2c_1m_y(wk2_r2c,-1,sp)
153 |        else  ! out_c==wk2_r2c if 1D decomposition
154 |           call transpose_z_to_y(wk13,out_c,sp)
155 |           call c2c_1m_y(out_c,-1,sp)
156 |        end if
157 | 
158 |        ! ===== Swap Y --> X; 1D FFTs in X =====
159 |        if (dims(1)>1) then
160 |           call transpose_y_to_x(wk2_r2c,out_c,sp)
161 |        end if
162 |        call c2c_1m_x(out_c,-1,sp)
163 | 
164 |     end if
165 |     
166 |     return
167 |   end subroutine fft_3d_r2c
168 |   
169 |   
170 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
171 |   ! 3D inverse FFT - complex to real
172 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
173 |   subroutine fft_3d_c2r(in_c, out_r)
174 |     
175 |     implicit none
176 |     
177 |     complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
178 |     real(mytype), dimension(:,:,:), intent(OUT) :: out_r
179 | 
180 | #ifndef OVERWRITE
181 |     complex(mytype), allocatable, dimension(:,:,:) :: wk1
182 | #endif
183 | 
184 |     if (format==PHYSICAL_IN_X) then
185 | 
186 |        ! ===== 1D FFTs in Z =====
187 | #ifdef OVERWRITE
188 |        call c2c_1m_z(in_c,1,sp)       
189 | #else
190 |        allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
191 |        wk1 = in_c
192 |        call c2c_1m_z(wk1,1,sp)
193 | #endif
194 | 
195 |        ! ===== Swap Z --> Y; 1D FFTs in Y =====
196 | #ifdef OVERWRITE
197 |        call transpose_z_to_y(in_c,wk2_r2c,sp)
198 | #else
199 |        call transpose_z_to_y(wk1,wk2_r2c,sp)
200 | #endif
201 |        call c2c_1m_y(wk2_r2c,1,sp)
202 | 
203 |        ! ===== Swap Y --> X; 1D FFTs in X =====
204 |        if (dims(1)>1) then
205 |           call transpose_y_to_x(wk2_r2c,wk13,sp)
206 |           call c2r_1m_x(wk13,out_r)
207 |        else
208 |           call c2r_1m_x(wk2_r2c,out_r)
209 |        end if
210 | 
211 |     else if (format==PHYSICAL_IN_Z) then
212 | 
213 |        ! ===== 1D FFTs in X =====
214 | #ifdef OVERWRITE
215 |        call c2c_1m_x(in_c,1,sp)
216 | #else
217 |        allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
218 |        wk1 = in_c
219 |        call c2c_1m_x(wk1,1,sp)
220 | #endif
221 | 
222 |        ! ===== Swap X --> Y; 1D FFTs in Y =====
223 |        if (dims(1)>1) then
224 | #ifdef OVERWRITE
225 |           call transpose_x_to_y(in_c,wk2_r2c,sp)
226 | #else
227 |           call transpose_x_to_y(wk1,wk2_r2c,sp)
228 | #endif
229 |           call c2c_1m_y(wk2_r2c,1,sp)
230 |        else  ! in_c==wk2_r2c if 1D decomposition
231 | #ifdef OVERWRITE
232 |           call c2c_1m_y(in_c,1,sp)
233 | #else
234 |           call c2c_1m_y(wk1,1,sp)
235 | #endif
236 |        end if
237 | 
238 |        ! ===== Swap Y --> Z; 1D FFTs in Z =====
239 |        if (dims(1)>1) then
240 |           call transpose_y_to_z(wk2_r2c,wk13,sp)
241 |        else
242 | #ifdef OVERWRITE
243 |           call transpose_y_to_z(in_c,wk13,sp)
244 | #else
245 |           call transpose_y_to_z(wk1,wk13,sp)
246 | #endif
247 |        end if
248 |        call c2r_1m_z(wk13,out_r)
249 | 
250 |     end if
251 | 
252 |     return
253 |   end subroutine fft_3d_c2r
254 | 


--------------------------------------------------------------------------------
/src/fft_generic.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This is the 'generic' implementation of the FFT library
 13 | 
 14 | module decomp_2d_fft
 15 |   
 16 |   use decomp_2d  ! 2D decomposition module
 17 |   use glassman
 18 |   
 19 |   implicit none
 20 |   
 21 |   private        ! Make everything private unless declared public
 22 | 
 23 |   ! engine-specific global variables
 24 |   complex(mytype), allocatable, dimension(:) :: buf, scratch
 25 | 
 26 |   ! common code used for all engines, including global variables, 
 27 |   ! generic interface definitions and several subroutines
 28 | #include "fft_common.f90"
 29 | 
 30 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 31 |   !  This routine performs one-time initialisations for the FFT engine
 32 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 33 |   subroutine init_fft_engine
 34 | 
 35 |     implicit none
 36 | 
 37 |     integer :: cbuf_size
 38 | 
 39 |     if (nrank==0) then
 40 |        write(*,*) ' '
 41 |        write(*,*) '***** Using the generic FFT engine *****'
 42 |        write(*,*) ' '
 43 |     end if
 44 | 
 45 |     cbuf_size = max(ph%xsz(1), ph%ysz(2))
 46 |     cbuf_size = max(cbuf_size, ph%zsz(3))
 47 |     allocate(buf(cbuf_size))
 48 |     allocate(scratch(cbuf_size))
 49 | 
 50 |     return
 51 |   end subroutine init_fft_engine
 52 | 
 53 | 
 54 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 55 |   !  This routine performs one-time finalisations for the FFT engine
 56 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 57 |   subroutine finalize_fft_engine
 58 | 
 59 |     implicit none
 60 | 
 61 |     deallocate(buf,scratch)
 62 | 
 63 |     return
 64 |   end subroutine finalize_fft_engine
 65 | 
 66 | 
 67 |   ! Following routines calculate multiple one-dimensional FFTs to form 
 68 |   ! the basis of three-dimensional FFTs.
 69 | 
 70 |   ! c2c transform, multiple 1D FFTs in x direction
 71 |   subroutine c2c_1m_x(inout, isign, decomp)
 72 | 
 73 |     implicit none
 74 | 
 75 |     complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
 76 |     integer, intent(IN) :: isign
 77 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
 78 | 
 79 |     integer :: i,j,k
 80 |     
 81 |     do k=1,decomp%xsz(3)
 82 |        do j=1,decomp%xsz(2)
 83 |           do i=1,decomp%xsz(1)
 84 |              buf(i) = inout(i,j,k)
 85 |           end do
 86 |           call spcfft(buf,decomp%xsz(1),isign,scratch)
 87 |           do i=1,decomp%xsz(1)
 88 |              inout(i,j,k) = buf(i)
 89 |           end do
 90 |        end do
 91 |     end do
 92 | 
 93 |     return
 94 | 
 95 |   end subroutine c2c_1m_x
 96 | 
 97 |   ! c2c transform, multiple 1D FFTs in y direction
 98 |   subroutine c2c_1m_y(inout, isign, decomp)
 99 | 
100 |     implicit none
101 | 
102 |     complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
103 |     integer, intent(IN) :: isign
104 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
105 | 
106 |     integer :: i,j,k
107 | 
108 |     do k=1,decomp%ysz(3)
109 |        do i=1,decomp%ysz(1)
110 |           do j=1,decomp%ysz(2)
111 |              buf(j) = inout(i,j,k)
112 |           end do
113 |           call spcfft(buf,decomp%ysz(2),isign,scratch)
114 |           do j=1,decomp%ysz(2)
115 |              inout(i,j,k) = buf(j)
116 |           end do
117 |        end do
118 |     end do
119 | 
120 |     return
121 | 
122 |   end subroutine c2c_1m_y
123 | 
124 |   ! c2c transform, multiple 1D FFTs in z direction
125 |   subroutine c2c_1m_z(inout, isign, decomp)
126 | 
127 |     implicit none
128 | 
129 |     complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
130 |     integer, intent(IN) :: isign
131 |     TYPE(DECOMP_INFO), intent(IN) :: decomp
132 | 
133 |     integer :: i,j,k
134 | 
135 |     do j=1,decomp%zsz(2)
136 |        do i=1,decomp%zsz(1)
137 |           do k=1,decomp%zsz(3)
138 |              buf(k) = inout(i,j,k)
139 |           end do
140 |           call spcfft(buf,decomp%zsz(3),isign,scratch)
141 |           do k=1,decomp%zsz(3)
142 |              inout(i,j,k) = buf(k)
143 |           end do
144 |        end do
145 |     end do
146 | 
147 |     return
148 | 
149 |   end subroutine c2c_1m_z
150 | 
151 |   ! r2c transform, multiple 1D FFTs in x direction
152 |   subroutine r2c_1m_x(input, output)
153 | 
154 |     implicit none
155 | 
156 |     real(mytype), dimension(:,:,:), intent(IN)  ::  input
157 |     complex(mytype), dimension(:,:,:), intent(OUT) :: output
158 | 
159 |     integer :: i,j,k, s1,s2,s3, d1
160 | 
161 |     s1 = size(input,1)
162 |     s2 = size(input,2)
163 |     s3 = size(input,3)
164 |     d1 = size(output,1)
165 | 
166 |     do k=1,s3
167 |        do j=1,s2
168 |           ! Glassman's FFT is c2c only, 
169 |           ! needing some pre- and post-processing for r2c
170 |           ! pack real input in complex storage
171 |           do i=1,s1
172 |              buf(i) = cmplx(input(i,j,k),0._mytype, kind=mytype)
173 |           end do
174 |           call spcfft(buf,s1,-1,scratch)
175 |           ! note d1 ~ s1/2+1
176 |           ! simply drop the redundant part of the complex output
177 |           do i=1,d1
178 |              output(i,j,k) = buf(i)
179 |           end do
180 |        end do
181 |     end do
182 | 
183 |     return
184 | 
185 |   end subroutine r2c_1m_x
186 | 
187 |   ! r2c transform, multiple 1D FFTs in z direction
188 |   subroutine r2c_1m_z(input, output)
189 | 
190 |     implicit none
191 | 
192 |     real(mytype), dimension(:,:,:), intent(IN)  ::  input
193 |     complex(mytype), dimension(:,:,:), intent(OUT) :: output
194 | 
195 |     integer :: i,j,k, s1,s2,s3, d3
196 | 
197 |     s1 = size(input,1)
198 |     s2 = size(input,2)
199 |     s3 = size(input,3)
200 |     d3 = size(output,3)
201 | 
202 |     do j=1,s2
203 |        do i=1,s1
204 |           ! Glassman's FFT is c2c only, 
205 |           ! needing some pre- and post-processing for r2c
206 |           ! pack real input in complex storage
207 |           do k=1,s3
208 |              buf(k) = cmplx(input(i,j,k),0._mytype, kind=mytype)
209 |           end do
210 |           call spcfft(buf,s3,-1,scratch)
211 |           ! note d3 ~ s3/2+1
212 |           ! simply drop the redundant part of the complex output
213 |           do k=1,d3
214 |              output(i,j,k) = buf(k)
215 |           end do
216 |        end do
217 |     end do
218 | 
219 |     return
220 | 
221 |   end subroutine r2c_1m_z
222 | 
223 |   ! c2r transform, multiple 1D FFTs in x direction
224 |   subroutine c2r_1m_x(input, output)
225 | 
226 |     implicit none
227 | 
228 |     complex(mytype), dimension(:,:,:), intent(IN)  ::  input
229 |     real(mytype), dimension(:,:,:), intent(OUT) :: output
230 | 
231 |     integer :: i,j,k, d1,d2,d3
232 | 
233 |     d1 = size(output,1)
234 |     d2 = size(output,2)
235 |     d3 = size(output,3)
236 | 
237 |     do k=1,d3
238 |        do j=1,d2
239 |           ! Glassman's FFT is c2c only, 
240 |           ! needing some pre- and post-processing for c2r
241 |           do i=1,d1/2+1
242 |              buf(i) = input(i,j,k)
243 |           end do
244 |           ! expanding to a full-size complex array
245 |           ! For odd N, the storage is:
246 |           !  1, 2, ...... N/2+1   integer division rounded down
247 |           !     N, ...... N/2+2   => a(i) is conjugate of a(N+2-i)
248 |           ! For even N, the storage is:
249 |           !  1, 2, ...... N/2  , N/2+1
250 |           !     N, ...... N/2+2  again a(i) conjugate of a(N+2-i)
251 |           do i=d1/2+2,d1
252 |              buf(i) =  conjg(buf(d1+2-i))
253 |           end do
254 |           call spcfft(buf,d1,1,scratch)
255 |           do i=1,d1
256 |              ! simply drop imaginary part
257 |              output(i,j,k) = real(buf(i), kind=mytype)
258 |           end do
259 |        end do
260 |     end do
261 | 
262 |     return
263 | 
264 |   end subroutine c2r_1m_x
265 | 
266 |   ! c2r transform, multiple 1D FFTs in z direction
267 |   subroutine c2r_1m_z(input, output)
268 | 
269 |     implicit none
270 | 
271 |     complex(mytype), dimension(:,:,:), intent(IN)  ::  input
272 |     real(mytype), dimension(:,:,:), intent(OUT) :: output
273 | 
274 |     integer :: i,j,k, d1,d2,d3
275 | 
276 |     d1 = size(output,1)
277 |     d2 = size(output,2)
278 |     d3 = size(output,3)
279 | 
280 |     do j=1,d2
281 |        do i=1,d1
282 |           do k=1,d3/2+1
283 |              buf(k) = input(i,j,k)
284 |           end do
285 |           do k=d3/2+2,d3
286 |              buf(k) =  conjg(buf(d3+2-k))
287 |           end do
288 |           call spcfft(buf,d3,1,scratch)
289 |           do k=1,d3
290 |              output(i,j,k) = real(buf(k), kind=mytype)
291 |           end do
292 |        end do
293 |     end do
294 | 
295 |     return
296 | 
297 |   end subroutine c2r_1m_z
298 | 
299 | 
300 | #include "fft_common_3d.f90"
301 | 
302 |   
303 | end module decomp_2d_fft
304 | 


--------------------------------------------------------------------------------
/src/glassman.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This module contains a few 'generic' FFT routines, making the 
 13 | ! 2DECOMP&FFT library not dependent on any external libraries
 14 | 
 15 | module glassman
 16 | 
 17 |   use decomp_2d, only : mytype
 18 | 
 19 |   implicit none
 20 | 
 21 | contains
 22 | 
 23 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 24 |   ! Following is a FFT implementation based on algorithm proposed by
 25 |   ! Glassman, a general FFT algorithm supporting arbitrary input length.
 26 |   !
 27 |   ! W. E. Ferguson, Jr., "A simple derivation of Glassman general-n fast
 28 |   ! Fourier transform," Comput. and Math. with Appls., vol. 8, no. 6, pp.
 29 |   ! 401-411, 1982.
 30 |   !
 31 |   ! Original implemtation online at http://www.jjj.de/fft/fftpage.html
 32 |   !
 33 |   ! Updated  
 34 |   !  -  to handle double-precision as well
 35 |   !  -  unnecessary scaling code removed
 36 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 37 | 
 38 |   SUBROUTINE SPCFFT(U,N,ISIGN,WORK)
 39 |     
 40 |     IMPLICIT NONE
 41 |     
 42 |     LOGICAL :: INU
 43 |     INTEGER :: A,B,C,N,I,ISIGN
 44 |     COMPLEX(mytype) :: U(*),WORK(*)
 45 |     
 46 |     A = 1
 47 |     B = N
 48 |     C = 1
 49 |     INU = .TRUE.
 50 |     
 51 |     DO WHILE ( B .GT. 1 )
 52 |        A = C * A
 53 |        C = 2
 54 |        DO WHILE ( MOD(B,C) .NE. 0 )
 55 |           C = C + 1
 56 |        END DO
 57 |        B = B / C
 58 |        IF ( INU ) THEN
 59 |           CALL SPCPFT (A,B,C,U,WORK,ISIGN)
 60 |        ELSE
 61 |           CALL SPCPFT (A,B,C,WORK,U,ISIGN)
 62 |        END IF
 63 |        INU = ( .NOT. INU )
 64 |     END DO
 65 |     
 66 |     IF ( .NOT. INU ) THEN
 67 |        DO I = 1, N
 68 |           U(I) = WORK(I)
 69 |        END DO
 70 |     END IF
 71 |     
 72 |     RETURN
 73 |   END SUBROUTINE SPCFFT
 74 |   
 75 |   
 76 |   SUBROUTINE SPCPFT( A, B, C, UIN, UOUT, ISIGN )
 77 |     
 78 |     IMPLICIT NONE
 79 |     
 80 |     INTEGER :: ISIGN,A,B,C,IA,IB,IC,JCR,JC
 81 |     
 82 |     DOUBLE PRECISION :: ANGLE
 83 |     
 84 |     COMPLEX(mytype) :: UIN(B,C,A),UOUT(B,A,C),DELTA,OMEGA,SUM
 85 |     
 86 |     ANGLE = 8.D0*DATAN(1.D0) / REAL( A * C, kind=mytype )
 87 |     OMEGA = CMPLX( 1.0, 0.0, kind=mytype )
 88 |     
 89 |     IF( ISIGN .EQ. 1 ) THEN
 90 |        DELTA = CMPLX( DCOS(ANGLE), DSIN(ANGLE), kind=mytype )
 91 |     ELSE
 92 |        DELTA = CMPLX( DCOS(ANGLE), -DSIN(ANGLE), kind=mytype )
 93 |     END IF
 94 |     
 95 |     DO IC = 1, C
 96 |        DO IA = 1, A
 97 |           DO IB = 1, B
 98 |              SUM = UIN( IB, C, IA )
 99 |              DO JCR = 2, C
100 |                 JC = C + 1 - JCR
101 |                 SUM = UIN( IB, JC, IA ) + OMEGA * SUM
102 |              END DO
103 |              UOUT( IB, IA, IC ) = SUM
104 |           END DO
105 |           OMEGA = DELTA * OMEGA
106 |        END DO
107 |     END DO
108 |     
109 |     RETURN
110 |   END SUBROUTINE SPCPFT
111 | 
112 | 
113 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
114 |   ! A 3D real-to-complex routine implemented using the 1D FFT above
115 |   !   Input:   nx*ny*nz real numbers
116 |   !   Output:  (nx/2+1)*ny*nz complex numbers
117 |   ! Just like big FFT libraries (such as FFTW) do
118 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
119 |   subroutine glassman_3d_r2c(in_r,nx,ny,nz,out_c)
120 | 
121 |     implicit none
122 |     
123 |     integer, intent(IN) :: nx,ny,nz
124 |     real(mytype), dimension(nx,ny,nz) :: in_r
125 |     complex(mytype), dimension(nx/2+1,ny,nz) :: out_c
126 |     
127 |     complex(mytype), allocatable, dimension(:) :: buf, scratch
128 |     integer :: maxsize, i,j,k
129 |     
130 |     maxsize = max(nx, max(ny,nz))
131 |     allocate(buf(maxsize))
132 |     allocate(scratch(maxsize))
133 |     
134 |     ! ===== 1D FFTs in X =====
135 |     do k=1,nz
136 |        do j=1,ny
137 |           ! Glassman's 1D FFT is c2c only, 
138 |           ! needing some pre- and post-processing for r2c
139 |           ! pack real input in complex storage
140 |           do i=1,nx
141 |              buf(i) = cmplx(in_r(i,j,k),0._mytype, kind=mytype)
142 |           end do
143 |           call spcfft(buf,nx,-1,scratch)
144 |           ! simply drop the redundant part of the complex output
145 |           do i=1,nx/2+1
146 |              out_c(i,j,k) = buf(i)
147 |           end do
148 |        end do
149 |     end do
150 |     
151 |     ! ===== 1D FFTs in Y =====
152 |     do k=1,nz
153 |        do i=1,nx/2+1
154 |           do j=1,ny
155 |              buf(j) = out_c(i,j,k)
156 |           end do
157 |           call spcfft(buf,ny,-1,scratch)
158 |           do j=1,ny
159 |              out_c(i,j,k) = buf(j)
160 |           end do
161 |        end do
162 |     end do
163 |     
164 |     ! ===== 1D FFTs in Z =====
165 |     do j=1,ny
166 |        do i=1,nx/2+1
167 |           do k=1,nz
168 |              buf(k) = out_c(i,j,k)
169 |           end do
170 |           call spcfft(buf,nz,-1,scratch)
171 |           do k=1,nz
172 |              out_c(i,j,k) = buf(k)
173 |           end do
174 |        end do
175 |     end do
176 |     
177 |     deallocate(buf,scratch)
178 |     
179 |     return
180 |   end subroutine glassman_3d_r2c
181 | 
182 | 
183 | end module glassman
184 | 
185 | 


--------------------------------------------------------------------------------
/src/halo.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 13 |   ! Halo cell support for neighbouring pencils to exchange data
 14 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 15 |   subroutine update_halo_real(in, out, level, opt_decomp, opt_global)
 16 | 
 17 |     implicit none
 18 | 
 19 |     integer, intent(IN) :: level      ! levels of halo cells required
 20 |     real(mytype), dimension(:,:,:), intent(IN) :: in    
 21 |     real(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
 22 |     TYPE(DECOMP_INFO), optional :: opt_decomp
 23 |     logical, optional :: opt_global
 24 | 
 25 |     TYPE(DECOMP_INFO) :: decomp
 26 |     logical :: global
 27 | 
 28 |     ! starting/ending index of array with halo cells
 29 |     integer :: xs, ys, zs, xe, ye, ze
 30 | 
 31 |     integer :: i, j, k, s1, s2, s3, ierror
 32 |     integer :: data_type
 33 | 
 34 |     integer :: icount, ilength, ijump 
 35 |     integer :: halo12, halo21, halo31, halo32                 
 36 |     integer, dimension(4) :: requests
 37 |     integer, dimension(MPI_STATUS_SIZE,4) :: status
 38 |     integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
 39 | 
 40 |     data_type = real_type
 41 | 
 42 | #include "halo_common.f90"
 43 | 
 44 |     return
 45 |   end subroutine update_halo_real
 46 | 
 47 | 
 48 |   subroutine update_halo_complex(in, out, level, opt_decomp, opt_global)
 49 | 
 50 |     implicit none
 51 | 
 52 |     integer, intent(IN) :: level      ! levels of halo cells required
 53 |     complex(mytype), dimension(:,:,:), intent(IN) :: in    
 54 |     complex(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
 55 |     TYPE(DECOMP_INFO), optional :: opt_decomp
 56 |     logical, optional :: opt_global
 57 | 
 58 |     TYPE(DECOMP_INFO) :: decomp
 59 |     logical :: global
 60 | 
 61 |     ! starting/ending index of array with halo cells
 62 |     integer :: xs, ys, zs, xe, ye, ze
 63 | 
 64 |     integer :: i, j, k, s1, s2, s3, ierror
 65 |     integer :: data_type
 66 | 
 67 |     integer :: icount, ilength, ijump 
 68 |     integer :: halo12, halo21, halo31, halo32                 
 69 |     integer, dimension(4) :: requests
 70 |     integer, dimension(MPI_STATUS_SIZE,4) :: status
 71 |     integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
 72 | 
 73 |     data_type = complex_type
 74 | 
 75 | #include "halo_common.f90"
 76 | 
 77 |     return
 78 |   end subroutine update_halo_complex
 79 | 
 80 | 
 81 | 
 82 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 83 |   ! To support halo-cell exchange:
 84 |   !   find the MPI ranks of neighbouring pencils
 85 |   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 86 |   subroutine init_neighbour
 87 | 
 88 |     integer :: ierror
 89 | 
 90 |     ! For X-pencil
 91 |     neighbour(1,1) = MPI_PROC_NULL               ! east
 92 |     neighbour(1,2) = MPI_PROC_NULL               ! west
 93 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 0, 1, &
 94 |          neighbour(1,4), neighbour(1,3), ierror) ! north & south
 95 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 1, 1, &
 96 |          neighbour(1,6), neighbour(1,5), ierror) ! top & bottom
 97 | 
 98 |     ! For Y-pencil
 99 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 0, 1, &
100 |          neighbour(2,2), neighbour(2,1), ierror) ! east & west
101 |     neighbour(2,3) = MPI_PROC_NULL               ! north
102 |     neighbour(2,4) = MPI_PROC_NULL               ! south
103 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 1, 1, &
104 |          neighbour(2,6), neighbour(2,5), ierror) ! top & bottom
105 | 
106 |     ! For Z-pencil
107 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 0, 1, &
108 |          neighbour(3,2), neighbour(3,1), ierror) ! east & west
109 |     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 1, 1, &
110 |          neighbour(3,4), neighbour(3,3), ierror) ! north & south
111 |     neighbour(3,5) = MPI_PROC_NULL               ! top
112 |     neighbour(3,6) = MPI_PROC_NULL               ! bottom
113 | 
114 |     return
115 |   end subroutine init_neighbour
116 | 


--------------------------------------------------------------------------------
/src/io_read_one.f90:
--------------------------------------------------------------------------------
 1 | !=======================================================================
 2 | ! This is part of the 2DECOMP&FFT library
 3 | ! 
 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | ! decomposition. It also implements a highly scalable distributed
 6 | ! three-dimensional Fast Fourier Transform (FFT).
 7 | !
 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | !
10 | !=======================================================================
11 | 
12 | ! This file contain common code to be included by subroutines 
13 | ! 'mpiio_read_one_...' in io.f90
14 | 
15 |     ! Using MPI-IO to write a distributed 3D array into a file
16 | 
17 |     if (present(opt_decomp)) then
18 |        decomp = opt_decomp
19 |     else
20 |        call get_decomp_info(decomp)
21 |     end if
22 |     
23 |     ! determine subarray parameters
24 |     sizes(1) = decomp%xsz(1)
25 |     sizes(2) = decomp%ysz(2)
26 |     sizes(3) = decomp%zsz(3)
27 |     
28 |     if (ipencil == 1) then
29 |        subsizes(1) = decomp%xsz(1)
30 |        subsizes(2) = decomp%xsz(2)
31 |        subsizes(3) = decomp%xsz(3)
32 |        starts(1) = decomp%xst(1)-1  ! 0-based index
33 |        starts(2) = decomp%xst(2)-1
34 |        starts(3) = decomp%xst(3)-1
35 |     else if (ipencil == 2) then
36 |        subsizes(1) = decomp%ysz(1)
37 |        subsizes(2) = decomp%ysz(2)
38 |        subsizes(3) = decomp%ysz(3)
39 |        starts(1) = decomp%yst(1)-1
40 |        starts(2) = decomp%yst(2)-1
41 |        starts(3) = decomp%yst(3)-1
42 |     else if (ipencil == 3) then
43 |        subsizes(1) = decomp%zsz(1)
44 |        subsizes(2) = decomp%zsz(2)
45 |        subsizes(3) = decomp%zsz(3)
46 |        starts(1) = decomp%zst(1)-1
47 |        starts(2) = decomp%zst(2)-1
48 |        starts(3) = decomp%zst(3)-1
49 |     endif
50 |     
51 |     call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
52 |          MPI_ORDER_FORTRAN, data_type, newtype, ierror)
53 |     call MPI_TYPE_COMMIT(newtype,ierror)
54 |     call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
55 |          MPI_MODE_RDONLY, MPI_INFO_NULL, &
56 |          fh, ierror)
57 |     disp = 0_MPI_OFFSET_KIND
58 |     call MPI_FILE_SET_VIEW(fh,disp,data_type, &
59 |          newtype,'native',MPI_INFO_NULL,ierror)
60 |     call MPI_FILE_READ_ALL(fh, var, &
61 |          subsizes(1)*subsizes(2)*subsizes(3), &
62 |          data_type, MPI_STATUS_IGNORE, ierror)
63 |     call MPI_FILE_CLOSE(fh,ierror)
64 |     call MPI_TYPE_FREE(newtype,ierror)
65 | 


--------------------------------------------------------------------------------
/src/io_read_var.f90:
--------------------------------------------------------------------------------
 1 | !=======================================================================
 2 | ! This is part of the 2DECOMP&FFT library
 3 | ! 
 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | ! decomposition. It also implements a highly scalable distributed
 6 | ! three-dimensional Fast Fourier Transform (FFT).
 7 | !
 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | !
10 | !=======================================================================
11 | 
12 | ! This file contain common code to be included by subroutines 
13 | ! 'read_var_...' in io.f90
14 | 
15 |   ! Using MPI-IO to read a distributed 3D variable from a file. File 
16 |   ! operations (open/close) need to be done in calling application. This 
17 |   ! allows multiple variables to be read from a single file. Together 
18 |   ! with the corresponding write operation, this is the perfect solution
19 |   ! for applications to perform restart/checkpointing.
20 | 
21 |     if (present(opt_decomp)) then
22 |        decomp = opt_decomp
23 |     else
24 |        call get_decomp_info(decomp)
25 |     end if
26 | 
27 |     ! Create file type and set file view
28 |     sizes(1) = decomp%xsz(1)
29 |     sizes(2) = decomp%ysz(2)
30 |     sizes(3) = decomp%zsz(3)
31 |     if (ipencil == 1) then
32 |        subsizes(1) = decomp%xsz(1)
33 |        subsizes(2) = decomp%xsz(2)
34 |        subsizes(3) = decomp%xsz(3)
35 |        starts(1) = decomp%xst(1)-1  ! 0-based index
36 |        starts(2) = decomp%xst(2)-1
37 |        starts(3) = decomp%xst(3)-1
38 |     else if (ipencil == 2) then
39 |        subsizes(1) = decomp%ysz(1)
40 |        subsizes(2) = decomp%ysz(2)
41 |        subsizes(3) = decomp%ysz(3)
42 |        starts(1) = decomp%yst(1)-1
43 |        starts(2) = decomp%yst(2)-1
44 |        starts(3) = decomp%yst(3)-1
45 |     else if (ipencil == 3) then
46 |        subsizes(1) = decomp%zsz(1)
47 |        subsizes(2) = decomp%zsz(2)
48 |        subsizes(3) = decomp%zsz(3)
49 |        starts(1) = decomp%zst(1)-1
50 |        starts(2) = decomp%zst(2)-1
51 |        starts(3) = decomp%zst(3)-1
52 |     endif
53 |     
54 |     call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
55 |          MPI_ORDER_FORTRAN, data_type, newtype, ierror)
56 |     call MPI_TYPE_COMMIT(newtype,ierror)
57 |     call MPI_FILE_SET_VIEW(fh,disp,data_type, &
58 |          newtype,'native',MPI_INFO_NULL,ierror)
59 |     call MPI_FILE_READ_ALL(fh, var, &
60 |          subsizes(1)*subsizes(2)*subsizes(3), &
61 |          data_type, MPI_STATUS_IGNORE, ierror)
62 |     call MPI_TYPE_FREE(newtype,ierror)
63 | 
64 |     ! update displacement for the next read operation
65 |     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
66 |     if (data_type == complex_type) then
67 |        disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
68 |     end if
69 | 


--------------------------------------------------------------------------------
/src/io_write_every.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This file contain common code to be included by subroutines 
 13 | ! 'write_every_...' in io.f90
 14 | 
 15 |   ! To write every few points of a 3D array to a file
 16 | 
 17 |     ! work out the distribution parameters, which may be different from 
 18 |     ! the default distribution used by the decomposition library
 19 |     !  For exmample if nx=17 and p_row=4
 20 |     !    distribution is: 4 4 4 5
 21 | 
 22 |     ! If writing from the 1st element
 23 |     !  If saving every 3 points, then 5 points to be saved (17/3)
 24 |     !    default distribution would be 1 1 1 2
 25 |     !    However, 1st block (1-4) contains the 3rd point
 26 |     !             2nd block (5-8) contains the 6th point
 27 |     !             3rd block (9-12) contains the 9th and 12th point
 28 |     !             4th block (13-17) contains then 15th point
 29 |     !    giving a 1 1 2 1 distribution
 30 |     !    So cannot use the base decomposition library for such IO
 31 | 
 32 |     ! If writing from the n-th element (n=?skip)
 33 |     !  If saving every 3 points, then 6 points to be saved
 34 |     !    However, 1st block (1-4) contains the 1st & 4th point
 35 |     !             2nd block (5-8) contains the 7th point
 36 |     !             3rd block (9-12) contains the 10th point
 37 |     !             4th block (13-17) contains then 12th & 15th point
 38 |     !    giving a 1 2 2 1 distribution
 39 | 
 40 |     skip(1)=iskip
 41 |     skip(2)=jskip
 42 |     skip(3)=kskip
 43 | 
 44 |     do i=1,3
 45 |        if (from1) then
 46 |           xst(i) = (xstart(i)+skip(i)-1)/skip(i)
 47 |           if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xst(i)=xst(i)+1
 48 |           xen(i) = (xend(i)+skip(i)-1)/skip(i)
 49 |        else
 50 |           xst(i) = xstart(i)/skip(i)
 51 |           if (mod(xstart(i),skip(i))/=0) xst(i)=xst(i)+1
 52 |           xen(i) = xend(i)/skip(i)
 53 |        end if
 54 |        xsz(i) = xen(i)-xst(i)+1
 55 |     end do
 56 |        
 57 |     do i=1,3
 58 |        if (from1) then
 59 |           yst(i) = (ystart(i)+skip(i)-1)/skip(i)
 60 |           if (mod(ystart(i)+skip(i)-1,skip(i))/=0) yst(i)=yst(i)+1
 61 |           yen(i) = (yend(i)+skip(i)-1)/skip(i)
 62 |        else
 63 |           yst(i) = ystart(i)/skip(i)
 64 |           if (mod(ystart(i),skip(i))/=0) yst(i)=yst(i)+1
 65 |           yen(i) = yend(i)/skip(i)
 66 |        end if
 67 |        ysz(i) = yen(i)-yst(i)+1
 68 |     end do
 69 | 
 70 |     do i=1,3
 71 |        if (from1) then
 72 |           zst(i) = (zstart(i)+skip(i)-1)/skip(i)
 73 |           if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zst(i)=zst(i)+1
 74 |           zen(i) = (zend(i)+skip(i)-1)/skip(i)
 75 |        else
 76 |           zst(i) = zstart(i)/skip(i)
 77 |           if (mod(zstart(i),skip(i))/=0) zst(i)=zst(i)+1
 78 |           zen(i) = zend(i)/skip(i)
 79 |        end if
 80 |        zsz(i) = zen(i)-zst(i)+1
 81 |     end do
 82 | 
 83 |     ! if 'skip' value is large it is possible that some ranks do not 
 84 |     ! contain any points to be written. Subarray constructor requires 
 85 |     ! nonzero size so it is not possible to use MPI_COMM_WORLD for IO.
 86 |     ! Create a sub communicator for this...
 87 |     color = 1
 88 |     key = 0  ! rank order doesn't matter
 89 |     if (ipencil==1) then
 90 |        if (xsz(1)==0 .or. xsz(2)==0 .or. xsz(3)==0) then
 91 |           color = 2
 92 |        end if
 93 |     else if (ipencil==2) then
 94 |        if (ysz(1)==0 .or. ysz(2)==0 .or. ysz(3)==0) then
 95 |           color = 2
 96 |        end if
 97 |     else if (ipencil==3) then
 98 |        if (zsz(1)==0 .or. zsz(2)==0 .or. zsz(3)==0) then
 99 |           color = 2
100 |        end if
101 |     end if
102 |     call MPI_COMM_SPLIT(MPI_COMM_WORLD,color,key,newcomm,ierror)
103 | 
104 |     if (color==1) then ! only ranks in this group do IO collectively
105 |        
106 |        ! generate subarray information
107 |        sizes(1) = xsz(1)
108 |        sizes(2) = ysz(2)
109 |        sizes(3) = zsz(3)
110 |        if (ipencil==1) then
111 |           subsizes(1) = xsz(1)
112 |           subsizes(2) = xsz(2)
113 |           subsizes(3) = xsz(3)
114 |           starts(1) = xst(1)-1
115 |           starts(2) = xst(2)-1
116 |           starts(3) = xst(3)-1
117 |        else if (ipencil==2) then
118 |           subsizes(1) = ysz(1)
119 |           subsizes(2) = ysz(2)
120 |           subsizes(3) = ysz(3)
121 |           starts(1) = yst(1)-1
122 |           starts(2) = yst(2)-1
123 |           starts(3) = yst(3)-1
124 |        else if (ipencil==3) then
125 |           subsizes(1) = zsz(1)
126 |           subsizes(2) = zsz(2)
127 |           subsizes(3) = zsz(3)
128 |           starts(1) = zst(1)-1
129 |           starts(2) = zst(2)-1
130 |           starts(3) = zst(3)-1
131 |        end if
132 |        
133 |        ! copy data from original array
134 |        ! needs a copy of original array in global coordinate 
135 |        if (ipencil==1) then
136 |           allocate(wk(xst(1):xen(1),xst(2):xen(2),xst(3):xen(3)))
137 |           allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
138 |           wk2=var
139 |           if (from1) then
140 |              do k=xst(3),xen(3)
141 |                 do j=xst(2),xen(2)
142 |                    do i=xst(1),xen(1)
143 |                       wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
144 |                    end do
145 |                 end do
146 |              end do
147 |           else
148 |              do k=xst(3),xen(3)
149 |                 do j=xst(2),xen(2)
150 |                    do i=xst(1),xen(1)
151 |                       wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
152 |                    end do
153 |                 end do
154 |              end do
155 |           end if   
156 |        else if (ipencil==2) then
157 |           allocate(wk(yst(1):yen(1),yst(2):yen(2),yst(3):yen(3)))
158 |           allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
159 |           wk2=var
160 |           if (from1) then
161 |              do k=yst(3),yen(3)
162 |                 do j=yst(2),yen(2)
163 |                    do i=yst(1),yen(1)
164 |                       wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
165 |                    end do
166 |                 end do
167 |              end do
168 |           else
169 |              do k=yst(3),yen(3)
170 |                 do j=yst(2),yen(2)
171 |                    do i=yst(1),yen(1)
172 |                       wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
173 |                    end do
174 |                 end do
175 |              end do
176 |           end if
177 |        else if (ipencil==3) then
178 |           allocate(wk(zst(1):zen(1),zst(2):zen(2),zst(3):zen(3)))
179 |           allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
180 |           wk2=var
181 |           if (from1) then
182 |              do k=zst(3),zen(3)
183 |                 do j=zst(2),zen(2)
184 |                    do i=zst(1),zen(1)
185 |                       wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
186 |                    end do
187 |                 end do
188 |              end do
189 |           else
190 |              do k=zst(3),zen(3)
191 |                 do j=zst(2),zen(2)
192 |                    do i=zst(1),zen(1)
193 |                       wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
194 |                    end do
195 |                 end do
196 |              end do
197 |           end if
198 |        end if
199 |        deallocate(wk2)
200 | 
201 |        ! MPI-IO
202 |        call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
203 |             MPI_ORDER_FORTRAN, data_type, newtype, ierror)
204 |        call MPI_TYPE_COMMIT(newtype,ierror)
205 |        call MPI_FILE_OPEN(newcomm, filename, &
206 |             MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
207 |             fh, ierror)
208 |        filesize = 0_MPI_OFFSET_KIND
209 |        call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
210 |        disp = 0_MPI_OFFSET_KIND
211 |        call MPI_FILE_SET_VIEW(fh,disp,data_type, &
212 |             newtype,'native',MPI_INFO_NULL,ierror)
213 |        call MPI_FILE_WRITE_ALL(fh, wk, &
214 |             subsizes(1)*subsizes(2)*subsizes(3), &
215 |             data_type, MPI_STATUS_IGNORE, ierror)
216 |        call MPI_FILE_CLOSE(fh,ierror)
217 |        call MPI_TYPE_FREE(newtype,ierror)
218 | 
219 |        deallocate(wk)
220 | 
221 |     end if ! color==1
222 | 
223 |     call MPI_BARRIER(MPI_COMM_WORLD, ierror)
224 | 


--------------------------------------------------------------------------------
/src/io_write_one.f90:
--------------------------------------------------------------------------------
 1 | !=======================================================================
 2 | ! This is part of the 2DECOMP&FFT library
 3 | ! 
 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | ! decomposition. It also implements a highly scalable distributed
 6 | ! three-dimensional Fast Fourier Transform (FFT).
 7 | !
 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | !
10 | !=======================================================================
11 | 
12 | ! This file contain common code to be included by subroutines 
13 | ! 'mpiio_write_one_...' in io.f90
14 | 
15 |     ! Using MPI-IO to write a distributed 3D array into a file
16 | 
17 |     if (present(opt_decomp)) then
18 |        decomp = opt_decomp
19 |     else
20 |        call get_decomp_info(decomp)
21 |     end if
22 |     
23 |     ! determine subarray parameters
24 |     sizes(1) = decomp%xsz(1)
25 |     sizes(2) = decomp%ysz(2)
26 |     sizes(3) = decomp%zsz(3)
27 |     
28 |     if (ipencil == 1) then
29 |        subsizes(1) = decomp%xsz(1)
30 |        subsizes(2) = decomp%xsz(2)
31 |        subsizes(3) = decomp%xsz(3)
32 |        starts(1) = decomp%xst(1)-1  ! 0-based index
33 |        starts(2) = decomp%xst(2)-1
34 |        starts(3) = decomp%xst(3)-1
35 |     else if (ipencil == 2) then
36 |        subsizes(1) = decomp%ysz(1)
37 |        subsizes(2) = decomp%ysz(2)
38 |        subsizes(3) = decomp%ysz(3)
39 |        starts(1) = decomp%yst(1)-1
40 |        starts(2) = decomp%yst(2)-1
41 |        starts(3) = decomp%yst(3)-1
42 |     else if (ipencil == 3) then
43 |        subsizes(1) = decomp%zsz(1)
44 |        subsizes(2) = decomp%zsz(2)
45 |        subsizes(3) = decomp%zsz(3)
46 |        starts(1) = decomp%zst(1)-1
47 |        starts(2) = decomp%zst(2)-1
48 |        starts(3) = decomp%zst(3)-1
49 |     endif
50 | 
51 | #ifdef T3PIO
52 |     call MPI_INFO_CREATE(info, ierror)
53 |     gs = ceiling(real(sizes(1),mytype)*real(sizes(2),mytype)* &
54 |          real(sizes(3),mytype)/1024./1024.)
55 |     call t3pio_set_info(MPI_COMM_WORLD, info, "./", ierror, &
56 |          GLOBAL_SIZE=gs, factor=1)
57 | #endif
58 | 
59 |     call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
60 |          MPI_ORDER_FORTRAN, data_type, newtype, ierror)
61 |     call MPI_TYPE_COMMIT(newtype,ierror)
62 | #ifdef T3PIO
63 |     call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
64 |          MPI_MODE_CREATE+MPI_MODE_WRONLY, info, fh, ierror)
65 | #else
66 |     call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
67 |          MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
68 |          fh, ierror)
69 | #endif
70 |     filesize = 0_MPI_OFFSET_KIND
71 |     call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
72 |     disp = 0_MPI_OFFSET_KIND
73 |     call MPI_FILE_SET_VIEW(fh,disp,data_type, &
74 |          newtype,'native',MPI_INFO_NULL,ierror)
75 |     call MPI_FILE_WRITE_ALL(fh, var, &
76 |          subsizes(1)*subsizes(2)*subsizes(3), &
77 |          data_type, MPI_STATUS_IGNORE, ierror)
78 |     call MPI_FILE_CLOSE(fh,ierror)
79 |     call MPI_TYPE_FREE(newtype,ierror)
80 | #ifdef T3PIO
81 |     call MPI_INFO_FREE(info,ierror)
82 | #endif
83 | 


--------------------------------------------------------------------------------
/src/io_write_plane.f90:
--------------------------------------------------------------------------------
  1 | !=======================================================================
  2 | ! This is part of the 2DECOMP&FFT library
  3 | ! 
  4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
  5 | ! decomposition. It also implements a highly scalable distributed
  6 | ! three-dimensional Fast Fourier Transform (FFT).
  7 | !
  8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
  9 | !
 10 | !=======================================================================
 11 | 
 12 | ! This file contain common code to be included by subroutines 
 13 | ! 'mpiio_write_plane_3d_...' in io.f90
 14 | 
 15 |     ! It is much easier to implement if all mpi ranks participate I/O.
 16 |     ! Transpose the 3D data if necessary.
 17 | 
 18 |     if (present(opt_decomp)) then
 19 |        decomp = opt_decomp
 20 |     else
 21 |        call get_decomp_info(decomp)
 22 |     end if
 23 | 
 24 |     if (iplane==1) then
 25 |        allocate(wk(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
 26 |        if (ipencil==1) then
 27 |           wk = var
 28 |        else if (ipencil==2) then
 29 |           call transpose_y_to_x(var,wk,decomp)
 30 |        else if (ipencil==3) then
 31 |           allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
 32 |           call transpose_z_to_y(var,wk2,decomp)
 33 |           call transpose_y_to_x(wk2,wk,decomp)
 34 |           deallocate(wk2)
 35 |        end if
 36 |        allocate(wk2d(1,decomp%xsz(2),decomp%xsz(3)))
 37 |        do k=1,decomp%xsz(3)
 38 |           do j=1,decomp%xsz(2)
 39 |              wk2d(1,j,k)=wk(n,j,k)
 40 |           end do
 41 |        end do
 42 |        sizes(1) = 1
 43 |        sizes(2) = decomp%ysz(2)
 44 |        sizes(3) = decomp%zsz(3)
 45 |        subsizes(1) = 1
 46 |        subsizes(2) = decomp%xsz(2)
 47 |        subsizes(3) = decomp%xsz(3)
 48 |        starts(1) = 0
 49 |        starts(2) = decomp%xst(2)-1
 50 |        starts(3) = decomp%xst(3)-1
 51 | 
 52 |     else if (iplane==2) then
 53 |        allocate(wk(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
 54 |        if (ipencil==1) then
 55 |           call transpose_x_to_y(var,wk,decomp)
 56 |        else if (ipencil==2) then
 57 |           wk = var
 58 |        else if (ipencil==3) then
 59 |           call transpose_z_to_y(var,wk,decomp)
 60 |        end if
 61 |        allocate(wk2d(decomp%ysz(1),1,decomp%ysz(3)))
 62 |        do k=1,decomp%ysz(3)
 63 |           do i=1,decomp%ysz(1)
 64 |              wk2d(i,1,k)=wk(i,n,k)
 65 |           end do
 66 |        end do
 67 |        sizes(1) = decomp%xsz(1)
 68 |        sizes(2) = 1
 69 |        sizes(3) = decomp%zsz(3)
 70 |        subsizes(1) = decomp%ysz(1)
 71 |        subsizes(2) = 1
 72 |        subsizes(3) = decomp%ysz(3)
 73 |        starts(1) = decomp%yst(1)-1
 74 |        starts(2) = 0
 75 |        starts(3) = decomp%yst(3)-1
 76 | 
 77 |     else if (iplane==3) then
 78 |        allocate(wk(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
 79 |        if (ipencil==1) then
 80 |           allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
 81 |           call transpose_x_to_y(var,wk2,decomp)
 82 |           call transpose_y_to_z(wk2,wk,decomp)
 83 |           deallocate(wk2)
 84 |        else if (ipencil==2) then
 85 |           call transpose_y_to_z(var,wk,decomp)
 86 |        else if (ipencil==3) then
 87 |           wk = var
 88 |        end if
 89 |        allocate(wk2d(decomp%zsz(1),decomp%zsz(2),1))
 90 |        do j=1,decomp%zsz(2)
 91 |           do i=1,decomp%zsz(1) 
 92 |              wk2d(i,j,1)=wk(i,j,n)
 93 |           end do
 94 |        end do
 95 |        sizes(1) = decomp%xsz(1)
 96 |        sizes(2) = decomp%ysz(2)
 97 |        sizes(3) = 1
 98 |        subsizes(1) = decomp%zsz(1)
 99 |        subsizes(2) = decomp%zsz(2)
100 |        subsizes(3) = 1
101 |        starts(1) = decomp%zst(1)-1
102 |        starts(2) = decomp%zst(2)-1
103 |        starts(3) = 0
104 |     end if
105 | 
106 |     call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
107 |          MPI_ORDER_FORTRAN, data_type, newtype, ierror)
108 |     call MPI_TYPE_COMMIT(newtype,ierror)
109 |     call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
110 |          MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
111 |          fh, ierror)
112 |     filesize = 0_MPI_OFFSET_KIND
113 |     call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
114 |     disp = 0_MPI_OFFSET_KIND
115 |     call MPI_FILE_SET_VIEW(fh,disp,data_type, &
116 |          newtype,'native',MPI_INFO_NULL,ierror)
117 |     call MPI_FILE_WRITE_ALL(fh, wk2d, &
118 |          subsizes(1)*subsizes(2)*subsizes(3), &
119 |          data_type, MPI_STATUS_IGNORE, ierror)
120 |     call MPI_FILE_CLOSE(fh,ierror)
121 |     call MPI_TYPE_FREE(newtype,ierror)
122 |     
123 |     deallocate(wk,wk2d)
124 | 


--------------------------------------------------------------------------------
/src/io_write_var.f90:
--------------------------------------------------------------------------------
 1 | !=======================================================================
 2 | ! This is part of the 2DECOMP&FFT library
 3 | ! 
 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
 5 | ! decomposition. It also implements a highly scalable distributed
 6 | ! three-dimensional Fast Fourier Transform (FFT).
 7 | !
 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
 9 | !
10 | !=======================================================================
11 | 
12 | ! This file contain common code to be included by subroutines 
13 | ! 'write_var_...' in io.f90
14 | 
15 |   ! Using MPI-IO to write a distributed 3D variable to a file. File 
16 |   ! operations (open/close) need to be done in calling application. This
17 |   ! allows multiple variables to be written to a single file. Together 
18 |   ! with the corresponding read operation, this is the perfect solution
19 |   ! for applications to perform restart/checkpointing.
20 | 
21 |     if (present(opt_decomp)) then
22 |        decomp = opt_decomp
23 |     else
24 |        call get_decomp_info(decomp)
25 |     end if
26 | 
27 |     ! Create file type and set file view
28 |     sizes(1) = decomp%xsz(1)
29 |     sizes(2) = decomp%ysz(2)
30 |     sizes(3) = decomp%zsz(3)
31 |     if (ipencil == 1) then
32 |        subsizes(1) = decomp%xsz(1)
33 |        subsizes(2) = decomp%xsz(2)
34 |        subsizes(3) = decomp%xsz(3)
35 |        starts(1) = decomp%xst(1)-1  ! 0-based index
36 |        starts(2) = decomp%xst(2)-1
37 |        starts(3) = decomp%xst(3)-1
38 |     else if (ipencil == 2) then
39 |        subsizes(1) = decomp%ysz(1)
40 |        subsizes(2) = decomp%ysz(2)
41 |        subsizes(3) = decomp%ysz(3)
42 |        starts(1) = decomp%yst(1)-1
43 |        starts(2) = decomp%yst(2)-1
44 |        starts(3) = decomp%yst(3)-1
45 |     else if (ipencil == 3) then
46 |        subsizes(1) = decomp%zsz(1)
47 |        subsizes(2) = decomp%zsz(2)
48 |        subsizes(3) = decomp%zsz(3)
49 |        starts(1) = decomp%zst(1)-1
50 |        starts(2) = decomp%zst(2)-1
51 |        starts(3) = decomp%zst(3)-1
52 |     endif
53 |     
54 |     call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
55 |          MPI_ORDER_FORTRAN, data_type, newtype, ierror)
56 |     call MPI_TYPE_COMMIT(newtype,ierror)
57 |     call MPI_FILE_SET_VIEW(fh,disp,data_type, &
58 |          newtype,'native',MPI_INFO_NULL,ierror)
59 |     call MPI_FILE_WRITE_ALL(fh, var, &
60 |          subsizes(1)*subsizes(2)*subsizes(3), &
61 |          data_type, MPI_STATUS_IGNORE, ierror)
62 |     call MPI_TYPE_FREE(newtype,ierror)
63 | 
64 |     ! update displacement for the next write operation
65 |     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
66 |     if (data_type == complex_type) then
67 |        disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
68 |     end if
69 | 


--------------------------------------------------------------------------------