├── .gitignore ├── Makefile ├── README.md ├── doc ├── 1d_decomp.md ├── api_decomposition.md ├── api_fft.md ├── api_halo.md ├── api_io.md ├── api_nonblocking.md ├── decomposition.md ├── dstar.md ├── hector.md ├── images │ ├── 1d_decomp.png │ ├── 2d_decomp.png │ ├── Brachos.png │ ├── compact.png │ ├── decomp-17-13-11-p_col-1.png │ ├── decomp-17-13-11-p_row-1.png │ ├── dstar-flame.png │ ├── fft_bgp.png │ ├── fft_hector_2a.png │ ├── fractal-grids.png │ ├── incompact3d-strong.png │ ├── incompact3d-weak.png │ ├── io_model-1.png │ ├── io_model-2.png │ ├── p3dfft_hector_phase1.png │ ├── shm1.png │ ├── shm2.png │ ├── vort-fractal.png │ └── yes.png ├── incompact3d.md ├── jugene.md ├── overview.md ├── p3dfft.md ├── papers │ └── 09C-Anton-Paper.pdf ├── samples.md ├── shared_memory.md └── vortex.md ├── examples ├── Makefile ├── README.md ├── fft_test_c2c │ ├── .gitignore │ ├── Makefile │ ├── README │ ├── c06fxfe.r │ └── fft_test_c2c.f90 ├── fft_test_r2c │ ├── .gitignore │ ├── Makefile │ ├── README │ └── fft_test_r2c.f90 ├── halo_test │ ├── .gitignore │ ├── Makefile │ ├── README │ └── halo_test.f90 ├── io_test │ ├── .gitignore │ ├── Makefile │ ├── README │ ├── io_bench.f90 │ ├── io_plane_test.f90 │ ├── io_read.f90 │ ├── io_test.f90 │ ├── io_var_test.f90 │ └── run_test.sh ├── non_blocking │ ├── Makefile │ ├── README.md │ ├── blocking.f90 │ └── non_blocking.f90 ├── p3dfft │ ├── Makefile │ ├── README.md │ └── p3dfft.f90 ├── tecplot_view │ ├── 2decomp_decomp.png │ ├── Makefile │ ├── README │ └── tecplot_view.f90 ├── test2d │ ├── .gitignore │ ├── Makefile │ ├── README │ └── test2d.f90 └── timing │ ├── .gitignore │ ├── Makefile │ ├── README │ └── timing.f90 ├── include └── .gitignore ├── lib └── Makefile └── src ├── Makefile ├── Makefile.inc ├── Makefile.inc.BlueGene ├── Makefile.inc.Cray_XE ├── Makefile.inc.Fujitsu_SPARC64_VIIIfx ├── acml_plan.f90 ├── alloc.f90 ├── alloc_shm.c ├── cuda_fft_1m.cu ├── decomp_2d.f90 ├── factor.f90 ├── fft_acml.f90 ├── fft_common.f90 ├── fft_common_3d.f90 ├── fft_cufft.f90 ├── fft_essl.f90 ├── fft_ffte.f90 ├── fft_fftpack5.f90 ├── fft_fftw3.f90 ├── fft_fftw3_f03.f90 ├── fft_generic.f90 ├── fft_mkl.f90 ├── glassman.f90 ├── halo.f90 ├── halo_common.f90 ├── io.f90 ├── io_read_one.f90 ├── io_read_var.f90 ├── io_write_every.f90 ├── io_write_one.f90 ├── io_write_plane.f90 ├── io_write_var.f90 ├── transpose_x_to_y.f90 ├── transpose_y_to_x.f90 ├── transpose_y_to_z.f90 └── transpose_z_to_y.f90 /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2DECOMP_DIR=$(CURDIR) 2 | 3 | .PHONY: lib examples clean install_dir 4 | 5 | all: lib basic_test 6 | 7 | lib: 8 | cd lib; $(MAKE) $@ 9 | 10 | examples: 11 | cd $@ ; $(MAKE) $@ 12 | 13 | basic_test: examples 14 | @echo "Basic Test target is examples" 15 | 16 | clean: 17 | cd src; $(MAKE) $@ 18 | cd lib; $(MAKE) $@ 19 | cd include; rm -f *.mod 20 | cd examples; $(MAKE) $@ 21 | 22 | install_dir: 23 | mkdir -p $(DESTDIR)$(prefix) 24 | mkdir -p $(DESTDIR)$(prefix)/include 25 | mkdir -p $(DESTDIR)$(prefix)/lib 26 | mkdir -p $(DESTDIR)$(prefix)/doc 27 | 28 | install: all install_dir 29 | cp $(2DECOMP_DIR)/include/*.mod $(DESTDIR)$(prefix)/include 30 | cp $(2DECOMP_DIR)/lib/lib*.a $(DESTDIR)$(prefix)/lib 31 | cp $(2DECOMP_DIR)/README $(DESTDIR)$(prefix)/README_2DECOMP 32 | cp $(2DECOMP_DIR)/doc/* $(DESTDIR)$(prefix)/doc 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2DECOMP&FFT 2 | 3 | 2DECOMP&FFT is a library for 2D pencil decomposition and highly scalable distributed 3D Fast Fourier Transforms 4 | 5 | #### Table of Contents 6 | 7 | - [Overview](doc/overview.md) 8 | - Software 9 | - [Download](doc/download.md) 10 | - [Installation](doc/installation.md) 11 | - [Domain decomposition strategies](doc/decomposition.md) 12 | - [Fast Fourier Transform (FFT) review](doc/fft.md) 13 | - APIs 14 | - [2D pencil decomposition APIs](doc/api_decomposition.md) 15 | - [FFT APIs](doc/api_fft.md) 16 | - [Halo cell support](doc/api_halo.md) 17 | - [Parallel I/O](doc/api_io.md) 18 | - [Non-blocking communication](doc/api_nonblocking.md) 19 | - Performance benchmarks 20 | - [2DECOMP&FFT vs. P3DFFT](doc/p3dfft.md) 21 | - [HECToR](doc/hector.md) 22 | - [JUGENE](doc/jugene.md) 23 | - Applications and case studies 24 | - [Sample applications](doc/samples.md) 25 | - [Case study - Vortex generation using FFT](doc/vortex.md) 26 | - [Incompact3D - a CFD application for turbulence research](doc/incompact3d.md) 27 | - [DSTAR - a CFD application for studies of turbulence, aeroacoustics, combustion and multiphase flow](doc/dstar.md) 28 | - Miscellaneous technical subjects 29 | - [Interactive decomposition map](https://monet.nag.co.uk/2decomp/decomp_map.php) 30 | - [Using the 1D slab decompostion mode](doc/1d_decomp.md) 31 | - [Shared-memory optimisation](doc/shared_memory.md) 32 | - [Process grid](doc/pgrid.md) 33 | - [Padded all-to-all optimisation](doc/padded_alltoall.md) 34 | - [Precision guidelines](doc/precision.md) 35 | - [Memory comsumption](doc/memory.md) 36 | 37 | #### Software License 38 | 39 | Copyright © 2011-2021, The Numerical Algorithms Group (NAG) 40 | All rights reserved. 41 | 42 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 43 | 44 | - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 45 | - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 46 | - Neither the name of the copyright owner nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 47 | 48 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /doc/1d_decomp.md: -------------------------------------------------------------------------------- 1 | ## Using the 1D Decomposition Mode 2 | 3 | While 2DECOMP&FFT implements a general-purpose 2D pencil decomposition library, 1D slab decomposition remains an attractive option for certain applications. 4 | 5 | - For small-to-medium size applications, which are unlikely to hit the constraint imposed by the decomposition strategy, having simpler and more efficient implementations is often preferable. 6 | - There are applications with algorithms that can not be easily split to multiple 1D operations (for example a Poisson solver using 1D FFT in a homogeneous direction and a 2D multigrid solver in the two remaining directions). 7 | - For applications with multiple levels of parallelism, it may be more convenient to use 1D decomposition for the coarse-grain level data distribution. Then on each slab fine-grain parallelism can be applied (e.g. using OpenMP on shared-memory node). 8 | 9 |

10 |
11 | Figure 1: 2D domain decomposition example using a Prow*Pcol=4*3 processor grid: (a) X-pencil; (b) Y-pencil; (c) Z-pencil. 12 |

13 | 14 | Fig.1 shows an arbitrary 3D domain partitioned using a 2D processor grid of Prow=4 by Pcol=3. Clearly 1D decomposition is just a special case of 2D decomposition with either Prow=1 or Pcol=1. In both cases, the communication algorithms can be simplified significantly. 15 | 16 |

17 | 18 |
19 | Figure 2: 1D slab decomposition of the same domain as in Figure 1
Left: Prow=1; Right: Pcol=1.
20 |

21 | 22 | If Prow=1, state (a) and (b) are identical, as shown in Fig.2 (left); similarly, for Pcol=1, state (b) and (c) are identical, shown in Fig.2 (right). So the 1D decomposition can be defined as either slabs in Y and Z or slabs in X and Y. The former is often preferred as better cache efficiency may be achieved by always keeping the X direction in local memory. 23 | 24 | When using the 2DECOMP&FFT library with 1D decomposition, half of the global transpositions can be dropped, resulting in more efficient code. This optimisation was introduced in version 1.1 of 2DECOMP&FFT. 25 | 26 | Finally, note that one can also rely on this arrangement to perform large distributed 2D simulations. For example one option is to define the 2D data sets in an X-Y plane by setting nz=1 and Pcol=1 (arrays are still to be declared as 3D to satisfy the programming interface of the library). -------------------------------------------------------------------------------- /doc/api_fft.md: -------------------------------------------------------------------------------- 1 | ## API for Parallel Three-dimensional FFTs 2 | 3 | #### Initialisation 4 | 5 | To use the FFT programming interface, first of all, one additional Fortran module has to be used: 6 | ``` 7 | use decomp_2d_fft 8 | ``` 9 | 10 | The FFT interface is built on top of the 2D decomposition library which, naturally, needs to be initialised first: 11 | ``` 12 | call decomp_2d_init(nx, ny, nz, P_row, P_col) 13 | ``` 14 | where *nx\*ny\*nz* is the 3D domain size and *P_row \* P_col* is the 2D processor grid. 15 | 16 | Next one needs to initialise the FFT interface by: 17 | ``` 18 | call decomp_2d_fft_init 19 | ``` 20 | 21 | The initialisation routine handles planing for the underlying FFT engine (if supported) and defines global data structures (such as temporary work spaces) for the computations. By default, it assumes that physical-space data is distributed in X-pencil format. The corresponding spectral-space data is stored in transposed Z-pencil format after the FFT. To give applications more flexibility, the library also supports the opposite direction, if an optional parameter is passed to the initialisation routine: 22 | ``` 23 | call decomp_2d_fft_init(PHYSICAL_IN_Z) 24 | ``` 25 | 26 | Physical-space data in Y-pencil is not an option as it would require additional expensive transpositions which does not make economical sense. There is a third and the most flexible form of the initialisation routine: 27 | ``` 28 | call decomp_2d_fft_init(pencil, n1, n2, n3) 29 | ``` 30 | It allows applications to initialise FFT computations using an arbitrary problem size *n1\*n2\*n3*, which can be different from the main domain size *nx\*ny\*nz*. 31 | 32 | #### Complex-to-complex Transforms 33 | 34 | The library supports three-dimensional FFTs whose data is distributed as 2D pencils and stored in ordinary ijk-ordered 3D arrays across processors. For complex-to-complex (c2c) FFTs, the user interface is: 35 | ``` 36 | call decomp_2d_fft_3d(in, out, direction) 37 | ``` 38 | where direction can be either `DECOMP_2D_FFT_FORWARD` (-1) for forward transforms, or `DECOMP_2D_FFT_BACKWARD` (1) for backward transforms. The input array `in` and output array `out` are both complex and have to be either a X-pencil/Z-pencil combination or vice versa, depending on the direction of FFT and how the FFT interface is initialised earlier (`PHYSICAL_IN_X`, the optional default, or `PHYSICAL_IN_Z`). 39 | 40 | #### Real-to-complex & Complex-to-Real Transforms 41 | 42 | While the c2c interface is already in the simplest possible form, for r2c and c2r transforms, the 3D FFT interface can be used in a more compact form: 43 | ``` 44 | call decomp_2d_fft_3d(in, out) 45 | ``` 46 | Here if `in` is a real array and `out` a complex array, then a forward FFT is implied. Similarly a backward FFT is computed if `in` is a complex array and `out` a real array. 47 | 48 | When real input is involved, the corresponding complex output satisfies so-called ***Hermitian redundancy*** - i.e. some output values are complex conjugates of others. Taking advantage of this, FFT algorithms can normally compute r2c and c2r transforms twice as fast as c2c transforms while only using about half of the memory. Unfortunately, the price to pay is that application's data structures have to become slightly more complex. For a 3D real input data set of size nx*ny*nz, the complex output can be held in an array of size *(nx/2+1)\*ny\*nz*, with the first dimension being cut roughly in half1. Applications can either rely on the advanced interface described in the [decomposition API](api_decomposition.md), or use the following utility routine to distribute the complex output as 2D pencils: 49 | ``` 50 | call decomp_2d_fft_get_size(start,end,size) 51 | ``` 52 | 53 | Here all three arguments are 1D array of three elements, returning to the caller the starting index, ending index and size of the sub-domain held by the current processor - information very similar to the *start/end/size* variables defined in the main decomposition library. 54 | 55 | Note that the complex output arrays obtained from X-pencil and Z-pencil input do not contain identical information (see the output of the fft_test_r2c [sample application](samples.md)). However, if 'Hermitian redundancy' is taken into account, no physical information is lost and the real input can be fully recovered through the corresponding inverse FFT from either complex array. 56 | 57 | Also note that 2DECOMP&FFT does not scale the transforms. So a forward transform followed by a backward transform will not recover the input unless applications normalise the results by the sizes of the transforms. 58 | 59 | #### Finalisation 60 | 61 | Finally, to release the memory used by the FFT interface: 62 | ``` 63 | call decomp_2d_fft_finalize 64 | ``` 65 | 66 | It is possible to re-initialise the FFT interface in the same application at the later stage after it has been finalised, if this becomes necessary. 67 | 68 | To obtain first-hand experience on the FFT interface, users are advised to examine the [sample applications](samples.md)) distributed with the library. 69 | 70 |
71 | 72 | 1The storage is for Fortran. In C/C++, the last dimension has to be cut in half due to different memory pattern. For Z-pencil input, the complex output is of size *nx\*ny\*(nz/2+1)* instead. Also note that the integer division is rounded down. -------------------------------------------------------------------------------- /doc/api_halo.md: -------------------------------------------------------------------------------- 1 | ## API for Halo-cell Support 2 | 3 | While most of the communications using the 2D decomposition are via the global transposition calls, it may become necessary for neighbouring blocks to exchange data explicitly. One such scenario is in CFD applications performing large-eddy simulations (LES). While most spatial derivatives are computed using the implicit formulation to achieve a high order of accuracy, some derivatives may be evaluated quickly using local stencils and explicit formulae, such as those used by sub-grid scale models (a model by definition does not require higher-order of accuracy). 4 | 5 | The halo-cell support API provides data structures and nearest-neighbour communication routines that support explicit message passing between neighbouring pencils. As with the rest of the 2DECOMP&FFT library, the API is designed to be very user-friendly: 6 | 7 | ``` 8 | call update_halo(var, var_halo, level) 9 | ``` 10 | Here the first parameter `var`, a 3D input array, contains the normal pencil-distributed data as defined by the decomposition library. After invoking the routine, the second parameter `var_halo`, an output, returns all original data plus halo data from the neighbouring processes. One can imagine that pencils are now fatter and overlap with the neighbouring pencils. The third parameter `level` defines how many layers of overlapping is required. `var_halo` should be defined from the calling routine as either a 3D allocatable array or pointer. Its memory space will be calculated and allocated by the library. When the routine returns, `var_halo` can be referenced by the calling program using the normal *i,j,k* indices. 11 | 12 | As with the rest of the 2DECOMP&FFT library, a more general form of the routine is available (implemented using Fortran optional arguments): 13 | ``` 14 | call update_halo(var, var_halo, level, opt_decomp, opt_global) 15 | ``` 16 | This supports halo-cell communications among pencils with arbitrary global sizes, as described by `opt_decomp`, the decomposition object. The last optional parameter `opt_global` is required (to be set to `.true.`) if global coordinate is used to define the pencils, i.e. the input array `var` is defined using the *start/end* variables rather than the *size* variables. This ensures the coordinate systems used by `var` and `var_halo` are consistent. 17 | 18 | To demonstrate the use of this API, here is an example that computes spatial derivatives: 19 | 20 | ``` 21 | ! to calculate dv/dy, assume that variables are stored in X-pencil 22 | 23 | real, allocatable, dimension(:,:,:) :: v, v_halo, dvdy 24 | 25 | allocate(v(xsize(1), xsize(2), xsize(3))) 26 | allocate(dvdy(xsize(1), xsize(2), xsize(3))) 27 | 28 | call update_halo(v,v_halo,level=1) 29 | 30 | ! compute derivatives 31 | do k=1,xsize(3) 32 | do j=1,xsize(2) 33 | do i=1,xsize(1) 34 | dvdy(i,j,k) = (v_halo(i,j+1,k)-v_halo(i,j-1,k)) / dy 35 | end do 36 | end do 37 | end do 38 | ``` 39 | 40 | As seen, the variables are stored in X-pencil and derivatives are to be evaluated over distributed data along Y direction using a central finite difference scheme. This is the perfect situation to use the halo-cell support API. Using global transpositions would be unnecessarily too expensive for this type of local/explicit calculations. After the call to `update_halo`, it is safe to refer to the *j+1* and *j-1* indices on array `v_halo` in order to compute the derivatives. 41 | 42 | Note that for the pencils bordering the computational domain, it is up to the application to handle the physical boundary conditions. The library does support periodic condition, i.e. for processes near the boundary of the computational domain, a call to the update_halo routine will fill the halo cells of one side with values from the other side of the domain, when periodic condition is required. To specify periodic condition, one need to initialise the decomposition library with additional information: 43 | ``` 44 | call decomp_2d_init(nx, ny, nz, P_row, P_col, periodic_bc) 45 | ``` 46 | The extra parameter `periodic_bc` is a 1D array containing 3 logical values that specify which dimension should be periodic. This parameter is optional and is only used with the halo-cell API. The domain decomposition should otherwise behaves exactly as normal. 47 | 48 | Like the rest of 2DECOMP&FFT, the halo-cell support API is implemented in a black-box fashion. The library internally handles the communications between neighbouring blocks using the standard MPI non-blocking point-to-point communications. -------------------------------------------------------------------------------- /doc/api_nonblocking.md: -------------------------------------------------------------------------------- 1 | ## Non-blocking API for Overlap of Communication and Computation 2 | 3 | Transpose-based parallelisation is inherently communication intensive. For large-scale applications, it is not unusual that communication accounts for more than half of the total cost. Application performance may be significantly improved if algorithms can be redesigned to allow overlap of communication and computation. From version 1.4, 2DECOMP&FFT provides a low-level communication API to facilitate such effort. 4 | 5 | The API is based on ideas of non-blocking MPI collectives (such as MPI_IALLTOALL and MPI_IALLTOALLV) introduced in MPI version 3. 6 | 7 | [Old users of 2DECOMP&FFT may recall the use of third-party library libNBC, which implemented the non-blocking MPI collectives using existing MPI 1 functions, to support such features. Using third party libraries is no longer necessary.] 8 | 9 | ### The API 10 | 11 | Each of the four transposition routines in the base [decomposition library](api_decomposition.md) contains three key elements: algorithm to pack the MPI send buffers, MPI_ALLTOALL(V) communication, and algorithms to unpack the MPI receive buffers. When the non-blocking version of the MPI_ALLTOALL(V) is used, these routines are broken into smaller routines. For example, when transposing from X pencils to Y pencils, the blocking version of the communication routine is: 12 | ``` 13 | call transpose_x_to_y(in, out, decomp) 14 | ``` 15 | The corresponding non-blocking routines are: 16 | ``` 17 | call transpose_x_to_y_start(handle, in, out, sbuf, rbuf, decomp) 18 | call transpose_x_to_y_wait(handle, in, out, sbuf, rbuf, decomp) 19 | ``` 20 | The *start* routine packs the MPI send buffer, starts the non-blocking MPI_ALLTOALL(V) communication, and returns immediately. Later, a call to the corresponding *wait* routine ensures the communication is completed and then unpacks the MPI receive buffer. The first parameter `handle` is used to uniquely identify each communication session. Because several non-blocking communications may be ongoing at the same time, each has to define its own send buffer *sbuf* and receive buffer *rbuf*1. It is up to the applications to supply (and if possible, reuse) these buffers, the size and shape of which should match the corresponding input array in and output array out. Between a *start* call and the corresponding *wait* call, the content of *sbuf* should not be modified and the content of *out* should not be referenced, to avoid unpredictable results. Other unrelated computations may be carried out while the communication is ongoing. 21 | 22 | There are similar *start/wait* routines defined to all other transposition routines. 23 | 24 | These routines are useful on systems with dedicated networking hardware to process the communication stack. On systems without such hardware, one has to call `MPI_TEST` explicitly from the user thread to progress the non-blocking communication. A utility routine is provided for this purpose: 25 | ``` 26 | call transpose_test(handle) 27 | ``` 28 | This needs to be called from time to time from the computational part of application, in order to progress the communication identified by `handle`. Of course, the practical difficulty is where and how frequently this should be called, a matter that is entirely application dependent. 29 | 30 | Currently, the author is not aware of any stable and high-quality software implementation that progresses all-to-all type of communication asynchronously2. 31 | 32 | #### A Sample Application 33 | 34 | To demonstrate the use of this API, a sample application (non_blocking) is provided to compute multiple independent FFTs, using both the blocking and non-blocking versions of the communication library. The idea of overlapping the communication of one 3D FFT and the computation of another, as described by Kandalla et al.[1], is implemented. The algorithm's pseudo-code looks like: 35 | ``` 36 | 1D FFT in X for V_1 37 | call transpose_x_to_y for V_1 (blocking) 38 | 1D FFT in Y for V_1 39 | call transpose_y_z_start for V_1 40 | do k=2,N 41 | 1D FFT in X for V_k 42 | call transpose_x_to_y for V_k (blocking) 43 | 1D FFT in Y for V_k 44 | call transpose_y_to_z_start for V_k 45 | call transpose_y_to_z_wait for V_(k-1) 46 | 1D FFT in Z for V_(k-1) 47 | end do 48 | call transpose_y_to_z_wait for V_N to complete 49 | 1D FFT in Z for V_N 50 | ``` 51 | 52 | This algorithm compute multiple independent 3D FFTs on dataset *Vk (k=1,N)*. As can be seen, the Y=>Z transpose for dataset *k* and the computation of 1D FFT in Z for dataset *k-1* are overlapped. Note that in the sample application the computations are done using loops of 1D FFTs, rather than with FFTW's advanced interface that allows multiple 1D FFTs to be done in one go. This design is to allow `MPI_TEST` calls to be inserted to progress the communication. 53 | 54 | It is up to the application developers to identify opportunities in their algorithms that may benefit from this non-blocking API. 55 | 56 | #### References 57 | 58 | [1] K. Kandalla, H. Subramoni, K. Tomko, D. Pekurovsky, S. Sur and D.K. Panda, "High-performance and scalable non-blocking all-to-all with collective offload on InfiniBand clusters: a study with parallel 3D FFT", *Computer Science - Research and Development*, vol. 26(3-4):237-246, 2011. 59 | 60 | 61 | --- 62 | 63 | 1The blocking version also needs to define send/recv buffers. But because there is only one communication at any time, the buffers are temporarily allocated as required by the library, or for performance reason defined globally and shared by multiple communication calls. 64 | 65 | 2There are *asynchronous progress control* in Intel MPI library. However, the only supported non-blocking collective calls are *Ibcast*, *Ireduce* and *Iallreduce*. -------------------------------------------------------------------------------- /doc/decomposition.md: -------------------------------------------------------------------------------- 1 | ## Domain Decomposition Strategies 2 | 3 | The discussions here apply to many applications based on three-dimensional Cartesian meshes (or to be exact, having a Cartesian topology), and in particular those using spatially implicit numerical schemes. For example, a compact finite difference scheme often results in solving a tridiagonal linear system when evaluating spatial derivatives or doing spatial interpolations; a spectral code often involves performing a Fast Fourier Transform along a global mesh line. 4 | 5 | There are two approaches to performing such computations on distributed-memory systems. One can either develop distributed algorithms (such as a parallel tridiagonal solver or a parallel FFT algorithm working on distributed data), or one can at runtime redistribute (transpose) data among processors in order to apply serial algorithms in local memory. The second approach is often preferred due to its simplicity: existing serial algorithms (hopefully already optimised for a single CPU) remain unchanged; porting serial code can be straight-forward as much of the original code logic still holds, and the only major addition is the data transposition procedures. 6 | 7 | #### 1D Slab Decomposition 8 | 9 | In early days, many applications implemented the above idea using 1D domain decomposition (also known as slab decomposition). In Fig.1, a 3D domain is arbitrarily chosen to be decomposed in Y and X directions. It can be seen that in state (a), any computations in the X-Z planes can be done in local memories while data along a Y mesh-line is distributed. When it is necessary to calculate along Y mesh-lines (say to evaluate Y-derivatives, or to perform 1D FFTs along Y), one can redistribute the data among processors to reach state (b), in which any computation in Y becomes 'local'. If using standard MPI library, switching between state (a) and (b) can be achieved using the MPI_ALLTOALL(V) routines. 10 | 11 |

12 |
13 | Figure 1. 1D domain decomposition example using 4 processors: (a) decomposed in Y direction; (b) decomposed in X direction. 14 |

15 | 16 | A 1D decomposition, while quite simple, has some limitations, especially for large-scale applications. Given a cubic mesh of size N^3 , one obvious constraint is that the maximum number of processors Nproc that can be used in a 1D decomposition is N as each slab has to contain at least one plane of data. For a cubic mesh with 1 billion points (which is very large but becomes increasingly common in CFD applications, such as those for fundamental turbulence studies), the constraint is Nproc<=1000. This is a serious limitation as most supercomputers today have at least tens of thousands of cores. Large applications are also likely to hit the memory limit when each processor handles too much workload. 17 | 18 | #### 2D Pencil Decomposition 19 | 20 | A 2D pencil decomposition (also known as a 'drawer' or 'block' decomposition) is a natural extension to 1D decompositions. Fig.2 shows that the same 3D domain as in Fig.1 can be partitioned in two dimensions. States (a), (b) and (c) are referred to as X-pencil, Y-pencil and Z-pencil arrangements, respectively. While a 1D decomposition algorithm swaps between two states, in a 2D decomposition one needs to traverse 3 different states using 4 global transpositions ((a) =>(b) => (c) => (b) => (a)). 21 | 22 |

23 |
24 | Figure 2: 2D domain decomposition example using a 4*3 processor grid: (a) X-pencil; (b) Y-pencil; (c) Z-pencil. 25 |

26 | 27 | An interactive view of the 2D pencil decomposition can be found from this [web application](https://monet.nag.co.uk/2decomp/decomp_map.php). 28 | 29 | Again MPI_ALLTOALL(V) can be used to realise the transpositions. However it is significantly more complex than the 1D case. There are two separate communicator groups. For a Prow*Pcol processor grid: Prow groups of Pcol processors need to exchange data among themselves for (a) <=> (b) ; Pcol groups of Prow processors need to exchange data among themselves for (b) <=> (c). For example, the red, green and blue processes in state (b) and (c) occupy exactly the same physical domain. 30 | 31 | On one hand, the proper implementation of the communication routines can be quite tricky. For example the communications are very sensitive to the orientations of pencils and their associated memory patterns. The packing and unpacking of memory buffers for the MPI library calls must be handled with great care for efficiency. These are pure software engineering topics, which are almost certainly irrelevant to the scientific researches conducted by the applications. 32 | 33 | On the other hand, although the idea of 2D decomposition has long been established, its adoption in real applications was not essential until recently, when ordinary researchers can realistically expect to regularly use thousands of cores on major supercomputers, therefore hitting the limitation imposed by 1D decomposition. 34 | 35 | These motivated the author to create the 2DECOMP&FFT library - a general-purpose domain decomposition library that can be reused by many applications - to handle these technical issues properly and to hide most software-engineering details from application developers who can concentrate on their scientific studies. 36 | 37 | -------------------------------------------------------------------------------- /doc/dstar.md: -------------------------------------------------------------------------------- 1 | ## DSTAR - Direct Simulation of Turbulence And Reaction 2 | 3 | DSTAR is a high-order code for **D**irect **S**imulation of **T**urbulence **A**nd **R**eaction, initially developed by Professor Kai Luo (Southampton University) and extended by co-workers over the past 20 years. It solves the complete Navier-Stokes equations as well as conservation equations for energy and chemical species. Modules for both direct numerical simulation (DNS) and large eddy simulation (LES) have been developed for high-fidelity simulation of turbulence, aeroacoustics, turbulent combustion, multiphase turbulent flow and combustion. DSTAR incorporates highly accurate numerical techniques such as 6th-order spatial discretisation, non-reflecting boundary conditions and low-storage Runge-Kutta explicit time-advancement. 4 | 5 | Parallel algorithms include MPI and mixed MPI/OpenMP. Parallel operations can be performed in 1D or 2D decomposition as supported by the 2DECOMP&FFT library. In this case, only the decomposition API is required. The code solves the fluid problem in compressible form and there is no tricky Poisson problem involved. The code was ran successfully using 6144 cores in pure MPI mode and 18432 cores in hybrid mode on HECToR. 6 | 7 | A typical scientific application is shown below: 8 | 9 |

10 |
11 | Large eddy simulation of a turbulent diffusion flame interacting with evaporating water droplets. 12 | 13 |

14 | 15 | The mathematical framework of DSTAR is described in details in: 16 | 17 | - K. H. Luo, "Combustion effects on turbulence in a partially premixed supersonic diffusion flame", *Combustion and Flame*, vol. 119(4):417-435, 1999. 18 | - J. Xia, K. H. Luo and S. Kumar, "Large-Eddy Simulation of Interactions Between a Reacting Jet and Evaporating Droplets", *Flow Turbulence and Combustion*, vol. 80(1):133-153, 2008. 19 | - J. Xia and K. H. Luo, "Conditional statistics of inert droplet effects on turbulent combustion in reacting mixing layers", *Combustion Theory and Modelling*, vol. 13(5):901-920, 2009. 20 | 21 | Details of the parallelisation was reported at the 2011 Cray User Group conference. 22 | 23 | - L. Anton, N. Li and K. H. Luo, "A study of scalability performance for hybrid mode computation and asynchronous MPI transpose operation in DSTAR", *Cray User Group 2011 conference*, Fairbanks, 2011. [PDF](papers/09C-Anton-Paper.pdf) -------------------------------------------------------------------------------- /doc/images/1d_decomp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/1d_decomp.png -------------------------------------------------------------------------------- /doc/images/2d_decomp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/2d_decomp.png -------------------------------------------------------------------------------- /doc/images/Brachos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/Brachos.png -------------------------------------------------------------------------------- /doc/images/compact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/compact.png -------------------------------------------------------------------------------- /doc/images/decomp-17-13-11-p_col-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/decomp-17-13-11-p_col-1.png -------------------------------------------------------------------------------- /doc/images/decomp-17-13-11-p_row-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/decomp-17-13-11-p_row-1.png -------------------------------------------------------------------------------- /doc/images/dstar-flame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/dstar-flame.png -------------------------------------------------------------------------------- /doc/images/fft_bgp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fft_bgp.png -------------------------------------------------------------------------------- /doc/images/fft_hector_2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fft_hector_2a.png -------------------------------------------------------------------------------- /doc/images/fractal-grids.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/fractal-grids.png -------------------------------------------------------------------------------- /doc/images/incompact3d-strong.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/incompact3d-strong.png -------------------------------------------------------------------------------- /doc/images/incompact3d-weak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/incompact3d-weak.png -------------------------------------------------------------------------------- /doc/images/io_model-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/io_model-1.png -------------------------------------------------------------------------------- /doc/images/io_model-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/io_model-2.png -------------------------------------------------------------------------------- /doc/images/p3dfft_hector_phase1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/p3dfft_hector_phase1.png -------------------------------------------------------------------------------- /doc/images/shm1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/shm1.png -------------------------------------------------------------------------------- /doc/images/shm2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/shm2.png -------------------------------------------------------------------------------- /doc/images/vort-fractal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/vort-fractal.png -------------------------------------------------------------------------------- /doc/images/yes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/images/yes.png -------------------------------------------------------------------------------- /doc/jugene.md: -------------------------------------------------------------------------------- 1 | ## Benchmarks on JUGENE 2 | 3 | This set of benchmarked was performed in May 2010 on JUGENE, the big IBM Blue Gene/P system at Jülich Supercomputing Centre in Germany. The system ranked world No. 4 by that time, with a Linpack capability of 825.5 TFLOPs. 4 | 5 | The work was made possible with the assistance of high performance computing resources (Tier-0) provided by PRACE. 2DECOMP&FFT was ported onto the Blue Gene/P. One major improvement achieved was the implementation of the FFT interface using ESSL, a high-performance math library native to IBM systems. The FFT interface was then benchmarked on problem sizes up to 8192^3 using up to 131072 cores. 6 | 7 |

8 |
9 | Scaling of the FFT interface on Blue Gene/P JUGENE. 10 |

11 | 12 | As seen, the code scales extremely well on the system for all problem sizes. The apparent super-linear scaling for the 1024^3 case is understood to be related to the Torus network configurations that favour larger jobs. 13 | -------------------------------------------------------------------------------- /doc/overview.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | ### Introduction 4 | 5 | The 2DECOMP&FFT library is a software framework in Fortran to build large-scale parallel applications. It is designed for applications using three-dimensional structured mesh and spatially implicit numerical algorithms. At the foundation it implements a general-purpose 2D pencil decomposition for data distribution on distributed-memory platforms. On top, it provides a highly scalable and efficient interface to perform three-dimensional distributed FFTs. The library is optimised for supercomputers and scales well to hundreds of thousands of cores. It relies on MPI but provides a user-friendly programming interface that hides communication details from application developers. 6 | 7 | ### Features 8 | 9 | Here is a list of 2DECOMP&FFT's main features: 10 | 11 | * General-purpose 2D pencil decomposition module to support building large-scale parallel applications on distributed memory systems. 12 | * Highly scalable and efficient distributed Fast Fourier Transform module, supporting three dimensional FFTs (both complex-to-complex and real-to-complex/complex-to-real). 13 | * Halo-cell support allowing explicit message passing between neighbouring blocks. 14 | * Parallel I/O module to support the handling of large data sets. 15 | * Shared-memory optimisation on the communication code for multi-core systems. 16 | 17 | 2DECOMP&FFT distinguishes itself from many other popular distributed FFT libraries by exposing its communication APIs upon which many other parallel algorithms can be built. 18 | 19 | 2DECOMP&FFT is designed to be: 20 | 21 | * **Scalable** - The library and applications built upon it are known to scale to o(10^5) cores on major supercomputers. 22 | * **Flexible** - Software framework to support building higher-level libraries and many types of applications. 23 | * **User-friendly** - Black-box implementation and very clean application programming interface hiding most communication details from applications. 24 | * **Portable** - Code tested on many major supercomputing architectures. The FFT library interfaces with almost every popular external FFT implementations. 25 | 26 | ### History 27 | 28 | This software package was originally derived from several projects funded under the HECToR Distributed Computational Science and Engineering (dCSE) programme operated by NAG Ltd. HECToR - a UK Research Councils' high end computing service - served as the UK's national supercomputer for open science between 2008 and 2014. 29 | 30 | The active development of this library completed in 2012. It has been in production use in many research applications since then. The code quality appears to be very good with almost no major bugs reported over the years. Its performance remains very competitive as reported by a [recent study](https://www.icl.utk.edu/files/publications/2021/icl-utk-1490-2021.pdf). 31 | 32 | Since August 2021, this project is hosted in NAG's official GitHub account to facilitate future development and maintenance. 33 | 34 | ### Citation 35 | 36 | If you wish to cite this work, you are recommended to use the following paper: 37 | 38 | * N. Li and S. Laizet, "2DECOMP&FFT – A highly scalable 2D decomposition library and FFT interface", Cray User Group 2010 conference, Edinburgh, 2010. -------------------------------------------------------------------------------- /doc/p3dfft.md: -------------------------------------------------------------------------------- 1 | ## 2DECOMP&FFT vs. P3DFFT 2 | 3 | P3DFFT is probably the most well-known open-source distributed FFT library. The project was initiated at San Diego Supercomputer Center at UCSD by Dmitry Pekurovsky. It is highly efficient and it has been widely adopted by scientists doing large-scale simulations, such as high-resolution turbulence simulations. 4 | 5 | P3DFFT was actually ported onto HECToR (my development system) at the early stage of the 2DECOMP&FFT project. Fig. 1 shows its good scaling on the old hardware (back in early 2009, the system was a Cray XT4 using dual-core AMD Opteron processors and Cray SeaStar interconnect). 6 | 7 |

8 |
9 | Figure 1. P3DFFT scaling on Cray XT4 HECToR. 10 |

11 | 12 | What motivated the author to develop a new and somewhat competing library were the following: 13 | - P3DFFT is an FFT-only package. It is not designed as a general-purpose 2D decomposition library and its communication routines are not designed to be user callable. 2DECOMP&FFT provides a general-purpose decomposition library to support the building of a variety of applications (the applications do not necessarily need to use FFT). 14 | - P3DFFT appears to be targeting applications using spectral method and only performs real-to-complex and complex-to-real transforms. 2DECOMP&FFT is also able to support complex-to-complex transforms. **Note that the new generation of P3DFFT library (dubbed P3DFFT++ or P3DFFT v.3) is a generalization of the concept of P3DFFT and does support complex-to-complex transforms.** 15 | - The separation of communication layer and the FFT layer in 2DECOMP&FFT makes it possible to build additional libraries (such as transforms using Chebyshev or Jacobian basis functions, or a general-purpose PDE solver). It is also easier to implement advanced software features (such as the shared-memory implementation) where only the low-level communication code needs to be updated. 16 | 17 | #### Performance Comparison 18 | 19 | The parallel performance of 2DECOMP&FFT and P3DFFT has been studied in great detail in a [MSc thesis by E. Brachos at University of Edinburgh](https://static.epcc.ed.ac.uk/dissertations/hpc-msc/2010-2011/EvangelosBrachos.pdf). Fig. 2 shows a set of benchmark on r2c/c2r transforms of size 256^3. The MPI interface of FFTW 3.3 was also examined, although it can only run in 1D slab decomposition mode. 20 | 21 |

22 |
23 | Figure 2. Speedup of 2DECOMP&FFT, P3DFFT and FFTW 3.3's MPI interface. 24 |

25 | 26 | The performance difference between 2DECOMP&FFT and P3DFFT is often shown to be marginal, although the best 2D processor grid to achieve the optimal performance can be very different due to the different internal architecture of the two libraries. 27 | 28 | The scalability and the absolute performance of both 2DECOMP&FFT and P3DFFT are better than FFTW 3.3 running in MPI mode. FFTW is, however, much more efficient in OpenMP mode. This suggests that a hybrid implementation may be the future direction of 2DECOMP&FFT. -------------------------------------------------------------------------------- /doc/papers/09C-Anton-Paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/doc/papers/09C-Anton-Paper.pdf -------------------------------------------------------------------------------- /doc/samples.md: -------------------------------------------------------------------------------- 1 | ## Sample Applications 2 | 3 | A list of sample applications are distributed with 2DECOMP&FFT package to validate the library and to demonstrate the proper use of it. 4 | 5 | - **test2d** - This application is to test the base 2D pencil decomposition module. It arranges to transpose data among the three pencil orientations and validate the result against a copy of the global data held on each process. It also demonstrates the use of the parallel I/O library - regardless of how the global data is distributed (X-pencil, Y-pencil or Z-pencil), when processes write to files collectively using the I/O library the resulting files should be identical. 6 | - **fft_test_c2c** - This is a simple application to validate the complex-to-complex FFT programming interface. Its input is taken from the example program of NAG library routine C06FXF (also for c2c FFTs). Its output should match the C06FXF output exactly. 7 | - **fft_test_r2c** - This is to test the FFT library's real-to-complex and complex-to-real interface. It generates some random input, computes a serial 3D r2c transform on rank 0 to generate reference data. It then computes two sets of transforms on distributed data, with the input distributed in X-pencil and Z-pencil, respectively. In both cases, a r2c transform is computed first and its result on rank 0 printed out (which should contain a subset of numbers found in the reference serial transform output). An inverse c2r transform is then followed to recover the input to machine accuracy (system dependent, but somewhere around 10-6 for single precision and 10-15 for double precision). 8 | - **timing** - This application can be used to benchmark the FFT library performance when porting it to a new system. It performs both c2c and r2c/c2r benchmarks, collects timing information and validates the results. 9 | - **halo_test** - This application demonstrate the use of the halo-cell support API. It calculates the divergence of a random field using an explicit 3-stencil finite different method. The parallel program relies on two different communication methods: (1) the global transposition routines; (2) the halo-cell support API. Both methods should return exactly the same results. Of course the halo-cell method is more efficient for such a stencil-based calculation. 10 | - **io_test** - A collection of sample applications testing the I/O APIs thoroughly. 11 | - **tecplot_view** - This application was used to generate the visualisation of the 2D decomposition, as frequently shown in the documentation. The output is in the format of Tecplot, a popular visualisation tool mainly used by the CFD community. Data from each process is written as a zone. 12 | - **p3dfft** - This application uses 2DECOMP&FFT and P3DFFT side-by-side to perform some FFTs. It was used to validate and benchmark 2DECOMP&FFT against its famous counterpart. P3DFFT has to be built separately in order to use this test. 13 | - **non_blocking** - This contains sample applications to compute multi-variable FFTs using the both the blocking and non-blocking versions of the communication library. This demonstrates how to use the non_blocking APIs to overlap communication and computation. 14 | 15 | Please consult the README files associated with these sample applications for more detail. -------------------------------------------------------------------------------- /doc/shared_memory.md: -------------------------------------------------------------------------------- 1 | ## Shared-memory Programming using System V IPC 2 | 3 | Most modern supercomputers are equipped with multi-core processors and cores on same node often share local memory. There are various programming models which can take advantage of this architecture, including the popular hybrid MPI/OpenMP model. In the context of this project, shared-memory programming is used to improve the efficiency of the communication code. 4 | 5 | For all-to-all type of communication in which each MPI rank has to send/receive messages to/from all other MPI ranks, traffics from cores on the same physical node compete for their network interface. Even if the network bandwidth is sufficient, the performance is likely to be affected by network latency when too many small messages are passed within the system. One solution is to create shared send/recv buffers on each SMP node. Then only leaders of the nodes participate MPI_ALLTOALL(V), resulting in fewer but larger messages, hopefully improving the communication performance. The interconnects of supercomputers are often optimised for handling small amount of large messages. 6 | 7 | This feature has been implemented within the communication library as a black box. It can be activated by users at compile time by using '-DSHM' flag. The shared-memory code uses the System V Inter-Process Communication (IPC) API which is widely supported on many variants of UNIX. 8 | 9 | 2DECOMP&FFT has two independent shared-memory implementations (they validate each other): 10 | 11 | - The first version is based on code supplied by David Tanqueray of Cray Inc., who initially applied this idea to several molecular dynamics applications. This code accesses platform-dependent information1 in order to establish the share-memory configurations (such as which MPI rank belongs to which node). It has been tested on Cray hardware only. 12 | - The second version is based on the open-source package FreeIPC, created by Ian Bush, a former NAG colleague. FreeIPC is basically a Fortran wrapper for the System V IPC API and it provides a system-independent way to gather shared-memory information. This makes it possible to write more portable shared-memory code. 13 | 14 | Fig. 1 below demonstrates the typical benefit of shared-memory programming. The data was collected on HECToR phase 2a system (Cray XT4 with quad-core AMD Opteron processors) from a series of simulations using 256 MPI ranks over a range of problem sizes. When the problem size is small (so is the message size), the communication routines were called more times so that the total amount of data moving within the system remains a constant. It can be seen that when the problem size is smaller, the overhead of setting up communications is much higher and the shared-memory code can improve communication efficiency by up to 30%. As the problem size increases, the benefit of using shared-memory code becomes smaller. For large message size (> 32Kb in this example), the shared-memory code is actually slower due to the extra memory copying operations required to assemble/disassemble the shared-memory buffers. 15 | 16 |

17 |
18 | Figure 1: Typical shared-memory code performance. 19 | 20 |

21 | 22 | The HECToR upgrade to phase 2b (world's first production Cray XE6) presented a unique opportunity to demonstrate the benefit of shared-memory programming in real applications. The 24-core nodes were introduced to HECToR several months before the arrival of new Gemini interconnect. During the transitional period, communication intensive applications often produced more network traffic than the old SeaStar interconnect could handle. Fig.2 shows the benchmark of 2DECOMP&FFT's FFT interface with a 2592^3 problem size2. With the slow SeaStar interconnect, the scaling was poor when using more than few thousands cores. However, switching on the shared-memory code significantly improved the application performance (by as far as 40%) and parallel efficiency of more than 90% was observed through out the scale. The new Gemini interconnect offered significant improvement in terms of both network bandwidth and latency. As a result, significant performance gain was to be expected for communication intensive codes. The FFT benchmark was almost twice as fast in some cases. However, the shared-memory code on Gemini (not shown in the figure) offered absolutely no benefit when the network was fast enough to handle all the messages efficiently. 23 | 24 |

25 |
26 | Figure 2: Parallel FFT performance: SeaStar (with and without shared-memory) vs. Gemini. 27 | 28 |

29 | 30 | --- 31 | 32 | 1On Cray XT/XE systems, this is done by checking the /proc file system of the computing nodes. 33 | 34 | 2The problem size of 2592 was chosen intentionally because it is divisible by 6 multiple times, which helped achieve better load balance on the system using 24-core node (containing two Magny-Cours processors, each with two six-core dies). -------------------------------------------------------------------------------- /examples/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test2d fft_test_c2c fft_test_r2c timing halo_test io_test 2 | 3 | # Just build the examples 4 | examples: test2d fft_test_c2c fft_test_r2c timing halo_test io_test 5 | @echo "Built the examples" 6 | 7 | test2d: 8 | cd test2d; $(MAKE) $@ 9 | fft_test_c2c: 10 | cd fft_test_c2c; $(MAKE) $@ 11 | fft_test_r2c: 12 | cd fft_test_r2c; $(MAKE) $@ 13 | timing: 14 | cd timing; $(MAKE) $@ 15 | halo_test: 16 | cd halo_test; $(MAKE) $@ 17 | io_test: 18 | cd io_test; $(MAKE) $@ 19 | 20 | # test all the examples (individual Makefiles should take care of updating) 21 | basic_test: 22 | cd test2d; $(MAKE) $@ 23 | cd fft_test_c2c; $(MAKE) $@ 24 | cd fft_test_r2c; $(MAKE) $@ 25 | cd timing; $(MAKE) $@ 26 | cd halo_test; $(MAKE) $@ 27 | cd io_test; $(MAKE) $@ 28 | 29 | clean: 30 | cd test2d; $(MAKE) $@ 31 | cd fft_test_c2c; $(MAKE) $@ 32 | cd fft_test_r2c; $(MAKE) $@ 33 | cd timing; $(MAKE) $@ 34 | cd halo_test; $(MAKE) $@ 35 | cd io_test; $(MAKE) $@ 36 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | * test2d - to test the base 2D pencil decomposition module 5 | 6 | * fft_test_c2c - to test the complex-to-complex FFTs 7 | 8 | * fft_test_r2c - to test the real-to-complex/complex-to-real FFTs 9 | 10 | * timing - to benchmark the FFT library 11 | 12 | * halo_test - to test the halo-cell exchange code 13 | 14 | * io_test - to test various IO functions 15 | 16 | * p3dfft - to crosscheck the library against P3DFFT 17 | 18 | * non_blocking - to test the idea of overlap communication and computation 19 | 20 | * tecplot_view - to generate Tecplot visualisation of the decomposition 21 | 22 | 23 | Some examples may require external libraries to be built first. Refer to the README files for each example for details. 24 | -------------------------------------------------------------------------------- /examples/fft_test_c2c/.gitignore: -------------------------------------------------------------------------------- 1 | fft_test_c2c 2 | -------------------------------------------------------------------------------- /examples/fft_test_c2c/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT) 5 | 6 | OBJ = fft_test_c2c.o 7 | 8 | fft_test_c2c: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o fft_test_c2c 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/fft_test_c2c/README: -------------------------------------------------------------------------------- 1 | fft_test_c2c 2 | ------------ 3 | 4 | This example demonstrates the use of the FFT c2c interface. The test uses the 5 | input of NAG routine 'c06fxf' (also for c2c transform) and attempts to 6 | reproduce the output. 7 | 8 | To run: use 4 MPI processes. 9 | 10 | What to expect: the output should match what is in 'c06fxfe.r'. 11 | -------------------------------------------------------------------------------- /examples/fft_test_c2c/c06fxfe.r: -------------------------------------------------------------------------------- 1 | C06FXF Example Program Results 2 | 3 | Original data values 4 | 5 | z(i,j,k) for i = 1 6 | 7 | Real 1.000 0.999 0.987 0.936 8 | Imag 0.000 -0.040 -0.159 -0.352 9 | 10 | Real 0.994 0.989 0.963 0.891 11 | Imag -0.111 -0.151 -0.268 -0.454 12 | 13 | Real 0.903 0.885 0.823 0.694 14 | Imag -0.430 -0.466 -0.568 -0.720 15 | 16 | z(i,j,k) for i = 2 17 | 18 | Real 0.500 0.499 0.487 0.436 19 | Imag 0.500 0.040 0.159 0.352 20 | 21 | Real 0.494 0.489 0.463 0.391 22 | Imag 0.111 0.151 0.268 0.454 23 | 24 | Real 0.403 0.385 0.323 0.194 25 | Imag 0.430 0.466 0.568 0.720 26 | 27 | Components of discrete Fourier transform 28 | 29 | z(i,j,k) for i = 1 30 | 31 | Real 3.292 0.051 0.113 0.051 32 | Imag 0.102 -0.042 0.102 0.246 33 | 34 | Real 0.143 0.016 -0.024 -0.050 35 | Imag -0.086 0.153 0.127 0.086 36 | 37 | Real 0.143 -0.050 -0.024 0.016 38 | Imag 0.290 0.118 0.077 0.051 39 | 40 | z(i,j,k) for i = 2 41 | 42 | Real 1.225 0.355 0.000 -0.355 43 | Imag -1.620 0.083 0.162 0.083 44 | 45 | Real 0.424 0.020 0.013 -0.007 46 | Imag 0.320 -0.115 -0.091 -0.080 47 | 48 | Real -0.424 0.007 -0.013 -0.020 49 | Imag 0.320 -0.080 -0.091 -0.115 50 | 51 | Original sequence as restored by inverse transform 52 | 53 | z(i,j,k) for i = 1 54 | 55 | Real 1.000 0.999 0.987 0.936 56 | Imag 0.000 -0.040 -0.159 -0.352 57 | 58 | Real 0.994 0.989 0.963 0.891 59 | Imag -0.111 -0.151 -0.268 -0.454 60 | 61 | Real 0.903 0.885 0.823 0.694 62 | Imag -0.430 -0.466 -0.568 -0.720 63 | 64 | z(i,j,k) for i = 2 65 | 66 | Real 0.500 0.499 0.487 0.436 67 | Imag 0.500 0.040 0.159 0.352 68 | 69 | Real 0.494 0.489 0.463 0.391 70 | Imag 0.111 0.151 0.268 0.454 71 | 72 | Real 0.403 0.385 0.323 0.194 73 | Imag 0.430 0.466 0.568 0.720 74 | -------------------------------------------------------------------------------- /examples/fft_test_c2c/fft_test_c2c.f90: -------------------------------------------------------------------------------- 1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2 | ! Main test program for the FFT interface 3 | ! - use input data from a NAG FFT library for validation 4 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 5 | 6 | program fft_test_c2c 7 | 8 | use decomp_2d 9 | use decomp_2d_fft 10 | 11 | implicit none 12 | 13 | integer, parameter :: nx=2, ny=3, nz=4 14 | integer, parameter :: p_row=2, p_col=2 15 | 16 | complex(mytype), allocatable, dimension(:,:,:) :: in, out 17 | 18 | complex(mytype), dimension(nx,ny,nz) :: in1, out1 19 | integer :: ierror, i,j,k 20 | 21 | interface 22 | subroutine assemble_global(ndir,local,global,nx,ny,nz) 23 | use decomp_2d 24 | integer, intent(IN) :: ndir 25 | integer, intent(IN) :: nx,ny,nz 26 | complex(mytype), dimension(:,:,:), intent(IN) :: local 27 | complex(mytype), dimension(nx,ny,nz), intent(OUT) :: global 28 | end subroutine assemble_global 29 | end interface 30 | 31 | call MPI_INIT(ierror) 32 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 33 | call decomp_2d_fft_init 34 | 35 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 36 | ! (1) Testing the complex-to-complex interface (c2c) 37 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 38 | 39 | ! input is X-pencil data 40 | ! output is Z-pencil data 41 | allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 42 | allocate (out(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3))) 43 | 44 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 45 | ! Following is the testing input for NAG library C06FXF 46 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 47 | in1(1,1,1) = (1.000, 0.000) 48 | in1(1,1,2) = (0.999, -0.040) 49 | in1(1,1,3) = (0.987, -0.159) 50 | in1(1,1,4) = (0.936, -0.352) 51 | in1(1,2,1) = (0.994, -0.111) 52 | in1(1,2,2) = (0.989, -0.151) 53 | in1(1,2,3) = (0.963, -0.268) 54 | in1(1,2,4) = (0.891, -0.454) 55 | in1(1,3,1) = (0.903, -0.430) 56 | in1(1,3,2) = (0.885, -0.466) 57 | in1(1,3,3) = (0.823, -0.568) 58 | in1(1,3,4) = (0.694, -0.720) 59 | in1(2,1,1) = (0.500, 0.500) 60 | in1(2,1,2) = (0.499, 0.040) 61 | in1(2,1,3) = (0.487, 0.159) 62 | in1(2,1,4) = (0.436, 0.352) 63 | in1(2,2,1) = (0.494, 0.111) 64 | in1(2,2,2) = (0.489, 0.151) 65 | in1(2,2,3) = (0.463, 0.268) 66 | in1(2,2,4) = (0.391, 0.454) 67 | in1(2,3,1) = (0.403, 0.430) 68 | in1(2,3,2) = (0.385, 0.466) 69 | in1(2,3,3) = (0.323, 0.568) 70 | in1(2,3,4) = (0.194, 0.720) 71 | 72 | ! each processor gets its local portion of global data 73 | do k=xstart(3),xend(3) 74 | do j=xstart(2),xend(2) 75 | do i=xstart(1),xend(1) 76 | in(i,j,k) = in1(i,j,k) 77 | end do 78 | end do 79 | end do 80 | 81 | ! write out input, to match the format of NAG example result file 82 | if (nrank==0) then 83 | write(*,*) 'C06FXF Example Program Results' 84 | write(*,*) '' 85 | write(*,*) 'Original data values' 86 | write(*,*) '' 87 | call print_global(in1,nx,ny,nz) 88 | end if 89 | 90 | ! ===== 3D forward FFT ===== 91 | call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD) 92 | 93 | ! normalisation - note FFTW doesn't normalise 94 | do k=zstart(3),zend(3) 95 | do j=zstart(2),zend(2) 96 | do i=zstart(1),zend(1) 97 | out(i,j,k) = out(i,j,k) / sqrt(real(nx*ny*nz)) 98 | end do 99 | end do 100 | end do 101 | 102 | call assemble_global(3,out,out1,nx,ny,nz) 103 | 104 | ! write out forward FFT result 105 | if (nrank==0) then 106 | write(*,*) 'Components of discrete Fourier transform' 107 | write(*,*) '' 108 | call print_global(out1,nx,ny,nz) 109 | end if 110 | 111 | ! ===== 3D inverse FFT ===== 112 | call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD) 113 | 114 | ! normalisation - note FFTW doesn't normalise 115 | do k=xstart(3),xend(3) 116 | do j=xstart(2),xend(2) 117 | do i=xstart(1),xend(1) 118 | in(i,j,k) = in(i,j,k) / sqrt(real(nx*ny*nz)) 119 | end do 120 | end do 121 | end do 122 | 123 | call assemble_global(1,in,in1,nx,ny,nz) 124 | 125 | ! write out inverse FFT result 126 | if (nrank==0) then 127 | write(*,*) 'Original sequence as restored by inverse transform' 128 | write(*,*) '' 129 | call print_global(in1,nx,ny,nz) 130 | end if 131 | 132 | call decomp_2d_fft_finalize 133 | call decomp_2d_finalize 134 | call MPI_FINALIZE(ierror) 135 | 136 | end program fft_test_c2c 137 | 138 | 139 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 140 | ! Collect data from each processor and assemble into a global array 141 | ! at the master rank 142 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 143 | subroutine assemble_global(ndir,local,global,nx,ny,nz) 144 | 145 | use decomp_2d 146 | use MPI 147 | 148 | implicit none 149 | 150 | integer, intent(IN) :: ndir ! 1 = X-pencil; 3 = Z-pencil 151 | integer, intent(IN) :: nx,ny,nz 152 | complex(mytype), dimension(:,:,:), intent(IN) :: local 153 | complex(mytype), dimension(nx,ny,nz), intent(OUT) :: global 154 | 155 | complex(mytype), allocatable, dimension(:,:,:) :: rbuf 156 | integer, dimension(9) :: sbuf1, rbuf1 157 | 158 | integer :: ierror, i,j,k,m, i1,i2,j1,j2,k1,k2, count 159 | integer, dimension(MPI_STATUS_SIZE) :: status 160 | 161 | if (nrank==0) then 162 | ! master writes its own data to a global array 163 | if (ndir==3) then ! Z-pencil 164 | i1 = zstart(1) 165 | i2 = zend(1) 166 | j1 = zstart(2) 167 | j2 = zend(2) 168 | k1 = zstart(3) 169 | k2 = zend(3) 170 | else if (ndir==1) then ! X-pencil 171 | i1 = xstart(1) 172 | i2 = xend(1) 173 | j1 = xstart(2) 174 | j2 = xend(2) 175 | k1 = xstart(3) 176 | k2 = xend(3) 177 | end if 178 | do k=k1,k2 179 | do j=j1,j2 180 | do i=i1,i2 181 | ! 'local' is assumbed shape array 182 | ! but it is OK as starting index for rank 0 always 1 183 | global(i,j,k)=local(i,j,k) 184 | end do 185 | end do 186 | end do 187 | ! then loop through all other ranks to collect data 188 | do m=1,nproc-1 189 | CALL MPI_RECV(rbuf1,9,MPI_INTEGER,m,m,MPI_COMM_WORLD, & 190 | status,ierror) 191 | allocate(rbuf(rbuf1(1):rbuf1(2),rbuf1(4):rbuf1(5), & 192 | rbuf1(7):rbuf1(8))) 193 | CALL MPI_RECV(rbuf,rbuf1(3)*rbuf1(6)*rbuf1(9),complex_type,m, & 194 | m+nproc,MPI_COMM_WORLD,status,ierror) 195 | do k=rbuf1(7),rbuf1(8) 196 | do j=rbuf1(4),rbuf1(5) 197 | do i=rbuf1(1),rbuf1(2) 198 | global(i,j,k)=rbuf(i,j,k) 199 | end do 200 | end do 201 | end do 202 | deallocate(rbuf) 203 | end do 204 | else 205 | ! slaves send data to mater 206 | if (ndir==3) then ! Z-pencil 207 | sbuf1(1) = zstart(1) 208 | sbuf1(2) = zend(1) 209 | sbuf1(3) = zsize(1) 210 | sbuf1(4) = zstart(2) 211 | sbuf1(5) = zend(2) 212 | sbuf1(6) = zsize(2) 213 | sbuf1(7) = zstart(3) 214 | sbuf1(8) = zend(3) 215 | sbuf1(9) = zsize(3) 216 | count = zsize(1)*zsize(2)*zsize(3) 217 | else if (ndir==1) then ! X-pencil 218 | sbuf1(1) = xstart(1) 219 | sbuf1(2) = xend(1) 220 | sbuf1(3) = xsize(1) 221 | sbuf1(4) = xstart(2) 222 | sbuf1(5) = xend(2) 223 | sbuf1(6) = xsize(2) 224 | sbuf1(7) = xstart(3) 225 | sbuf1(8) = xend(3) 226 | sbuf1(9) = xsize(3) 227 | count = xsize(1)*xsize(2)*xsize(3) 228 | end if 229 | ! send partition information 230 | CALL MPI_SEND(sbuf1,9,MPI_INTEGER,0,nrank,MPI_COMM_WORLD,ierror) 231 | ! send data array 232 | CALL MPI_SEND(local,count,complex_type,0, & 233 | nrank+nproc,MPI_COMM_WORLD,ierror) 234 | end if 235 | 236 | return 237 | end subroutine assemble_global 238 | 239 | 240 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 241 | ! Print out a global data array using special format that matches 242 | ! NAG library C06FXF Example Program Results for validation purpose 243 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 244 | subroutine print_global(data,nx,ny,nz) 245 | 246 | use decomp_2d 247 | 248 | implicit none 249 | 250 | integer, intent(IN) :: nx,ny,nz 251 | complex(mytype), dimension(nx,ny,nz), intent(IN) :: data 252 | 253 | integer :: i,j,k 254 | 255 | do i=1,nx 256 | write(*,10) i 257 | write(*,*) '' 258 | do j=1,ny 259 | write(*,20) (real(data(i,j,k)),k=1,nz) 260 | write(*,21) (aimag(data(i,j,k)),k=1,nz) 261 | write(*,*) '' 262 | end do 263 | end do 264 | 10 format(1x,'z(i,j,k) for i =', I6) 265 | 20 format(1x,'Real ', 4F10.3) 266 | 21 format(1x,'Imag ', 4F10.3) 267 | 268 | return 269 | end subroutine print_global 270 | 271 | -------------------------------------------------------------------------------- /examples/fft_test_r2c/.gitignore: -------------------------------------------------------------------------------- 1 | fft_test_r2c 2 | -------------------------------------------------------------------------------- /examples/fft_test_r2c/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT) 5 | 6 | OBJ = fft_test_r2c.o 7 | 8 | fft_test_r2c: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o fft_test_r2c 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/fft_test_r2c/README: -------------------------------------------------------------------------------- 1 | fft_test_r2c 2 | ------------ 3 | 4 | This example demonstrates the use of the FFT r2c/c2r interface. It generates 5 | random input and computes a serial 3D r2c transform on rank 0 to generate 6 | reference results. It then performs parallel computations of the same transform 7 | on distributed data. There are two separate tests, with input data distributed 8 | in X-pencil and Z-pencil, respectively. In each test, a r2c transform is 9 | performed first and its results on rank 0 printed out. Then an inverse c2r 10 | transform is followed which should recover the input to machine accuracy. 11 | 12 | To run: use 4 MPI processes. 13 | 14 | What to expect: 15 | - The output from the distributed computations should contain a subset of 16 | numbers as in the serial output. 17 | - The error reported should be around machine accuracy (~ 10^-6 for single 18 | precision and 10^-15 for double) 19 | 20 | -------------------------------------------------------------------------------- /examples/fft_test_r2c/fft_test_r2c.f90: -------------------------------------------------------------------------------- 1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2 | ! Main test program for the FFT r2c/c2r interface 3 | ! also demonstrate the use of the IO library 4 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 5 | 6 | program fft_test_r2c 7 | 8 | use decomp_2d 9 | use decomp_2d_fft 10 | use glassman 11 | use decomp_2d_io 12 | 13 | use MPI 14 | 15 | implicit none 16 | !include "fftw3.f" 17 | 18 | integer, parameter :: nx=4, ny=2, nz=3 19 | integer, parameter :: p_row=2, p_col=2 20 | 21 | real(mytype), allocatable, dimension(:,:,:) :: in, in2 22 | complex(mytype), allocatable, dimension(:,:,:) :: out 23 | 24 | integer, dimension(3) :: fft_start, fft_end, fft_size 25 | 26 | real(mytype), dimension(nx,ny,nz) :: in_global, in_g2, in_g3 27 | complex(mytype), dimension(nx/2+1,ny,nz) :: out_global 28 | 29 | integer (kind=MPI_OFFSET_KIND) :: filesize, disp 30 | 31 | real(mytype) :: err 32 | !integer*8 :: plan 33 | integer :: fh, ierror, i,j,k, n,iol 34 | 35 | call MPI_INIT(ierror) 36 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 37 | call decomp_2d_fft_init 38 | 39 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 40 | ! Compute a small problem all on rank 0 as reference 41 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 42 | call random_number(in_global) 43 | 44 | if (nrank==0) then 45 | write(*,*) '*** Reference serial computation on rank 0 only' 46 | write(*,*) ' global real input' 47 | do i=1,nx 48 | write(*,20) ((in_global(i,j,k),j=1,ny),k=1,nz) 49 | end do 50 | 51 | ! Using a 3D FFT routine supplied by this library 52 | call glassman_3d_r2c(in_global,nx,ny,nz,out_global) 53 | 54 | ! If using FFTW library: 55 | ! - uncomment the FFTW include file & plan above 56 | ! - uncomment the follwing function calls 57 | ! - change names to dfftw... for double precision 58 | !call sfftw_plan_dft_r2c_3d(plan,nx,ny,nz, & 59 | ! in_global,out_global,FFTW_ESTIMATE) 60 | !call sfftw_execute_dft_r2c(plan,in_global,out_global) 61 | 62 | write(*,*) ' global complex output' 63 | do i=1,nx/2+1 64 | write(*,10) ((out_global(i,j,k),j=1,ny),k=1,nz) 65 | end do 66 | end if 67 | 10 format(1x,6(:,'(',F5.2,',',F5.2,')')) 68 | 20 format(1x,6F5.2) 69 | 70 | ! File for testing IO 71 | call MPI_FILE_OPEN(MPI_COMM_WORLD, 'fftdata', & 72 | MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, & 73 | fh, ierror) 74 | filesize = 0_MPI_OFFSET_KIND 75 | call MPI_FILE_SET_SIZE(fh,filesize,ierror) ! guarantee overwriting 76 | disp = 0_MPI_OFFSET_KIND 77 | 78 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 79 | ! Test the real-to-complex interface (r2c) 80 | 81 | ! input is X-pencil real data whose global size is nx*ny*nz 82 | ! output is Z-pencil complex data whose global size is (nx/2+1)*ny*nz 83 | allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 84 | 85 | call decomp_2d_fft_get_size(fft_start,fft_end,fft_size) 86 | allocate (out(fft_start(1):fft_end(1), & 87 | fft_start(2):fft_end(2), & 88 | fft_start(3):fft_end(3))) 89 | 90 | ! each processor gets its local portion of global data 91 | do k=xstart(3),xend(3) 92 | do j=xstart(2),xend(2) 93 | do i=xstart(1),xend(1) 94 | in(i,j,k) = in_global(i,j,k) 95 | end do 96 | end do 97 | end do 98 | 99 | ! write input to file 100 | call decomp_2d_write_var(fh,disp,1,in) 101 | 102 | if (nrank==0) then 103 | write(*,*) ' ' 104 | write(*,*) '*** Distributed computation (X-pencil input)' 105 | write(*,*) ' real input held by rank 0:' 106 | write(*,20) in 107 | end if 108 | 109 | ! compute r2c transform 110 | call decomp_2d_fft_3d(in,out) 111 | 112 | if (nrank==0) then 113 | write(*,*) ' - after forward transform' 114 | write(*,*) ' complex output held by rank 0:' 115 | write(*,10) out 116 | end if 117 | 118 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 119 | ! Test the complex-to-real interface (c2r) 120 | 121 | allocate (in2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 122 | 123 | ! compute c2r transform 124 | call decomp_2d_fft_3d(out,in2) 125 | 126 | ! normalisation 127 | in2 = in2 / real(nx) / real(ny) / real(nz) 128 | 129 | ! write the data recovered by inverse FFT to file 130 | call decomp_2d_write_var(fh,disp,1,in2) 131 | 132 | if (nrank==0) then 133 | write(*,*) ' - after backward transform and normalisation' 134 | write(*,*) ' real output held by rank 0:' 135 | write(*,20) in2 136 | end if 137 | 138 | deallocate(in,in2,out) 139 | call decomp_2d_fft_finalize 140 | 141 | call MPI_FILE_CLOSE(fh,ierror) 142 | 143 | ! check on rank 0 if input data is properly recovered 144 | ! this also tests the IO routines 145 | if (nrank==0) then 146 | in_g2(1,1,1) = real(0., mytype) 147 | inquire(iolength=iol) in_g2(1,1,1) 148 | OPEN(10, FILE='fftdata', FORM='unformatted', & 149 | ACCESS='DIRECT', RECL=iol) 150 | n=1 151 | do k=1,nz 152 | do j=1,ny 153 | do i=1,nx 154 | read(10,rec=n) in_g2(i,j,k) 155 | n=n+1 156 | end do 157 | end do 158 | end do 159 | do k=1,nz 160 | do j=1,ny 161 | do i=1,nx 162 | read(10,rec=n) in_g3(i,j,k) 163 | n=n+1 164 | end do 165 | end do 166 | end do 167 | err = 0._mytype 168 | do k=1,nz 169 | do j=1,ny 170 | do i=1,nx 171 | err = err + (in_g2(i,j,k)-in_g3(i,j,k))**2 172 | end do 173 | end do 174 | end do 175 | err = err / real(nx,mytype) / real(ny,mytype) / real(nz,mytype) 176 | write(*,*) ' error / mesh point: ', sqrt(err) 177 | end if 178 | 179 | 180 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 181 | ! Repeat the above but using Z-pencil input 182 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 183 | 184 | call decomp_2d_fft_init(PHYSICAL_IN_Z) 185 | 186 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 187 | ! Test the real-to-complex interface (r2c) 188 | 189 | allocate (in(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3))) 190 | call decomp_2d_fft_get_size(fft_start,fft_end,fft_size) 191 | allocate (out(fft_start(1):fft_end(1), & 192 | fft_start(2):fft_end(2), & 193 | fft_start(3):fft_end(3))) 194 | 195 | ! each processor gets its local portion of global data 196 | do k=zstart(3),zend(3) 197 | do j=zstart(2),zend(2) 198 | do i=zstart(1),zend(1) 199 | in(i,j,k) = in_global(i,j,k) 200 | end do 201 | end do 202 | end do 203 | if (nrank==0) then 204 | write(*,*) ' ' 205 | write(*,*) '*** Distributed computation (Z-pencil input)' 206 | write(*,*) ' real input held by rank 0:' 207 | write(*,20) in 208 | end if 209 | 210 | ! compute r2c transform 211 | call decomp_2d_fft_3d(in,out) 212 | 213 | if (nrank==0) then 214 | write(*,*) ' - after forward transform' 215 | write(*,*) ' complex output held by rank 0:' 216 | write(*,10) out 217 | end if 218 | 219 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 220 | ! Test the complex-to-real interface (c2r) 221 | 222 | allocate (in2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3))) 223 | 224 | ! compute c2r transform 225 | call decomp_2d_fft_3d(out,in2) 226 | 227 | ! normalisation 228 | in2 = in2 / real(nx) / real(ny) / real(nz) 229 | 230 | if (nrank==0) then 231 | write(*,*) ' - after backward transform and normalisation' 232 | write(*,*) ' real output held by rank 0:' 233 | write(*,20) in2 234 | end if 235 | 236 | deallocate(in,in2,out) 237 | 238 | call decomp_2d_fft_finalize 239 | call decomp_2d_finalize 240 | call MPI_FINALIZE(ierror) 241 | 242 | end program fft_test_r2c 243 | -------------------------------------------------------------------------------- /examples/halo_test/.gitignore: -------------------------------------------------------------------------------- 1 | halo_test 2 | -------------------------------------------------------------------------------- /examples/halo_test/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft 5 | 6 | OBJ = halo_test.o 7 | 8 | halo_test: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o halo_test 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/halo_test/README: -------------------------------------------------------------------------------- 1 | halo_test 2 | --------- 3 | 4 | This example demonstrates the use of the halo-cell support API. It calculates 5 | the divergency of an arbitrary field, which contains evaluation of spatial 6 | derivatives in all three dimensions. The calculation was first implemented via 7 | the global transposition routines, then via halo-cell exchanges. Identical 8 | results are to be expected regardless of the communication algorithm. The 9 | computation is based on an explicit finite difference method so clearly using 10 | the halo-cell support API is more efficient. 11 | 12 | To run: use 12 MPI processes. 13 | 14 | What to expect: the output using different communication algorithms should be 15 | exactly the same. 16 | -------------------------------------------------------------------------------- /examples/io_test/.gitignore: -------------------------------------------------------------------------------- 1 | io_test 2 | -------------------------------------------------------------------------------- /examples/io_test/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft 5 | ifneq (,$(findstring DT3PIO,$(OPTIONS))) 6 | LIBS+= -L$(T3PIO_PATH)/lib -lt3pio 7 | endif 8 | 9 | all: io_test io_read io_var_test io_plane_test io_bench 10 | 11 | io_test: io_test.o 12 | $(F90) -o $@ $@.o $(LIBS) 13 | 14 | io_read: io_read.o 15 | $(F90) -o $@ $@.o $(LIBS) 16 | 17 | io_var_test: io_var_test.o 18 | $(F90) -o $@ $@.o $(LIBS) 19 | 20 | io_plane_test: io_plane_test.o 21 | $(F90) -o $@ $@.o $(LIBS) 22 | 23 | io_bench: io_bench.o 24 | $(F90) -o $@ $@.o $(LIBS) 25 | 26 | clean: 27 | rm -f *.o io_test io_read io_var_test io_plane_test io_bench 28 | 29 | realclean: clean 30 | rm -f *.dat io_var_data.* 31 | 32 | %.o : %.f90 33 | $(F90) $(INCLUDE) $(OPTIONS) $(ARG) $(F90FLAGS) -c $< 34 | -------------------------------------------------------------------------------- /examples/io_test/README: -------------------------------------------------------------------------------- 1 | io_test 2 | ------- 3 | 4 | This folder contains several applications to thoroughly test 2DECOMP&FFT's 5 | IO library. 6 | 7 | To run: If interactive run is possible on your system, adapt the script 8 | provided to run all the tests in one go. 9 | -------------------------------------------------------------------------------- /examples/io_test/io_bench.f90: -------------------------------------------------------------------------------- 1 | program io_bench 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | use MPI 6 | 7 | implicit none 8 | 9 | integer, parameter :: nx=100, ny=100, nz=100 10 | integer, parameter :: p_row=4, p_col=4 11 | 12 | real(mytype), allocatable, dimension(:,:,:) :: u1 13 | 14 | double precision :: t1, t2 15 | integer :: ierror 16 | 17 | call MPI_INIT(ierror) 18 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 19 | 20 | allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 21 | call random_number(u1) 22 | 23 | t1 = MPI_WTIME() 24 | call decomp_2d_write_one(1,u1,'io.dat') 25 | t2 = MPI_WTIME() 26 | 27 | if (nrank==0) write(*,*) 'I/O time: ', t2-t1 28 | 29 | call decomp_2d_finalize 30 | call MPI_FINALIZE(ierror) 31 | deallocate(u1) 32 | 33 | end program io_bench 34 | 35 | -------------------------------------------------------------------------------- /examples/io_test/io_plane_test.f90: -------------------------------------------------------------------------------- 1 | program io_plane_test 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | 6 | implicit none 7 | 8 | integer, parameter :: nx=17, ny=13, nz=11 9 | integer, parameter :: p_row=4, p_col=3 10 | 11 | real(mytype), dimension(nx,ny,nz) :: data1 12 | real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3 13 | 14 | real(mytype), allocatable, dimension(:,:,:) :: work 15 | 16 | integer :: i,j,k, m, ierror, iol 17 | 18 | call MPI_INIT(ierror) 19 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 20 | 21 | ! ***** global data ***** 22 | m = 1 23 | do k=1,nz 24 | do j=1,ny 25 | do i=1,nx 26 | data1(i,j,k) = real(m,mytype) 27 | m = m+1 28 | end do 29 | end do 30 | end do 31 | 32 | allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 33 | allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3))) 34 | allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3))) 35 | 36 | ! original X-pensil based data 37 | do k=xstart(3),xend(3) 38 | do j=xstart(2),xend(2) 39 | do i=xstart(1),xend(1) 40 | u1(i,j,k) = data1(i,j,k) 41 | end do 42 | end do 43 | end do 44 | call decomp_2d_write_plane(1,u1,1,nx/2,'x_pencil-x_plane.dat') 45 | call decomp_2d_write_plane(1,u1,2,ny/2,'x_pencil-y_plane.dat') 46 | call decomp_2d_write_plane(1,u1,3,nz/2,'x_pencil-z_plane.dat') 47 | 48 | ! Y-pencil data 49 | call transpose_x_to_y(u1,u2) 50 | call decomp_2d_write_plane(2,u2,1,nx/2,'y_pencil-x_plane.dat') 51 | call decomp_2d_write_plane(2,u2,2,ny/2,'y_pencil-y_plane.dat') 52 | call decomp_2d_write_plane(2,u2,3,nz/2,'y_pencil-z_plane.dat') 53 | 54 | ! Z-pencil data 55 | call transpose_y_to_z(u2,u3) 56 | call decomp_2d_write_plane(3,u3,1,nx/2,'z_pencil-x_plane.dat') 57 | call decomp_2d_write_plane(3,u3,2,ny/2,'z_pencil-y_plane.dat') 58 | call decomp_2d_write_plane(3,u3,3,nz/2,'z_pencil-z_plane.dat') 59 | 60 | ! Attemp to read the files 61 | if (nrank==0) then 62 | inquire(iolength=iol) data1(1,1,1) 63 | 64 | ! X-plane 65 | allocate(work(1,ny,nz)) 66 | open(10, FILE='x_pencil-x_plane.dat', FORM='unformatted', & 67 | ACCESS='DIRECT', RECL=iol) 68 | m=1 69 | do k=1,nz 70 | do j=1,ny 71 | read(10,rec=m) work(1,j,k) 72 | m=m+1 73 | end do 74 | end do 75 | write(*,*) ' ' 76 | write(*,'(15I5)') int(work) 77 | close(10) 78 | deallocate(work) 79 | 80 | ! Y-plane 81 | allocate(work(nx,1,nz)) 82 | open(10, FILE='x_pencil-y_plane.dat', FORM='unformatted', & 83 | ACCESS='DIRECT', RECL=iol) 84 | m=1 85 | do k=1,nz 86 | do i=1,nx 87 | read(10,rec=m) work(i,1,k) 88 | m=m+1 89 | end do 90 | end do 91 | write(*,*) ' ' 92 | write(*,'(15I5)') int(work) 93 | close(10) 94 | deallocate(work) 95 | 96 | ! Z-plane 97 | allocate(work(nx,ny,1)) 98 | open(10, FILE='x_pencil-z_plane.dat', FORM='unformatted', & 99 | ACCESS='DIRECT', RECL=iol) 100 | m=1 101 | do j=1,ny 102 | do i=1,nx 103 | read(10,rec=m) work(i,j,1) 104 | m=m+1 105 | end do 106 | end do 107 | write(*,*) ' ' 108 | write(*,'(15I5)') int(work) 109 | close(10) 110 | deallocate(work) 111 | 112 | end if 113 | 114 | call decomp_2d_finalize 115 | call MPI_FINALIZE(ierror) 116 | deallocate(u1,u2,u3) 117 | 118 | end program io_plane_test 119 | -------------------------------------------------------------------------------- /examples/io_test/io_read.f90: -------------------------------------------------------------------------------- 1 | program io_read 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | 6 | implicit none 7 | 8 | integer, parameter :: nx=17, ny=13, nz=11 9 | ! use different number of processes 10 | integer, parameter :: p_row=3, p_col=2 11 | 12 | #ifdef COMPLEX_TEST 13 | complex(mytype), dimension(nx,ny,nz) :: data1 14 | 15 | complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b 16 | #else 17 | real(mytype), dimension(nx,ny,nz) :: data1 18 | 19 | real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b 20 | #endif 21 | 22 | real(mytype), parameter :: eps = 1.0E-7_mytype 23 | 24 | integer :: i,j,k, m, ierror 25 | 26 | call MPI_INIT(ierror) 27 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 28 | 29 | ! ***** global data ***** 30 | m = 1 31 | do k=1,nz 32 | do j=1,ny 33 | do i=1,nx 34 | #ifdef COMPLEX_TEST 35 | data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 36 | #else 37 | data1(i,j,k) = real(m,mytype) 38 | #endif 39 | m = m+1 40 | end do 41 | end do 42 | end do 43 | 44 | allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 45 | allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3))) 46 | allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3))) 47 | 48 | ! read back to different arrays 49 | call decomp_2d_read_one(1,u1b,'u1.dat') 50 | call decomp_2d_read_one(2,u2b,'u2.dat') 51 | call decomp_2d_read_one(3,u3b,'u3.dat') 52 | 53 | ! Check against the global data array 54 | do k=xstart(3),xend(3) 55 | do j=xstart(2),xend(2) 56 | do i=xstart(1),xend(1) 57 | if (abs((data1(i,j,k)-u1b(i,j,k))) > eps) stop 4 58 | end do 59 | end do 60 | end do 61 | 62 | do k=ystart(3),yend(3) 63 | do j=ystart(2),yend(2) 64 | do i=ystart(1),yend(1) 65 | if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5 66 | end do 67 | end do 68 | end do 69 | 70 | do k=zstart(3),zend(3) 71 | do j=zstart(2),zend(2) 72 | do i=zstart(1),zend(1) 73 | if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6 74 | end do 75 | end do 76 | end do 77 | 78 | call decomp_2d_finalize 79 | call MPI_FINALIZE(ierror) 80 | deallocate(u1b,u2b,u3b) 81 | 82 | end program io_read 83 | -------------------------------------------------------------------------------- /examples/io_test/io_test.f90: -------------------------------------------------------------------------------- 1 | program io_test 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | 6 | implicit none 7 | 8 | integer, parameter :: nx=17, ny=13, nz=11 9 | integer, parameter :: p_row=4, p_col=3 10 | 11 | #ifdef COMPLEX_TEST 12 | complex(mytype), dimension(nx,ny,nz) :: data1 13 | 14 | complex(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3 15 | complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b 16 | #else 17 | real(mytype), dimension(nx,ny,nz) :: data1 18 | 19 | real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3 20 | real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b 21 | #endif 22 | 23 | real(mytype), parameter :: eps = 1.0E-7_mytype 24 | 25 | integer :: i,j,k, m, ierror 26 | 27 | call MPI_INIT(ierror) 28 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 29 | 30 | ! ***** global data ***** 31 | m = 1 32 | do k=1,nz 33 | do j=1,ny 34 | do i=1,nx 35 | #ifdef COMPLEX_TEST 36 | data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 37 | #else 38 | data1(i,j,k) = real(m,mytype) 39 | #endif 40 | m = m+1 41 | end do 42 | end do 43 | end do 44 | 45 | allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 46 | allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3))) 47 | allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3))) 48 | 49 | allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 50 | allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3))) 51 | allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3))) 52 | 53 | ! original x-pensil based data 54 | do k=xstart(3),xend(3) 55 | do j=xstart(2),xend(2) 56 | do i=xstart(1),xend(1) 57 | u1(i,j,k) = data1(i,j,k) 58 | end do 59 | end do 60 | end do 61 | 62 | ! transpose 63 | call transpose_x_to_y(u1,u2) 64 | call transpose_y_to_z(u2,u3) 65 | 66 | ! write to disk 67 | call decomp_2d_write_one(1,u1,'u1.dat') 68 | call decomp_2d_write_one(2,u2,'u2.dat') 69 | call decomp_2d_write_one(3,u3,'u3.dat') 70 | 71 | ! read back to different arrays 72 | call decomp_2d_read_one(1,u1b,'u1.dat') 73 | call decomp_2d_read_one(2,u2b,'u2.dat') 74 | call decomp_2d_read_one(3,u3b,'u3.dat') 75 | 76 | ! compare 77 | do k=xstart(3),xend(3) 78 | do j=xstart(2),xend(2) 79 | do i=xstart(1),xend(1) 80 | if (abs((u1(i,j,k)-u1b(i,j,k))) > eps) stop 1 81 | end do 82 | end do 83 | end do 84 | 85 | do k=ystart(3),yend(3) 86 | do j=ystart(2),yend(2) 87 | do i=ystart(1),yend(1) 88 | if (abs((u2(i,j,k)-u2b(i,j,k))) > eps) stop 2 89 | end do 90 | end do 91 | end do 92 | 93 | do k=zstart(3),zend(3) 94 | do j=zstart(2),zend(2) 95 | do i=zstart(1),zend(1) 96 | if (abs((u3(i,j,k)-u3b(i,j,k))) > eps) stop 3 97 | end do 98 | end do 99 | end do 100 | 101 | ! Also check against the global data array 102 | do k=xstart(3),xend(3) 103 | do j=xstart(2),xend(2) 104 | do i=xstart(1),xend(1) 105 | if (abs((data1(i,j,k)-u1b(i,j,k))) > eps) stop 4 106 | end do 107 | end do 108 | end do 109 | 110 | do k=ystart(3),yend(3) 111 | do j=ystart(2),yend(2) 112 | do i=ystart(1),yend(1) 113 | if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5 114 | end do 115 | end do 116 | end do 117 | 118 | do k=zstart(3),zend(3) 119 | do j=zstart(2),zend(2) 120 | do i=zstart(1),zend(1) 121 | if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6 122 | end do 123 | end do 124 | end do 125 | 126 | call decomp_2d_finalize 127 | call MPI_FINALIZE(ierror) 128 | deallocate(u1,u2,u3) 129 | deallocate(u1b,u2b,u3b) 130 | 131 | end program io_test 132 | -------------------------------------------------------------------------------- /examples/io_test/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | make 4 | if [ $? -ne 0 ] ; then 5 | echo "=================================================" 6 | echo "Failed to build the applications. Fix them first!" 7 | echo "=================================================" 8 | exit 1 9 | fi 10 | 11 | echo " " 12 | echo "writing data files using MPI-IO..." 13 | echo "mpirun -np 12 ./io_test" 14 | mpirun -np 12 ./io_test 15 | 16 | echo " " 17 | echo "reading data files back (different number of processes)..." 18 | echo "mpirun -np 6 ./io_read" 19 | mpirun -np 6 ./io_read 20 | 21 | # The files written by MPI-IO should be independent on # processes 22 | echo " " 23 | echo "*** testing write_var..." 24 | echo "mpirun -np 20 ./io_var_test 5 4" 25 | mpirun -np 20 ./io_var_test 5 4 26 | echo "mpirun -np 12 ./io_var_test 4 3" 27 | mpirun -np 12 ./io_var_test 4 3 28 | echo "mpirun -np 6 ./io_var_test 3 2" 29 | mpirun -np 6 ./io_var_test 3 2 30 | echo "mpirun -np 2 ./io_var_test 2 1" 31 | mpirun -np 2 ./io_var_test 2 1 32 | diff -s io_var_data.020 io_var_data.012 33 | diff -s io_var_data.020 io_var_data.006 34 | diff -s io_var_data.020 io_var_data.002 35 | 36 | echo " " 37 | echo "*** testing write_plane..." 38 | echo "mpirun -np 12 ./io_plane_test" 39 | mpirun -np 12 ./io_plane_test 40 | diff -s x_pencil-x_plane.dat y_pencil-x_plane.dat 41 | diff -s x_pencil-x_plane.dat z_pencil-x_plane.dat 42 | diff -s x_pencil-y_plane.dat y_pencil-y_plane.dat 43 | diff -s x_pencil-y_plane.dat z_pencil-y_plane.dat 44 | diff -s x_pencil-z_plane.dat y_pencil-z_plane.dat 45 | diff -s x_pencil-z_plane.dat z_pencil-z_plane.dat 46 | 47 | echo " " 48 | echo "Tests PASSED, unless errors reported" 49 | 50 | -------------------------------------------------------------------------------- /examples/non_blocking/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT) 5 | 6 | all: blocking non_blocking 7 | 8 | blocking: blocking.o 9 | $(F90) -o $@ $@.o $(LIBS) 10 | 11 | non_blocking: non_blocking.o 12 | $(F90) -o $@ $@.o $(LIBS) 13 | 14 | clean: 15 | rm -f *.o blocking non_blocking 16 | 17 | %.o : %.f90 18 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 19 | -------------------------------------------------------------------------------- /examples/non_blocking/README.md: -------------------------------------------------------------------------------- 1 | non_blocking 2 | ------------ 3 | 4 | This test contains two sample applications to compute multiple independent FFTs. The first application `blocking.f90` uses the standard blocking version of MPI communication code to transpose the data among different stages of the computation. The second application `non_blocking.f90` performs the same computation using the non-blocking communication routines supplied by 2DECOMP&FFT. 5 | 6 | These two applications are using FFTW APIs directly. Please compile them separately using the `Makefile` in this directory after building 2DECOMP&FFT with the FFTW engine. 7 | 8 | Non-blocking collective communication is part of MPI 3 standard and it is now widely supported. Earlier users of 2DECOMP&FFT may remember the use of libNBC (http://www.unixer.de/research/nbcoll/libnbc/), a library implementing non-blocking MPI collectives (such as IALLTOALL) with existing MPI 1 functions. libNBC is now obsolete and reference to it has been removed from the source codes. 9 | 10 | To demonstrate the idea of overlap communication and computation, the 3D FFT is implemented using loops of 1D FFTs (without using the advanced interface of FFTW) so that MPI_TEST calls can be easily inserted in the computational part of the code. This is required because the communication has to be explicitly progressed when running on the same thread as the computation. 11 | 12 | The two applications should produce identical results. 13 | 14 | End users are responsible for identifying opportunities in their applications to overlap communication with computation. 15 | -------------------------------------------------------------------------------- /examples/non_blocking/blocking.f90: -------------------------------------------------------------------------------- 1 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2 | ! This program computes multiple distributed 3D FFTs 3 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 4 | 5 | program blocking 6 | 7 | use decomp_2d 8 | use MPI 9 | 10 | implicit none 11 | 12 | include "fftw3.f" 13 | 14 | integer, parameter :: nx=16, ny=16, nz=16 15 | integer, parameter :: p_row=2, p_col=2 16 | integer, parameter :: NFFT=20 ! number of independent FFTs 17 | 18 | integer :: i,j,k, m, nmax, ierror 19 | real(mytype) :: tmp1, tmp2 20 | 21 | double precision :: t1, t2 22 | 23 | ! FFTW plans for the 1D forward/backward transforms 24 | integer*8, save :: x_plan_f, x_plan_b 25 | integer*8, save :: y_plan_f, y_plan_b 26 | integer*8, save :: z_plan_f, z_plan_b 27 | 28 | ! dummy array used for planning 29 | complex(mytype), allocatable, dimension(:) :: buf1, buf2 30 | 31 | ! input/output of the FFT 32 | complex(mytype), allocatable, dimension(:,:,:) :: in, out, wk2 33 | 34 | 35 | call MPI_INIT(ierror) 36 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 37 | 38 | ! ===== planning 1D FFT in x ===== 39 | allocate(buf1(xsize(1)), buf2(xsize(1))) 40 | 41 | #ifdef DOUBLE_PREC 42 | call dfftw_plan_dft_1d(x_plan_f, xsize(1), buf1, buf2, & 43 | FFTW_FORWARD, FFTW_MEASURE) 44 | call dfftw_plan_dft_1d(x_plan_b, xsize(1), buf1, buf2, & 45 | FFTW_BACKWARD, FFTW_MEASURE) 46 | #else 47 | call sfftw_plan_dft_1d(x_plan_f, xsize(1), buf1, buf2, & 48 | FFTW_FORWARD, FFTW_MEASURE) 49 | call sfftw_plan_dft_1d(x_plan_b, xsize(1), buf1, buf2, & 50 | FFTW_BACKWARD, FFTW_MEASURE) 51 | #endif 52 | 53 | deallocate(buf1,buf2) 54 | 55 | ! ===== planning 1D FFT in Y ===== 56 | allocate(buf1(ysize(2)), buf2(ysize(2))) 57 | 58 | #ifdef DOUBLE_PREC 59 | call dfftw_plan_dft_1d(y_plan_f, ysize(2), buf1, buf2, & 60 | FFTW_FORWARD, FFTW_MEASURE) 61 | call dfftw_plan_dft_1d(y_plan_b, ysize(2), buf1, buf2, & 62 | FFTW_BACKWARD, FFTW_MEASURE) 63 | #else 64 | call sfftw_plan_dft_1d(y_plan_f, ysize(2), buf1, buf2, & 65 | FFTW_FORWARD, FFTW_MEASURE) 66 | call sfftw_plan_dft_1d(y_plan_b, ysize(2), buf1, buf2, & 67 | FFTW_BACKWARD, FFTW_MEASURE) 68 | #endif 69 | 70 | deallocate(buf1,buf2) 71 | 72 | ! ===== planning 1D FFT in Z ===== 73 | allocate(buf1(zsize(3)), buf2(zsize(3))) 74 | 75 | #ifdef DOUBLE_PREC 76 | call dfftw_plan_dft_1d(z_plan_f, zsize(3), buf1, buf2, & 77 | FFTW_FORWARD, FFTW_MEASURE) 78 | call dfftw_plan_dft_1d(z_plan_b, zsize(3), buf1, buf2, & 79 | FFTW_BACKWARD, FFTW_MEASURE) 80 | #else 81 | call sfftw_plan_dft_1d(z_plan_f, zsize(3), buf1, buf2, & 82 | FFTW_FORWARD, FFTW_MEASURE) 83 | call sfftw_plan_dft_1d(z_plan_b, zsize(3), buf1, buf2, & 84 | FFTW_BACKWARD, FFTW_MEASURE) 85 | #endif 86 | 87 | deallocate(buf1,buf2) 88 | 89 | 90 | allocate( in(xsize(1),xsize(2),xsize(3))) ! x-pencil input 91 | allocate(out(zsize(1),zsize(2),zsize(3))) ! z-pencil output 92 | allocate(wk2(ysize(1),ysize(2),ysize(3))) ! y-pencil intermediate 93 | 94 | ! 1D temp buffer 95 | nmax = max(xsize(1),max(ysize(2),zsize(3))) 96 | allocate (buf1(nmax)) 97 | allocate (buf2(nmax)) 98 | 99 | t1 = MPI_WTIME() 100 | 101 | do m=1,NFFT 102 | 103 | do k=1,xsize(3) 104 | do j=1,xsize(2) 105 | do i=1,xsize(1) 106 | tmp1 = real(xstart(1)+i-1, mytype) / real(nx, mytype) & 107 | * real(xstart(2)+j-1, mytype) / real(ny, mytype) & 108 | * real(xstart(3)+k-1, mytype) / real(nz, mytype) & 109 | * real(m, mytype) / real(NFFT, mytype) 110 | in(i,j,k) = cmplx(tmp1, 0._mytype, mytype) 111 | end do 112 | end do 113 | end do 114 | 115 | ! This shows how to perform 3D FFT by using the FFTW basic interface. 116 | ! Copy data to/from 1D buffers and loop through all 1D FFTs. 117 | 118 | ! 1D FFT in X 119 | do k=1,xsize(3) 120 | do j=1,xsize(2) 121 | do i=1,xsize(1) 122 | buf1(i) = in(i,j,k) 123 | end do 124 | #ifdef DOUBLE_PREC 125 | call dfftw_execute_dft(x_plan_f, buf1, buf2) 126 | #else 127 | call sfftw_execute_dft(x_plan_f, buf1, buf2) 128 | #endif 129 | do i=1,xsize(1) 130 | in(i,j,k) = buf2(i) 131 | end do 132 | end do 133 | end do 134 | 135 | ! ===== Swap X --> Y ===== 136 | call transpose_x_to_y(in,wk2) 137 | 138 | ! ===== 1D FFTs in Y ===== 139 | do k=1,ysize(3) 140 | do i=1,ysize(1) 141 | do j=1,ysize(2) 142 | buf1(j) = wk2(i,j,k) 143 | end do 144 | #ifdef DOUBLE_PREC 145 | call dfftw_execute_dft(y_plan_f, buf1, buf2) 146 | #else 147 | call sfftw_execute_dft(y_plan_f, buf1, buf2) 148 | #endif 149 | do j=1,ysize(2) 150 | wk2(i,j,k) = buf2(j) 151 | end do 152 | end do 153 | end do 154 | 155 | ! ===== Swap Y --> Z ===== 156 | call transpose_y_to_z(wk2,out) 157 | 158 | ! ===== 1D FFTs in Z ===== 159 | do j=1,zsize(2) 160 | do i=1,zsize(1) 161 | do k=1,zsize(3) 162 | buf1(k) = out(i,j,k) 163 | end do 164 | #ifdef DOUBLE_PREC 165 | call dfftw_execute_dft(z_plan_f, buf1, buf2) 166 | #else 167 | call sfftw_execute_dft(z_plan_f, buf1, buf2) 168 | #endif 169 | do k=1,zsize(3) 170 | out(i,j,k) = buf2(k) 171 | end do 172 | end do 173 | end do 174 | 175 | if (nrank==0) write(*,*) 'TEST ', m, out(1:2,1:2,1:2) 176 | 177 | end do ! NFFT 178 | 179 | t2 = MPI_WTIME() - t1 180 | call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, & 181 | MPI_COMM_WORLD,ierror) 182 | t1 = t1 / real(nproc, mytype) 183 | 184 | if (nrank==0) then 185 | write(*,*) 'Average Forward FFT Time(sec): ', t1 186 | end if 187 | 188 | ! clean up 189 | #ifdef DOUBLE_PREC 190 | call dfftw_destroy_plan(x_plan_f) 191 | call dfftw_destroy_plan(x_plan_b) 192 | call dfftw_destroy_plan(y_plan_f) 193 | call dfftw_destroy_plan(y_plan_b) 194 | call dfftw_destroy_plan(z_plan_f) 195 | call dfftw_destroy_plan(z_plan_b) 196 | #else 197 | call sfftw_destroy_plan(x_plan_f) 198 | call sfftw_destroy_plan(x_plan_b) 199 | call sfftw_destroy_plan(y_plan_f) 200 | call sfftw_destroy_plan(y_plan_b) 201 | call sfftw_destroy_plan(z_plan_f) 202 | call sfftw_destroy_plan(z_plan_b) 203 | #endif 204 | 205 | call decomp_2d_finalize 206 | call MPI_FINALIZE(ierror) 207 | deallocate(in,out,wk2,buf1,buf2) 208 | 209 | 210 | end program blocking 211 | -------------------------------------------------------------------------------- /examples/p3dfft/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | P3DFFT_HOME=$(HOME)/software/build/p3dfft-2.7.9-dimsc 4 | FFTW3_HOME=$(HOME)/software/build/fftw-3.3 5 | 6 | INCLUDE = -I../../include -I$(P3DFFT_HOME)/include 7 | LIBS = -L../../lib -l2decomp_fft -L$(P3DFFT_HOME)/lib -lp3dfft $(LIBFFT) -L$(FFTW3_HOME)/lib -lfftw3 8 | 9 | OBJ = p3dfft.o 10 | 11 | p3dfft: $(OBJ) 12 | $(F90) -o $@ $(OBJ) $(LIBS) 13 | 14 | clean: 15 | rm -f *.o p3dfft 16 | 17 | %.o : %.f90 18 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 19 | -------------------------------------------------------------------------------- /examples/p3dfft/README.md: -------------------------------------------------------------------------------- 1 | p3dfft 2 | ------ 3 | 4 | This test program performs the following tasks: 5 | 6 | * It crosschecks 2DECOMP&FFT results against P3DFFT results. 7 | * It compares the performance of the two codes. 8 | 9 | **How to set up this test?** 10 | 11 | Due to external dependency, this test has to be built separately after building 2DECOMP&FFT. 12 | 13 | First, download P3DFFT version 2.7.9, the most recent 2.x release. Install P3DFFT at a directory denoted as `$P3DFFT_HOME` and set this path properly in the `Makefile`. To build P3DFFT : 14 | ``` 15 | FC=mpif90 CC=mpicc LDFLAGS="-lm" ./configure --prefix=$HOME/software/build/p3dfft-2.7.9-dimsc \ 16 | --enable-gnu --enable-fftw --with-fftw=$HOME/software/build/fftw-3.3.9 --enable-openmpi --enable-dimsc 17 | make 18 | make install 19 | ``` 20 | This instruction is on a workstation with gcc 8.4.1 and OpenMPI 4.0.5. Adapt this accordlingly. Note that P3DFFT is built with `-DDIMS_C` flag (enabling same decomposition as 2DECOMP&FFT for a fair comparison). P3DFFT needs to link to *libm* but somehow its build system doesn't handle this correctly, thus the added `LDFLAGS` setting being required. 21 | 22 | Both P3DFFT and 2DECOMP&FFT are built in double precision. P3DFFT is built against FFTW (provide its path in `Makefile`); 2DECOMP&FFT can use any FFT engine. 23 | 24 | **What to expect:** 25 | 26 | * Results from P3DFFT and 2DECOMP&FFT should be almost identical, even when different FFT engines are used. 27 | * Each library should recover its input after one forward and one backward transform with errors reported close to machine accuracy. 28 | -------------------------------------------------------------------------------- /examples/tecplot_view/2decomp_decomp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numericalalgorithmsgroup/2decomp_fft/a5a126bb4a02fdece45f65aad69cfba26a383499/examples/tecplot_view/2decomp_decomp.png -------------------------------------------------------------------------------- /examples/tecplot_view/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft 5 | 6 | OBJ = tecplot_view.o 7 | 8 | tecplot_view: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o tecplot_view 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/tecplot_view/README: -------------------------------------------------------------------------------- 1 | tecplot_view 2 | ------------ 3 | 4 | This program generates data to visualise the 2D decomposition options. 5 | 6 | To run: use 12 MPI processes. 7 | 8 | What to expect: several data files are written to disk. These are in the 9 | standard format that can be read by Tecplot (a visualisation package popular 10 | in the CFD community). 11 | 12 | There is also an interactive web page at http://www.2decomp.org/decomp.php 13 | to demonstrate the data distribution in a 2D decomposition. 14 | -------------------------------------------------------------------------------- /examples/tecplot_view/tecplot_view.f90: -------------------------------------------------------------------------------- 1 | program tecplot_view 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | use MPI 6 | 7 | implicit none 8 | 9 | integer, parameter :: nx=17, ny=13, nz=11 10 | integer, parameter :: p_row=4, p_col=3 11 | 12 | real(mytype), dimension(nx,ny,nz) :: data1 13 | 14 | integer, dimension(3) :: lstart, lend, lsize 15 | 16 | integer :: i,j,k, m, ierror 17 | 18 | call MPI_INIT(ierror) 19 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 20 | 21 | ! a copy of global data saved on every rank 22 | m = 1 23 | do k=1,nz 24 | do j=1,ny 25 | do i=1,nx 26 | data1(i,j,k) = real(m, mytype) 27 | m = m+1 28 | end do 29 | end do 30 | end do 31 | 32 | ! master rank generated a Tecplot view of the global data 33 | if (nrank==0) then 34 | open(10,file='data0.dat',form='formatted') 35 | write(10,*) 'TITLE="Tecplot Output"' 36 | write(10,*) 'VARIABLES= "X" "Y" "Z" "VAR"' 37 | write(10,*) 'ZONE F=POINT I=',nx,' J=',ny,' ,K=',nz 38 | do k=1,nz 39 | do j=1,ny 40 | do i=1,nx 41 | write(10,*) i,j,k, data1(i,j,k) 42 | end do 43 | end do 44 | end do 45 | close(10) 46 | end if 47 | 48 | ! Generate Tecplot views of the decompositions 49 | ! -------------------------------------------- 50 | ! For each pencil orientation there are two ways decomposing: 51 | ! p_row*p_col or p_col*p_row. One set is used in 2DECOMP and is 52 | ! described by the library's global variables. The other set is 53 | ! generated here for visualisation. 54 | 55 | ! (/ 1,2,3 /) 56 | call tecplot(nx, ny, nz, data1, xstart, xend, xsize, 'data1.dat') 57 | ! (/ 2,1,3 /) 58 | call tecplot(nx, ny, nz, data1, ystart, yend, ysize, 'data2.dat') 59 | ! (/ 2,3,1 /) 60 | call tecplot(nx, ny, nz, data1, zstart, zend, zsize, 'data3b.dat') 61 | 62 | call partition(nx, ny, nz, (/ 1,3,2 /), lstart, lend, lsize) 63 | call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data1b.dat') 64 | call partition(nx, ny, nz, (/ 3,1,2 /), lstart, lend, lsize) 65 | call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data2b.dat') 66 | call partition(nx, ny, nz, (/ 3,2,1 /), lstart, lend, lsize) 67 | call tecplot(nx, ny, nz, data1, lstart, lend, lsize, 'data3.dat') 68 | 69 | call decomp_2d_finalize 70 | call MPI_FINALIZE(ierror) 71 | 72 | end program tecplot_view 73 | 74 | 75 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 76 | ! Generate Tecplot files to visualise the 2D decompositions 77 | ! - each rank corresponds to a Tecplot 'zone' 78 | ! - rank 0 handles all I/O 79 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 80 | 81 | subroutine tecplot(nx, ny, nz, data1, lstart, lend, lsize, filename) 82 | 83 | use decomp_2d 84 | use MPI 85 | 86 | implicit none 87 | 88 | integer, intent(IN) :: nx ,ny ,nz 89 | real(mytype), dimension(nx,ny,nz), intent(IN) :: data1 90 | integer, dimension(3), intent(IN) :: lstart, lend, lsize 91 | character(len=*), intent(IN) :: filename 92 | 93 | real(mytype), dimension(nx,ny,nz) :: data1b 94 | real(mytype), allocatable, dimension(:,:,:) :: local_data, rbuf 95 | integer, dimension(9) :: sbuf1, rbuf1 96 | integer, dimension(MPI_STATUS_SIZE) :: status 97 | character(len=7) :: tempstr 98 | integer :: i,j,k, m, ierror 99 | 100 | ! data1 holds the first copy of global data, generated locally 101 | ! data1b holds a second copy of global data, collected via communication 102 | 103 | ! each rank holds its local data 104 | allocate (local_data(lstart(1):lend(1),lstart(2):lend(2), & 105 | lstart(3):lend(3))) 106 | do k=lstart(3),lend(3) 107 | do j=lstart(2),lend(2) 108 | do i=lstart(1),lend(1) 109 | local_data(i,j,k) = data1(i,j,k) 110 | end do 111 | end do 112 | end do 113 | 114 | if (nrank==0) then 115 | 116 | ! master writes file header, collect data from each slave process, 117 | ! and write as seraprate Tecplot zones 118 | open(10,file=filename,form='formatted') 119 | write(10,*) 'TITLE="Tecplot Output"' 120 | write(10,*) 'VARIABLES= "X" "Y" "Z" "VAR"' 121 | write(10,*) 'ZONE F=POINT T="Rank 00" I=',lsize(1),' J=',lsize(2), & 122 | ' ,K=',lsize(3) 123 | do k=lstart(3),lend(3) 124 | do j=lstart(2),lend(2) 125 | do i=lstart(1),lend(1) 126 | write(10,*) i,j,k, local_data(i,j,k) 127 | ! master copies its local data to the second global array 128 | data1b(i,j,k)=local_data(i,j,k) 129 | end do 130 | end do 131 | end do 132 | 133 | ! loop through all other ranks to receive data and write 134 | do m=1,nproc-1 135 | CALL MPI_RECV(rbuf1,9,MPI_INTEGER,m,m,MPI_COMM_WORLD, & 136 | status,ierror) 137 | write(tempstr,100)'Rank ',m 138 | 100 format(A,I2.2) 139 | write(10,*) 'ZONE F=POINT T="', tempstr, '" I=',rbuf1(3), & 140 | ' J=',rbuf1(6), ' ,K=',rbuf1(9) 141 | allocate (rbuf(rbuf1(1):rbuf1(2),rbuf1(4):rbuf1(5), & 142 | rbuf1(7):rbuf1(8))) 143 | CALL MPI_RECV(rbuf,rbuf1(3)*rbuf1(6)*rbuf1(9),real_type,m, & 144 | m+nproc,MPI_COMM_WORLD,status,ierror) 145 | do k=rbuf1(7),rbuf1(8) 146 | do j=rbuf1(4),rbuf1(5) 147 | do i=rbuf1(1),rbuf1(2) 148 | write(10,*) i,j,k, rbuf(i,j,k) 149 | ! data received copied to global array 150 | data1b(i,j,k)=rbuf(i,j,k) 151 | end do 152 | end do 153 | end do 154 | deallocate(rbuf) 155 | end do 156 | 157 | close (10) 158 | 159 | ! check if data set collected via communication is correct 160 | do k=1,nz 161 | do j=1,ny 162 | do i=1,nx 163 | if (abs(data1b(i,j,k)-data1(i,j,k)) > 1.0e-5) then 164 | stop "error" 165 | end if 166 | end do 167 | end do 168 | end do 169 | 170 | else 171 | 172 | ! slaves send data to the master 173 | sbuf1(1) = lstart(1) 174 | sbuf1(2) = lend(1) 175 | sbuf1(3) = lsize(1) 176 | sbuf1(4) = lstart(2) 177 | sbuf1(5) = lend(2) 178 | sbuf1(6) = lsize(2) 179 | sbuf1(7) = lstart(3) 180 | sbuf1(8) = lend(3) 181 | sbuf1(9) = lsize(3) 182 | CALL MPI_SEND(sbuf1,9,MPI_INTEGER,0,nrank,MPI_COMM_WORLD,ierror) 183 | CALL MPI_SEND(local_data,lsize(1)*lsize(2)*lsize(3),real_type,0, & 184 | nrank+nproc,MPI_COMM_WORLD,ierror) 185 | 186 | endif 187 | 188 | deallocate(local_data) 189 | return 190 | 191 | end subroutine tecplot 192 | -------------------------------------------------------------------------------- /examples/test2d/.gitignore: -------------------------------------------------------------------------------- 1 | test2d 2 | -------------------------------------------------------------------------------- /examples/test2d/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft 5 | 6 | OBJ = test2d.o 7 | 8 | test2d: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o test2d u*.dat 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/test2d/README: -------------------------------------------------------------------------------- 1 | test2d 2 | ------ 3 | 4 | This program is to validate the 2D pencil decomposition library. It transposes 5 | a set of data into different storage formats so that all communication routines 6 | are tested. The test code also demonstrates to use of the I/O library. 7 | 8 | To run: use 12 MPI processes. 9 | 10 | What to expect: many files are written to disk. Regardless of the pencil- 11 | orientation of the distributed data, files written collectively by all MPI 12 | processes should contain identical information. For example, u1.dat, u2.dat, 13 | u3.dat, u2b.dat and u1b.dat should be all identical. 14 | -------------------------------------------------------------------------------- /examples/test2d/test2d.f90: -------------------------------------------------------------------------------- 1 | program test2d 2 | 3 | use decomp_2d 4 | use decomp_2d_io 5 | 6 | implicit none 7 | 8 | integer, parameter :: nx=17, ny=13, nz=11 9 | integer, parameter :: p_row=4, p_col=3 10 | 11 | real(mytype), dimension(nx,ny,nz) :: data1 12 | 13 | real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3 14 | 15 | integer :: i,j,k, m, ierror 16 | 17 | call MPI_INIT(ierror) 18 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 19 | 20 | ! ***** global data ***** 21 | m = 1 22 | do k=1,nz 23 | do j=1,ny 24 | do i=1,nx 25 | data1(i,j,k) = float(m) 26 | m = m+1 27 | end do 28 | end do 29 | end do 30 | 31 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 32 | ! Testing the swap routines 33 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 34 | 35 | !allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3))) 36 | !allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3))) 37 | !allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3))) 38 | call alloc_x(u1, opt_global=.true.) 39 | call alloc_y(u2, opt_global=.true.) 40 | call alloc_z(u3, opt_global=.true.) 41 | 42 | ! original x-pensil based data 43 | do k=xstart(3),xend(3) 44 | do j=xstart(2),xend(2) 45 | do i=xstart(1),xend(1) 46 | u1(i,j,k) = data1(i,j,k) 47 | end do 48 | end do 49 | end do 50 | 51 | 10 format(15I5) 52 | 53 | if (nrank==0) then 54 | write(*,*) 'Numbers held on Rank 0' 55 | write(*,*) ' ' 56 | write(*,*) 'X-pencil' 57 | write(*,10) int(u1) 58 | end if 59 | 60 | call decomp_2d_write_one(1,u1,'u1.dat') 61 | 62 | !!!!!!!!!!!!!!!!!!!!!!! 63 | ! x-pensil ==> y-pensil 64 | call transpose_x_to_y(u1,u2) 65 | 66 | if (nrank==0) then 67 | write(*,*) ' ' 68 | write(*,*) 'Y-pencil' 69 | write(*,10) int(u2) 70 | end if 71 | 72 | call decomp_2d_write_one(2,u2,'u2.dat') 73 | ! 'u1.dat' and 'u2.dat' should be identical byte-by-byte 74 | 75 | ! also check the transposition this way 76 | do k=ystart(3),yend(3) 77 | do j=ystart(2),yend(2) 78 | do i=ystart(1),yend(1) 79 | if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping x->y" 80 | end do 81 | end do 82 | end do 83 | 84 | !!!!!!!!!!!!!!!!!!!!!!! 85 | ! y-pensil ==> z-pensil 86 | call transpose_y_to_z(u2,u3) 87 | 88 | if (nrank==0) then 89 | write(*,*) ' ' 90 | write(*,*) 'Z-pencil' 91 | write(*,10) int(u3) 92 | end if 93 | 94 | call decomp_2d_write_one(3,u3,'u3.dat') 95 | ! 'u1.dat','u2.dat' and 'u3.dat' should be identical 96 | 97 | do k=zstart(3),zend(3) 98 | do j=zstart(2),zend(2) 99 | do i=zstart(1),zend(1) 100 | if (abs(u3(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping y->z" 101 | end do 102 | end do 103 | end do 104 | 105 | !!!!!!!!!!!!!!!!!!!!!!! 106 | ! z-pensil ==> y-pensil 107 | call transpose_z_to_y(u3,u2) 108 | call decomp_2d_write_one(2,u2,'u2b.dat') 109 | 110 | do k=ystart(3),yend(3) 111 | do j=ystart(2),yend(2) 112 | do i=ystart(1),yend(1) 113 | if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping z->y" 114 | end do 115 | end do 116 | end do 117 | 118 | !!!!!!!!!!!!!!!!!!!!!!! 119 | ! y-pensil ==> x-pensil 120 | call transpose_y_to_x(u2,u1) 121 | call decomp_2d_write_one(1,u1,'u1b.dat') 122 | 123 | do k=xstart(3),xend(3) 124 | do j=xstart(2),xend(2) 125 | do i=xstart(1),xend(1) 126 | if (abs(u1(i,j,k)-data1(i,j,k)).gt.0) stop "error swaping y->x" 127 | end do 128 | end do 129 | end do 130 | 131 | call decomp_2d_finalize 132 | call MPI_FINALIZE(ierror) 133 | deallocate(u1,u2,u3) 134 | 135 | end program test2d 136 | 137 | -------------------------------------------------------------------------------- /examples/timing/.gitignore: -------------------------------------------------------------------------------- 1 | timing 2 | -------------------------------------------------------------------------------- /examples/timing/Makefile: -------------------------------------------------------------------------------- 1 | include ../../src/Makefile.inc 2 | 3 | INCLUDE = -I../../include 4 | LIBS = -L../../lib -l2decomp_fft $(LIBFFT) 5 | 6 | OBJ = timing.o 7 | 8 | timing: $(OBJ) 9 | $(F90) -o $@ $(OBJ) $(LIBS) 10 | 11 | clean: 12 | rm -f *.o timing 13 | 14 | %.o : %.f90 15 | $(F90) $(INCLUDE) $(OPTIONS) $(F90FLAGS) -c $< 16 | -------------------------------------------------------------------------------- /examples/timing/README: -------------------------------------------------------------------------------- 1 | timing 2 | ------ 3 | 4 | This program can be used to benchmark the performance of the FFT library. 5 | Timings for both c2c and r2c/c2r transforms are collected. Also the FFT 6 | library is validated as the input data should be recovered to machine 7 | accuracy after a forward and a backward transform and proper normalisation. 8 | 9 | To run: set a proper problem size and number of MPI processes to use (these 10 | are hardware dependent). 11 | 12 | What to expect: 13 | - The timing results 14 | - The error reported should be around machine accuracy (~ 10^-6 for single 15 | precision and 10^-15 for double) 16 | -------------------------------------------------------------------------------- /examples/timing/timing.f90: -------------------------------------------------------------------------------- 1 | program fft_timing 2 | 3 | use decomp_2d 4 | use decomp_2d_fft 5 | use MPI 6 | 7 | implicit none 8 | 9 | integer, parameter :: nx=17, ny=13, nz=11 10 | integer, parameter :: p_row=0, p_col=0 11 | 12 | integer, parameter :: NTEST = 10 ! repeat test this times 13 | 14 | complex(mytype), allocatable, dimension(:,:,:) :: in, out 15 | real(mytype), allocatable, dimension(:,:,:) :: in_r 16 | 17 | integer, dimension(3) :: fft_start, fft_end, fft_size 18 | 19 | real(mytype) :: dr,di, err, err_all, n1,flops 20 | integer :: ierror, i,j,k,m 21 | double precision :: t1, t2, t3 ,t4 22 | 23 | call MPI_INIT(ierror) 24 | call decomp_2d_init(nx,ny,nz,p_row,p_col) 25 | 26 | 27 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 28 | ! Test the c2c interface 29 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 30 | 31 | call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input 32 | 33 | ! input is Z-pencil data 34 | ! output is X-pencil data 35 | allocate (in(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3))) 36 | allocate (out(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 37 | ! initilise input 38 | do k=zstart(3),zend(3) 39 | do j=zstart(2),zend(2) 40 | do i=zstart(1),zend(1) 41 | dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) & 42 | /real(ny,mytype)*real(k,mytype)/real(nz,mytype) 43 | di = dr 44 | in(i,j,k) = cmplx(dr,di,mytype) 45 | end do 46 | end do 47 | end do 48 | 49 | t2 = 0.0D0 50 | t4 = 0.0D0 51 | do m=1,NTEST 52 | 53 | ! forward FFT 54 | t1 = MPI_WTIME() 55 | call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD) 56 | t2 = t2 + MPI_WTIME() - t1 57 | 58 | ! inverse FFT 59 | t3 = MPI_WTIME() 60 | call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD) 61 | t4 = t4 + MPI_WTIME() - t3 62 | 63 | ! normalisation - note 2DECOMP&FFT doesn't normalise 64 | in = in / real(nx,mytype) / real(ny,mytype) /real(nz,mytype) 65 | 66 | end do 67 | 68 | call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, & 69 | MPI_COMM_WORLD,ierror) 70 | t1 = t1 / real(nproc,mytype) 71 | call MPI_ALLREDUCE(t4,t3,1,MPI_DOUBLE_PRECISION,MPI_SUM, & 72 | MPI_COMM_WORLD,ierror) 73 | t3 = t3 / real(nproc,mytype) 74 | 75 | ! checking accuracy 76 | err = 0. 77 | do k=zstart(3),zend(3) 78 | do j=zstart(2),zend(2) 79 | do i=zstart(1),zend(1) 80 | dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) & 81 | /real(ny,mytype)*real(k,mytype)/real(nz,mytype) 82 | di = dr 83 | dr = dr - real(in(i,j,k),mytype) 84 | di = di - aimag(in(i,j,k)) 85 | err = err + sqrt(dr*dr + di*di) 86 | end do 87 | end do 88 | end do 89 | call MPI_ALLREDUCE(err,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror) 90 | err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype) 91 | 92 | if (nrank==0) then 93 | write(*,*) '===== c2c interface =====' 94 | write(*,*) 'error / mesh point: ', err_all 95 | write(*,*) 'time (sec): ', t1,t3 96 | n1 = real(nx,mytype) * real(ny,mytype) * real(nz,mytype) 97 | n1 = n1 ** (1._mytype/3._mytype) 98 | ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm 99 | flops = 5._mytype * n1 * log(n1) / log(2.0_mytype) 100 | ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs 101 | flops = flops * 3._mytype * n1**2 102 | flops = 2._mytype * flops / ((t1+t3)/real(NTEST,mytype)) 103 | write(*,*) 'GFLOPS : ', flops / 1000._mytype**3 104 | end if 105 | 106 | deallocate(in,out) 107 | call decomp_2d_fft_finalize 108 | 109 | 110 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 111 | ! Test the r2c/c2r interface 112 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 113 | call decomp_2d_fft_init 114 | 115 | allocate (in_r(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 116 | call decomp_2d_fft_get_size(fft_start,fft_end,fft_size) 117 | allocate (out(fft_start(1):fft_end(1), & 118 | fft_start(2):fft_end(2), & 119 | fft_start(3):fft_end(3))) 120 | 121 | ! initilise input 122 | do k=xstart(3),xend(3) 123 | do j=xstart(2),xend(2) 124 | do i=xstart(1),xend(1) 125 | in_r(i,j,k) = real(i,mytype)/real(nx,mytype)*real(j,mytype) & 126 | /real(ny,mytype)*real(k,mytype)/real(nz,mytype) 127 | end do 128 | end do 129 | end do 130 | 131 | t2 = 0.0D0 132 | t4 = 0.0D0 133 | do m=1,NTEST 134 | 135 | ! 3D r2c FFT 136 | t1 = MPI_WTIME() 137 | call decomp_2d_fft_3d(in_r, out) 138 | t2 = t2 + MPI_WTIME() - t1 139 | 140 | ! 3D inverse FFT 141 | t3 = MPI_WTIME() 142 | call decomp_2d_fft_3d(out, in_r) 143 | t4 = t4 + MPI_WTIME() - t3 144 | 145 | ! normalisation - note 2DECOMP&FFT doesn't normalise 146 | do k=xstart(3),xend(3) 147 | do j=xstart(2),xend(2) 148 | do i=xstart(1),xend(1) 149 | in_r(i,j,k) = in_r(i,j,k) & 150 | / (real(nx,mytype)*real(ny,mytype)*real(nz,mytype)) 151 | end do 152 | end do 153 | end do 154 | 155 | end do 156 | 157 | call MPI_ALLREDUCE(t2,t1,1,MPI_DOUBLE_PRECISION,MPI_SUM, & 158 | MPI_COMM_WORLD,ierror) 159 | t1 = t1 / real(nproc,mytype) 160 | call MPI_ALLREDUCE(t4,t3,1,MPI_DOUBLE_PRECISION,MPI_SUM, & 161 | MPI_COMM_WORLD,ierror) 162 | t3 = t3 / real(nproc,mytype) 163 | 164 | ! checking accuracy 165 | err = 0. 166 | do k=xstart(3),xend(3) 167 | do j=xstart(2),xend(2) 168 | do i=xstart(1),xend(1) 169 | dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) & 170 | /real(ny,mytype)*real(k,mytype)/real(nz,mytype) 171 | err = err + abs(in_r(i,j,k)-dr) 172 | end do 173 | end do 174 | end do 175 | call MPI_ALLREDUCE(err,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror) 176 | err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype) 177 | 178 | if (nrank==0) then 179 | write(*,*) '===== r2c/c2r interface =====' 180 | write(*,*) 'error / mesh point: ', err_all 181 | write(*,*) 'time (sec): ', t1,t3 182 | end if 183 | 184 | deallocate(in_r,out) 185 | call decomp_2d_fft_finalize 186 | call decomp_2d_finalize 187 | call MPI_FINALIZE(ierror) 188 | 189 | end program fft_timing 190 | 191 | -------------------------------------------------------------------------------- /include/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | VPATH=../src 2 | 3 | LIBNAME:=lib2decomp_fft.a 4 | 5 | all: lib 6 | 7 | .PHONY: lib 8 | 9 | lib: 10 | cd ../src ; $(MAKE) $@ 11 | 12 | clean: 13 | rm -f $(LIBNAME) 14 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # Makefile for building the 2DECOMP&FFT library 3 | ######################################################################## 4 | 5 | # Normally, do not change anything here. Modify the platform-dependent 6 | # Makefile.inc file instead 7 | 8 | include Makefile.inc 9 | 10 | SRCS = decomp_2d.f90 io.f90 glassman.f90 fft_$(FFT).f90 11 | 12 | # special treatment for shared-memory code 13 | ifneq (,$(findstring DSHM,$(OPTIONS))) 14 | SRCS := FreeIPC.f90 $(SRCS) 15 | OBJS = $(SRCS:.f90=.o) alloc_shm.o FreeIPC_c.o 16 | else 17 | OBJS = $(SRCS:.f90=.o) 18 | endif 19 | 20 | # special treatment for Intel MKL (need to build MKL mod files) 21 | MKL_MOD= 22 | ifeq ($(FFT),mkl) 23 | MKL_MOD=mkl_mod 24 | endif 25 | 26 | # spcial treatment for CUDA 27 | ifeq ($(FFT),cufft) 28 | OBJS += cuda_fft_1m.o 29 | endif 30 | 31 | 32 | all: lib examples 33 | 34 | lib: includes lib2decomp_fft.a 35 | mv lib2decomp_fft.a ../lib 36 | 37 | includes: lib2decomp_fft.a 38 | mv *.mod ../include 39 | 40 | cuda_fft_1m.o: cuda_fft_1m.cu 41 | $(CUDA_PATH)/bin/nvcc -c $< 42 | 43 | lib2decomp_fft.a: $(MKL_MOD) $(OBJS) 44 | ar qc $@ $(OBJS) 45 | 46 | alloc_shm.o: alloc_shm.c 47 | $(CC) $(CFLAGS) -c $< 48 | 49 | FreeIPC_c.o: FreeIPC_c.c 50 | $(CC) $(CFLAGS) -c $< 51 | 52 | .PHONY: mkl_mod 53 | mkl_mod: 54 | echo Building Intel MKL mod files... 55 | $(IFORT) -c $(MKL_ROOT)/include/mkl_dfti.f90 56 | 57 | .PHONY: examples 58 | examples: lib 59 | 60 | .PHONY: test 61 | test: examples 62 | 63 | .PHONY: clean 64 | clean: 65 | rm -f *.o *.mod lib*.a 66 | 67 | .PHONY: realclean 68 | realclean: clean 69 | rm -f *~ \#*\# 70 | 71 | %.o : %.f90 72 | $(F90) $(OPTIONS) $(F90FLAGS) -c $< 73 | -------------------------------------------------------------------------------- /src/Makefile.inc: -------------------------------------------------------------------------------- 1 | # Configurations on an ordinary Linux PCs for development 2 | # using GNU compiler, OpenMPI and various FFT libraries 3 | 4 | # 2DECOMP&FFT options 5 | #==================== 6 | 7 | # Choose pre-processing options 8 | # -DDOUBLE_PREC - use double-precision (default single) 9 | # -DEVEN - for evenly distributed data, use ALLTOALL 10 | # -DSHM - enable system V shared-memory implementation 11 | # -DOVERWRITE - allow FFT input to be overwritten (save memory) 12 | # -DT3PIO - turn on LUSTRE IO optimisation code using T3PIO 13 | OPTIONS= 14 | 15 | # Choose one FFT engine, available options are: 16 | # acml - AMD Core Math Library 17 | # cufft - cuFFT, the CUDA Fast Fourier Transform library 18 | # ffte - FFTE 19 | # fftpack5 - FFTPACK5 20 | # fftw3 - FFTW version 3.x 21 | # fftw3_f03 - FFTW 3.3-beta1 or later (with Fortran 2003 interface) 22 | # generic - A general FFT algorithm (no 3rd-party library needed) 23 | # mkl - Intel Math Kernel Library 24 | FFT=generic 25 | 26 | # Platform-dependent information - compiler, external library etc 27 | #================================================================ 28 | 29 | # Inlcude path if necessary (change to your actual paths) 30 | MKL_ROOT=/opt/intel/oneapi/mkl/latest 31 | FFTW_PATH=$(HOME)/software/build/fftw-3.3.9 32 | CUDA_PATH=/usr/local/cuda-11.1 33 | ifeq ($(FFT),mkl) 34 | INC=-I$(MKL_ROOT)/include 35 | # Fortran compiler used to compile MKL mod files 36 | IFORT=gfortran 37 | else ifeq ($(FFT),fftw3) 38 | INC=-I$(FFTW_PATH)/include 39 | else ifeq ($(FFT),fftw3_f03) 40 | INC=-I$(FFTW_PATH)/include 41 | else ifeq ($(FFT),cufft) 42 | INC=-I$(CUDA_PATH)/include 43 | else 44 | INC= 45 | endif 46 | 47 | ifneq (,$(findstring DT3PIO,$(OPTIONS))) 48 | T3PIO_PATH=/opt/t3pio 49 | INC+= -I$(T3PIO_PATH)/include 50 | endif 51 | 52 | 53 | #----------------------- Fortran Compiler ---------------------------- 54 | F90=mpif90 55 | 56 | # enable preprocessing 57 | CPPFLAGS=-cpp 58 | # enable Cray pointer support if needed 59 | CRAYPTR=-fcray-pointer 60 | # optimisation or debugging flags 61 | #OPTIM=-g -fcheck=all 62 | OPTIM=-O3 63 | 64 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC) 65 | LDFLAGS=$(OPTIM) 66 | 67 | #--------------------------C Compiler--------------------------------- 68 | CC=mpicc 69 | CFLAGS=-O3 70 | 71 | #-----------------------External Library------------------------------ 72 | 73 | # For FFTW 74 | LIB_FFTW3=-L$(FFTW_PATH)/lib -lfftw3f -lfftw3 75 | 76 | # For ACML 77 | # This assumes that 32-bit ACML installed at /opt/. Adjust properly. 78 | # It helps to create a symbolic link such as 'acml -> acml4.4.0' 79 | # and update the symbolic link when future ACML version is installed. 80 | LIB_ACML=/opt/acml/gfortran32/lib/libacml.a -lrt 81 | 82 | # For FFTPACK5 83 | FFTPACK5_PATH=$(HOME)/software/fftpack5 84 | LIB_FFTPACK5=$(FFTPACK5_PATH)/libfftpack5.a 85 | 86 | # For Intel MKL 87 | MKL_LIB_PATH= $(MKL_ROOT)/lib/intel64 88 | LIB_MKL=-Wl,--start-group $(MKL_LIB_PATH)/libmkl_gf_lp64.a $(MKL_LIB_PATH)/libmkl_sequential.a $(MKL_LIB_PATH)/libmkl_core.a -Wl,--end-group -lpthread -lm -ldl 89 | 90 | # For FFTE 91 | FFTE_PATH=$(HOME)/software/ffte-4.1 92 | LIB_FFTE=$(FFTE_PATH)/libffte.a 93 | 94 | # For cuFFT 95 | LIB_CUFFT=-L$(CUDA_PATH)/lib64 -lcudart -lcufft 96 | 97 | ifeq ($(FFT),generic) 98 | LIBFFT= 99 | else ifeq ($(FFT),acml) 100 | LIBFFT=$(LIB_ACML) 101 | else ifeq ($(FFT),ffte) 102 | LIBFFT=$(LIB_FFTE) 103 | else ifeq ($(FFT),fftpack5) 104 | LIBFFT=$(LIB_FFTPACK5) 105 | else ifeq ($(FFT),fftw3) 106 | LIBFFT=$(LIB_FFTW3) 107 | else ifeq ($(FFT),fftw3_f03) 108 | LIBFFT=$(LIB_FFTW3) 109 | else ifeq ($(FFT),mkl) 110 | LIBFFT=$(LIB_MKL) 111 | else ifeq ($(FFT),cufft) 112 | LIBFFT=$(LIB_CUFFT) 113 | endif 114 | -------------------------------------------------------------------------------- /src/Makefile.inc.BlueGene: -------------------------------------------------------------------------------- 1 | # Configurations for IBM BlueGene systems 2 | # using IBM XL compilers 3 | 4 | # 2DECOMP&FFT options 5 | #==================== 6 | 7 | # Choose pre-processing options 8 | # -DDOUBLE_PREC - use double-precision (default single) 9 | # -DEVEN - for evenly distributed data, use ALLTOALL 10 | # -DOVERWRITE - allow FFT input to be overwritten (save memory) 11 | OPTION=-DDOUBLE_PREC -DOVERWRITE 12 | 13 | # Choose one FFT engine, available options are: 14 | # essl - IBM Engineering and Scientific Subroutine Library 15 | # fftw3 - FFTW version 3.x 16 | # fftw3_f03 - FFTW 3.3-beta1 or later (with Fortran 2003 interface) 17 | # generic - A general FFT algorithm (no 3rd-party library needed) 18 | FFT=generic 19 | 20 | # Platform-dependent information 21 | #=============================== 22 | 23 | # special syntax for IBM XL compiler's preprocessing 24 | # instead of using "-DTAG1 -DTAG2", XL needs "-WF,-DTAG1 -WF,-DTAG2" 25 | from:=-D 26 | to:=-WF,-D 27 | TMP=$(subst $(from),$(to),$(OPTION)) 28 | OPTIONS=$(TMP) 29 | 30 | # The path of the base BlueGene system software 31 | BGP_SYS=/bgsys/drivers/V1R4M1_460_2009-091110P/ppc/comm/xl 32 | 33 | # Inlcude path 34 | INC= 35 | 36 | #----------------------- Fortran Compiler ---------------------------- 37 | F90=$(BGP_SYS)/bin/mpixlf90_r 38 | 39 | # enable preprocessing 40 | CPPFLAGS=-qsuffix=cpp=f90 41 | 42 | # enable Cray pointer support if needed 43 | CRAYPTR= 44 | 45 | # optimisation or debugging flags 46 | OPTIM=-O3 -qarch=450d -qtune=450 47 | 48 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC) 49 | LDFLAGS=$(OPTIM) 50 | 51 | #--------------------------C Compiler--------------------------------- 52 | CC=mpixlc_r 53 | CFLAGS=-O3 54 | 55 | #-----------------------External Library------------------------------ 56 | 57 | # for FFTW 58 | LIB_FFTW3=-lfftw3f -lfftw3 -L # supply path to FFTW3 here 59 | 60 | # for ESSL 61 | LIB_ESSL=-L$(BGP_SYS)/lib -L/opt/ibmmath/lib -lesslbg 62 | 63 | ifeq ($(FFT),generic) 64 | LIBFFT= 65 | else ifeq ($(FFT),fftw3) 66 | LIBFFT=$(LIB_FFTW3) 67 | else ifeq ($(FFT),fftw3_f03) 68 | LIBFFT=$(LIB_FFTW3) 69 | else ifeq ($(FFT),essl) 70 | LIBFFT=$(LIB_ESSL) 71 | endif 72 | -------------------------------------------------------------------------------- /src/Makefile.inc.Cray_XE: -------------------------------------------------------------------------------- 1 | # Configurations for Cray XT/XE systems 2 | # using PGI/PathScale/GNU/Cray compilers 3 | 4 | # 2DECOMP&FFT options 5 | #==================== 6 | 7 | # Choose pre-processing options 8 | # -DDOUBLE_PREC - use double-precision (default single) 9 | # -DEVEN - for evenly distributed data, use ALLTOALL 10 | # -DSHM - enable shared-memory implementation 11 | # -DOVERWRITE - allow FFT input to be overwritten (save memory) 12 | OPTIONS=-DDOUBLE_PREC -DOVERWRITE 13 | 14 | # Choose one FFT engine, available options are: 15 | # acml - AMD Core Math Library 16 | # fftw3 - FFTW version 3.x 17 | # fftw3_f03 - FFTW 3.3-beta1 or later (with Fortran 2003 interface) 18 | # generic - A general FFT algorithm (no 3rd-party library needed) 19 | FFT=fftw3 20 | 21 | # Platform-dependent information 22 | #=============================== 23 | 24 | # Choose compiler suite - valid options: PGI, PathScale, GNU, Cray, Intel 25 | COMPILER=PGI 26 | 27 | # Inlcude path not set, relying on Cray's modules 28 | INC= 29 | 30 | #----------------------- Fortran Compiler ---------------------------- 31 | F90=ftn 32 | 33 | # enable preprocessing 34 | ifeq ($(COMPILER),PGI) 35 | CPPFLAGS=-Mpreprocess 36 | else ifeq ($(COMPILER),PathScale) 37 | CPPFLAGS=-cpp 38 | else ifeq ($(COMPILER),GNU) 39 | CPPFLAGS=-cpp 40 | else ifeq ($(COMPILER),Cray) 41 | CPPFLAGS=-e Fm 42 | else ifeq ($(COMPILER),Intel) 43 | CPPFLAGS=-fpp 44 | endif 45 | 46 | # enable Cray pointer support if needed 47 | ifeq ($(COMPILER),GNU) 48 | CRAYPTR=-fcray-pointer 49 | else 50 | CRAYPTR= 51 | endif 52 | 53 | # optimisation or debugging flags 54 | ifeq ($(COMPILER),PGI) 55 | OPTIM=-O3 56 | else ifeq ($(COMPILER),PathScale) 57 | OPTIM=-O3 58 | else ifeq ($(COMPILER),GNU) 59 | #OPTIM=-g -fbounds-check 60 | OPTIM=-O3 61 | else ifeq ($(COMPILER),Cray) 62 | OPTIM=-O3 63 | else ifeq ($(COMPILER),Intel) 64 | OPTIM=-O3 # no -fast as IPA cause problem 65 | endif 66 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC) 67 | LDFLAGS=$(OPTIM) 68 | 69 | #--------------------------C Compiler--------------------------------- 70 | CC=cc 71 | CFLAGS=-O3 72 | 73 | #-----------------------External Library------------------------------ 74 | 75 | # Do not need any as this is all handled by the 'module' system 76 | 77 | LIBFFT= 78 | 79 | -------------------------------------------------------------------------------- /src/Makefile.inc.Fujitsu_SPARC64_VIIIfx: -------------------------------------------------------------------------------- 1 | # Configurations for SPARC64 servers using Fujitsu compiler 2 | # configutation tested on a SPARC64 VIIIfx machine 3 | 4 | # 2DECOMP&FFT options 5 | #==================== 6 | 7 | # Choose pre-processing options 8 | # -DDOUBLE_PREC - use double-precision (default single) 9 | # -DEVEN - for evenly distributed data, use ALLTOALL 10 | # -DOVERWRITE - allow FFT input to be overwritten (save memory) 11 | OPTIONS=-DDOUBLE_PREC -DOVERWRITE 12 | 13 | # Choose one FFT engine, available options are: 14 | # ffte - FFTE 15 | # fftw3 - FFTW version 3.x 16 | # fftw3_f03 - FFTW 3.3-beta1 or later (with Fortran 2003 interface) 17 | # generic - A general FFT algorithm (no 3rd-party library needed) 18 | FFT=generic 19 | 20 | # Platform-dependent information - compiler, external library etc 21 | #================================================================ 22 | 23 | # Inlcude path if necessary 24 | # Need to compile a copy of FFTW version 3.x using the same Fujitsu compiler 25 | FFTW_PATH= 26 | ifeq ($(FFT),fftw3) 27 | INC=-I$(FFTW_PATH)/include 28 | else ifeq ($(FFT),fftw3_f03) 29 | INC=-I$(FFTW_PATH)/include 30 | else 31 | INC= 32 | endif 33 | 34 | #----------------------- Fortran Compiler ---------------------------- 35 | F90=mpifrtpx 36 | 37 | # enable preprocessing 38 | CPPFLAGS=-Cpp 39 | # enable Cray pointer support if needed 40 | CRAYPTR= 41 | # optimisation or debugging flags 42 | #OPTIM=-g 43 | OPTIM=-Kfast 44 | 45 | F90FLAGS=$(OPTIM) $(CRAYPTR) $(CPPFLAGS) $(INC) 46 | LDFLAGS=$(OPTIM) 47 | 48 | #--------------------------C Compiler--------------------------------- 49 | CC=mpifccpx 50 | CFLAGS=-Kfast 51 | 52 | #-----------------------External Library------------------------------ 53 | 54 | # For FFTW 55 | LIB_FFTW3=-L$(FFTW_PATH)/lib -lfftw3f -lfftw3 56 | 57 | # For FFTE 58 | FFTE_PATH=path/to/ffte-4.1 # compile FFTE using the same compiler 59 | LIB_FFTE=$(FFTE_PATH)/libffte.a 60 | 61 | ifeq ($(FFT),generic) 62 | LIBFFT= 63 | else ifeq ($(FFT),ffte) 64 | LIBFFT=$(LIB_FFTE) 65 | else ifeq ($(FFT),fftw3) 66 | LIBFFT=$(LIB_FFTW3) 67 | else ifeq ($(FFT),fftw3_f03) 68 | LIBFFT=$(LIB_FFTW3) 69 | endif 70 | -------------------------------------------------------------------------------- /src/acml_plan.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contains subroutines that generate ACML plans 13 | ! for several types of 1D multiple FFTs. 14 | 15 | ! Note most ACML plans can be shared by forward/backward transforms 16 | 17 | ! Return an ACML plan for multiple 1D c2c FFTs in X direction 18 | subroutine c2c_1m_x_plan(comm, decomp) 19 | 20 | implicit none 21 | complex(mytype), allocatable, dimension(:), intent(OUT) :: comm 22 | TYPE(DECOMP_INFO), intent(IN) :: decomp 23 | 24 | complex(mytype), allocatable, dimension(:,:,:) :: dummy 25 | integer :: info 26 | 27 | #ifdef DOUBLE_PREC 28 | allocate(comm(3*decomp%xsz(1)+100)) 29 | #else 30 | allocate(comm(5*decomp%xsz(1)+100)) 31 | #endif 32 | 33 | allocate(dummy(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3))) 34 | 35 | #ifdef DOUBLE_PREC 36 | call zfft1mx(plan_type,scale,.true.,decomp%xsz(2)*decomp%xsz(3), & 37 | decomp%xsz(1), dummy,1,decomp%xsz(1),dummy,1,decomp%xsz(1), & 38 | comm,info) 39 | #else 40 | call cfft1mx(plan_type,scale,.true.,decomp%xsz(2)*decomp%xsz(3), & 41 | decomp%xsz(1), dummy,1,decomp%xsz(1),dummy,1,decomp%xsz(1), & 42 | comm,info) 43 | #endif 44 | 45 | deallocate(dummy) 46 | 47 | return 48 | end subroutine c2c_1m_x_plan 49 | 50 | ! Return an ACML plan for multiple 1D c2c FFTs in Y direction 51 | subroutine c2c_1m_y_plan(comm, decomp) 52 | 53 | implicit none 54 | complex(mytype), allocatable, dimension(:), intent(OUT) :: comm 55 | TYPE(DECOMP_INFO), intent(IN) :: decomp 56 | 57 | complex(mytype), allocatable, dimension(:,:,:) :: dummy 58 | integer :: info 59 | 60 | #ifdef DOUBLE_PREC 61 | allocate(comm(3*decomp%ysz(2)+100)) 62 | #else 63 | allocate(comm(5*decomp%ysz(2)+100)) 64 | #endif 65 | 66 | allocate(dummy(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) 67 | 68 | #ifdef DOUBLE_PREC 69 | call zfft1mx(plan_type,scale,.true.,decomp%ysz(1),decomp%ysz(2), & 70 | dummy,decomp%ysz(1),1,dummy,decomp%ysz(1),1,comm,info) 71 | #else 72 | call cfft1mx(plan_type,scale,.true.,decomp%ysz(1),decomp%ysz(2), & 73 | dummy,decomp%ysz(1),1,dummy,decomp%ysz(1),1,comm,info) 74 | #endif 75 | 76 | deallocate(dummy) 77 | 78 | return 79 | end subroutine c2c_1m_y_plan 80 | 81 | ! Return an ACML plan for multiple 1D c2c FFTs in Z direction 82 | subroutine c2c_1m_z_plan(comm, decomp) 83 | 84 | implicit none 85 | complex(mytype), allocatable, dimension(:), intent(OUT) :: comm 86 | TYPE(DECOMP_INFO), intent(IN) :: decomp 87 | 88 | complex(mytype), allocatable, dimension(:,:,:) :: dummy 89 | integer :: info 90 | 91 | #ifdef DOUBLE_PREC 92 | allocate(comm(3*decomp%zsz(3)+100)) 93 | #else 94 | allocate(comm(5*decomp%zsz(3)+100)) 95 | #endif 96 | 97 | allocate(dummy(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3))) 98 | 99 | #ifdef DOUBLE_PREC 100 | call zfft1mx(plan_type,scale,.true.,decomp%zsz(1)*decomp%zsz(2), & 101 | decomp%zsz(3),dummy,decomp%zsz(1)*decomp%zsz(2),1,dummy, & 102 | decomp%zsz(1)*decomp%zsz(2),1,comm,info) 103 | #else 104 | call cfft1mx(plan_type,scale,.true.,decomp%zsz(1)*decomp%zsz(2), & 105 | decomp%zsz(3),dummy,decomp%zsz(1)*decomp%zsz(2),1,dummy, & 106 | decomp%zsz(1)*decomp%zsz(2),1,comm,info) 107 | #endif 108 | 109 | deallocate(dummy) 110 | 111 | return 112 | end subroutine c2c_1m_z_plan 113 | 114 | 115 | ! Return an ACML plan for multiple 1D r2c FFTs in X direction 116 | subroutine r2c_1m_x_plan(comm, decomp) 117 | 118 | implicit none 119 | real(mytype), allocatable, dimension(:), intent(OUT) :: comm 120 | TYPE(DECOMP_INFO), intent(IN) :: decomp 121 | 122 | real(mytype), allocatable, dimension(:) :: dummy 123 | integer :: info 124 | 125 | allocate(comm(3*decomp%xsz(1)+100)) 126 | 127 | allocate(dummy(decomp%xsz(1))) 128 | 129 | #ifdef DOUBLE_PREC 130 | call dzfft(plan_type,decomp%xsz(1),dummy,comm,info) 131 | #else 132 | call scfft(plan_type,decomp%xsz(1),dummy,comm,info) 133 | #endif 134 | 135 | deallocate(dummy) 136 | 137 | return 138 | end subroutine r2c_1m_x_plan 139 | 140 | ! Return an ACML plan for multiple 1D c2r FFTs in X direction 141 | subroutine c2r_1m_x_plan(comm, decomp) 142 | 143 | implicit none 144 | real(mytype), allocatable, dimension(:), intent(OUT) :: comm 145 | TYPE(DECOMP_INFO), intent(IN) :: decomp 146 | 147 | real(mytype), allocatable, dimension(:) :: dummy 148 | integer :: info 149 | 150 | allocate(comm(3*decomp%xsz(1)+100)) 151 | 152 | allocate(dummy(decomp%xsz(1))) 153 | 154 | #ifdef DOUBLE_PREC 155 | call zdfft(plan_type,decomp%xsz(1),dummy,comm,info) 156 | #else 157 | call csfft(plan_type,decomp%xsz(1),dummy,comm,info) 158 | #endif 159 | 160 | deallocate(dummy) 161 | 162 | return 163 | end subroutine c2r_1m_x_plan 164 | 165 | ! Return an ACML plan for multiple 1D r2c FFTs in Z direction 166 | subroutine r2c_1m_z_plan(comm, decomp) 167 | 168 | implicit none 169 | real(mytype), allocatable, dimension(:), intent(OUT) :: comm 170 | TYPE(DECOMP_INFO), intent(IN) :: decomp 171 | 172 | real(mytype), allocatable, dimension(:) :: dummy 173 | integer :: info 174 | 175 | allocate(comm(3*decomp%zsz(3)+100)) 176 | 177 | allocate(dummy(decomp%zsz(3))) 178 | 179 | #ifdef DOUBLE_PREC 180 | call dzfft(plan_type,decomp%zsz(3),dummy,comm,info) 181 | #else 182 | call scfft(plan_type,decomp%zsz(3),dummy,comm,info) 183 | #endif 184 | 185 | deallocate(dummy) 186 | 187 | return 188 | end subroutine r2c_1m_z_plan 189 | 190 | ! Return an ACML plan for multiple 1D c2r FFTs in Z direction 191 | subroutine c2r_1m_z_plan(comm, decomp) 192 | 193 | implicit none 194 | real(mytype), allocatable, dimension(:), intent(OUT) :: comm 195 | TYPE(DECOMP_INFO), intent(IN) :: decomp 196 | 197 | real(mytype), allocatable, dimension(:) :: dummy 198 | integer :: info 199 | 200 | allocate(comm(3*decomp%zsz(3)+100)) 201 | 202 | allocate(dummy(decomp%zsz(3))) 203 | 204 | #ifdef DOUBLE_PREC 205 | call zdfft(plan_type,decomp%zsz(3),dummy,comm,info) 206 | #else 207 | call csfft(plan_type,decomp%zsz(3),dummy,comm,info) 208 | #endif 209 | 210 | deallocate(dummy) 211 | 212 | return 213 | end subroutine c2r_1m_z_plan 214 | -------------------------------------------------------------------------------- /src/alloc.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 13 | ! Utility routine to help allocate 3D arrays 14 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 15 | 16 | ! X-pencil real arrays 17 | subroutine alloc_x_real(var, opt_decomp, opt_global) 18 | 19 | implicit none 20 | 21 | real(mytype), allocatable, dimension(:,:,:) :: var 22 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 23 | logical, intent(IN), optional :: opt_global 24 | 25 | TYPE(DECOMP_INFO) :: decomp 26 | logical :: global 27 | integer :: alloc_stat, errorcode 28 | 29 | if (present(opt_decomp)) then 30 | decomp = opt_decomp 31 | else 32 | decomp = decomp_main 33 | end if 34 | 35 | if (present(opt_global)) then 36 | global = opt_global 37 | else 38 | global = .false. 39 | end if 40 | 41 | if (global) then 42 | allocate(var(decomp%xst(1):decomp%xen(1), & 43 | decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), & 44 | stat=alloc_stat) 45 | else 46 | allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), & 47 | stat=alloc_stat) 48 | end if 49 | 50 | if (alloc_stat /= 0) then 51 | errorcode = 8 52 | call decomp_2d_abort(errorcode, & 53 | 'Memory allocation failed when creating new arrays') 54 | end if 55 | 56 | return 57 | end subroutine alloc_x_real 58 | 59 | 60 | ! X-pencil complex arrays 61 | subroutine alloc_x_complex(var, opt_decomp, opt_global) 62 | 63 | implicit none 64 | 65 | complex(mytype), allocatable, dimension(:,:,:) :: var 66 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 67 | logical, intent(IN), optional :: opt_global 68 | 69 | TYPE(DECOMP_INFO) :: decomp 70 | logical :: global 71 | integer :: alloc_stat, errorcode 72 | 73 | if (present(opt_decomp)) then 74 | decomp = opt_decomp 75 | else 76 | decomp = decomp_main 77 | end if 78 | 79 | if (present(opt_global)) then 80 | global = opt_global 81 | else 82 | global = .false. 83 | end if 84 | 85 | if (global) then 86 | allocate(var(decomp%xst(1):decomp%xen(1), & 87 | decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), & 88 | stat=alloc_stat) 89 | else 90 | allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), & 91 | stat=alloc_stat) 92 | end if 93 | 94 | if (alloc_stat /= 0) then 95 | errorcode = 8 96 | call decomp_2d_abort(errorcode, & 97 | 'Memory allocation failed when creating new arrays') 98 | end if 99 | 100 | return 101 | end subroutine alloc_x_complex 102 | 103 | 104 | ! Y-pencil real arrays 105 | subroutine alloc_y_real(var, opt_decomp, opt_global) 106 | 107 | implicit none 108 | 109 | real(mytype), allocatable, dimension(:,:,:) :: var 110 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 111 | logical, intent(IN), optional :: opt_global 112 | 113 | TYPE(DECOMP_INFO) :: decomp 114 | logical :: global 115 | integer :: alloc_stat, errorcode 116 | 117 | if (present(opt_decomp)) then 118 | decomp = opt_decomp 119 | else 120 | decomp = decomp_main 121 | end if 122 | 123 | if (present(opt_global)) then 124 | global = opt_global 125 | else 126 | global = .false. 127 | end if 128 | 129 | if (global) then 130 | allocate(var(decomp%yst(1):decomp%yen(1), & 131 | decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), & 132 | stat=alloc_stat) 133 | else 134 | allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), & 135 | stat=alloc_stat) 136 | end if 137 | 138 | if (alloc_stat /= 0) then 139 | errorcode = 8 140 | call decomp_2d_abort(errorcode, & 141 | 'Memory allocation failed when creating new arrays') 142 | end if 143 | 144 | return 145 | end subroutine alloc_y_real 146 | 147 | 148 | ! Y-pencil complex arrays 149 | subroutine alloc_y_complex(var, opt_decomp, opt_global) 150 | 151 | implicit none 152 | 153 | complex(mytype), allocatable, dimension(:,:,:) :: var 154 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 155 | logical, intent(IN), optional :: opt_global 156 | 157 | TYPE(DECOMP_INFO) :: decomp 158 | logical :: global 159 | integer :: alloc_stat, errorcode 160 | 161 | if (present(opt_decomp)) then 162 | decomp = opt_decomp 163 | else 164 | decomp = decomp_main 165 | end if 166 | 167 | if (present(opt_global)) then 168 | global = opt_global 169 | else 170 | global = .false. 171 | end if 172 | 173 | if (global) then 174 | allocate(var(decomp%yst(1):decomp%yen(1), & 175 | decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), & 176 | stat=alloc_stat) 177 | else 178 | allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), & 179 | stat=alloc_stat) 180 | end if 181 | 182 | if (alloc_stat /= 0) then 183 | errorcode = 8 184 | call decomp_2d_abort(errorcode, & 185 | 'Memory allocation failed when creating new arrays') 186 | end if 187 | 188 | return 189 | end subroutine alloc_y_complex 190 | 191 | 192 | ! Z-pencil real arrays 193 | subroutine alloc_z_real(var, opt_decomp, opt_global) 194 | 195 | implicit none 196 | 197 | real(mytype), allocatable, dimension(:,:,:) :: var 198 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 199 | logical, intent(IN), optional :: opt_global 200 | 201 | TYPE(DECOMP_INFO) :: decomp 202 | logical :: global 203 | integer :: alloc_stat, errorcode 204 | 205 | if (present(opt_decomp)) then 206 | decomp = opt_decomp 207 | else 208 | decomp = decomp_main 209 | end if 210 | 211 | if (present(opt_global)) then 212 | global = opt_global 213 | else 214 | global = .false. 215 | end if 216 | 217 | if (global) then 218 | allocate(var(decomp%zst(1):decomp%zen(1), & 219 | decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), & 220 | stat=alloc_stat) 221 | else 222 | allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), & 223 | stat=alloc_stat) 224 | end if 225 | 226 | if (alloc_stat /= 0) then 227 | errorcode = 8 228 | call decomp_2d_abort(errorcode, & 229 | 'Memory allocation failed when creating new arrays') 230 | end if 231 | 232 | return 233 | end subroutine alloc_z_real 234 | 235 | 236 | ! Z-pencil complex arrays 237 | subroutine alloc_z_complex(var, opt_decomp, opt_global) 238 | 239 | implicit none 240 | 241 | complex(mytype), allocatable, dimension(:,:,:) :: var 242 | TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp 243 | logical, intent(IN), optional :: opt_global 244 | 245 | TYPE(DECOMP_INFO) :: decomp 246 | logical :: global 247 | integer :: alloc_stat, errorcode 248 | 249 | if (present(opt_decomp)) then 250 | decomp = opt_decomp 251 | else 252 | decomp = decomp_main 253 | end if 254 | 255 | if (present(opt_global)) then 256 | global = opt_global 257 | else 258 | global = .false. 259 | end if 260 | 261 | if (global) then 262 | allocate(var(decomp%zst(1):decomp%zen(1), & 263 | decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), & 264 | stat=alloc_stat) 265 | else 266 | allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), & 267 | stat=alloc_stat) 268 | end if 269 | 270 | if (alloc_stat /= 0) then 271 | errorcode = 8 272 | call decomp_2d_abort(errorcode, & 273 | 'Memory allocation failed when creating new arrays') 274 | end if 275 | 276 | return 277 | end subroutine alloc_z_complex 278 | -------------------------------------------------------------------------------- /src/alloc_shm.c: -------------------------------------------------------------------------------- 1 | //======================================================================= 2 | // This is part of the 2DECOMP&FFT library 3 | // 4 | // 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | // decomposition. It also implements a highly scalable distributed 6 | // three-dimensional Fast Fourier Transform (FFT). 7 | // 8 | // Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | // 10 | //======================================================================= 11 | 12 | // This is the shared-memory code using System V IPC API 13 | 14 | /* 15 | This shared-memory code is kindly provided by David Tanqueray of Cray Inc. 16 | who also helped the author adapt it to use in 2DECOMP&FFT. His assistance 17 | is greatly appreciated. 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #ifndef DBG 29 | #define DBG 30 | #endif 31 | static int shm_debug=0; 32 | 33 | float log2f(float); 34 | 35 | void set_shm_debug_(); 36 | void get_smp_map2_(MPI_Fint *comm, MPI_Fint *nnodes, MPI_Fint *my_node, 37 | MPI_Fint *ncores, MPI_Fint *my_core, MPI_Fint *maxcor); 38 | void alloc_shm_(MPI_Aint *ptr, MPI_Fint *nelem, MPI_Fint *type, 39 | MPI_Fint *comm, MPI_Fint *ret); 40 | void dealloc_shm_(MPI_Aint *ptr, MPI_Fint *comm); 41 | 42 | void set_shm_debug_() 43 | { 44 | shm_debug = 1; 45 | } 46 | 47 | void get_smp_map2_(MPI_Fint *comm, MPI_Fint *nnodes, MPI_Fint *my_node, 48 | MPI_Fint *ncores, MPI_Fint *my_core, MPI_Fint *maxcor) 49 | { 50 | MPI_Comm world; 51 | int err, pe, mype, npes, nnds, ncrs, maxc; 52 | int nlen, mynid, *nidlst, *nodlst; 53 | int i, n; 54 | char nodnam[MPI_MAX_PROCESSOR_NAME]; 55 | char string[20]; 56 | FILE *fp; 57 | 58 | MPI_Comm_rank(MPI_COMM_WORLD,&pe); 59 | 60 | world = MPI_Comm_f2c(*comm); 61 | MPI_Comm_rank(world, &mype); 62 | MPI_Comm_size(world, &npes); 63 | MPI_Get_processor_name(nodnam, &nlen); 64 | #ifdef USE_NAME 65 | mynid = atoi(nodnam+3); 66 | #else 67 | sprintf(string," pe %d /proc/cray_xt/nid",mype); 68 | if ((fp = fopen("/proc/cray_xt/nid", "r")) == NULL) { 69 | perror(string); 70 | exit(1); 71 | } 72 | fscanf(fp,"%i", &mynid); 73 | fclose(fp); 74 | #endif 75 | #ifdef DBG 76 | if (shm_debug) { 77 | fprintf(stderr," pe %d mype %d of %d, nodnam = %s (len = %d), node = %d\n", 78 | pe, mype, npes, nodnam, nlen, mynid); 79 | MPI_Barrier(world); 80 | } 81 | #endif 82 | 83 | /* get list of nodeid for each pe */ 84 | nidlst = malloc(npes*sizeof(int)); 85 | MPI_Allgather(&mynid, 1, MPI_INT, nidlst, 1, MPI_INT, world); 86 | 87 | nodlst = malloc(npes*sizeof(int)); 88 | nnds = ncrs = 0; 89 | for (i=0; i= nnds) nodlst[nnds++] = nidlst[i]; /* add new node to list */ 100 | } 101 | /* get max core counts over all nodes */ 102 | MPI_Allreduce(&ncrs, &maxc, 1, MPI_INT, MPI_MAX, world); 103 | *nnodes = (MPI_Fint)(nnds); 104 | *ncores = (MPI_Fint)(ncrs); 105 | *maxcor = (MPI_Fint)(maxc); 106 | 107 | #ifdef DBG 108 | if (shm_debug) { 109 | fprintf(stderr," pe %d nnodes=%d ncores=%d maxcor=%d\n",pe,nnds,ncrs,maxc); 110 | for (n=0; nplist) s[-1]='\0'; 179 | fprintf(stderr," pe %d al_shm: comm sz/rk=%d/%d pes=%s\n",pe,npes,mype,plist); 180 | } 181 | #endif 182 | MPI_Type_size(MPI_Type_f2c(*type), &typsiz); 183 | size = (size_t)*nelem * (size_t)typsiz; 184 | 185 | err = 0; 186 | /* setup structure to keep track of this segment and its blocks */ 187 | seg = (Segtyp *)malloc(sizeof(Segtyp)); 188 | seg->next = seglst; 189 | seg->base = NULL; 190 | seg->blks = NULL; 191 | seglst = seg; 192 | 193 | shmaddr = NULL; 194 | while (size) { 195 | blksize = sizebase == NULL) seg->base = shm; 229 | 230 | /* setup structure to record block details */ 231 | blk = malloc(sizeof(Blktyp)); 232 | blk->size = blksize; 233 | blk->addr = shm; 234 | blk->shmid = shmid; 235 | blk->key = key; 236 | /* and add it to segment's list */ 237 | blk->next = seg->blks; 238 | seg->blks = blk; 239 | 240 | shmaddr = shm + blksize; 241 | size -= blksize; 242 | } 243 | 244 | *ptr = (MPI_Aint)seg->base; 245 | if (*ret) *ret = err; 246 | else if (err) exit(1); 247 | } 248 | 249 | void dealloc_shm_(MPI_Aint *ptr, MPI_Fint *comm) 250 | { 251 | MPI_Comm world; 252 | int pe, mype, err; 253 | char string[20]; 254 | Segtyp *seg; 255 | Segtyp **prev; 256 | Blktyp *blk=NULL, *nblk; 257 | void *shm; 258 | 259 | MPI_Comm_rank(MPI_COMM_WORLD, &pe); 260 | #ifdef DBG 261 | if (shm_debug) fprintf(stderr," pe %d dealloc_shm: ptr=%lx\n",pe,*ptr); 262 | #endif 263 | 264 | world = MPI_Comm_f2c(*comm); 265 | MPI_Comm_rank(world, &mype); 266 | err = 0; 267 | /* Find segment with specified start address and remove from list */ 268 | seg = seglst; 269 | prev = &seglst; 270 | while (seg) { 271 | if (seg->base == (void*)*ptr) { 272 | blk=seg->blks; 273 | *prev = seg->next; 274 | free(seg); 275 | break; 276 | } 277 | prev = &seg->next; 278 | seg = seg->next; 279 | } 280 | if (blk == NULL) { 281 | fprintf(stderr," pe %d dealloc_shm: segment at address %lx not found\n", 282 | pe, *ptr); 283 | } 284 | /* detach all blocks in this segment */ 285 | while (blk) { 286 | shm = blk->addr; 287 | sprintf(string," pe %d shmdt",pe); 288 | if (shmdt((char*)shm) < 0) { 289 | perror(string); 290 | err++; 291 | } 292 | nblk = blk->next; 293 | free(blk); 294 | blk = nblk; 295 | } 296 | if (err) exit(1); 297 | } 298 | 299 | -------------------------------------------------------------------------------- /src/cuda_fft_1m.cu: -------------------------------------------------------------------------------- 1 | //======================================================================= 2 | // This is part of the 2DECOMP&FFT library 3 | // 4 | // 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | // decomposition. It also implements a highly scalable distributed 6 | // three-dimensional Fast Fourier Transform (FFT). 7 | // 8 | // Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | // 10 | //======================================================================= 11 | 12 | // This contains CUDA code that compute multiple 1D FFTs on NVidia GPU 13 | 14 | #ifdef DOUBLE_PREC 15 | #define CUFFT_REAL_TYPE cufftDoubleReal 16 | #define CUFFT_COMPLEX_TYPE cufftDoubleComplex 17 | #define CUFFT_PLAN_TYPE_C2C CUFFT_Z2Z 18 | #define CUFFT_PLAN_TYPE_R2C CUFFT_D2Z 19 | #define CUFFT_PLAN_TYPE_C2R CUFFT_Z2D 20 | #define CUFFT_EXEC_TYPE_C2C cufftExecZ2Z 21 | #define CUFFT_EXEC_TYPE_R2C cufftExecD2Z 22 | #define CUFFT_EXEC_TYPE_C2R cufftExecZ2D 23 | #else 24 | #define CUFFT_REAL_TYPE cufftReal 25 | #define CUFFT_COMPLEX_TYPE cufftComplex 26 | #define CUFFT_PLAN_TYPE_C2C CUFFT_C2C 27 | #define CUFFT_PLAN_TYPE_R2C CUFFT_R2C 28 | #define CUFFT_PLAN_TYPE_C2R CUFFT_C2R 29 | #define CUFFT_EXEC_TYPE_C2C cufftExecC2C 30 | #define CUFFT_EXEC_TYPE_R2C cufftExecR2C 31 | #define CUFFT_EXEC_TYPE_C2R cufftExecC2R 32 | #endif 33 | 34 | #include 35 | #include 36 | #include "cufft.h" 37 | #include "cuda.h" 38 | 39 | extern "C" void fft_1m_r2c_(int *nx, int *m, CUFFT_REAL_TYPE *h_a, CUFFT_COMPLEX_TYPE *h_b) 40 | { 41 | unsigned long size1 = sizeof(CUFFT_REAL_TYPE) * (*nx) * (*m); 42 | unsigned long size2 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx/2+1) * (*m); 43 | CUFFT_REAL_TYPE *d_ic = NULL; 44 | CUFFT_COMPLEX_TYPE *d_oc = NULL; 45 | cufftHandle plan; 46 | cudaMalloc((void **)&d_ic, size1); 47 | cudaMalloc((void **)&d_oc, size2); 48 | cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice); 49 | int dims[1] = {*nx}; 50 | cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_R2C,*m); 51 | CUFFT_EXEC_TYPE_R2C(plan, d_ic, d_oc); 52 | cudaMemcpy(h_b, d_oc, size2, cudaMemcpyDeviceToHost); 53 | cudaFree(d_ic); 54 | cudaFree(d_oc); 55 | cufftDestroy(plan); 56 | } 57 | 58 | 59 | extern "C" void fft_1m_c2r_(int *nx, int *m, CUFFT_COMPLEX_TYPE *h_a, CUFFT_REAL_TYPE *h_b) 60 | { 61 | unsigned long size1 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx/2+1)*(*m); 62 | unsigned long size2 = sizeof(CUFFT_REAL_TYPE) * (*nx)*(*m); 63 | CUFFT_COMPLEX_TYPE *d_ic = NULL; 64 | CUFFT_REAL_TYPE *d_oc = NULL; 65 | cufftHandle plan; 66 | cudaMalloc((void **)&d_ic, size1); 67 | cudaMalloc((void **)&d_oc, size2); 68 | cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice); 69 | int dims[1] = {*nx}; 70 | cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_C2R,*m); 71 | CUFFT_EXEC_TYPE_C2R(plan, d_ic, d_oc); 72 | cudaMemcpy(h_b, d_oc, size2, cudaMemcpyDeviceToHost); 73 | cudaFree(d_ic); 74 | cudaFree(d_oc); 75 | cufftDestroy(plan); 76 | } 77 | 78 | 79 | extern "C" void fft_1m_c2c_(int *nx, int *m, CUFFT_COMPLEX_TYPE *h_a, CUFFT_COMPLEX_TYPE *h_b, int *sign) 80 | { 81 | unsigned long size1 = sizeof(CUFFT_COMPLEX_TYPE) * (*nx) * (*m); 82 | CUFFT_COMPLEX_TYPE *d_ic = NULL; 83 | CUFFT_COMPLEX_TYPE *d_oc = NULL; 84 | cufftHandle plan; 85 | cudaMalloc((void **)&d_ic, size1); 86 | cudaMalloc((void **)&d_oc, size1); 87 | cudaMemcpy(d_ic, h_a, size1, cudaMemcpyHostToDevice); 88 | int dims[1] = {*nx}; 89 | cufftPlanMany(&plan,1,dims,NULL,1,0,NULL,1,0,CUFFT_PLAN_TYPE_C2C,*m); 90 | CUFFT_EXEC_TYPE_C2C(plan, d_ic, d_oc, *sign); 91 | cudaMemcpy(h_b, d_oc, size1, cudaMemcpyDeviceToHost); 92 | cudaFree(d_ic); 93 | cudaFree(d_oc); 94 | cufftDestroy(plan); 95 | } 96 | -------------------------------------------------------------------------------- /src/factor.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! A few utility routines to find factors of integer numbers 13 | 14 | subroutine findfactor(num, factors, nfact) 15 | 16 | implicit none 17 | 18 | integer, intent(IN) :: num 19 | integer, intent(OUT), dimension(*) :: factors 20 | integer, intent(OUT) :: nfact 21 | integer :: i, m 22 | 23 | ! find the factors <= sqrt(num) 24 | m = int(sqrt(real(num))) 25 | nfact = 1 26 | do i=1,m 27 | if (num/i*i == num) then 28 | factors(nfact) = i 29 | nfact = nfact + 1 30 | end if 31 | end do 32 | nfact = nfact - 1 33 | 34 | ! derive those > sqrt(num) 35 | if (factors(nfact)**2/=num) then 36 | do i=nfact+1, 2*nfact 37 | factors(i) = num / factors(2*nfact-i+1) 38 | end do 39 | nfact = nfact * 2 40 | else 41 | do i=nfact+1, 2*nfact-1 42 | factors(i) = num / factors(2*nfact-i) 43 | end do 44 | nfact = nfact * 2 - 1 45 | endif 46 | 47 | return 48 | 49 | end subroutine findfactor 50 | 51 | 52 | subroutine primefactors(num, factors, nfact) 53 | 54 | implicit none 55 | 56 | integer, intent(IN) :: num 57 | integer, intent(OUT), dimension(*) :: factors 58 | integer, intent(INOUT) :: nfact 59 | 60 | integer :: i, n 61 | 62 | i = 2 63 | nfact = 1 64 | n = num 65 | do 66 | if (mod(n,i) == 0) then 67 | factors(nfact) = i 68 | nfact = nfact + 1 69 | n = n / i 70 | else 71 | i = i + 1 72 | end if 73 | if (n == 1) then 74 | nfact = nfact - 1 75 | exit 76 | end if 77 | end do 78 | 79 | return 80 | 81 | end subroutine primefactors 82 | 83 | -------------------------------------------------------------------------------- /src/fft_common.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contains common code shared by all FFT engines 13 | 14 | integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1 15 | integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1 16 | 17 | ! Physical space data can be stored in either X-pencil or Z-pencil 18 | integer, parameter, public :: PHYSICAL_IN_X = 1 19 | integer, parameter, public :: PHYSICAL_IN_Z = 3 20 | 21 | integer, save :: format ! input X-pencil or Z-pencil 22 | 23 | ! The libary can only be initialised once 24 | logical, save :: initialised = .false. 25 | 26 | ! Global size of the FFT 27 | integer, save :: nx_fft, ny_fft, nz_fft 28 | 29 | ! 2D processor grid 30 | integer, save, dimension(2) :: dims 31 | 32 | ! Decomposition objects 33 | TYPE(DECOMP_INFO), save :: ph ! physical space 34 | TYPE(DECOMP_INFO), save :: sp ! spectral space 35 | 36 | ! Workspace to store the intermediate Y-pencil data 37 | ! *** TODO: investigate how to use only one workspace array 38 | complex(mytype), allocatable, dimension(:,:,:) :: wk2_c2c, wk2_r2c 39 | complex(mytype), allocatable, dimension(:,:,:) :: wk13 40 | 41 | public :: decomp_2d_fft_init, decomp_2d_fft_3d, & 42 | decomp_2d_fft_finalize, decomp_2d_fft_get_size 43 | 44 | ! Declare generic interfaces to handle different inputs 45 | 46 | interface decomp_2d_fft_init 47 | module procedure fft_init_noarg 48 | module procedure fft_init_arg 49 | module procedure fft_init_general 50 | end interface 51 | 52 | interface decomp_2d_fft_3d 53 | module procedure fft_3d_c2c 54 | module procedure fft_3d_r2c 55 | module procedure fft_3d_c2r 56 | end interface 57 | 58 | 59 | contains 60 | 61 | 62 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 63 | ! Initialise the FFT module 64 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 65 | subroutine fft_init_noarg 66 | 67 | implicit none 68 | 69 | call fft_init_arg(PHYSICAL_IN_X) ! default input is X-pencil data 70 | 71 | return 72 | end subroutine fft_init_noarg 73 | 74 | subroutine fft_init_arg(pencil) ! allow to handle Z-pencil input 75 | 76 | implicit none 77 | 78 | integer, intent(IN) :: pencil 79 | 80 | call fft_init_general(pencil, nx_global, ny_global, nz_global) 81 | 82 | return 83 | end subroutine fft_init_arg 84 | 85 | ! Initialise the FFT library to perform arbitrary size transforms 86 | subroutine fft_init_general(pencil, nx, ny, nz) 87 | 88 | implicit none 89 | 90 | integer, intent(IN) :: pencil 91 | integer, intent(IN) :: nx, ny, nz 92 | 93 | logical, dimension(2) :: dummy_periods 94 | integer, dimension(2) :: dummy_coords 95 | integer :: status, errorcode, ierror 96 | 97 | if (initialised) then 98 | errorcode = 4 99 | call decomp_2d_abort(errorcode, & 100 | 'FFT library should only be initialised once') 101 | end if 102 | 103 | format = pencil 104 | nx_fft = nx 105 | ny_fft = ny 106 | nz_fft = nz 107 | 108 | ! determine the processor grid in use 109 | call MPI_CART_GET(DECOMP_2D_COMM_CART_X, 2, & 110 | dims, dummy_periods, dummy_coords, ierror) 111 | 112 | ! for c2r/r2c interface: 113 | ! if in physical space, a real array is of size: nx*ny*nz 114 | ! in spectral space, the complex array is of size: 115 | ! (nx/2+1)*ny*nz, if PHYSICAL_IN_X 116 | ! or nx*ny*(nz/2+1), if PHYSICAL_IN_Z 117 | 118 | call decomp_info_init(nx, ny, nz, ph) 119 | if (format==PHYSICAL_IN_X) then 120 | call decomp_info_init(nx/2+1, ny, nz, sp) 121 | else if (format==PHYSICAL_IN_Z) then 122 | call decomp_info_init(nx, ny, nz/2+1, sp) 123 | end if 124 | 125 | allocate(wk2_c2c(ph%ysz(1),ph%ysz(2),ph%ysz(3)), STAT=status) 126 | allocate(wk2_r2c(sp%ysz(1),sp%ysz(2),sp%ysz(3)), STAT=status) 127 | if (format==PHYSICAL_IN_X) then 128 | allocate(wk13(sp%xsz(1),sp%xsz(2),sp%xsz(3)), STAT=status) 129 | else if (format==PHYSICAL_IN_Z) then 130 | allocate(wk13(sp%zsz(1),sp%zsz(2),sp%zsz(3)), STAT=status) 131 | end if 132 | if (status /= 0) then 133 | errorcode = 3 134 | call decomp_2d_abort(errorcode, & 135 | 'Out of memory when initialising FFT') 136 | end if 137 | 138 | call init_fft_engine 139 | 140 | initialised = .true. 141 | 142 | return 143 | end subroutine fft_init_general 144 | 145 | 146 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 147 | ! Final clean up 148 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 149 | subroutine decomp_2d_fft_finalize 150 | 151 | implicit none 152 | 153 | call decomp_info_finalize(ph) 154 | call decomp_info_finalize(sp) 155 | 156 | deallocate(wk2_c2c, wk2_r2c, wk13) 157 | 158 | call finalize_fft_engine 159 | 160 | initialised = .false. 161 | 162 | return 163 | end subroutine decomp_2d_fft_finalize 164 | 165 | 166 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 167 | ! Return the size, starting/ending index of the distributed array 168 | ! whose global size is (nx/2+1)*ny*nz, for defining data structures 169 | ! in r2c and c2r interfaces 170 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 171 | subroutine decomp_2d_fft_get_size(istart, iend, isize) 172 | 173 | implicit none 174 | integer, dimension(3), intent(OUT) :: istart, iend, isize 175 | 176 | if (format==PHYSICAL_IN_X) then 177 | istart = sp%zst 178 | iend = sp%zen 179 | isize = sp%zsz 180 | else if (format==PHYSICAL_IN_Z) then 181 | istart = sp%xst 182 | iend = sp%xen 183 | isize = sp%xsz 184 | end if 185 | 186 | return 187 | end subroutine decomp_2d_fft_get_size 188 | -------------------------------------------------------------------------------- /src/fft_common_3d.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contains 3D c2c/r2c/c2r transform subroutines which are 13 | ! identical for several FFT engines 14 | 15 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 16 | ! 3D FFT - complex to complex 17 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 18 | subroutine fft_3d_c2c(in, out, isign) 19 | 20 | implicit none 21 | 22 | complex(mytype), dimension(:,:,:), intent(INOUT) :: in 23 | complex(mytype), dimension(:,:,:), intent(OUT) :: out 24 | integer, intent(IN) :: isign 25 | 26 | #ifndef OVERWRITE 27 | complex(mytype), allocatable, dimension(:,:,:) :: wk1 28 | #endif 29 | 30 | if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR. & 31 | format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then 32 | 33 | ! ===== 1D FFTs in X ===== 34 | #ifdef OVERWRITE 35 | call c2c_1m_x(in,isign,ph) 36 | #else 37 | allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3))) 38 | wk1 = in 39 | call c2c_1m_x(wk1,isign,ph) 40 | #endif 41 | 42 | ! ===== Swap X --> Y; 1D FFTs in Y ===== 43 | 44 | if (dims(1)>1) then 45 | #ifdef OVERWRITE 46 | call transpose_x_to_y(in,wk2_c2c,ph) 47 | #else 48 | call transpose_x_to_y(wk1,wk2_c2c,ph) 49 | #endif 50 | call c2c_1m_y(wk2_c2c,isign,ph) 51 | else 52 | #ifdef OVERWRITE 53 | call c2c_1m_y(in,isign,ph) 54 | #else 55 | call c2c_1m_y(wk1,isign,ph) 56 | #endif 57 | end if 58 | 59 | ! ===== Swap Y --> Z; 1D FFTs in Z ===== 60 | if (dims(1)>1) then 61 | call transpose_y_to_z(wk2_c2c,out,ph) 62 | else 63 | #ifdef OVERWRITE 64 | call transpose_y_to_z(in,out,ph) 65 | #else 66 | call transpose_y_to_z(wk1,out,ph) 67 | #endif 68 | end if 69 | call c2c_1m_z(out,isign,ph) 70 | 71 | else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD & 72 | .OR. & 73 | format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then 74 | 75 | ! ===== 1D FFTs in Z ===== 76 | #ifdef OVERWRITE 77 | call c2c_1m_z(in,isign,ph) 78 | #else 79 | allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3))) 80 | wk1 = in 81 | call c2c_1m_z(wk1,isign,ph) 82 | #endif 83 | 84 | ! ===== Swap Z --> Y; 1D FFTs in Y ===== 85 | if (dims(1)>1) then 86 | #ifdef OVERWRITE 87 | call transpose_z_to_y(in,wk2_c2c,ph) 88 | #else 89 | call transpose_z_to_y(wk1,wk2_c2c,ph) 90 | #endif 91 | call c2c_1m_y(wk2_c2c,isign,ph) 92 | else ! out==wk2_c2c if 1D decomposition 93 | #ifdef OVERWRITE 94 | call transpose_z_to_y(in,out,ph) 95 | #else 96 | call transpose_z_to_y(wk1,out,ph) 97 | #endif 98 | call c2c_1m_y(out,isign,ph) 99 | end if 100 | 101 | ! ===== Swap Y --> X; 1D FFTs in X ===== 102 | if (dims(1)>1) then 103 | call transpose_y_to_x(wk2_c2c,out,ph) 104 | end if 105 | call c2c_1m_x(out,isign,ph) 106 | 107 | end if 108 | 109 | return 110 | end subroutine fft_3d_c2c 111 | 112 | 113 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 114 | ! 3D forward FFT - real to complex 115 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 116 | subroutine fft_3d_r2c(in_r, out_c) 117 | 118 | implicit none 119 | 120 | real(mytype), dimension(:,:,:), intent(IN) :: in_r 121 | complex(mytype), dimension(:,:,:), intent(OUT) :: out_c 122 | 123 | if (format==PHYSICAL_IN_X) then 124 | 125 | ! ===== 1D FFTs in X ===== 126 | call r2c_1m_x(in_r,wk13) 127 | 128 | ! ===== Swap X --> Y; 1D FFTs in Y ===== 129 | if (dims(1)>1) then 130 | call transpose_x_to_y(wk13,wk2_r2c,sp) 131 | call c2c_1m_y(wk2_r2c,-1,sp) 132 | else 133 | call c2c_1m_y(wk13,-1,sp) 134 | end if 135 | 136 | ! ===== Swap Y --> Z; 1D FFTs in Z ===== 137 | if (dims(1)>1) then 138 | call transpose_y_to_z(wk2_r2c,out_c,sp) 139 | else 140 | call transpose_y_to_z(wk13,out_c,sp) 141 | end if 142 | call c2c_1m_z(out_c,-1,sp) 143 | 144 | else if (format==PHYSICAL_IN_Z) then 145 | 146 | ! ===== 1D FFTs in Z ===== 147 | call r2c_1m_z(in_r,wk13) 148 | 149 | ! ===== Swap Z --> Y; 1D FFTs in Y ===== 150 | if (dims(1)>1) then 151 | call transpose_z_to_y(wk13,wk2_r2c,sp) 152 | call c2c_1m_y(wk2_r2c,-1,sp) 153 | else ! out_c==wk2_r2c if 1D decomposition 154 | call transpose_z_to_y(wk13,out_c,sp) 155 | call c2c_1m_y(out_c,-1,sp) 156 | end if 157 | 158 | ! ===== Swap Y --> X; 1D FFTs in X ===== 159 | if (dims(1)>1) then 160 | call transpose_y_to_x(wk2_r2c,out_c,sp) 161 | end if 162 | call c2c_1m_x(out_c,-1,sp) 163 | 164 | end if 165 | 166 | return 167 | end subroutine fft_3d_r2c 168 | 169 | 170 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 171 | ! 3D inverse FFT - complex to real 172 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 173 | subroutine fft_3d_c2r(in_c, out_r) 174 | 175 | implicit none 176 | 177 | complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c 178 | real(mytype), dimension(:,:,:), intent(OUT) :: out_r 179 | 180 | #ifndef OVERWRITE 181 | complex(mytype), allocatable, dimension(:,:,:) :: wk1 182 | #endif 183 | 184 | if (format==PHYSICAL_IN_X) then 185 | 186 | ! ===== 1D FFTs in Z ===== 187 | #ifdef OVERWRITE 188 | call c2c_1m_z(in_c,1,sp) 189 | #else 190 | allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3))) 191 | wk1 = in_c 192 | call c2c_1m_z(wk1,1,sp) 193 | #endif 194 | 195 | ! ===== Swap Z --> Y; 1D FFTs in Y ===== 196 | #ifdef OVERWRITE 197 | call transpose_z_to_y(in_c,wk2_r2c,sp) 198 | #else 199 | call transpose_z_to_y(wk1,wk2_r2c,sp) 200 | #endif 201 | call c2c_1m_y(wk2_r2c,1,sp) 202 | 203 | ! ===== Swap Y --> X; 1D FFTs in X ===== 204 | if (dims(1)>1) then 205 | call transpose_y_to_x(wk2_r2c,wk13,sp) 206 | call c2r_1m_x(wk13,out_r) 207 | else 208 | call c2r_1m_x(wk2_r2c,out_r) 209 | end if 210 | 211 | else if (format==PHYSICAL_IN_Z) then 212 | 213 | ! ===== 1D FFTs in X ===== 214 | #ifdef OVERWRITE 215 | call c2c_1m_x(in_c,1,sp) 216 | #else 217 | allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3))) 218 | wk1 = in_c 219 | call c2c_1m_x(wk1,1,sp) 220 | #endif 221 | 222 | ! ===== Swap X --> Y; 1D FFTs in Y ===== 223 | if (dims(1)>1) then 224 | #ifdef OVERWRITE 225 | call transpose_x_to_y(in_c,wk2_r2c,sp) 226 | #else 227 | call transpose_x_to_y(wk1,wk2_r2c,sp) 228 | #endif 229 | call c2c_1m_y(wk2_r2c,1,sp) 230 | else ! in_c==wk2_r2c if 1D decomposition 231 | #ifdef OVERWRITE 232 | call c2c_1m_y(in_c,1,sp) 233 | #else 234 | call c2c_1m_y(wk1,1,sp) 235 | #endif 236 | end if 237 | 238 | ! ===== Swap Y --> Z; 1D FFTs in Z ===== 239 | if (dims(1)>1) then 240 | call transpose_y_to_z(wk2_r2c,wk13,sp) 241 | else 242 | #ifdef OVERWRITE 243 | call transpose_y_to_z(in_c,wk13,sp) 244 | #else 245 | call transpose_y_to_z(wk1,wk13,sp) 246 | #endif 247 | end if 248 | call c2r_1m_z(wk13,out_r) 249 | 250 | end if 251 | 252 | return 253 | end subroutine fft_3d_c2r 254 | -------------------------------------------------------------------------------- /src/fft_generic.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This is the 'generic' implementation of the FFT library 13 | 14 | module decomp_2d_fft 15 | 16 | use decomp_2d ! 2D decomposition module 17 | use glassman 18 | 19 | implicit none 20 | 21 | private ! Make everything private unless declared public 22 | 23 | ! engine-specific global variables 24 | complex(mytype), allocatable, dimension(:) :: buf, scratch 25 | 26 | ! common code used for all engines, including global variables, 27 | ! generic interface definitions and several subroutines 28 | #include "fft_common.f90" 29 | 30 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 31 | ! This routine performs one-time initialisations for the FFT engine 32 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 33 | subroutine init_fft_engine 34 | 35 | implicit none 36 | 37 | integer :: cbuf_size 38 | 39 | if (nrank==0) then 40 | write(*,*) ' ' 41 | write(*,*) '***** Using the generic FFT engine *****' 42 | write(*,*) ' ' 43 | end if 44 | 45 | cbuf_size = max(ph%xsz(1), ph%ysz(2)) 46 | cbuf_size = max(cbuf_size, ph%zsz(3)) 47 | allocate(buf(cbuf_size)) 48 | allocate(scratch(cbuf_size)) 49 | 50 | return 51 | end subroutine init_fft_engine 52 | 53 | 54 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 55 | ! This routine performs one-time finalisations for the FFT engine 56 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 57 | subroutine finalize_fft_engine 58 | 59 | implicit none 60 | 61 | deallocate(buf,scratch) 62 | 63 | return 64 | end subroutine finalize_fft_engine 65 | 66 | 67 | ! Following routines calculate multiple one-dimensional FFTs to form 68 | ! the basis of three-dimensional FFTs. 69 | 70 | ! c2c transform, multiple 1D FFTs in x direction 71 | subroutine c2c_1m_x(inout, isign, decomp) 72 | 73 | implicit none 74 | 75 | complex(mytype), dimension(:,:,:), intent(INOUT) :: inout 76 | integer, intent(IN) :: isign 77 | TYPE(DECOMP_INFO), intent(IN) :: decomp 78 | 79 | integer :: i,j,k 80 | 81 | do k=1,decomp%xsz(3) 82 | do j=1,decomp%xsz(2) 83 | do i=1,decomp%xsz(1) 84 | buf(i) = inout(i,j,k) 85 | end do 86 | call spcfft(buf,decomp%xsz(1),isign,scratch) 87 | do i=1,decomp%xsz(1) 88 | inout(i,j,k) = buf(i) 89 | end do 90 | end do 91 | end do 92 | 93 | return 94 | 95 | end subroutine c2c_1m_x 96 | 97 | ! c2c transform, multiple 1D FFTs in y direction 98 | subroutine c2c_1m_y(inout, isign, decomp) 99 | 100 | implicit none 101 | 102 | complex(mytype), dimension(:,:,:), intent(INOUT) :: inout 103 | integer, intent(IN) :: isign 104 | TYPE(DECOMP_INFO), intent(IN) :: decomp 105 | 106 | integer :: i,j,k 107 | 108 | do k=1,decomp%ysz(3) 109 | do i=1,decomp%ysz(1) 110 | do j=1,decomp%ysz(2) 111 | buf(j) = inout(i,j,k) 112 | end do 113 | call spcfft(buf,decomp%ysz(2),isign,scratch) 114 | do j=1,decomp%ysz(2) 115 | inout(i,j,k) = buf(j) 116 | end do 117 | end do 118 | end do 119 | 120 | return 121 | 122 | end subroutine c2c_1m_y 123 | 124 | ! c2c transform, multiple 1D FFTs in z direction 125 | subroutine c2c_1m_z(inout, isign, decomp) 126 | 127 | implicit none 128 | 129 | complex(mytype), dimension(:,:,:), intent(INOUT) :: inout 130 | integer, intent(IN) :: isign 131 | TYPE(DECOMP_INFO), intent(IN) :: decomp 132 | 133 | integer :: i,j,k 134 | 135 | do j=1,decomp%zsz(2) 136 | do i=1,decomp%zsz(1) 137 | do k=1,decomp%zsz(3) 138 | buf(k) = inout(i,j,k) 139 | end do 140 | call spcfft(buf,decomp%zsz(3),isign,scratch) 141 | do k=1,decomp%zsz(3) 142 | inout(i,j,k) = buf(k) 143 | end do 144 | end do 145 | end do 146 | 147 | return 148 | 149 | end subroutine c2c_1m_z 150 | 151 | ! r2c transform, multiple 1D FFTs in x direction 152 | subroutine r2c_1m_x(input, output) 153 | 154 | implicit none 155 | 156 | real(mytype), dimension(:,:,:), intent(IN) :: input 157 | complex(mytype), dimension(:,:,:), intent(OUT) :: output 158 | 159 | integer :: i,j,k, s1,s2,s3, d1 160 | 161 | s1 = size(input,1) 162 | s2 = size(input,2) 163 | s3 = size(input,3) 164 | d1 = size(output,1) 165 | 166 | do k=1,s3 167 | do j=1,s2 168 | ! Glassman's FFT is c2c only, 169 | ! needing some pre- and post-processing for r2c 170 | ! pack real input in complex storage 171 | do i=1,s1 172 | buf(i) = cmplx(input(i,j,k),0._mytype, kind=mytype) 173 | end do 174 | call spcfft(buf,s1,-1,scratch) 175 | ! note d1 ~ s1/2+1 176 | ! simply drop the redundant part of the complex output 177 | do i=1,d1 178 | output(i,j,k) = buf(i) 179 | end do 180 | end do 181 | end do 182 | 183 | return 184 | 185 | end subroutine r2c_1m_x 186 | 187 | ! r2c transform, multiple 1D FFTs in z direction 188 | subroutine r2c_1m_z(input, output) 189 | 190 | implicit none 191 | 192 | real(mytype), dimension(:,:,:), intent(IN) :: input 193 | complex(mytype), dimension(:,:,:), intent(OUT) :: output 194 | 195 | integer :: i,j,k, s1,s2,s3, d3 196 | 197 | s1 = size(input,1) 198 | s2 = size(input,2) 199 | s3 = size(input,3) 200 | d3 = size(output,3) 201 | 202 | do j=1,s2 203 | do i=1,s1 204 | ! Glassman's FFT is c2c only, 205 | ! needing some pre- and post-processing for r2c 206 | ! pack real input in complex storage 207 | do k=1,s3 208 | buf(k) = cmplx(input(i,j,k),0._mytype, kind=mytype) 209 | end do 210 | call spcfft(buf,s3,-1,scratch) 211 | ! note d3 ~ s3/2+1 212 | ! simply drop the redundant part of the complex output 213 | do k=1,d3 214 | output(i,j,k) = buf(k) 215 | end do 216 | end do 217 | end do 218 | 219 | return 220 | 221 | end subroutine r2c_1m_z 222 | 223 | ! c2r transform, multiple 1D FFTs in x direction 224 | subroutine c2r_1m_x(input, output) 225 | 226 | implicit none 227 | 228 | complex(mytype), dimension(:,:,:), intent(IN) :: input 229 | real(mytype), dimension(:,:,:), intent(OUT) :: output 230 | 231 | integer :: i,j,k, d1,d2,d3 232 | 233 | d1 = size(output,1) 234 | d2 = size(output,2) 235 | d3 = size(output,3) 236 | 237 | do k=1,d3 238 | do j=1,d2 239 | ! Glassman's FFT is c2c only, 240 | ! needing some pre- and post-processing for c2r 241 | do i=1,d1/2+1 242 | buf(i) = input(i,j,k) 243 | end do 244 | ! expanding to a full-size complex array 245 | ! For odd N, the storage is: 246 | ! 1, 2, ...... N/2+1 integer division rounded down 247 | ! N, ...... N/2+2 => a(i) is conjugate of a(N+2-i) 248 | ! For even N, the storage is: 249 | ! 1, 2, ...... N/2 , N/2+1 250 | ! N, ...... N/2+2 again a(i) conjugate of a(N+2-i) 251 | do i=d1/2+2,d1 252 | buf(i) = conjg(buf(d1+2-i)) 253 | end do 254 | call spcfft(buf,d1,1,scratch) 255 | do i=1,d1 256 | ! simply drop imaginary part 257 | output(i,j,k) = real(buf(i), kind=mytype) 258 | end do 259 | end do 260 | end do 261 | 262 | return 263 | 264 | end subroutine c2r_1m_x 265 | 266 | ! c2r transform, multiple 1D FFTs in z direction 267 | subroutine c2r_1m_z(input, output) 268 | 269 | implicit none 270 | 271 | complex(mytype), dimension(:,:,:), intent(IN) :: input 272 | real(mytype), dimension(:,:,:), intent(OUT) :: output 273 | 274 | integer :: i,j,k, d1,d2,d3 275 | 276 | d1 = size(output,1) 277 | d2 = size(output,2) 278 | d3 = size(output,3) 279 | 280 | do j=1,d2 281 | do i=1,d1 282 | do k=1,d3/2+1 283 | buf(k) = input(i,j,k) 284 | end do 285 | do k=d3/2+2,d3 286 | buf(k) = conjg(buf(d3+2-k)) 287 | end do 288 | call spcfft(buf,d3,1,scratch) 289 | do k=1,d3 290 | output(i,j,k) = real(buf(k), kind=mytype) 291 | end do 292 | end do 293 | end do 294 | 295 | return 296 | 297 | end subroutine c2r_1m_z 298 | 299 | 300 | #include "fft_common_3d.f90" 301 | 302 | 303 | end module decomp_2d_fft 304 | -------------------------------------------------------------------------------- /src/glassman.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This module contains a few 'generic' FFT routines, making the 13 | ! 2DECOMP&FFT library not dependent on any external libraries 14 | 15 | module glassman 16 | 17 | use decomp_2d, only : mytype 18 | 19 | implicit none 20 | 21 | contains 22 | 23 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 24 | ! Following is a FFT implementation based on algorithm proposed by 25 | ! Glassman, a general FFT algorithm supporting arbitrary input length. 26 | ! 27 | ! W. E. Ferguson, Jr., "A simple derivation of Glassman general-n fast 28 | ! Fourier transform," Comput. and Math. with Appls., vol. 8, no. 6, pp. 29 | ! 401-411, 1982. 30 | ! 31 | ! Original implemtation online at http://www.jjj.de/fft/fftpage.html 32 | ! 33 | ! Updated 34 | ! - to handle double-precision as well 35 | ! - unnecessary scaling code removed 36 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 37 | 38 | SUBROUTINE SPCFFT(U,N,ISIGN,WORK) 39 | 40 | IMPLICIT NONE 41 | 42 | LOGICAL :: INU 43 | INTEGER :: A,B,C,N,I,ISIGN 44 | COMPLEX(mytype) :: U(*),WORK(*) 45 | 46 | A = 1 47 | B = N 48 | C = 1 49 | INU = .TRUE. 50 | 51 | DO WHILE ( B .GT. 1 ) 52 | A = C * A 53 | C = 2 54 | DO WHILE ( MOD(B,C) .NE. 0 ) 55 | C = C + 1 56 | END DO 57 | B = B / C 58 | IF ( INU ) THEN 59 | CALL SPCPFT (A,B,C,U,WORK,ISIGN) 60 | ELSE 61 | CALL SPCPFT (A,B,C,WORK,U,ISIGN) 62 | END IF 63 | INU = ( .NOT. INU ) 64 | END DO 65 | 66 | IF ( .NOT. INU ) THEN 67 | DO I = 1, N 68 | U(I) = WORK(I) 69 | END DO 70 | END IF 71 | 72 | RETURN 73 | END SUBROUTINE SPCFFT 74 | 75 | 76 | SUBROUTINE SPCPFT( A, B, C, UIN, UOUT, ISIGN ) 77 | 78 | IMPLICIT NONE 79 | 80 | INTEGER :: ISIGN,A,B,C,IA,IB,IC,JCR,JC 81 | 82 | DOUBLE PRECISION :: ANGLE 83 | 84 | COMPLEX(mytype) :: UIN(B,C,A),UOUT(B,A,C),DELTA,OMEGA,SUM 85 | 86 | ANGLE = 8.D0*DATAN(1.D0) / REAL( A * C, kind=mytype ) 87 | OMEGA = CMPLX( 1.0, 0.0, kind=mytype ) 88 | 89 | IF( ISIGN .EQ. 1 ) THEN 90 | DELTA = CMPLX( DCOS(ANGLE), DSIN(ANGLE), kind=mytype ) 91 | ELSE 92 | DELTA = CMPLX( DCOS(ANGLE), -DSIN(ANGLE), kind=mytype ) 93 | END IF 94 | 95 | DO IC = 1, C 96 | DO IA = 1, A 97 | DO IB = 1, B 98 | SUM = UIN( IB, C, IA ) 99 | DO JCR = 2, C 100 | JC = C + 1 - JCR 101 | SUM = UIN( IB, JC, IA ) + OMEGA * SUM 102 | END DO 103 | UOUT( IB, IA, IC ) = SUM 104 | END DO 105 | OMEGA = DELTA * OMEGA 106 | END DO 107 | END DO 108 | 109 | RETURN 110 | END SUBROUTINE SPCPFT 111 | 112 | 113 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 114 | ! A 3D real-to-complex routine implemented using the 1D FFT above 115 | ! Input: nx*ny*nz real numbers 116 | ! Output: (nx/2+1)*ny*nz complex numbers 117 | ! Just like big FFT libraries (such as FFTW) do 118 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 119 | subroutine glassman_3d_r2c(in_r,nx,ny,nz,out_c) 120 | 121 | implicit none 122 | 123 | integer, intent(IN) :: nx,ny,nz 124 | real(mytype), dimension(nx,ny,nz) :: in_r 125 | complex(mytype), dimension(nx/2+1,ny,nz) :: out_c 126 | 127 | complex(mytype), allocatable, dimension(:) :: buf, scratch 128 | integer :: maxsize, i,j,k 129 | 130 | maxsize = max(nx, max(ny,nz)) 131 | allocate(buf(maxsize)) 132 | allocate(scratch(maxsize)) 133 | 134 | ! ===== 1D FFTs in X ===== 135 | do k=1,nz 136 | do j=1,ny 137 | ! Glassman's 1D FFT is c2c only, 138 | ! needing some pre- and post-processing for r2c 139 | ! pack real input in complex storage 140 | do i=1,nx 141 | buf(i) = cmplx(in_r(i,j,k),0._mytype, kind=mytype) 142 | end do 143 | call spcfft(buf,nx,-1,scratch) 144 | ! simply drop the redundant part of the complex output 145 | do i=1,nx/2+1 146 | out_c(i,j,k) = buf(i) 147 | end do 148 | end do 149 | end do 150 | 151 | ! ===== 1D FFTs in Y ===== 152 | do k=1,nz 153 | do i=1,nx/2+1 154 | do j=1,ny 155 | buf(j) = out_c(i,j,k) 156 | end do 157 | call spcfft(buf,ny,-1,scratch) 158 | do j=1,ny 159 | out_c(i,j,k) = buf(j) 160 | end do 161 | end do 162 | end do 163 | 164 | ! ===== 1D FFTs in Z ===== 165 | do j=1,ny 166 | do i=1,nx/2+1 167 | do k=1,nz 168 | buf(k) = out_c(i,j,k) 169 | end do 170 | call spcfft(buf,nz,-1,scratch) 171 | do k=1,nz 172 | out_c(i,j,k) = buf(k) 173 | end do 174 | end do 175 | end do 176 | 177 | deallocate(buf,scratch) 178 | 179 | return 180 | end subroutine glassman_3d_r2c 181 | 182 | 183 | end module glassman 184 | 185 | -------------------------------------------------------------------------------- /src/halo.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 13 | ! Halo cell support for neighbouring pencils to exchange data 14 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 15 | subroutine update_halo_real(in, out, level, opt_decomp, opt_global) 16 | 17 | implicit none 18 | 19 | integer, intent(IN) :: level ! levels of halo cells required 20 | real(mytype), dimension(:,:,:), intent(IN) :: in 21 | real(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out 22 | TYPE(DECOMP_INFO), optional :: opt_decomp 23 | logical, optional :: opt_global 24 | 25 | TYPE(DECOMP_INFO) :: decomp 26 | logical :: global 27 | 28 | ! starting/ending index of array with halo cells 29 | integer :: xs, ys, zs, xe, ye, ze 30 | 31 | integer :: i, j, k, s1, s2, s3, ierror 32 | integer :: data_type 33 | 34 | integer :: icount, ilength, ijump 35 | integer :: halo12, halo21, halo31, halo32 36 | integer, dimension(4) :: requests 37 | integer, dimension(MPI_STATUS_SIZE,4) :: status 38 | integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b 39 | 40 | data_type = real_type 41 | 42 | #include "halo_common.f90" 43 | 44 | return 45 | end subroutine update_halo_real 46 | 47 | 48 | subroutine update_halo_complex(in, out, level, opt_decomp, opt_global) 49 | 50 | implicit none 51 | 52 | integer, intent(IN) :: level ! levels of halo cells required 53 | complex(mytype), dimension(:,:,:), intent(IN) :: in 54 | complex(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out 55 | TYPE(DECOMP_INFO), optional :: opt_decomp 56 | logical, optional :: opt_global 57 | 58 | TYPE(DECOMP_INFO) :: decomp 59 | logical :: global 60 | 61 | ! starting/ending index of array with halo cells 62 | integer :: xs, ys, zs, xe, ye, ze 63 | 64 | integer :: i, j, k, s1, s2, s3, ierror 65 | integer :: data_type 66 | 67 | integer :: icount, ilength, ijump 68 | integer :: halo12, halo21, halo31, halo32 69 | integer, dimension(4) :: requests 70 | integer, dimension(MPI_STATUS_SIZE,4) :: status 71 | integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b 72 | 73 | data_type = complex_type 74 | 75 | #include "halo_common.f90" 76 | 77 | return 78 | end subroutine update_halo_complex 79 | 80 | 81 | 82 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 83 | ! To support halo-cell exchange: 84 | ! find the MPI ranks of neighbouring pencils 85 | !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 86 | subroutine init_neighbour 87 | 88 | integer :: ierror 89 | 90 | ! For X-pencil 91 | neighbour(1,1) = MPI_PROC_NULL ! east 92 | neighbour(1,2) = MPI_PROC_NULL ! west 93 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 0, 1, & 94 | neighbour(1,4), neighbour(1,3), ierror) ! north & south 95 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 1, 1, & 96 | neighbour(1,6), neighbour(1,5), ierror) ! top & bottom 97 | 98 | ! For Y-pencil 99 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 0, 1, & 100 | neighbour(2,2), neighbour(2,1), ierror) ! east & west 101 | neighbour(2,3) = MPI_PROC_NULL ! north 102 | neighbour(2,4) = MPI_PROC_NULL ! south 103 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 1, 1, & 104 | neighbour(2,6), neighbour(2,5), ierror) ! top & bottom 105 | 106 | ! For Z-pencil 107 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 0, 1, & 108 | neighbour(3,2), neighbour(3,1), ierror) ! east & west 109 | call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 1, 1, & 110 | neighbour(3,4), neighbour(3,3), ierror) ! north & south 111 | neighbour(3,5) = MPI_PROC_NULL ! top 112 | neighbour(3,6) = MPI_PROC_NULL ! bottom 113 | 114 | return 115 | end subroutine init_neighbour 116 | -------------------------------------------------------------------------------- /src/io_read_one.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'mpiio_read_one_...' in io.f90 14 | 15 | ! Using MPI-IO to write a distributed 3D array into a file 16 | 17 | if (present(opt_decomp)) then 18 | decomp = opt_decomp 19 | else 20 | call get_decomp_info(decomp) 21 | end if 22 | 23 | ! determine subarray parameters 24 | sizes(1) = decomp%xsz(1) 25 | sizes(2) = decomp%ysz(2) 26 | sizes(3) = decomp%zsz(3) 27 | 28 | if (ipencil == 1) then 29 | subsizes(1) = decomp%xsz(1) 30 | subsizes(2) = decomp%xsz(2) 31 | subsizes(3) = decomp%xsz(3) 32 | starts(1) = decomp%xst(1)-1 ! 0-based index 33 | starts(2) = decomp%xst(2)-1 34 | starts(3) = decomp%xst(3)-1 35 | else if (ipencil == 2) then 36 | subsizes(1) = decomp%ysz(1) 37 | subsizes(2) = decomp%ysz(2) 38 | subsizes(3) = decomp%ysz(3) 39 | starts(1) = decomp%yst(1)-1 40 | starts(2) = decomp%yst(2)-1 41 | starts(3) = decomp%yst(3)-1 42 | else if (ipencil == 3) then 43 | subsizes(1) = decomp%zsz(1) 44 | subsizes(2) = decomp%zsz(2) 45 | subsizes(3) = decomp%zsz(3) 46 | starts(1) = decomp%zst(1)-1 47 | starts(2) = decomp%zst(2)-1 48 | starts(3) = decomp%zst(3)-1 49 | endif 50 | 51 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 52 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 53 | call MPI_TYPE_COMMIT(newtype,ierror) 54 | call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, & 55 | MPI_MODE_RDONLY, MPI_INFO_NULL, & 56 | fh, ierror) 57 | disp = 0_MPI_OFFSET_KIND 58 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 59 | newtype,'native',MPI_INFO_NULL,ierror) 60 | call MPI_FILE_READ_ALL(fh, var, & 61 | subsizes(1)*subsizes(2)*subsizes(3), & 62 | data_type, MPI_STATUS_IGNORE, ierror) 63 | call MPI_FILE_CLOSE(fh,ierror) 64 | call MPI_TYPE_FREE(newtype,ierror) 65 | -------------------------------------------------------------------------------- /src/io_read_var.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'read_var_...' in io.f90 14 | 15 | ! Using MPI-IO to read a distributed 3D variable from a file. File 16 | ! operations (open/close) need to be done in calling application. This 17 | ! allows multiple variables to be read from a single file. Together 18 | ! with the corresponding write operation, this is the perfect solution 19 | ! for applications to perform restart/checkpointing. 20 | 21 | if (present(opt_decomp)) then 22 | decomp = opt_decomp 23 | else 24 | call get_decomp_info(decomp) 25 | end if 26 | 27 | ! Create file type and set file view 28 | sizes(1) = decomp%xsz(1) 29 | sizes(2) = decomp%ysz(2) 30 | sizes(3) = decomp%zsz(3) 31 | if (ipencil == 1) then 32 | subsizes(1) = decomp%xsz(1) 33 | subsizes(2) = decomp%xsz(2) 34 | subsizes(3) = decomp%xsz(3) 35 | starts(1) = decomp%xst(1)-1 ! 0-based index 36 | starts(2) = decomp%xst(2)-1 37 | starts(3) = decomp%xst(3)-1 38 | else if (ipencil == 2) then 39 | subsizes(1) = decomp%ysz(1) 40 | subsizes(2) = decomp%ysz(2) 41 | subsizes(3) = decomp%ysz(3) 42 | starts(1) = decomp%yst(1)-1 43 | starts(2) = decomp%yst(2)-1 44 | starts(3) = decomp%yst(3)-1 45 | else if (ipencil == 3) then 46 | subsizes(1) = decomp%zsz(1) 47 | subsizes(2) = decomp%zsz(2) 48 | subsizes(3) = decomp%zsz(3) 49 | starts(1) = decomp%zst(1)-1 50 | starts(2) = decomp%zst(2)-1 51 | starts(3) = decomp%zst(3)-1 52 | endif 53 | 54 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 55 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 56 | call MPI_TYPE_COMMIT(newtype,ierror) 57 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 58 | newtype,'native',MPI_INFO_NULL,ierror) 59 | call MPI_FILE_READ_ALL(fh, var, & 60 | subsizes(1)*subsizes(2)*subsizes(3), & 61 | data_type, MPI_STATUS_IGNORE, ierror) 62 | call MPI_TYPE_FREE(newtype,ierror) 63 | 64 | ! update displacement for the next read operation 65 | disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes 66 | if (data_type == complex_type) then 67 | disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes 68 | end if 69 | -------------------------------------------------------------------------------- /src/io_write_every.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'write_every_...' in io.f90 14 | 15 | ! To write every few points of a 3D array to a file 16 | 17 | ! work out the distribution parameters, which may be different from 18 | ! the default distribution used by the decomposition library 19 | ! For exmample if nx=17 and p_row=4 20 | ! distribution is: 4 4 4 5 21 | 22 | ! If writing from the 1st element 23 | ! If saving every 3 points, then 5 points to be saved (17/3) 24 | ! default distribution would be 1 1 1 2 25 | ! However, 1st block (1-4) contains the 3rd point 26 | ! 2nd block (5-8) contains the 6th point 27 | ! 3rd block (9-12) contains the 9th and 12th point 28 | ! 4th block (13-17) contains then 15th point 29 | ! giving a 1 1 2 1 distribution 30 | ! So cannot use the base decomposition library for such IO 31 | 32 | ! If writing from the n-th element (n=?skip) 33 | ! If saving every 3 points, then 6 points to be saved 34 | ! However, 1st block (1-4) contains the 1st & 4th point 35 | ! 2nd block (5-8) contains the 7th point 36 | ! 3rd block (9-12) contains the 10th point 37 | ! 4th block (13-17) contains then 12th & 15th point 38 | ! giving a 1 2 2 1 distribution 39 | 40 | skip(1)=iskip 41 | skip(2)=jskip 42 | skip(3)=kskip 43 | 44 | do i=1,3 45 | if (from1) then 46 | xst(i) = (xstart(i)+skip(i)-1)/skip(i) 47 | if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xst(i)=xst(i)+1 48 | xen(i) = (xend(i)+skip(i)-1)/skip(i) 49 | else 50 | xst(i) = xstart(i)/skip(i) 51 | if (mod(xstart(i),skip(i))/=0) xst(i)=xst(i)+1 52 | xen(i) = xend(i)/skip(i) 53 | end if 54 | xsz(i) = xen(i)-xst(i)+1 55 | end do 56 | 57 | do i=1,3 58 | if (from1) then 59 | yst(i) = (ystart(i)+skip(i)-1)/skip(i) 60 | if (mod(ystart(i)+skip(i)-1,skip(i))/=0) yst(i)=yst(i)+1 61 | yen(i) = (yend(i)+skip(i)-1)/skip(i) 62 | else 63 | yst(i) = ystart(i)/skip(i) 64 | if (mod(ystart(i),skip(i))/=0) yst(i)=yst(i)+1 65 | yen(i) = yend(i)/skip(i) 66 | end if 67 | ysz(i) = yen(i)-yst(i)+1 68 | end do 69 | 70 | do i=1,3 71 | if (from1) then 72 | zst(i) = (zstart(i)+skip(i)-1)/skip(i) 73 | if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zst(i)=zst(i)+1 74 | zen(i) = (zend(i)+skip(i)-1)/skip(i) 75 | else 76 | zst(i) = zstart(i)/skip(i) 77 | if (mod(zstart(i),skip(i))/=0) zst(i)=zst(i)+1 78 | zen(i) = zend(i)/skip(i) 79 | end if 80 | zsz(i) = zen(i)-zst(i)+1 81 | end do 82 | 83 | ! if 'skip' value is large it is possible that some ranks do not 84 | ! contain any points to be written. Subarray constructor requires 85 | ! nonzero size so it is not possible to use MPI_COMM_WORLD for IO. 86 | ! Create a sub communicator for this... 87 | color = 1 88 | key = 0 ! rank order doesn't matter 89 | if (ipencil==1) then 90 | if (xsz(1)==0 .or. xsz(2)==0 .or. xsz(3)==0) then 91 | color = 2 92 | end if 93 | else if (ipencil==2) then 94 | if (ysz(1)==0 .or. ysz(2)==0 .or. ysz(3)==0) then 95 | color = 2 96 | end if 97 | else if (ipencil==3) then 98 | if (zsz(1)==0 .or. zsz(2)==0 .or. zsz(3)==0) then 99 | color = 2 100 | end if 101 | end if 102 | call MPI_COMM_SPLIT(MPI_COMM_WORLD,color,key,newcomm,ierror) 103 | 104 | if (color==1) then ! only ranks in this group do IO collectively 105 | 106 | ! generate subarray information 107 | sizes(1) = xsz(1) 108 | sizes(2) = ysz(2) 109 | sizes(3) = zsz(3) 110 | if (ipencil==1) then 111 | subsizes(1) = xsz(1) 112 | subsizes(2) = xsz(2) 113 | subsizes(3) = xsz(3) 114 | starts(1) = xst(1)-1 115 | starts(2) = xst(2)-1 116 | starts(3) = xst(3)-1 117 | else if (ipencil==2) then 118 | subsizes(1) = ysz(1) 119 | subsizes(2) = ysz(2) 120 | subsizes(3) = ysz(3) 121 | starts(1) = yst(1)-1 122 | starts(2) = yst(2)-1 123 | starts(3) = yst(3)-1 124 | else if (ipencil==3) then 125 | subsizes(1) = zsz(1) 126 | subsizes(2) = zsz(2) 127 | subsizes(3) = zsz(3) 128 | starts(1) = zst(1)-1 129 | starts(2) = zst(2)-1 130 | starts(3) = zst(3)-1 131 | end if 132 | 133 | ! copy data from original array 134 | ! needs a copy of original array in global coordinate 135 | if (ipencil==1) then 136 | allocate(wk(xst(1):xen(1),xst(2):xen(2),xst(3):xen(3))) 137 | allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3))) 138 | wk2=var 139 | if (from1) then 140 | do k=xst(3),xen(3) 141 | do j=xst(2),xen(2) 142 | do i=xst(1),xen(1) 143 | wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1) 144 | end do 145 | end do 146 | end do 147 | else 148 | do k=xst(3),xen(3) 149 | do j=xst(2),xen(2) 150 | do i=xst(1),xen(1) 151 | wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip) 152 | end do 153 | end do 154 | end do 155 | end if 156 | else if (ipencil==2) then 157 | allocate(wk(yst(1):yen(1),yst(2):yen(2),yst(3):yen(3))) 158 | allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3))) 159 | wk2=var 160 | if (from1) then 161 | do k=yst(3),yen(3) 162 | do j=yst(2),yen(2) 163 | do i=yst(1),yen(1) 164 | wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1) 165 | end do 166 | end do 167 | end do 168 | else 169 | do k=yst(3),yen(3) 170 | do j=yst(2),yen(2) 171 | do i=yst(1),yen(1) 172 | wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip) 173 | end do 174 | end do 175 | end do 176 | end if 177 | else if (ipencil==3) then 178 | allocate(wk(zst(1):zen(1),zst(2):zen(2),zst(3):zen(3))) 179 | allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3))) 180 | wk2=var 181 | if (from1) then 182 | do k=zst(3),zen(3) 183 | do j=zst(2),zen(2) 184 | do i=zst(1),zen(1) 185 | wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1) 186 | end do 187 | end do 188 | end do 189 | else 190 | do k=zst(3),zen(3) 191 | do j=zst(2),zen(2) 192 | do i=zst(1),zen(1) 193 | wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip) 194 | end do 195 | end do 196 | end do 197 | end if 198 | end if 199 | deallocate(wk2) 200 | 201 | ! MPI-IO 202 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 203 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 204 | call MPI_TYPE_COMMIT(newtype,ierror) 205 | call MPI_FILE_OPEN(newcomm, filename, & 206 | MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, & 207 | fh, ierror) 208 | filesize = 0_MPI_OFFSET_KIND 209 | call MPI_FILE_SET_SIZE(fh,filesize,ierror) ! guarantee overwriting 210 | disp = 0_MPI_OFFSET_KIND 211 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 212 | newtype,'native',MPI_INFO_NULL,ierror) 213 | call MPI_FILE_WRITE_ALL(fh, wk, & 214 | subsizes(1)*subsizes(2)*subsizes(3), & 215 | data_type, MPI_STATUS_IGNORE, ierror) 216 | call MPI_FILE_CLOSE(fh,ierror) 217 | call MPI_TYPE_FREE(newtype,ierror) 218 | 219 | deallocate(wk) 220 | 221 | end if ! color==1 222 | 223 | call MPI_BARRIER(MPI_COMM_WORLD, ierror) 224 | -------------------------------------------------------------------------------- /src/io_write_one.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'mpiio_write_one_...' in io.f90 14 | 15 | ! Using MPI-IO to write a distributed 3D array into a file 16 | 17 | if (present(opt_decomp)) then 18 | decomp = opt_decomp 19 | else 20 | call get_decomp_info(decomp) 21 | end if 22 | 23 | ! determine subarray parameters 24 | sizes(1) = decomp%xsz(1) 25 | sizes(2) = decomp%ysz(2) 26 | sizes(3) = decomp%zsz(3) 27 | 28 | if (ipencil == 1) then 29 | subsizes(1) = decomp%xsz(1) 30 | subsizes(2) = decomp%xsz(2) 31 | subsizes(3) = decomp%xsz(3) 32 | starts(1) = decomp%xst(1)-1 ! 0-based index 33 | starts(2) = decomp%xst(2)-1 34 | starts(3) = decomp%xst(3)-1 35 | else if (ipencil == 2) then 36 | subsizes(1) = decomp%ysz(1) 37 | subsizes(2) = decomp%ysz(2) 38 | subsizes(3) = decomp%ysz(3) 39 | starts(1) = decomp%yst(1)-1 40 | starts(2) = decomp%yst(2)-1 41 | starts(3) = decomp%yst(3)-1 42 | else if (ipencil == 3) then 43 | subsizes(1) = decomp%zsz(1) 44 | subsizes(2) = decomp%zsz(2) 45 | subsizes(3) = decomp%zsz(3) 46 | starts(1) = decomp%zst(1)-1 47 | starts(2) = decomp%zst(2)-1 48 | starts(3) = decomp%zst(3)-1 49 | endif 50 | 51 | #ifdef T3PIO 52 | call MPI_INFO_CREATE(info, ierror) 53 | gs = ceiling(real(sizes(1),mytype)*real(sizes(2),mytype)* & 54 | real(sizes(3),mytype)/1024./1024.) 55 | call t3pio_set_info(MPI_COMM_WORLD, info, "./", ierror, & 56 | GLOBAL_SIZE=gs, factor=1) 57 | #endif 58 | 59 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 60 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 61 | call MPI_TYPE_COMMIT(newtype,ierror) 62 | #ifdef T3PIO 63 | call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, & 64 | MPI_MODE_CREATE+MPI_MODE_WRONLY, info, fh, ierror) 65 | #else 66 | call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, & 67 | MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, & 68 | fh, ierror) 69 | #endif 70 | filesize = 0_MPI_OFFSET_KIND 71 | call MPI_FILE_SET_SIZE(fh,filesize,ierror) ! guarantee overwriting 72 | disp = 0_MPI_OFFSET_KIND 73 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 74 | newtype,'native',MPI_INFO_NULL,ierror) 75 | call MPI_FILE_WRITE_ALL(fh, var, & 76 | subsizes(1)*subsizes(2)*subsizes(3), & 77 | data_type, MPI_STATUS_IGNORE, ierror) 78 | call MPI_FILE_CLOSE(fh,ierror) 79 | call MPI_TYPE_FREE(newtype,ierror) 80 | #ifdef T3PIO 81 | call MPI_INFO_FREE(info,ierror) 82 | #endif 83 | -------------------------------------------------------------------------------- /src/io_write_plane.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'mpiio_write_plane_3d_...' in io.f90 14 | 15 | ! It is much easier to implement if all mpi ranks participate I/O. 16 | ! Transpose the 3D data if necessary. 17 | 18 | if (present(opt_decomp)) then 19 | decomp = opt_decomp 20 | else 21 | call get_decomp_info(decomp) 22 | end if 23 | 24 | if (iplane==1) then 25 | allocate(wk(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3))) 26 | if (ipencil==1) then 27 | wk = var 28 | else if (ipencil==2) then 29 | call transpose_y_to_x(var,wk,decomp) 30 | else if (ipencil==3) then 31 | allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) 32 | call transpose_z_to_y(var,wk2,decomp) 33 | call transpose_y_to_x(wk2,wk,decomp) 34 | deallocate(wk2) 35 | end if 36 | allocate(wk2d(1,decomp%xsz(2),decomp%xsz(3))) 37 | do k=1,decomp%xsz(3) 38 | do j=1,decomp%xsz(2) 39 | wk2d(1,j,k)=wk(n,j,k) 40 | end do 41 | end do 42 | sizes(1) = 1 43 | sizes(2) = decomp%ysz(2) 44 | sizes(3) = decomp%zsz(3) 45 | subsizes(1) = 1 46 | subsizes(2) = decomp%xsz(2) 47 | subsizes(3) = decomp%xsz(3) 48 | starts(1) = 0 49 | starts(2) = decomp%xst(2)-1 50 | starts(3) = decomp%xst(3)-1 51 | 52 | else if (iplane==2) then 53 | allocate(wk(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) 54 | if (ipencil==1) then 55 | call transpose_x_to_y(var,wk,decomp) 56 | else if (ipencil==2) then 57 | wk = var 58 | else if (ipencil==3) then 59 | call transpose_z_to_y(var,wk,decomp) 60 | end if 61 | allocate(wk2d(decomp%ysz(1),1,decomp%ysz(3))) 62 | do k=1,decomp%ysz(3) 63 | do i=1,decomp%ysz(1) 64 | wk2d(i,1,k)=wk(i,n,k) 65 | end do 66 | end do 67 | sizes(1) = decomp%xsz(1) 68 | sizes(2) = 1 69 | sizes(3) = decomp%zsz(3) 70 | subsizes(1) = decomp%ysz(1) 71 | subsizes(2) = 1 72 | subsizes(3) = decomp%ysz(3) 73 | starts(1) = decomp%yst(1)-1 74 | starts(2) = 0 75 | starts(3) = decomp%yst(3)-1 76 | 77 | else if (iplane==3) then 78 | allocate(wk(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3))) 79 | if (ipencil==1) then 80 | allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3))) 81 | call transpose_x_to_y(var,wk2,decomp) 82 | call transpose_y_to_z(wk2,wk,decomp) 83 | deallocate(wk2) 84 | else if (ipencil==2) then 85 | call transpose_y_to_z(var,wk,decomp) 86 | else if (ipencil==3) then 87 | wk = var 88 | end if 89 | allocate(wk2d(decomp%zsz(1),decomp%zsz(2),1)) 90 | do j=1,decomp%zsz(2) 91 | do i=1,decomp%zsz(1) 92 | wk2d(i,j,1)=wk(i,j,n) 93 | end do 94 | end do 95 | sizes(1) = decomp%xsz(1) 96 | sizes(2) = decomp%ysz(2) 97 | sizes(3) = 1 98 | subsizes(1) = decomp%zsz(1) 99 | subsizes(2) = decomp%zsz(2) 100 | subsizes(3) = 1 101 | starts(1) = decomp%zst(1)-1 102 | starts(2) = decomp%zst(2)-1 103 | starts(3) = 0 104 | end if 105 | 106 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 107 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 108 | call MPI_TYPE_COMMIT(newtype,ierror) 109 | call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, & 110 | MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, & 111 | fh, ierror) 112 | filesize = 0_MPI_OFFSET_KIND 113 | call MPI_FILE_SET_SIZE(fh,filesize,ierror) ! guarantee overwriting 114 | disp = 0_MPI_OFFSET_KIND 115 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 116 | newtype,'native',MPI_INFO_NULL,ierror) 117 | call MPI_FILE_WRITE_ALL(fh, wk2d, & 118 | subsizes(1)*subsizes(2)*subsizes(3), & 119 | data_type, MPI_STATUS_IGNORE, ierror) 120 | call MPI_FILE_CLOSE(fh,ierror) 121 | call MPI_TYPE_FREE(newtype,ierror) 122 | 123 | deallocate(wk,wk2d) 124 | -------------------------------------------------------------------------------- /src/io_write_var.f90: -------------------------------------------------------------------------------- 1 | !======================================================================= 2 | ! This is part of the 2DECOMP&FFT library 3 | ! 4 | ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 5 | ! decomposition. It also implements a highly scalable distributed 6 | ! three-dimensional Fast Fourier Transform (FFT). 7 | ! 8 | ! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG) 9 | ! 10 | !======================================================================= 11 | 12 | ! This file contain common code to be included by subroutines 13 | ! 'write_var_...' in io.f90 14 | 15 | ! Using MPI-IO to write a distributed 3D variable to a file. File 16 | ! operations (open/close) need to be done in calling application. This 17 | ! allows multiple variables to be written to a single file. Together 18 | ! with the corresponding read operation, this is the perfect solution 19 | ! for applications to perform restart/checkpointing. 20 | 21 | if (present(opt_decomp)) then 22 | decomp = opt_decomp 23 | else 24 | call get_decomp_info(decomp) 25 | end if 26 | 27 | ! Create file type and set file view 28 | sizes(1) = decomp%xsz(1) 29 | sizes(2) = decomp%ysz(2) 30 | sizes(3) = decomp%zsz(3) 31 | if (ipencil == 1) then 32 | subsizes(1) = decomp%xsz(1) 33 | subsizes(2) = decomp%xsz(2) 34 | subsizes(3) = decomp%xsz(3) 35 | starts(1) = decomp%xst(1)-1 ! 0-based index 36 | starts(2) = decomp%xst(2)-1 37 | starts(3) = decomp%xst(3)-1 38 | else if (ipencil == 2) then 39 | subsizes(1) = decomp%ysz(1) 40 | subsizes(2) = decomp%ysz(2) 41 | subsizes(3) = decomp%ysz(3) 42 | starts(1) = decomp%yst(1)-1 43 | starts(2) = decomp%yst(2)-1 44 | starts(3) = decomp%yst(3)-1 45 | else if (ipencil == 3) then 46 | subsizes(1) = decomp%zsz(1) 47 | subsizes(2) = decomp%zsz(2) 48 | subsizes(3) = decomp%zsz(3) 49 | starts(1) = decomp%zst(1)-1 50 | starts(2) = decomp%zst(2)-1 51 | starts(3) = decomp%zst(3)-1 52 | endif 53 | 54 | call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, & 55 | MPI_ORDER_FORTRAN, data_type, newtype, ierror) 56 | call MPI_TYPE_COMMIT(newtype,ierror) 57 | call MPI_FILE_SET_VIEW(fh,disp,data_type, & 58 | newtype,'native',MPI_INFO_NULL,ierror) 59 | call MPI_FILE_WRITE_ALL(fh, var, & 60 | subsizes(1)*subsizes(2)*subsizes(3), & 61 | data_type, MPI_STATUS_IGNORE, ierror) 62 | call MPI_TYPE_FREE(newtype,ierror) 63 | 64 | ! update displacement for the next write operation 65 | disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes 66 | if (data_type == complex_type) then 67 | disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes 68 | end if 69 | --------------------------------------------------------------------------------