├── .circleci └── config.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── conf └── conda │ ├── conda_build_config.yaml │ ├── meta.yaml │ └── run_test.sh ├── demo ├── spectral_dns_solver.py └── transforms_realdata.py ├── mpiFFT4py ├── __init__.py ├── cython │ ├── __init__.py │ └── maths.pyx ├── line.py ├── mpibase.py ├── pencil.py ├── serialFFT │ ├── __init__.py │ ├── numpy_fft.py │ └── pyfftw_fft.py └── slab.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests └── test_FFT.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | machine: true 5 | steps: 6 | - checkout 7 | 8 | - restore_cache: 9 | key: v2-miniconda-{{ .Branch }} 10 | 11 | - run: 12 | name: install miniconda 13 | command: | 14 | if [[ ! -d /home/circleci/miniconda ]]; then 15 | wget https://repo.continuum.io/miniconda/Miniconda3-4.5.1-Linux-x86_64.sh -O miniconda.sh && 16 | bash miniconda.sh -b -f -p /home/circleci/miniconda; 17 | else 18 | echo "Using cached miniconda"; 19 | fi 20 | source ~/miniconda/bin/activate root 21 | conda config --set always_yes yes 22 | conda config --add channels conda-forge 23 | conda config --add channels spectralDNS 24 | conda clean --lock 25 | conda install --yes --quiet conda-forge-ci-setup=1 26 | source run_conda_forge_build_setup 27 | 28 | - save_cache: 29 | key: v2-miniconda-{{ .Branch }} 30 | paths: 31 | - /home/circleci/miniconda 32 | 33 | - run: 34 | name: Build and test 35 | command: | 36 | source ~/miniconda/bin/activate root 37 | cd /home/circleci/project 38 | conda build --python 2.7 ./conf/conda 39 | conda build --python 3.6 ./conf/conda 40 | 41 | - run: 42 | name: Upload packages 43 | command: | 44 | source ~/miniconda/bin/activate root 45 | cd /home/circleci/project 46 | upload_or_check_non_existence ./conf/conda spectralDNS --channel main 47 | export CONDA_PY=36 48 | upload_or_check_non_existence ./conf/conda spectralDNS --channel main 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: generic 2 | os: osx 3 | osx_image: xcode7.3 4 | sudo: false 5 | env: 6 | matrix: 7 | - CONDA_PY=27 8 | - CONDA_PY=36 9 | global: 10 | - secure: "swxbq67k6ag2v7QjLGMtn72mROxDZ7d+c6X+BgR2YS1XT7l45T9+0Z/PTpCJg+9mmEH3YdlpnlzKjatz9xVNY04a7RljFMsNy/+5oiTOmno2IDq2fAPrUFvGAvdqsVgnc6+e+GUwaDL5n/AfDVOIb18tT4P2VRk3ooCsSILtQYvQWixLw5bx3BhTgAfXnmu7e+oaB+vCDXXjlFINlOvHZCBiVI9g0yXH0sW9gYsR2vsmIdxraChsq/+Q0wkaNUgUaiuHXNWcaZiiWleRYnYsktsNfT1nknkLrkPAtQTC5fYgXj6o9Sh+codcfYH95ztBm83rWzfWo2f+Ok1AtrRdG+CiApCFMQ6T4ZjonxEeZhopvY7+xNLXFoHcmnBdf0NM3wmCdwrzuzdHvpqRnozClTqG6Srvna7X4/WtDbKpF2yEHKdiBmaf8NRcGDpbJeyvnzlNz5HMESltvYUVatLzPTzzJplkvgMX3Ti8xcqYgwB1ayrClGFlpWM33MdzJiSSTptv3WYmhi7rV5xdpCc5pBTF5XLOtEB0dFGY60yQd9SWSxjFAMwo9808V6koiKX3D0Ogin8mQmvR2DqVhkBqfHFf36s38OfG/n1iV/Oednc9pfYP55T7ljKRsPUpavblCPizBfQnQEFivjaDlPGX3/bR0TV9F/pRSiJ84JMgKzs=" 11 | 12 | before_install: 13 | - brew remove --force $(brew list) 14 | - brew cleanup -s 15 | - rm -rf $(brew --cache) 16 | install: 17 | - | 18 | MINICONDA_URL="https://repo.continuum.io/miniconda" 19 | MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh" 20 | curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" 21 | bash $MINICONDA_FILE -b 22 | source /Users/travis/miniconda3/bin/activate root 23 | conda config --set show_channel_urls true 24 | conda config --add channels conda-forge 25 | conda install --yes --quiet conda-forge-ci-setup=1 26 | source run_conda_forge_build_setup 27 | script: 28 | - conda build conf/conda 29 | after_success: 30 | - export GIT_DESCRIBE_TAG=`git describe --tags | cut -d'-' -f 1` 31 | - upload_or_check_non_existence ./conf/conda spectralDNS --channel main || exit 1 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.py *.txt *.rst 2 | recursive-include mpiFFT4py *.py *.pyx 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VERSION=$(shell python3 -c "import mpiFFT4py; print(mpiFFT4py.__version__)") 2 | 3 | default: 4 | python setup.py build_ext -i 5 | 6 | pip: 7 | rm -f dist/* 8 | python setup.py sdist 9 | twine upload dist/* 10 | 11 | tag: 12 | git tag $(VERSION) 13 | git push --tags 14 | 15 | publish: tag pip 16 | 17 | clean: 18 | git clean mpiFFT4py -fx 19 | git clean tests -fx 20 | cd docs && make clean && cd .. 21 | @rm -rf *.egg-info/ build/ dist/ .eggs/ -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | mpiFFT4py 2 | --------- 3 | 4 | .. image:: https://travis-ci.org/spectralDNS/mpiFFT4py.svg?branch=master 5 | :target: https://travis-ci.org/spectralDNS/mpiFFT4py 6 | .. image:: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master.svg?style=svg 7 | :target: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master 8 | .. image:: https://zenodo.org/badge/51817237.svg 9 | :target: https://zenodo.org/badge/latestdoi/51817237 10 | 11 | Description 12 | ----------- 13 | mpiFFT4py performs FFTs in parallel in Python. It is developed to be able to do FFTs in parallel on a three-dimensional computational box (a structured grid), but there are also routines for doing the FFTs on a 2D mesh. It implements both the *slab* and the *pencil* decompositions. 14 | 15 | Installation 16 | ------------ 17 | mpiFFT4py requires *numpy* for basic array oparations, [*pyfftw*](https://github.com/pyfftw/pyFFTW) for efficient FFTs and [*mpi4py*](https://bitbucket.org/mpi4py/mpi4py) for MPI communications. However, if *pyfftw* is not found, then the slower *numpy.fft* is used instead. [*cython*](http://cython.org) is used to optimize a few routines. Install using regular python distutils:: 18 | 19 | python setup.py install --prefix="Path on the PYTHONPATH" 20 | 21 | To install in place do:: 22 | 23 | python setup.py build_ext --inplace 24 | 25 | To install using Anaconda, you may either compile it yourselves using (from the main directory):: 26 | 27 | conda config --add channels conda-forge 28 | conda build conf/conda 29 | conda install mpiFFT4py --use-local 30 | 31 | or use precompiled binaries in the[*conda-forge*](https://anaconda.org/conda-forge/mpifft4py) or the [*spectralDNS*](https://anaconda.org/spectralDNS/mpifft4py) channel on Anaconda cloud:: 32 | 33 | conda install -c conda-forge mpifft4py 34 | 35 | or:: 36 | 37 | conda config --add channels conda-forge 38 | conda install -c spectralDNS mpifft4py 39 | 40 | There are binaries compiled for both OSX and linux, and several versions of Python. Note that the spectralDNS channel contains bleeding-edge versions of the Software, whereas conda-forge is more stable. 41 | 42 | Authors 43 | ------- 44 | mpiFFT4py is developed by 45 | 46 | * Mikael Mortensen 47 | 48 | Licence 49 | ------- 50 | mpiFFT4py is licensed under the GNU GPL, version 3 or (at your option) any later version. mpiFFT4py is Copyright (2014-2016) by the authors. 51 | 52 | Contact 53 | ------- 54 | The latest version of this software can be obtained from 55 | 56 | https://github.com/spectralDNS/mpiFFT4py 57 | 58 | Please report bugs and other issues through the issue tracker at: 59 | 60 | https://github.com/spectralDNS/mpiFFT4py/issues 61 | -------------------------------------------------------------------------------- /conf/conda/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | numpy: 2 | - 1.15 3 | -------------------------------------------------------------------------------- /conf/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: mpifft4py 3 | version: "{{ GIT_DESCRIBE_TAG }}" 4 | 5 | source: 6 | git_url: ../../ 7 | 8 | build: 9 | number: 0 10 | script: "pip install ." 11 | 12 | requirements: 13 | build: 14 | - python 15 | - pip 16 | - cython 17 | - numpy 18 | 19 | run: 20 | - python 21 | - numpy 22 | - scipy 23 | - mpi4py 24 | - fftw 25 | - pyfftw 26 | 27 | test: 28 | source_files: 29 | - tests 30 | 31 | requires: 32 | - pytest 33 | -------------------------------------------------------------------------------- /conf/conda/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pushd tests 4 | 5 | export OMPI_MCA_plm=isolated 6 | export OMPI_MCA_btl_vader_single_copy_mechanism=none 7 | export OMPI_MCA_rmaps_base_oversubscribe=yes 8 | 9 | if [ "$(uname)" == "Darwin" ]; then 10 | mpirun -np 2 py.test -v 11 | fi 12 | 13 | if [ "$(uname)" == "Linux" ]; then 14 | mpirun -np 2 py.test -v 15 | fi 16 | # if [ "${CONDA_PY:0:1}" == "3" ]; then 17 | # mpirun -np 4 py.test 18 | # fi 19 | # 20 | # if [ "${CONDA_PY:0:1}" == "2" ]; then 21 | # mpirun -np 1 py.test 22 | # fi 23 | # 24 | -------------------------------------------------------------------------------- /demo/spectral_dns_solver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demo program that solves the Navier Stokes equations in a triply 3 | periodic domain. The solution is initialized using the Taylor-Green 4 | vortex and evolved in time with a 4'th order Runge Kutta method. 5 | 6 | Basically, we create an instance of the R2C class for performing 3D FFTs 7 | in parallel on a cube of size N points and physical size L. The mesh 8 | decomposition is performed by the FFT class using a slab decomposition. 9 | With slab decomposition the first index in real physical space is shared 10 | amongst the processors, whereas in wavenumber space the second index is shared. 11 | """ 12 | __author__ = "Mikael Mortensen " 13 | __date__ = "2016-04-07" 14 | __copyright__ = "Copyright (C) 2016 " + __author__ 15 | __license__ = "GNU Lesser GPL version 3 or any later version" 16 | 17 | from numpy import array, pi, empty, where, sin, cos, sum 18 | from mpi4py import MPI 19 | from mpiFFT4py import work_arrays 20 | from mpiFFT4py.slab import R2C 21 | from collections import defaultdict 22 | 23 | # Set viscosity, end time and time step 24 | nu = 0.000625 25 | T = 0.1 26 | dt = 0.01 27 | 28 | # Set global size of the computational box 29 | N = array([2**5, 2**5, 2**5], dtype=int) 30 | L = array([2*pi, 2*pi, 2*pi], dtype=float) 31 | 32 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", planner_effort= 33 | defaultdict(lambda: 'FFTW_ESTIMATE', {'irfft2': 'FFTW_PATIENT'})) 34 | 35 | U = empty((3,) + FFT.real_shape()) # real_shape = (N[0]/comm.Get_size(), N[1], N[2]) 36 | U_hat = empty((3,) + FFT.complex_shape(), dtype=complex) # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]/2+1) 37 | P = empty(FFT.real_shape()) 38 | P_hat = empty(FFT.complex_shape(), dtype=complex) 39 | U_hat0 = empty((3,) + FFT.complex_shape(), dtype=complex) 40 | U_hat1 = empty((3,) + FFT.complex_shape(), dtype=complex) 41 | dU = empty((3,) + FFT.complex_shape(), dtype=complex) 42 | work = work_arrays() 43 | X = FFT.get_local_mesh() 44 | K = FFT.get_local_wavenumbermesh(scaled=True) 45 | K2 = K[0]*K[0] + K[1]*K[1] + K[2]*K[2] 46 | K_over_K2 = empty((3,) + FFT.complex_shape()) 47 | for k in range(3): 48 | K_over_K2[k] = K[k].astype(float) / where(K2 == 0, 1, K2).astype(float) 49 | a = [1./6., 1./3., 1./3., 1./6.] 50 | b = [0.5, 0.5, 1.] 51 | dealias = '3/2-rule' # ('2/3-rule', None) 52 | 53 | def cross(x, y, z): 54 | """Cross product z = x X y""" 55 | z[0] = FFT.fftn(x[1]*y[2]-x[2]*y[1], z[0], dealias) 56 | z[1] = FFT.fftn(x[2]*y[0]-x[0]*y[2], z[1], dealias) 57 | z[2] = FFT.fftn(x[0]*y[1]-x[1]*y[0], z[2], dealias) 58 | return z 59 | 60 | def curl(x, z): 61 | z[2] = FFT.ifftn(1j*(K[0]*x[1]-K[1]*x[0]), z[2], dealias) 62 | z[1] = FFT.ifftn(1j*(K[2]*x[0]-K[0]*x[2]), z[1], dealias) 63 | z[0] = FFT.ifftn(1j*(K[1]*x[2]-K[2]*x[1]), z[0], dealias) 64 | return z 65 | 66 | def compute_rhs(rhs): 67 | U_dealiased = work[((3,) + FFT.work_shape(dealias), float, 0)] 68 | curl_dealiased = work[((3,) + FFT.work_shape(dealias), float, 1)] 69 | for i in range(3): 70 | U_dealiased[i] = FFT.ifftn(U_hat[i], U_dealiased[i], dealias) 71 | 72 | curl_dealiased = curl(U_hat, curl_dealiased) 73 | rhs = cross(U_dealiased, curl_dealiased, rhs) 74 | P_hat[:] = sum(rhs*K_over_K2, 0, out=P_hat) 75 | rhs -= P_hat*K 76 | rhs -= nu*K2*U_hat 77 | return rhs 78 | 79 | # Initialize a Taylor Green vortex 80 | U[0] = sin(X[0])*cos(X[1])*cos(X[2]) 81 | U[1] = -cos(X[0])*sin(X[1])*cos(X[2]) 82 | U[2] = 0 83 | for i in range(3): 84 | U_hat[i] = FFT.fftn(U[i], U_hat[i]) 85 | 86 | # Integrate using a 4th order Rung-Kutta method 87 | t = 0.0 88 | tstep = 0 89 | while t < T-1e-8: 90 | t += dt 91 | tstep += 1 92 | U_hat1[:] = U_hat0[:] = U_hat 93 | for rk in range(4): 94 | dU = compute_rhs(dU) 95 | if rk < 3: 96 | U_hat[:] = U_hat0 + b[rk]*dt*dU 97 | U_hat1[:] += a[rk]*dt*dU 98 | U_hat[:] = U_hat1[:] 99 | 100 | for i in range(3): 101 | U[i] = FFT.ifftn(U_hat[i], U[i]) 102 | 103 | k = FFT.comm.reduce(sum(U*U)/N[0]/N[1]/N[2]/2) 104 | if FFT.rank == 0: 105 | assert round(k - 0.124953117517, 7) == 0 106 | -------------------------------------------------------------------------------- /demo/transforms_realdata.py: -------------------------------------------------------------------------------- 1 | __author__ = "Mikael Mortensen " 2 | __date__ = "2016-03-09" 3 | __copyright__ = "Copyright (C) 2016 " + __author__ 4 | __license__ = "GNU Lesser GPL version 3 or any later version" 5 | 6 | from numpy import * 7 | from mpi4py import MPI 8 | #from mpiFFT4py.pencil import R2C 9 | from mpiFFT4py.slab import R2C 10 | from mpi4py_fft.mpifft import PFFT 11 | from time import time 12 | 13 | #assert MPI.COMM_WORLD.Get_size() >= 4 14 | 15 | # Set global size of the computational box 16 | M = 6 17 | N = array([2**M, 2**M, 2**M], dtype=int) 18 | L = array([2*pi, 2*pi, 2*pi], dtype=float) 19 | 20 | # Create an instance of the R2C class for performing 3D FFTs in parallel 21 | # on a cube of size N points and physical size L. The mesh decomposition is 22 | # performed by the FFT class using a slab decomposition. With slab decomposition 23 | # the first index in real physical space is shared amongst the processors, 24 | # whereas in wavenumber space the second index is shared. 25 | 26 | #FFT = R2C(N, L, MPI.COMM_WORLD, "double", None, alignment='X', communication='Alltoallw') 27 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", communication='Alltoallw') 28 | fft = PFFT(MPI.COMM_WORLD, N, collapse=False, slab=2) 29 | 30 | U = random.random(FFT.real_shape()).astype(FFT.float) # real_shape = (N[0]//comm.Get_size(), N[1], N[2]) 31 | U_copy = zeros_like(U) 32 | U_hat = zeros(FFT.complex_shape(), dtype=FFT.complex) # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]//2+1) 33 | 34 | # Perform forward FFT. Real transform in third direction, complex in first two 35 | U_hat = FFT.fftn(U, U_hat) 36 | 37 | # Perform inverse FFT. 38 | U_copy = FFT.ifftn(U_hat, U_copy) 39 | MPI.COMM_WORLD.barrier() 40 | t0 = time() 41 | U_hat = FFT.fftn(U, U_hat) 42 | U_copy = FFT.ifftn(U_hat, U_copy) 43 | print("mpiFFT4py ", time()-t0) 44 | ########### 45 | u = random.random(fft.forward.input_array.shape).astype(fft.forward.input_array.dtype) 46 | MPI.COMM_WORLD.barrier() 47 | t0 = time() 48 | u_hat = fft.forward(u) 49 | u_copy = fft.backward(u_hat) 50 | print("mpi4py-fft ", time()-t0) 51 | ######### 52 | 53 | tol = 1e-6 if FFT.float == float32 else 1e-10 54 | 55 | assert allclose(U, U_copy, tol, tol) 56 | assert allclose(u, u_copy, tol, tol) 57 | -------------------------------------------------------------------------------- /mpiFFT4py/__init__.py: -------------------------------------------------------------------------------- 1 | from .serialFFT import * 2 | from .slab import R2C as Slab_R2C 3 | from .pencil import R2C as Pencil_R2C 4 | from .line import R2C as Line_R2C 5 | from .mpibase import work_arrays, datatypes, empty, zeros 6 | from numpy.fft import fftfreq, rfftfreq 7 | 8 | __version__ = '1.1.2' 9 | -------------------------------------------------------------------------------- /mpiFFT4py/cython/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spectralDNS/mpiFFT4py/61ce6474771efff4e3b280b3f69f09611a2c1150/mpiFFT4py/cython/__init__.py -------------------------------------------------------------------------------- /mpiFFT4py/cython/maths.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False 2 | #cython: wraparound=False 3 | cimport numpy as np 4 | 5 | ctypedef fused complex_t: 6 | np.complex64_t 7 | np.complex128_t 8 | 9 | def dealias_filter(np.ndarray[complex_t, ndim=3] fu, 10 | np.ndarray[np.uint8_t, ndim=3] dealias): 11 | cdef unsigned int i, j, k 12 | cdef np.uint8_t uu 13 | for i in xrange(dealias.shape[0]): 14 | for j in xrange(dealias.shape[1]): 15 | for k in xrange(dealias.shape[2]): 16 | uu = dealias[i, j, k] 17 | fu[i, j, k].real *= uu 18 | fu[i, j, k].imag *= uu 19 | return fu 20 | 21 | def transpose_Uc(np.ndarray[complex_t, ndim=3] Uc_hatT, 22 | np.ndarray[complex_t, ndim=4] U_mpi, 23 | int num_processes, int Np0, int Np1, int Nf): 24 | cdef unsigned int i, j, k, l, kk 25 | for i in xrange(num_processes): 26 | for j in xrange(Np0): 27 | for k in xrange(i*Np1, (i+1)*Np1): 28 | kk = k-i*Np1 29 | for l in xrange(Nf): 30 | Uc_hatT[j, k, l] = U_mpi[i, j, kk, l] 31 | return Uc_hatT 32 | 33 | def transpose_Umpi(np.ndarray[complex_t, ndim=4] U_mpi, 34 | np.ndarray[complex_t, ndim=3] Uc_hatT, 35 | int num_processes, int Np, int Nf): 36 | cdef unsigned int i,j,k,l,kk 37 | for i in xrange(num_processes): 38 | for j in xrange(Np): 39 | for kk in xrange(Np): 40 | k = kk+i*Np 41 | for l in xrange(Nf): 42 | U_mpi[i,j,kk,l] = Uc_hatT[j,k,l] 43 | return U_mpi 44 | 45 | #for i in xrange(num_processes): 46 | #for j in xrange(Np): 47 | #for k in xrange(i*Np, (i+1)*Np): 48 | #kk = k-i*Np 49 | #for l in xrange(Nf): 50 | #U_mpi[i,j,kk,l] = Uc_hatT[j,k,l] 51 | #return U_mpi 52 | 53 | #def copy_to_padded(np.ndarray[complex_t, ndim=3] fu, 54 | #np.ndarray[complex_t, ndim=3] fp, 55 | #np.ndarray[int, ndim=1] N, int axis=0): 56 | #if axis == 0: 57 | #fp[:N[0]/2] = fu[:N[0]/2] 58 | #fp[-N[0]/2:] = fu[N[0]/2:] 59 | #elif axis == 1: 60 | #fp[:, :N[1]/2] = fu[:, :N[1]/2] 61 | #fp[:, -N[1]/2:] = fu[:, N[1]/2:] 62 | #elif axis == 2: 63 | #fp[:, :, :(N[2]/2+1)] = fu[:] 64 | #return fp 65 | 66 | #def copy_to_padded_c(np.ndarray[complex_t, ndim=3] fu, 67 | #np.ndarray[complex_t, ndim=3] fp, 68 | #np.ndarray[int, ndim=1] N, int axis=0): 69 | #if axis == 0: 70 | #fp[:N[0]] = fu[:N[0]] 71 | #elif axis == 1: 72 | #fp[:, :N[1]/2] = fu[:, :N[1]/2] 73 | #fp[:, -N[1]/2:] = fu[:, N[1]/2:] 74 | #elif axis == 2: 75 | #fp[:, :, :(N[2]/2+1)] = fu[:] 76 | #return fp 77 | 78 | -------------------------------------------------------------------------------- /mpiFFT4py/line.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | __author__ = "Mikael Mortensen " 3 | __date__ = "2016-02-16" 4 | __copyright__ = "Copyright (C) 2016 " + __author__ 5 | __license__ = "GNU Lesser GPL version 3 or any later version" 6 | 7 | from .serialFFT import * 8 | import numpy as np 9 | from .mpibase import work_arrays, datatypes, zeros, empty 10 | from numpy.fft import fftfreq, rfftfreq 11 | from collections import defaultdict 12 | from mpi4py import MPI 13 | 14 | def transpose_x(U_send, Uc_hatT, num_processes): 15 | sx = U_send.shape 16 | sy = Uc_hatT.shape 17 | U_send[:] = np.rollaxis(Uc_hatT[:,:-1].reshape(sy[0], num_processes, sx[2]), 1) 18 | return U_send 19 | 20 | def transpose_y(Uc_hatT, U_recv, num_processes): 21 | sx = Uc_hatT.shape 22 | sy = U_recv.shape 23 | Uc_hatT[:, :-1] = np.rollaxis(U_recv.reshape(num_processes, sx[0], sy[1]), 1).reshape((sx[0], sx[1]-1)) 24 | return Uc_hatT 25 | 26 | def swap_Nq(fft_y, fu, fft_x, N): 27 | f = fu[:, 0].copy() 28 | fft_x[0] = f[0].real 29 | fft_x[1:N//2] = 0.5*(f[1:N//2] + np.conj(f[:N//2:-1])) 30 | fft_x[N//2] = f[N//2].real 31 | fu[:N//2+1, 0] = fft_x[:N//2+1] 32 | fu[N//2+1:, 0] = np.conj(fft_x[(N//2-1):0:-1]) 33 | 34 | fft_y[0] = f[0].imag 35 | fft_y[1:N//2] = -0.5*1j*(f[1:N//2] - np.conj(f[:N//2:-1])) 36 | fft_y[N//2] = f[N//2].imag 37 | 38 | fft_y[N//2+1:] = np.conj(fft_y[(N//2-1):0:-1]) 39 | return fft_y 40 | 41 | class R2C(object): 42 | """Class for performing FFT in 2D using MPI 43 | 44 | Slab decomposition 45 | 46 | Args: 47 | N - NumPy array([Nx, Ny]) Number of nodes for the real mesh 48 | L - NumPy array([Lx, Ly]) The actual size of the real mesh 49 | comm - The MPI communicator object 50 | precision - "single" or "double" 51 | padsize - For performing transforms with padding 52 | 53 | """ 54 | 55 | def __init__(self, N, L, comm, precision, padsize=1.5, threads=1, 56 | planner_effort=defaultdict(lambda : "FFTW_MEASURE")): 57 | self.N = N # The global size of the problem 58 | self.L = L 59 | assert len(L) == 2 60 | assert len(N) == 2 61 | self.comm = comm 62 | self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision) 63 | self.num_processes = comm.Get_size() 64 | self.rank = comm.Get_rank() 65 | self.padsize = padsize 66 | self.threads = threads 67 | self.planner_effort = planner_effort 68 | # Each cpu gets ownership of Np indices 69 | self.Np = N // self.num_processes 70 | self.Nf = N[1]//2+1 71 | self.Npf = self.Np[1]//2+1 if self.rank+1 == self.num_processes else self.Np[1]//2 72 | self.Nfp = int(padsize*self.N[1]/2+1) 73 | self.ks = (fftfreq(N[0])*N[0]).astype(int) 74 | self.dealias = zeros(0) 75 | self.work_arrays = work_arrays() 76 | 77 | def real_shape(self): 78 | """The local shape of the real data""" 79 | return (self.Np[0], self.N[1]) 80 | 81 | def complex_shape(self): 82 | """The local shape of the complex data""" 83 | return (self.N[0], self.Npf) 84 | 85 | def global_complex_shape(self): 86 | """The local shape of the complex data""" 87 | return (self.N[0], self.Nf) 88 | 89 | def global_real_shape(self): 90 | """The local shape of the complex data""" 91 | return (self.N[0], self.N[1]) 92 | 93 | def real_local_slice(self, padsize=1): 94 | return (slice(int(padsize*self.rank*self.Np[0]), 95 | int(padsize*(self.rank+1)*self.Np[0]), 1), 96 | slice(0, int(padsize*self.N[1]))) 97 | 98 | def complex_local_slice(self): 99 | return (slice(0, self.N[0]), 100 | slice(self.rank*self.Np[1]//2, self.rank*self.Np[1]//2+self.Npf, 1)) 101 | 102 | def get_N(self): 103 | return self.N 104 | 105 | def get_local_mesh(self): 106 | # Create the mesh 107 | X = np.mgrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0], :self.N[1]].astype(self.float) 108 | X[0] *= self.L[0]/self.N[0] 109 | X[1] *= self.L[1]/self.N[1] 110 | return X 111 | 112 | def get_local_wavenumbermesh(self, scaled=True, broadcast=False, 113 | eliminate_highest_freq=False): 114 | kx = fftfreq(self.N[0], 1./self.N[0]) 115 | ky = rfftfreq(self.N[1], 1./self.N[1]) 116 | if eliminate_highest_freq: 117 | for i, k in enumerate((kx, ky)): 118 | if self.N[i] % 2 == 0: 119 | k[self.N[i]//2] = 0 120 | 121 | Ks = np.meshgrid(kx, ky[self.rank*self.Np[1]//2:(self.rank*self.Np[1]//2+self.Npf)], indexing='ij', sparse=True) 122 | if scaled is True: 123 | Lp = 2*np.pi/self.L 124 | Ks[0] *= Lp[0] 125 | Ks[1] *= Lp[1] 126 | K = Ks 127 | if broadcast is True: 128 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks] 129 | return K 130 | 131 | def get_dealias_filter(self): 132 | """Filter for dealiasing nonlinear convection""" 133 | K = self.get_local_wavenumbermesh() 134 | kmax = 2./3.*(self.N//2+1) 135 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1]), dtype=np.uint8) 136 | return dealias 137 | 138 | def global_complex_shape_padded(self): 139 | """Global size of problem in complex wavenumber space""" 140 | return (int(self.padsize*self.N[0]), int(self.padsize*self.N[1]/2+1)) 141 | 142 | def real_shape_padded(self): 143 | """The local shape of the real data""" 144 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1])) 145 | 146 | def complex_padded_xy(self): 147 | """The local shape of the real data""" 148 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]/2+1)) 149 | 150 | def complex_shape_padded_01(self): 151 | """The local shape of the real data""" 152 | return (int(self.padsize*self.Np[0]), self.Nf) 153 | 154 | def complex_padded_x(self): 155 | """Padding in x-direction""" 156 | return (int(self.padsize*self.N[0]), self.Npf) 157 | 158 | def work_shape(self, dealias): 159 | """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved""" 160 | if dealias == '3/2-rule': 161 | return self.real_shape_padded() 162 | 163 | else: 164 | return self.real_shape() 165 | 166 | def copy_to_padded_x(self, fu, fp): 167 | fp[:self.N[0]//2] = fu[:self.N[0]//2] 168 | fp[-(self.N[0]//2):] = fu[self.N[0]//2:] 169 | return fp 170 | 171 | def copy_to_padded_y(self, fu, fp): 172 | fp[:, :self.Nf] = fu[:] 173 | return fp 174 | 175 | def copy_from_padded_y(self, fp, fu): 176 | fu[:] = fp[:, :self.Nf] 177 | return fu 178 | 179 | def fft2(self, u, fu, dealias=None): 180 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 181 | 182 | if self.num_processes == 1: 183 | if not dealias == '3/2-rule': 184 | fu = rfft2(u, fu, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2']) 185 | 186 | else: 187 | fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)] 188 | fu_padded = rfft2(u/self.padsize**2, fu_padded, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2']) 189 | fu[:] = fu_padded[self.ks, :self.Nf] 190 | 191 | return fu 192 | 193 | if not dealias == '3/2-rule': 194 | 195 | # Work arrays 196 | Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)] 197 | U_send = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)] 198 | U_sendr = U_send.reshape((self.N[0], self.Np[1]//2)) 199 | Uc = self.work_arrays[((self.N[0], self.Np[1]//2), self.complex, 0)] 200 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)] 201 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)] 202 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)] 203 | 204 | # Transform in y-direction 205 | Uc_hatT = rfft(u, Uc_hatT, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft']) 206 | Uc_hatT[:, 0] += 1j*Uc_hatT[:, -1] 207 | 208 | U_send = transpose_x(U_send, Uc_hatT, self.num_processes) 209 | 210 | # Communicate all values 211 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype]) 212 | 213 | Uc = fft(U_sendr, Uc, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft']) 214 | fu[:, :self.Np[1]//2] = Uc 215 | 216 | # Handle Nyquist frequency 217 | if self.rank == 0: 218 | fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0]) 219 | self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77) 220 | 221 | elif self.rank == self.num_processes-1: 222 | self.comm.Recv([fft_y, self.mpitype], source=0, tag=77) 223 | fu[:, -1] = fft_y 224 | 225 | else: 226 | # Work arrays 227 | U_send = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)] 228 | U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2)) 229 | fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)] 230 | fu_padded_xy2 = self.work_arrays[(self.complex_shape_padded_01(), self.complex, 0)] 231 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)] 232 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)] 233 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)] 234 | 235 | # Transform in y-direction 236 | fu_padded_xy = rfft(u/self.padsize, fu_padded_xy, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft']) 237 | fu_padded_xy2 = self.copy_from_padded_y(fu_padded_xy, fu_padded_xy2) 238 | fu_padded_xy2[:, 0] += 1j*fu_padded_xy2[:, -1] 239 | 240 | U_send = transpose_x(U_send, fu_padded_xy2, self.num_processes) 241 | 242 | # Communicate all values 243 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype]) 244 | 245 | U_sendr = fft(U_sendr/self.padsize, U_sendr, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft']) 246 | 247 | fu.fill(0) 248 | fu[:self.N[0]//2+1, :self.Np[1]//2] = U_sendr[:self.N[0]//2+1] 249 | fu[self.N[0]//2:, :self.Np[1]//2] += U_sendr[-self.N[0]//2:] 250 | 251 | # Handle Nyquist frequency 252 | if self.rank == 0: 253 | fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0]) 254 | self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77) 255 | 256 | elif self.rank == self.num_processes-1: 257 | self.comm.Recv([fft_y, self.mpitype], source=0, tag=77) 258 | fu[:, -1] = fft_y 259 | 260 | return fu 261 | 262 | def ifft2(self, fu, u, dealias=None): 263 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 264 | 265 | if dealias == '2/3-rule' and self.dealias.shape == (0,): 266 | self.dealias = self.get_dealias_filter() 267 | 268 | fu_ = fu 269 | if dealias == '2/3-rule': 270 | fu_ = self.work_arrays[(fu, 0, False)] 271 | fu_[:] = fu 272 | fu_ *= self.dealias 273 | 274 | if self.num_processes == 1: 275 | if not dealias == '3/2-rule': 276 | u = irfft2(fu_, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2']) 277 | 278 | else: 279 | fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)] 280 | fu_padded[self.ks, :self.Nf] = fu[:] 281 | u = irfft2(fu_padded*self.padsize**2, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2']) 282 | 283 | return u 284 | 285 | if not dealias == '3/2-rule': 286 | # Get some work arrays 287 | Uc_hat = self.work_arrays[((self.N[0], self.Npf), self.complex, 0)] 288 | Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)] 289 | U_send = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)] 290 | U_sendr = U_send.reshape((self.N[0], self.Np[1]//2)) 291 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)] 292 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)] 293 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)] 294 | 295 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft']) 296 | U_sendr[:] = Uc_hat[:, :self.Np[1]//2] 297 | 298 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype]) 299 | 300 | Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes) 301 | 302 | if self.rank == self.num_processes-1: 303 | fft_y[:] = Uc_hat[:, -1] 304 | 305 | self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1) 306 | Uc_hatT[:, -1] = plane_recv 307 | 308 | u = irfft(Uc_hatT, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft']) 309 | 310 | else: 311 | U_send = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)] 312 | U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2)) 313 | Uc_hatT = self.work_arrays[((int(self.padsize*self.Np[0]), self.Nf), self.complex, 0)] 314 | fu_padded_x = self.work_arrays[(self.complex_padded_x(), self.complex, 0)] 315 | fu_padded_x2= self.work_arrays[(self.complex_padded_x(), self.complex, 1)] 316 | fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)] 317 | fft_y = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 0)] 318 | fft_x = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 1)] 319 | plane_recv = self.work_arrays[((int(self.padsize*self.Np[0]),), self.complex, 2)] 320 | 321 | fu_padded_x2 = self.copy_to_padded_x(fu, fu_padded_x2) 322 | fu_padded_x = ifft(fu_padded_x2, fu_padded_x, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft']) 323 | 324 | U_sendr[:] = fu_padded_x[:, :self.Np[1]//2] 325 | 326 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype]) 327 | 328 | Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes) 329 | 330 | if self.rank == self.num_processes-1: 331 | fft_y[:] = fu_padded_x[:, -1] 332 | 333 | self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1) 334 | Uc_hatT[:, -1] = plane_recv 335 | 336 | fu_padded_xy = self.copy_to_padded_y(Uc_hatT, fu_padded_xy) 337 | 338 | u = irfft(fu_padded_xy*self.padsize**2, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft']) 339 | 340 | return u 341 | -------------------------------------------------------------------------------- /mpiFFT4py/mpibase.py: -------------------------------------------------------------------------------- 1 | __author__ = "Mikael Mortensen " 2 | __date__ = "2016-04-14" 3 | __copyright__ = "Copyright (C) 2016 " + __author__ 4 | __license__ = "GNU Lesser GPL version 3 or any later version" 5 | 6 | import numpy as np 7 | from mpi4py import MPI 8 | import collections 9 | 10 | # Possible way to give numpy arrays attributes... 11 | #class Empty(np.ndarray): 12 | #"""Numpy empty array with additional info dictionary to hold attributes 13 | #""" 14 | #def __new__(subtype, shape, dtype=np.float, info={}): 15 | #obj = np.ndarray.__new__(subtype, shape, dtype) 16 | #obj.info = info 17 | #return obj 18 | 19 | #def __array_finalize__(self, obj): 20 | #if obj is None: return 21 | #self.info = getattr(obj, 'info', {}) 22 | 23 | #class Zeros(np.ndarray): 24 | #"""Numpy zeros array with additional info dictionary to hold attributes 25 | #""" 26 | #def __new__(subtype, shape, dtype=float, info={}): 27 | #obj = np.ndarray.__new__(subtype, shape, dtype) 28 | #obj.fill(0) 29 | #obj.info = info 30 | #return obj 31 | 32 | #def __array_finalize__(self, obj): 33 | #if obj is None: return 34 | #self.info = getattr(obj, 'info', {}) 35 | 36 | Empty, Zeros = np.empty, np.zeros 37 | 38 | try: 39 | import pyfftw 40 | def empty(N, dtype=np.float, bytes=16): 41 | return pyfftw.empty_aligned(N, dtype=dtype, n=bytes) 42 | 43 | def zeros(N, dtype=np.float, bytes=16): 44 | return pyfftw.zeros_aligned(N, dtype=dtype, n=bytes) 45 | 46 | except ImportError: 47 | def empty(N, dtype=np.float, bytes=None): 48 | return Empty(N, dtype=dtype) 49 | 50 | def zeros(N, dtype=np.float, bytes=None): 51 | return Zeros(N, dtype=dtype) 52 | 53 | class work_array_dict(dict): 54 | """Dictionary of work arrays indexed by their shape, type and an indicator i.""" 55 | def __missing__(self, key): 56 | shape, dtype, i = key 57 | a = zeros(shape, dtype=dtype) 58 | self[key] = a 59 | return self[key] 60 | 61 | class work_arrays(collections.MutableMapping): 62 | """A dictionary to hold numpy work arrays. 63 | 64 | The dictionary allows two types of keys for the same item. 65 | 66 | keys: 67 | - (shape, dtype, index (, fillzero)), where shape is tuple, dtype is np.dtype and 68 | index an integer 69 | - (ndarray, index (, fillzero)), where ndarray is a numpy array and index is 70 | an integer 71 | fillzero is an optional bool that determines 72 | whether the array is initialised to zero 73 | 74 | Usage: 75 | To create two real work arrays of shape (3,3), do: 76 | - work = workarrays() 77 | - a = work[((3,3), np.float, 0)] 78 | - b = work[(a, 1)] 79 | 80 | Returns: 81 | Numpy array of given shape. The array is by default initialised to zero, but this 82 | can be overridden using the fillzero argument. 83 | 84 | """ 85 | 86 | def __init__(self): 87 | self.store = work_array_dict() 88 | self.fillzero = True 89 | 90 | def __getitem__(self, key): 91 | val = self.store[self.__keytransform__(key)] 92 | if self.fillzero is True: val.fill(0) 93 | return val 94 | 95 | def __setitem__(self, key, value): 96 | self.store[self.__keytransform__(key)] = value 97 | 98 | def __delitem__(self, key): 99 | del self.store[self.__keytransform__(key)] 100 | 101 | def __iter__(self): 102 | return iter(self.store) 103 | 104 | def __len__(self): 105 | return len(self.store) 106 | 107 | def values(self): 108 | raise TypeError('Work arrays not iterable') 109 | 110 | def __keytransform__(self, key): 111 | if isinstance(key[0], np.ndarray): 112 | shape = key[0].shape 113 | dtype = key[0].dtype 114 | i = key[1] 115 | zero = True if len(key) == 2 else key[2] 116 | 117 | elif isinstance(key[0], tuple): 118 | if len(key) == 3: 119 | shape, dtype, i = key 120 | zero = True 121 | 122 | elif len(key) == 4: 123 | shape, dtype, i, zero = key 124 | 125 | else: 126 | raise TypeError("Wrong type of key for work array") 127 | 128 | assert isinstance(zero, bool) 129 | assert isinstance(i, int) 130 | self.fillzero = zero 131 | return (shape, np.dtype(dtype), i) 132 | 133 | def datatypes(precision): 134 | """Return datatypes associated with precision.""" 135 | assert precision in ("single", "double") 136 | return {"single": (np.float32, np.complex64, MPI.C_FLOAT_COMPLEX), 137 | "double": (np.float64, np.complex128, MPI.C_DOUBLE_COMPLEX)}[precision] 138 | -------------------------------------------------------------------------------- /mpiFFT4py/pencil.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | """Pencil decomposition 3 | 4 | This module contains classes for performing FFTs with pencil decomposition 5 | of three-dimensional data structures data[Nx,Ny,Nz], where (Nx, Ny, Nz) is 6 | the shape of the input data. With slab decomposition only one of these three 7 | indices is shared, leading to local datastructures on each processor 8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors. 9 | With pencil, two of the input arrays indices are shared, leading to local 10 | data of shape (Nx/P1, Ny/P2, Nz), i.e., pencils aligned in the z-direction. 11 | 12 | The final transformed data can be aligned in either the y-direction or 13 | the x-direction. 14 | 15 | classes: 16 | R2CX - For real to complex transforms. Final alignment in x-direction 17 | Args: 18 | N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh 19 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain 20 | comm - The MPI communicator object 21 | precision - "single" or "double" 22 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw') 23 | padsize - The size of padding, if padding is used in transforms 24 | threads - Number of threads used by FFTs 25 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", 26 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 27 | 28 | R2CY - For real to complex transforms. Final alignment in y-direction 29 | Args: 30 | N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh 31 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain 32 | comm - The MPI communicator object 33 | precision - "single" or "double" 34 | P1 - Decomposition along first dimension 35 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw') 36 | padsize - The size of padding, if padding is used in transforms 37 | threads - Number of threads used by FFTs 38 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE", 39 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 40 | 41 | function: 42 | R2C 43 | 44 | Args: 45 | N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh 46 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain 47 | comm - The MPI communicator object 48 | precision - "single" or "double" 49 | P1 - Decomposition along first dimension 50 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw') 51 | padsize - The size of padding, if padding is used in transforms 52 | threads - Number of threads used by FFTs 53 | alignment - Final alignment, ('X' or 'Y') 54 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE", 55 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 56 | 57 | """ 58 | __author__ = "Mikael Mortensen " 59 | __date__ = "2016-02-16" 60 | __copyright__ = "Copyright (C) 2016 " + __author__ 61 | __license__ = "GNU Lesser GPL version 3 or any later version" 62 | 63 | from .serialFFT import * 64 | import numpy as np 65 | from .mpibase import work_arrays, datatypes 66 | from .cython.maths import dealias_filter 67 | from numpy.fft import fftfreq, rfftfreq 68 | from collections import defaultdict 69 | from mpi4py import MPI 70 | 71 | #__all__ = ['R2C'] 72 | 73 | # Using Lisandro Dalcin's code for Alltoallw. 74 | # Note that _subsize and _distribution are modified for a mesh of power two. 75 | 76 | def _subsize(N, size, rank): 77 | return N // size + ((N % size) * (rank == size -1)) 78 | #return N // size + (N % size > rank) # Generic 79 | 80 | def _distribution(N, size): 81 | q = N // size 82 | r = N % size 83 | n = s = i = 0 84 | while i < size: 85 | n = q 86 | s = q * i 87 | if r == 1 and i+1 == size: 88 | n += 1 89 | yield n, s 90 | i += 1 91 | 92 | # Generic 93 | #def _distribution2(N, size): 94 | #q = N // size 95 | #r = N % size 96 | #n = s = i = 0 97 | #while i < size: 98 | #n = q 99 | #s = q * i 100 | #if i < r: 101 | #n += 1 102 | #s += i 103 | #else: 104 | #s += r 105 | #yield n, s 106 | #i += 1 107 | 108 | 109 | def transform_Uc_xz(Uc_hat_x, Uc_hat_z, P1): 110 | sz = Uc_hat_z.shape 111 | sx = Uc_hat_x.shape 112 | Uc_hat_x[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P1, sx[2])), 2).reshape(sx) 113 | return Uc_hat_x 114 | 115 | def transform_Uc_zx(Uc_hat_z, Uc_hat_xr, P1): 116 | sz = Uc_hat_z.shape 117 | sx = Uc_hat_xr.shape 118 | Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_xr.reshape((P1, sz[0], sz[1], sx[2])), 0, 3).reshape((sz[0], sz[1], sz[2]-1)) 119 | return Uc_hat_z 120 | 121 | def transform_Uc_xy(Uc_hat_x, Uc_hat_y, P): 122 | sy = Uc_hat_y.shape 123 | sx = Uc_hat_x.shape 124 | Uc_hat_x[:] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sx[1], sx[2])), 1).reshape(sx) 125 | return Uc_hat_x 126 | 127 | def transform_Uc_yx(Uc_hat_y, Uc_hat_x, P): 128 | sy = Uc_hat_y.shape 129 | sx = Uc_hat_x.shape 130 | Uc_hat_y[:] = np.rollaxis(Uc_hat_x.reshape((P, sx[0]//P, sx[1], sx[2])), 1).reshape(sy) 131 | return Uc_hat_y 132 | 133 | def transform_Uc_yz(Uc_hat_y, Uc_hat_z, P): 134 | sz = Uc_hat_z.shape 135 | sy = Uc_hat_y.shape 136 | Uc_hat_y[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P, sy[2])), 1, 3).reshape(sy) 137 | return Uc_hat_y 138 | 139 | def transform_Uc_zy(Uc_hat_z, Uc_hat_y, P): 140 | sz = Uc_hat_z.shape 141 | sy = Uc_hat_y.shape 142 | Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sz[1], sy[2])), 1, 3).reshape((sz[0], sz[1], sz[2]-1)) 143 | return Uc_hat_z 144 | 145 | class R2CY(object): 146 | """Class for performing FFT in 3D using MPI 147 | 148 | Pencil decomposition 149 | 150 | Args: 151 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh 152 | L - NumPy array([Lx, Ly, Lz]) The actual size of the computational domain 153 | comm - The MPI communicator object 154 | precision - "single" or "double" 155 | P1 - Decomposition along first dimension 156 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw') 157 | padsize - The size of padding, if padding is used in transforms 158 | threads - Number of threads used by FFTs 159 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 160 | Give as defaultdict, with keys representing transform (e.g., fft, ifft) 161 | 162 | This version has the final complex data aligned in the y-direction, in agreement 163 | with the paper in CPC (http://arxiv.org/pdf/1602.03638v1.pdf) 164 | 165 | """ 166 | 167 | def __init__(self, N, L, comm, precision, P1=None, communication='Alltoallw', padsize=1.5, threads=1, 168 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")): 169 | self.N = N 170 | assert len(L) == 3 171 | assert len(N) == 3 172 | self.Nf = N[2]//2+1 # Number of independent complex wavenumbers in z-direction 173 | self.comm = comm 174 | self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision) 175 | self.num_processes = comm.Get_size() 176 | assert self.num_processes > 1 177 | self.L = L.astype(float) 178 | self.dealias = np.zeros(0) 179 | self.communication = communication 180 | self.padsize = padsize 181 | self.threads = threads 182 | self.planner_effort = planner_effort 183 | self.rank = comm.Get_rank() 184 | if P1 is None: 185 | P1, P2 = MPI.Compute_dims(self.num_processes, 2) 186 | self.P1, self.P2 = P1, P2 187 | else: 188 | self.P1 = P1 189 | self.P2 = P2 = self.num_processes // P1 190 | self.N1 = N // P1 191 | self.N2 = N // P2 192 | self.comm0 = comm.Split(self.rank/P1) 193 | self.comm1 = comm.Split(self.rank%P1) 194 | self.comm0_rank = self.comm0.Get_rank() 195 | self.comm1_rank = self.comm1.Get_rank() 196 | self.work_arrays = work_arrays() 197 | self.N1f = self.N1[2]//2 if self.comm0_rank < self.P1-1 else self.N1[2]//2+1 198 | if self.communication == 'AlltoallN': 199 | self.N1f = self.N1[2]//2 200 | 201 | if not (self.num_processes % 2 == 0 or self.num_processes == 1): 202 | raise IOError("Number of cpus must be even") 203 | 204 | if (P1 % 2 != 0) or (P2 % 2 != 0): 205 | raise IOError("Number of cpus in each direction must be even power of 2") 206 | 207 | self._subarrays1A = [] 208 | self._subarrays1B = [] 209 | self._subarrays2A = [] 210 | self._subarrays2B = [] 211 | self._subarrays1A_pad = [] 212 | self._subarrays1B_pad = [] 213 | self._subarrays2A_pad = [] 214 | self._subarrays2B_pad = [] 215 | self._counts_displs1 = None 216 | self._counts_displs2 = None 217 | 218 | def get_subarrays(self, padsize=1): 219 | datatype = MPI._typedict[np.dtype(self.complex).char] 220 | M, N, Q = self.N[0], self.N[1], self.Nf 221 | m = _subsize(M, self.P2, self.comm1_rank) 222 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank) 223 | q = _subsize(Q, self.P1, self.comm0_rank) 224 | _subarrays1A = [ 225 | datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit() 226 | for l, s in _distribution(int(padsize*N), self.P2) 227 | ] 228 | _subarrays1B = [ 229 | datatype.Create_subarray([M,n,q], [l,n,q], [s,0,0]).Commit() 230 | for l, s in _distribution(M, self.P2) 231 | ] 232 | _counts_displs1 = ([1] * self.P2, [0] * self.P2) 233 | 234 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank) 235 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank) 236 | q = _subsize(Q, self.P1, self.comm0_rank) 237 | _subarrays2A = [ 238 | datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit() 239 | for l, s in _distribution(int(padsize*M), self.P1) 240 | ] 241 | _subarrays2B = [ 242 | datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit() 243 | for l, s in _distribution(Q, self.P1) 244 | ] 245 | _counts_displs2 = ([1] * self.P1, [0] * self.P1) 246 | return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2 247 | 248 | def real_shape(self): 249 | """The local shape of the real data""" 250 | return (self.N1[0], self.N2[1], self.N[2]) 251 | 252 | def complex_shape(self): 253 | """The local shape of the complex data""" 254 | return (self.N2[0], self.N[1], self.N1f) 255 | 256 | def complex_shape_T(self): 257 | """The local transposed shape of the complex data""" 258 | return (self.Np[0], self.N[1], self.Nf) 259 | 260 | def complex_shape_I(self): 261 | """A local intermediate shape of the complex data""" 262 | return (self.Np[0], self.num_processes, self.Np[1], self.Nf) 263 | 264 | def real_shape_padded(self): 265 | return (int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2])) 266 | 267 | def work_shape(self, dealias): 268 | """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved""" 269 | if dealias == '3/2-rule': 270 | return self.real_shape_padded() 271 | 272 | else: 273 | return self.real_shape() 274 | 275 | def real_local_slice(self, padsize=1): 276 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane 277 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane 278 | return (slice(int(padsize * xzrank * self.N1[0]), int(padsize * (xzrank+1) * self.N1[0]), 1), 279 | slice(int(padsize * xyrank * self.N2[1]), int(padsize * (xyrank+1) * self.N2[1]), 1), 280 | slice(0, int(padsize*self.N[2]))) 281 | 282 | def complex_local_slice(self): 283 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane 284 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane 285 | return (slice(xyrank*self.N2[0], (xyrank+1)*self.N2[0], 1), 286 | slice(0, self.N[1]), 287 | slice(xzrank*self.N1[2]//2, xzrank*self.N1[2]//2 + self.N1f, 1)) 288 | 289 | def complex_local_wavenumbers(self): 290 | s = self.complex_local_slice() 291 | return (fftfreq(self.N[0], 1./self.N[0]).astype(int)[s[0]], 292 | fftfreq(self.N[1], 1./self.N[1]).astype(int), 293 | rfftfreq(self.N[2], 1./self.N[2]).astype(int)[s[2]]) 294 | 295 | def get_P(self): 296 | return self.P1, self.P2 297 | 298 | def get_local_mesh(self): 299 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane 300 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane 301 | 302 | # Create the physical mesh 303 | x1 = slice(xzrank * self.N1[0], (xzrank+1) * self.N1[0], 1) 304 | x2 = slice(xyrank * self.N2[1], (xyrank+1) * self.N2[1], 1) 305 | X = np.ogrid[x1, x2, :self.N[2]] 306 | 307 | X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float) 308 | X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float) 309 | X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float) 310 | X = [np.broadcast_to(x, self.real_shape()) for x in X] 311 | return X 312 | 313 | def get_local_wavenumbermesh(self, scaled=False, broadcast=False, 314 | eliminate_highest_freq=False): 315 | """Returns (scaled) local decomposed wavenumbermesh 316 | 317 | If scaled is True, then the wavenumbermesh is scaled with physical mesh 318 | size. This takes care of mapping the physical domain to a computational 319 | cube of size (2pi)**3 320 | 321 | 322 | """ 323 | s = self.complex_local_slice() 324 | kx = fftfreq(self.N[0], 1./self.N[0]).astype(int) 325 | ky = fftfreq(self.N[1], 1./self.N[1]).astype(int) 326 | kz = rfftfreq(self.N[2], 1./self.N[2]).astype(int) 327 | if eliminate_highest_freq: 328 | for i, k in enumerate((kx, ky, kz)): 329 | if self.N[i] % 2 == 0: 330 | k[self.N[i]//2] = 0 331 | kx = kx[s[0]] 332 | kz = kz[s[2]] 333 | Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True) 334 | if scaled is True: 335 | Lp = 2*np.pi/self.L 336 | for i in range(3): 337 | Ks[i] = (Ks[i]*Lp[i]).astype(self.float) 338 | K = Ks 339 | if broadcast is True: 340 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks] 341 | return K 342 | 343 | def get_dealias_filter(self): 344 | """Filter for dealiasing nonlinear convection""" 345 | K = self.get_local_wavenumbermesh() 346 | kmax = 2./3.*(self.N//2+1) 347 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])* 348 | (abs(K[2]) < kmax[2]), dtype=np.uint8) 349 | return dealias 350 | 351 | def copy_to_padded_x(self, fu, fp): 352 | fp[:self.N[0]//2] = fu[:self.N[0]//2] 353 | fp[-(self.N[0]//2):] = fu[self.N[0]//2:] 354 | return fp 355 | 356 | def copy_to_padded_y(self, fu, fp): 357 | fp[:, :self.N[1]//2] = fu[:, :self.N[1]//2] 358 | fp[:, -(self.N[1]//2):] = fu[:, self.N[1]//2:] 359 | return fp 360 | 361 | def copy_to_padded_z(self, fu, fp): 362 | fp[:, :, :self.Nf] = fu[:] 363 | return fp 364 | 365 | def copy_from_padded_z(self, fp, fu): 366 | fu[:] = fp[:, :, :self.Nf] 367 | return fu 368 | 369 | def copy_from_padded_x(self, fp, fu): 370 | fu.fill(0) 371 | fu[:self.N[0]//2+1] = fp[:self.N[0]//2+1] 372 | fu[self.N[0]//2:] += fp[-self.N[0]//2:] 373 | return fu 374 | 375 | def copy_from_padded_y(self, fp, fu): 376 | fu.fill(0) 377 | fu[:, :self.N[1]//2+1] = fp[:, :self.N[1]//2+1] 378 | fu[:, self.N[1]//2:] += fp[:, -self.N[1]//2:] 379 | return fu 380 | 381 | def global_complex_shape(self, padsize=1.0): 382 | """Global size of problem in complex wavenumber space""" 383 | return (int(padsize*self.N[0]), int(padsize*self.N[1]), 384 | int(padsize*self.N[2]//2+1)) 385 | 386 | def ifftn(self, fu, u, dealias=None): 387 | """ifft in three directions using mpi. 388 | Need to do ifft in reversed order of fft 389 | """ 390 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 391 | 392 | if dealias == '2/3-rule' and self.dealias.shape == (0,): 393 | self.dealias = self.get_dealias_filter() 394 | 395 | # Strip off self 396 | N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f 397 | 398 | if not dealias == '3/2-rule': 399 | 400 | fu_ = fu 401 | if dealias == '2/3-rule': 402 | fu_ = self.work_arrays[(fu, 0, False)] 403 | fu_[:] = fu 404 | fu_ = dealias_filter(fu_, self.dealias) 405 | #fu_ *= self.dealias 406 | 407 | Uc_hat_y = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0, False)] 408 | Uc_hat_z = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0, False)] 409 | 410 | if self.communication == 'AlltoallN': 411 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)] 412 | 413 | # Do first owned direction 414 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads, 415 | planner_effort=self.planner_effort['ifft']) 416 | 417 | # Transform to x all but k=N//2 (the neglected Nyquist mode) 418 | Uc_hat_x[:] = transform_Uc_xy(Uc_hat_x, Uc_hat_y, self.P2) 419 | 420 | # Communicate in xz-plane and do fft in x-direction 421 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 422 | Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads, 423 | planner_effort=self.planner_effort['ifft']) 424 | 425 | # Communicate and transform in xy-plane 426 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 427 | Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1) 428 | 429 | # Do fft for z-direction 430 | Uc_hat_z[:, :, -1] = 0 431 | u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads, 432 | planner_effort=self.planner_effort['irfft']) 433 | 434 | elif self.communication == 'Alltoall': 435 | # Additional work arrays 436 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)] 437 | Uc_hat_xp = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)] 438 | xy_plane = self.work_arrays[((N[0], N2[1]), self.complex, 0, False)] 439 | xy_recv = self.work_arrays[((N1[0], N2[1]), self.complex, 0, False)] 440 | 441 | # Do first owned direction 442 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads, 443 | planner_effort=self.planner_effort['ifft']) 444 | 445 | # Transform to x 446 | Uc_hat_xp = transform_Uc_xy(Uc_hat_xp, Uc_hat_y, self.P2) 447 | 448 | ###### In-place 449 | ## Communicate in xz-plane and do fft in x-direction 450 | #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xp, self.mpitype]) 451 | #Uc_hat_xp[:] = ifft(Uc_hat_xp, axis=0, threads=self.threads, 452 | #planner_effort=self.planner_effort['ifft']) 453 | 454 | #Uc_hat_x[:] = Uc_hat_xp[:, :, :self.N1[2]//2] 455 | 456 | ## Communicate and transform in xy-plane all but k=N//2 457 | #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 458 | 459 | ####### Not in-place 460 | # Communicate in xz-plane and do fft in x-direction 461 | Uc_hat_xp2 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 1, False)] 462 | self.comm1.Alltoall([Uc_hat_xp, self.mpitype], [Uc_hat_xp2, self.mpitype]) 463 | Uc_hat_xp = ifft(Uc_hat_xp2, Uc_hat_xp, axis=0, threads=self.threads, 464 | planner_effort=self.planner_effort['ifft']) 465 | 466 | Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 1, False)] 467 | Uc_hat_x2[:] = Uc_hat_xp[:, :, :N1[2]//2] 468 | 469 | # Communicate and transform in xy-plane all but k=N//2 470 | self.comm0.Alltoall([Uc_hat_x2, self.mpitype], [Uc_hat_x, self.mpitype]) 471 | ######################### 472 | 473 | Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1) 474 | 475 | xy_plane[:] = Uc_hat_xp[:, :, -1] 476 | self.comm0.Scatter(xy_plane, xy_recv, root=self.P1-1) 477 | Uc_hat_z[:, :, -1] = xy_recv 478 | 479 | # Do ifft for z-direction 480 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads, 481 | planner_effort=self.planner_effort['irfft']) 482 | 483 | elif self.communication == 'Alltoallw': 484 | if len(self._subarrays1A) == 0: 485 | (self._subarrays1A, self._subarrays1B, self._subarrays2A, 486 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays() 487 | 488 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)] 489 | 490 | # Do first owned direction 491 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads, 492 | planner_effort=self.planner_effort['ifft']) 493 | 494 | self.comm1.Alltoallw( 495 | [Uc_hat_y, self._counts_displs1, self._subarrays1A], 496 | [Uc_hat_x, self._counts_displs1, self._subarrays1B]) 497 | 498 | Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads, 499 | planner_effort=self.planner_effort['ifft']) 500 | 501 | self.comm0.Alltoallw( 502 | [Uc_hat_x, self._counts_displs2, self._subarrays2A], 503 | [Uc_hat_z, self._counts_displs2, self._subarrays2B]) 504 | 505 | # Do fft for z-direction 506 | u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads, 507 | planner_effort=self.planner_effort['irfft']) 508 | 509 | return u 510 | 511 | else: # padded 512 | 513 | padsize = self.padsize 514 | Uc_pad_hat_y = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)] 515 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)] 516 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)] 517 | 518 | if self.communication == 'AlltoallN': 519 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 520 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 521 | 522 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y) 523 | 524 | # Do first owned direction 525 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads, 526 | planner_effort=self.planner_effort['ifft']) 527 | 528 | # Transform to x all but k=N//2 (the neglected Nyquist mode) 529 | Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P2) 530 | 531 | # Communicate in xz-plane 532 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype]) 533 | 534 | # Pad and do fft in x-direction 535 | Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy) 536 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads, 537 | planner_effort=self.planner_effort['ifft']) 538 | 539 | # Communicate in xy-plane 540 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype]) 541 | 542 | # Transform 543 | Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1) 544 | Uc_pad_hat_z[:, :, -1] = 0 545 | 546 | # Pad in z-dir 547 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 548 | 549 | # Do ifft for z-direction 550 | u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads, 551 | planner_effort=self.planner_effort['irfft']) 552 | 553 | elif self.communication == 'Alltoall': 554 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 555 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 556 | Uc_pad_hat_xr2 = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)] 557 | Uc_pad_hat_xy3 = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)] 558 | xy2_pad_plane = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1])), self.complex, 0)] 559 | xy2_pad_recv = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1])), self.complex, 1)] 560 | 561 | # Pad in y-direction 562 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y) 563 | 564 | # Transform first owned direction 565 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads, 566 | planner_effort=self.planner_effort['ifft']) 567 | 568 | # Transpose datastructure to x 569 | Uc_pad_hat_xr2[:] = transform_Uc_xy(Uc_pad_hat_xr2, Uc_pad_hat_y, self.P2) 570 | 571 | # Communicate in xz-plane and do fft in x-direction 572 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype]) 573 | 574 | # Pad and do fft in x-direction 575 | Uc_pad_hat_xy3 = self.copy_to_padded_x(Uc_pad_hat_xr2, Uc_pad_hat_xy3) 576 | Uc_pad_hat_xy3[:] = ifft(Uc_pad_hat_xy3, axis=0, threads=self.threads, 577 | planner_effort=self.planner_effort['ifft']) 578 | 579 | Uc_pad_hat_xy[:] = Uc_pad_hat_xy3[:, :, :N1[2]//2] 580 | 581 | # Communicate and transform in xy-plane all but k=N//2 582 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype]) 583 | 584 | Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1) 585 | 586 | xy2_pad_plane[:] = Uc_pad_hat_xy3[:, :, -1] 587 | self.comm0.Scatter(xy2_pad_plane, xy2_pad_recv, root=self.P1-1) 588 | Uc_pad_hat_z[:, :, -1] = xy2_pad_recv 589 | 590 | # Pad in z-dir 591 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 592 | 593 | # Do ifft for z-direction 594 | u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads, 595 | planner_effort=self.planner_effort['irfft']) 596 | 597 | elif self.communication == 'Alltoallw': 598 | if len(self._subarrays1A_pad) == 0: 599 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad, 600 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize) 601 | 602 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)] 603 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)] 604 | 605 | # Pad in y-direction 606 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y) 607 | 608 | # Transform first owned direction 609 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads, 610 | planner_effort=self.planner_effort['ifft']) 611 | 612 | self.comm1.Alltoallw( 613 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad], 614 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad]) 615 | 616 | # Pad and do fft in x-direction 617 | Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy) 618 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads, 619 | planner_effort=self.planner_effort['ifft']) 620 | 621 | self.comm0.Alltoallw( 622 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad], 623 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad]) 624 | 625 | # Pad in z-dir 626 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 627 | 628 | # Do fft for z-direction 629 | u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads, 630 | planner_effort=self.planner_effort['irfft']) 631 | 632 | return u 633 | 634 | def fftn(self, u, fu, dealias=None): 635 | """fft in three directions using mpi.""" 636 | 637 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 638 | 639 | # Strip off self 640 | N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f 641 | 642 | if not dealias == '3/2-rule': 643 | 644 | Uc_hat_y = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0)] 645 | Uc_hat_z = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0)] 646 | 647 | if self.communication == 'AlltoallN': 648 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)] 649 | 650 | # Do fft in z direction on owned data 651 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 652 | planner_effort=self.planner_effort['rfft']) 653 | 654 | # Transform to x direction neglecting k=N//2 (Nyquist) 655 | Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1) 656 | 657 | # Communicate and do fft in x-direction 658 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 659 | Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads, 660 | planner_effort=self.planner_effort['fft']) 661 | 662 | # Communicate and transform to final y-direction 663 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 664 | Uc_hat_y[:] = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P2) 665 | 666 | # Do fft for last direction 667 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads, 668 | planner_effort=self.planner_effort['fft']) 669 | 670 | elif self.communication == 'Alltoall': 671 | 672 | # Additional work arrays 673 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)] 674 | Uc_hat_xr2= self.work_arrays[((N[0], N2[1], N1f), self.complex, 1)] 675 | xy_plane = self.work_arrays[((N[0], N2[1]), self.complex, 0)] 676 | xy_plane2 = self.work_arrays[((N[0]//2+1, N2[1]), self.complex, 0)] 677 | xy_recv = self.work_arrays[((N1[0], N2[1]), self.complex, 0)] 678 | 679 | # Do fft in z direction on owned data 680 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 681 | planner_effort=self.planner_effort['rfft']) 682 | 683 | # Move real part of Nyquist to k=0 684 | Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1] 685 | 686 | # Transform to x direction neglecting k=N//2 (Nyquist) 687 | Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1) 688 | 689 | # In-place 690 | # Communicate and do fft in x-direction 691 | #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 692 | #Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads, 693 | #planner_effort=self.planner_effort['fft']) 694 | 695 | # Not in-place 696 | Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 2, False)] 697 | self.comm0.Alltoall([Uc_hat_x, self.mpitype], [Uc_hat_x2, self.mpitype]) 698 | Uc_hat_x = fft(Uc_hat_x2, Uc_hat_x, axis=0, threads=self.threads, 699 | planner_effort=self.planner_effort['fft']) 700 | ################ 701 | 702 | Uc_hat_xr2[:, :, :N1[2]//2] = Uc_hat_x[:] 703 | 704 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0 705 | if self.comm0_rank == 0: 706 | M = N[0] 707 | xy_plane[:] = Uc_hat_x[:, :, 0] 708 | xy_plane2[:] = np.vstack((xy_plane[0].real, 0.5*(xy_plane[1:M//2]+np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].real)) 709 | Uc_hat_xr2[:, :, 0] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1]))) 710 | xy_plane2[:] = np.vstack((xy_plane[0].imag, -0.5*1j*(xy_plane[1:M//2]-np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].imag)) 711 | xy_plane[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1]))) 712 | self.comm0.Send([xy_plane, self.mpitype], dest=self.P1-1, tag=77) 713 | 714 | if self.comm0_rank == self.P1-1: 715 | self.comm0.Recv([xy_plane, self.mpitype], source=0, tag=77) 716 | Uc_hat_xr2[:, :, -1] = xy_plane 717 | 718 | # Communicate and transform to final y-direction 719 | #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xr2, self.mpitype]) 720 | #Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr2, self.P2) 721 | # Not in-place 722 | Uc_hat_xr3 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 3)] 723 | self.comm1.Alltoall([Uc_hat_xr2, self.mpitype], [Uc_hat_xr3, self.mpitype]) 724 | Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr3, self.P2) 725 | 726 | # Do fft for last direction 727 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads, 728 | planner_effort=self.planner_effort['fft']) 729 | 730 | elif self.communication == 'Alltoallw': 731 | if len(self._subarrays1A) == 0: 732 | (self._subarrays1A, self._subarrays1B, self._subarrays2A, 733 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays() 734 | 735 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0)] 736 | 737 | # Do fft in z direction on owned data 738 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 739 | planner_effort=self.planner_effort['rfft']) 740 | 741 | self.comm0.Alltoallw( 742 | [Uc_hat_z, self._counts_displs2, self._subarrays2B], 743 | [Uc_hat_x, self._counts_displs2, self._subarrays2A]) 744 | 745 | Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads, 746 | planner_effort=self.planner_effort['fft']) 747 | 748 | self.comm1.Alltoallw( 749 | [Uc_hat_x, self._counts_displs1, self._subarrays1B], 750 | [Uc_hat_y, self._counts_displs1, self._subarrays1A]) 751 | 752 | # Do fft for last direction 753 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads, 754 | planner_effort=self.planner_effort['fft']) 755 | 756 | return fu 757 | 758 | else: # padded 759 | 760 | assert u.shape == self.real_shape_padded() 761 | 762 | padsize = self.padsize 763 | Uc_pad_hat_y = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)] 764 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)] 765 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)] 766 | 767 | if self.communication == 'AlltoallN': 768 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 769 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 770 | 771 | # Do fft in z direction on owned data 772 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 773 | planner_effort=self.planner_effort['rfft']) 774 | 775 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 776 | 777 | # Transform to x direction neglecting k=N//2 (Nyquist) 778 | Uc_pad_hat_xy = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1) 779 | 780 | # Communicate and do fft in x-direction 781 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype]) 782 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads, 783 | planner_effort=self.planner_effort['fft']) 784 | 785 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x) 786 | 787 | # Communicate and transform to final y-direction 788 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype]) 789 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P2) 790 | 791 | # Do fft for last direction 792 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads, 793 | planner_effort=self.planner_effort['fft']) 794 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu) 795 | fu /= padsize**3 796 | 797 | elif self.communication == 'Alltoall': 798 | 799 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 800 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)] 801 | xy_pad_plane = self.work_arrays[((N[0], int(padsize*N2[1])), self.complex, 0)] 802 | xy_pad_plane2= self.work_arrays[((N[0]//2+1, int(padsize*N2[1])), self.complex, 0)] 803 | Uc_pad_hat_xr2 = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)] 804 | 805 | # Do fft in z direction on owned data 806 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 807 | planner_effort=self.planner_effort['rfft']) 808 | 809 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 810 | 811 | # Move real part of Nyquist to k=0 812 | Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1] 813 | 814 | # Transform to x direction neglecting k=N//2 (Nyquist) 815 | Uc_pad_hat_xy[:] = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1) 816 | 817 | # Communicate and do fft in x-direction 818 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype]) 819 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads, 820 | planner_effort=self.planner_effort['fft']) 821 | 822 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x) 823 | 824 | Uc_pad_hat_xr2[:, :, :N1[2]//2] = Uc_pad_hat_x[:] 825 | 826 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0 827 | if self.comm0_rank == 0: 828 | N = self.N[0] 829 | xy_pad_plane[:] = Uc_pad_hat_x[:, :, 0] 830 | xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].real, 0.5*(xy_pad_plane[1:N//2]+np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].real)) 831 | Uc_pad_hat_xr2[:, :, 0] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1]))) 832 | xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].imag, -0.5*1j*(xy_pad_plane[1:N//2]-np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].imag)) 833 | xy_pad_plane[:] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1]))) 834 | self.comm0.Send([xy_pad_plane, self.mpitype], dest=self.P1-1, tag=77) 835 | 836 | if self.comm0_rank == self.P1-1: 837 | self.comm0.Recv([xy_pad_plane, self.mpitype], source=0, tag=77) 838 | Uc_pad_hat_xr2[:, :, -1] = xy_pad_plane 839 | 840 | # Communicate and transform to final y-direction 841 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype]) 842 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_xr2, self.P2) 843 | 844 | # Do fft for last direction 845 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads, 846 | planner_effort=self.planner_effort['fft']) 847 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu) 848 | fu /= padsize**3 849 | 850 | elif self.communication == 'Alltoallw': 851 | if len(self._subarrays1A_pad) == 0: 852 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad, 853 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize) 854 | 855 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)] 856 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)] 857 | 858 | # Do fft in z direction on owned data 859 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 860 | planner_effort=self.planner_effort['rfft']) 861 | 862 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 863 | 864 | self.comm0.Alltoallw( 865 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad], 866 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad]) 867 | 868 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads, 869 | planner_effort=self.planner_effort['fft']) 870 | 871 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x) 872 | 873 | self.comm1.Alltoallw( 874 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad], 875 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad]) 876 | 877 | # Do fft for last direction 878 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads, 879 | planner_effort=self.planner_effort['fft']) 880 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu) 881 | fu /= padsize**3 882 | 883 | return fu 884 | 885 | class R2CX(R2CY): 886 | """Class for performing FFT in 3D using MPI 887 | 888 | Pencil decomposition 889 | 890 | Args: 891 | N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh 892 | L - NumPy array([Lx, Ly, Lz]) setting the actual size of the computational domain 893 | MPI - The MPI object (from mpi4py import MPI) 894 | precision - "single" or "double" 895 | communication - Communication scheme. ('AlltoallN', 'Alltoall' or 'Alltoallw') 896 | padsize - The size of padding, if padding is used in transforms 897 | threads - Number of threads used by FFTs 898 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 899 | Give as defaultdict, with keys representing transform (e.g., fft, ifft) 900 | 901 | This version has the final complex data aligned in the x-direction 902 | """ 903 | def __init__(self, N, L, comm, precision, P1=None, communication='Alltoall', 904 | padsize=1.5, threads=1, 905 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")): 906 | R2CY.__init__(self, N, L, comm, precision, P1=P1, communication=communication, 907 | padsize=padsize, threads=threads, planner_effort=planner_effort) 908 | self.N2f = self.N2[2]//2 if self.comm1_rank < self.P2-1 else self.N2[2]//2+1 909 | if self.communication == 'AlltoallN': 910 | self.N2f = self.N2[2]//2 911 | if self.communication == 'Alltoallw': 912 | q = _subsize(self.Nf, self.P2, self.comm1_rank) 913 | self.N2f = q 914 | 915 | def real_shape(self): 916 | """The local shape of the real data""" 917 | return (self.N1[0], self.N2[1], self.N[2]) 918 | 919 | def complex_shape(self): 920 | """The local shape of the complex data""" 921 | return (self.N[0], self.N1[1], self.N2f) 922 | 923 | def complex_shape_T(self): 924 | """The local transposed shape of the complex data""" 925 | return (self.Np[0], self.N[1], self.Nf) 926 | 927 | def complex_shape_I(self): 928 | """A local intermediate shape of the complex data""" 929 | return (self.Np[0], self.num_processes, self.Np[1], self.Nf) 930 | 931 | def real_local_slice(self, padsize=1): 932 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane 933 | yzrank = self.comm1.Get_rank() # Local rank in xy-plane 934 | return (slice(int(padsize * xyrank * self.N1[0]), int(padsize * (xyrank+1) * self.N1[0]), 1), 935 | slice(int(padsize * yzrank * self.N2[1]), int(padsize * (yzrank+1) * self.N2[1]), 1), 936 | slice(0, int(padsize * self.N[2]))) 937 | 938 | def complex_local_slice(self): 939 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane 940 | yzrank = self.comm1.Get_rank() # Local rank in yz-plane 941 | return (slice(0, self.N[0]), 942 | slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1), 943 | slice(yzrank*self.N2[2]//2, yzrank*self.N2[2]//2 + self.N2f, 1)) 944 | 945 | def get_local_mesh(self): 946 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane 947 | yzrank = self.comm1.Get_rank() # Local rank in xy-plane 948 | 949 | # Create the physical mesh 950 | x1 = slice(xyrank * self.N1[0], (xyrank+1) * self.N1[0], 1) 951 | x2 = slice(yzrank * self.N2[1], (yzrank+1) * self.N2[1], 1) 952 | X = np.mgrid[x1, x2, :self.N[2]].astype(self.float) 953 | X[0] *= self.L[0]/self.N[0] 954 | X[1] *= self.L[1]/self.N[1] 955 | X[2] *= self.L[2]/self.N[2] 956 | return X 957 | 958 | def get_local_wavenumbermesh(self): 959 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane 960 | yzrank = self.comm1.Get_rank() # Local rank in yz-plane 961 | 962 | # Set wavenumbers in grid 963 | kx = fftfreq(self.N[0], 1./self.N[0]).astype(int) 964 | ky = fftfreq(self.N[1], 1./self.N[1]).astype(int) 965 | kz = fftfreq(self.N[2], 1./self.N[2]).astype(int) 966 | k2 = slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1) 967 | k1 = slice(yzrank*self.N2[2]//2, (yzrank+1)*self.N2[2]//2, 1) 968 | K = np.array(np.meshgrid(kx, ky[k2], kz[k1], indexing='ij'), dtype=self.float) 969 | return K 970 | 971 | def get_subarrays(self, padsize=1): 972 | datatype = MPI._typedict[np.dtype(self.complex).char] 973 | M, N, Q = self.N[0], self.N[1], self.Nf 974 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank) 975 | n = _subsize(N, self.P1, self.comm0_rank) 976 | q = _subsize(Q, self.P2, self.comm1_rank) 977 | _subarrays1A = [ 978 | datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit() 979 | for l, s in _distribution(int(padsize*M), self.P1) 980 | ] 981 | _subarrays1B = [ 982 | datatype.Create_subarray([m,N,q], [m,l,q], [0,s,0]).Commit() 983 | for l, s in _distribution(N, self.P1) 984 | ] 985 | _counts_displs1 = ([1] * self.P1, [0] * self.P1) 986 | 987 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank) 988 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank) 989 | q = _subsize(Q, self.P2, self.comm1_rank) 990 | _subarrays2A = [ 991 | datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit() 992 | for l, s in _distribution(int(padsize*N), self.P2) 993 | ] 994 | _subarrays2B = [ 995 | datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit() 996 | for l, s in _distribution(Q, self.P2) 997 | ] 998 | _counts_displs2 = ([1] * self.P2, [0] * self.P2) 999 | return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2 1000 | 1001 | def ifftn(self, fu, u, dealias=None): 1002 | """ifft in three directions using mpi 1003 | 1004 | Need to do ifft in reversed order of fft 1005 | """ 1006 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 1007 | 1008 | if dealias == '2/3-rule' and self.dealias.shape == (0,): 1009 | self.dealias = self.get_dealias_filter() 1010 | 1011 | if not dealias == '3/2-rule': 1012 | 1013 | fu_ = fu 1014 | if dealias == '2/3-rule': 1015 | fu_ = self.work_arrays[(fu, 0, False)] 1016 | fu_[:] = fu 1017 | fu_ = dealias_filter(fu_, self.dealias) 1018 | #fu_ *= self.dealias 1019 | 1020 | # Intermediate work arrays required for transform 1021 | Uc_hat_z = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)] 1022 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)] 1023 | 1024 | if self.communication == 'AlltoallN': 1025 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)] 1026 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2)) 1027 | 1028 | # Do first owned direction 1029 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads, 1030 | planner_effort=self.planner_effort['ifft']) 1031 | 1032 | # Communicate in xz-plane and do fft in y-direction 1033 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 1034 | 1035 | # Transform to y all but k=N//2 (the neglected Nyquist mode) 1036 | Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P1) 1037 | Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads, 1038 | planner_effort=self.planner_effort['ifft']) 1039 | 1040 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first. 1041 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype]) 1042 | Uc_hat_z[:] = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2) 1043 | 1044 | # Do ifft for z-direction 1045 | Uc_hat_z[:, :, -1] = 0 1046 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads, 1047 | planner_effort=self.planner_effort['irfft']) 1048 | 1049 | elif self.communication == 'Alltoall': 1050 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)] 1051 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2)) 1052 | Uc_hat_y2 = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)] 1053 | xy_plane_T = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)] 1054 | xy_plane = xy_plane_T.transpose((1, 0)) 1055 | xy_recv = self.work_arrays[((self.N2[1], self.N1[0]), self.complex, 0)] 1056 | 1057 | # Do first owned direction 1058 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads, 1059 | planner_effort=self.planner_effort['ifft']) 1060 | 1061 | # Communicate in xz-plane and do fft in y-direction 1062 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 1063 | 1064 | # Transform to y all but k=N//2 (the neglected Nyquist mode) 1065 | Uc_hat_y2 = transform_Uc_yx(Uc_hat_y2, Uc_hat_x, self.P1) 1066 | Uc_hat_y2[:] = ifft(Uc_hat_y2, axis=1, threads=self.threads, 1067 | planner_effort=self.planner_effort['ifft']) 1068 | xy_plane[:] = Uc_hat_y2[:, :, -1] 1069 | 1070 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first. 1071 | Uc_hat_y[:] = Uc_hat_y2[:, :, :self.N2[2]//2] 1072 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype]) 1073 | Uc_hat_z = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2) 1074 | 1075 | self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1) 1076 | Uc_hat_z[:, :, -1] = xy_recv.transpose((1, 0)) 1077 | 1078 | # Do ifft for z-direction 1079 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads, 1080 | planner_effort=self.planner_effort['irfft']) 1081 | 1082 | elif self.communication == 'Alltoallw': 1083 | if len(self._subarrays1A) == 0: 1084 | (self._subarrays1A, self._subarrays1B, self._subarrays2A, 1085 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays() 1086 | 1087 | Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)] 1088 | 1089 | # Do first owned direction 1090 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads, 1091 | planner_effort=self.planner_effort['ifft']) 1092 | 1093 | self.comm0.Alltoallw( 1094 | [Uc_hat_x, self._counts_displs1, self._subarrays1A], 1095 | [Uc_hat_y, self._counts_displs1, self._subarrays1B]) 1096 | 1097 | Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads, 1098 | planner_effort=self.planner_effort['ifft']) 1099 | 1100 | self.comm1.Alltoallw( 1101 | [Uc_hat_y, self._counts_displs2, self._subarrays2A], 1102 | [Uc_hat_z, self._counts_displs2, self._subarrays2B]) 1103 | # Do ifft for z-direction 1104 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads, 1105 | planner_effort=self.planner_effort['irfft']) 1106 | 1107 | else: 1108 | # Intermediate work arrays required for transform 1109 | Uc_pad_hat_z = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), self.Nf), self.complex, 0)] 1110 | Uc_pad_hat_z2 = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2]//2)+1), self.complex, 0)] 1111 | Uc_pad_hat_x = self.work_arrays[((int(self.padsize*self.N[0]), self.N1[1], self.N2f), self.complex, 0)] 1112 | 1113 | if self.communication == 'AlltoallN': 1114 | Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)] 1115 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2)) 1116 | Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)] 1117 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2)) 1118 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)] 1119 | 1120 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x) 1121 | 1122 | # Do first owned direction 1123 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads, 1124 | planner_effort=self.planner_effort['ifft']) 1125 | 1126 | # Communicate in xz-plane and do fft in y-direction 1127 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype]) 1128 | 1129 | # Transform to y 1130 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P1) 1131 | Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy2) 1132 | 1133 | Uc_pad_hat_xy = ifft(Uc_pad_hat_xy2, Uc_pad_hat_xy, overwrite_input=True, axis=1, threads=self.threads, 1134 | planner_effort=self.planner_effort['ifft']) 1135 | 1136 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first. 1137 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype]) 1138 | Uc_pad_hat_z[:] = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2) 1139 | Uc_pad_hat_z[:, :, -1] = 0 1140 | 1141 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 1142 | 1143 | # Do ifft for z-direction 1144 | u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads, 1145 | planner_effort=self.planner_effort['irfft']) 1146 | 1147 | elif self.communication == 'Alltoall': 1148 | Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)] 1149 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2)) 1150 | Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)] 1151 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2)) 1152 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)] 1153 | Uc_pad_hat_y2_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2f), self.complex, 0)] 1154 | Uc_pad_hat_y2 = Uc_pad_hat_y2_T.transpose((1, 0, 2)) 1155 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)] 1156 | 1157 | xy_plane_T = self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0])), self.complex, 0)] 1158 | xy_plane = xy_plane_T.transpose((1, 0)) 1159 | xy_recv = self.work_arrays[((int(self.padsize*self.N2[1]), int(self.padsize*self.N1[0])), self.complex, 0)] 1160 | 1161 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x) 1162 | 1163 | # Do first owned direction 1164 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads, 1165 | planner_effort=self.planner_effort['ifft']) 1166 | 1167 | # Communicate in xz-plane and do fft in y-direction 1168 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype]) 1169 | 1170 | # Transform to y 1171 | Uc_pad_hat_y2 = transform_Uc_yx(Uc_pad_hat_y2, Uc_pad_hat_x, self.P1) 1172 | 1173 | Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y2, Uc_pad_hat_xy2) 1174 | 1175 | Uc_pad_hat_xy2[:] = ifft(Uc_pad_hat_xy2, axis=1, threads=self.threads, 1176 | planner_effort=self.planner_effort['ifft']) 1177 | xy_plane[:] = Uc_pad_hat_xy2[:, :, -1] 1178 | 1179 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first. 1180 | Uc_pad_hat_xy[:] = Uc_pad_hat_xy2[:, :, :self.N2[2]//2] 1181 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype]) 1182 | Uc_pad_hat_z = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2) 1183 | 1184 | self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1) 1185 | Uc_pad_hat_z[:, :, -1] = xy_recv.transpose((1, 0)) 1186 | 1187 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 1188 | 1189 | # Do ifft for z-direction 1190 | u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads, 1191 | planner_effort=self.planner_effort['irfft']) 1192 | 1193 | elif self.communication == 'Alltoallw': 1194 | if len(self._subarrays1A_pad) == 0: 1195 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad, 1196 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize) 1197 | 1198 | Uc_pad_hat_y = self.work_arrays[((int(self.padsize*self.N1[0]), self.N[1], self.N2f), self.complex, 0)] 1199 | Uc_pad_hat_xy = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)] 1200 | 1201 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x) 1202 | 1203 | # Do first owned direction 1204 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads, 1205 | planner_effort=self.planner_effort['ifft']) 1206 | 1207 | self.comm0.Alltoallw( 1208 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad], 1209 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1B_pad]) 1210 | 1211 | Uc_pad_hat_xy = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy) 1212 | 1213 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=1, threads=self.threads, 1214 | planner_effort=self.planner_effort['ifft']) 1215 | 1216 | self.comm1.Alltoallw( 1217 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad], 1218 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad]) 1219 | 1220 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2) 1221 | 1222 | # Do ifft for z-direction 1223 | u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads, 1224 | planner_effort=self.planner_effort['irfft']) 1225 | 1226 | return u 1227 | 1228 | def fftn(self, u, fu, dealias=None): 1229 | """fft in three directions using mpi.""" 1230 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 1231 | 1232 | if not dealias == '3/2-rule': 1233 | 1234 | # Intermediate work arrays required for transform 1235 | Uc_hat_z = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)] 1236 | 1237 | if self.communication == 'AlltoallN': 1238 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)] 1239 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)] 1240 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2)) 1241 | Uc_hat_y2= self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 1)] 1242 | 1243 | # Do fft in z direction on owned data 1244 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 1245 | planner_effort=self.planner_effort['rfft']) 1246 | 1247 | # Transform to y direction neglecting k=N//2 (Nyquist) 1248 | Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2) 1249 | 1250 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first 1251 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype]) 1252 | Uc_hat_y2 = fft(Uc_hat_y, Uc_hat_y2, axis=1, threads=self.threads, 1253 | planner_effort=self.planner_effort['fft']) 1254 | 1255 | # Communicate and transform to final x-direction 1256 | Uc_hat_x = transform_Uc_xy(Uc_hat_x, Uc_hat_y2, self.P1) 1257 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype]) 1258 | 1259 | # Do fft for last direction 1260 | fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads, 1261 | planner_effort=self.planner_effort['fft']) 1262 | 1263 | elif self.communication == 'Alltoall': 1264 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)] 1265 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)] 1266 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2)) 1267 | Uc_hat_y2 = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)] 1268 | Uc_hat_x2 = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)] 1269 | Uc_hat_y3 = self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 0)] 1270 | xy_plane_T = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)] 1271 | xy_plane = xy_plane_T.transpose((1, 0)) 1272 | xy_plane2 = self.work_arrays[((self.N[1]//2+1, self.N1[0]), self.complex, 0)] 1273 | 1274 | # Do fft in z direction on owned data 1275 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 1276 | planner_effort=self.planner_effort['rfft']) 1277 | 1278 | # Move real part of Nyquist to k=0 1279 | Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1] 1280 | 1281 | # Transform to y direction neglecting k=N//2 (Nyquist) 1282 | Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2) 1283 | 1284 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first 1285 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype]) 1286 | Uc_hat_y3 = fft(Uc_hat_y, Uc_hat_y3, axis=1, threads=self.threads, 1287 | planner_effort=self.planner_effort['fft']) 1288 | Uc_hat_y2[:, :, :self.N2[2]//2] = Uc_hat_y3[:] 1289 | 1290 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0 1291 | if self.comm1_rank == 0: 1292 | M = self.N[1] 1293 | xy_plane[:] = Uc_hat_y3[:, :, 0] 1294 | xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real)) 1295 | Uc_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0)) 1296 | xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag)) 1297 | xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1]))) 1298 | self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77) 1299 | 1300 | if self.comm1_rank == self.P2-1: 1301 | self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77) 1302 | Uc_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0)) 1303 | 1304 | # Communicate and transform to final x-direction 1305 | Uc_hat_x2 = transform_Uc_xy(Uc_hat_x2, Uc_hat_y2, self.P1) 1306 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x2, self.mpitype]) 1307 | 1308 | # Do fft for last direction 1309 | fu = fft(Uc_hat_x2, fu, axis=0, threads=self.threads, 1310 | planner_effort=self.planner_effort['fft']) 1311 | 1312 | elif self.communication == 'Alltoallw': 1313 | Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)] 1314 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)] 1315 | 1316 | if len(self._subarrays1A) == 0: 1317 | (self._subarrays1A, self._subarrays1B, self._subarrays2A, 1318 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays() 1319 | 1320 | # Do fft in z direction on owned data 1321 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads, 1322 | planner_effort=self.planner_effort['rfft']) 1323 | 1324 | self.comm1.Alltoallw( 1325 | [Uc_hat_z, self._counts_displs2, self._subarrays2B], 1326 | [Uc_hat_y, self._counts_displs2, self._subarrays2A]) 1327 | Uc_hat_y[:] = fft(Uc_hat_y, axis=1, threads=self.threads, 1328 | planner_effort=self.planner_effort['fft']) 1329 | 1330 | # Communicate and transform to final x-direction 1331 | self.comm0.Alltoallw( 1332 | [Uc_hat_y, self._counts_displs1, self._subarrays1B], 1333 | [Uc_hat_x, self._counts_displs1, self._subarrays1A]) 1334 | 1335 | # Do fft for last direction 1336 | fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads, 1337 | planner_effort=self.planner_effort['fft']) 1338 | 1339 | else: 1340 | 1341 | assert u.shape == self.real_shape_padded() 1342 | padsize = self.padsize 1343 | # Strip off self 1344 | N, N1, N2, Nf, N2f = self.N, self.N1, self.N2, self.Nf, self.N2f 1345 | 1346 | # Intermediate work arrays required for transform 1347 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)] 1348 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)] 1349 | 1350 | if self.communication == 'AlltoallN': 1351 | Uc_pad_hat_x = self.work_arrays[((int(padsize*N[0]), N1[1], N2[2]//2), self.complex, 0)] 1352 | Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)] 1353 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2)) 1354 | Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)] 1355 | Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)] 1356 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2)) 1357 | 1358 | # Do fft in z direction on owned data 1359 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 1360 | planner_effort=self.planner_effort['rfft']) 1361 | 1362 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 1363 | 1364 | # Transform to y direction neglecting k=N//2 (Nyquist) 1365 | Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2) 1366 | 1367 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first 1368 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype]) 1369 | Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads, 1370 | planner_effort=self.planner_effort['fft']) 1371 | 1372 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y) 1373 | 1374 | # Communicate and transform to final x-direction 1375 | Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P1) 1376 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype]) 1377 | 1378 | # Do fft for last direction 1379 | Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads, 1380 | planner_effort=self.planner_effort['fft']) 1381 | fu = self.copy_from_padded_x(Uc_pad_hat_x, fu) 1382 | fu /= padsize**3 1383 | 1384 | elif self.communication == 'Alltoall': 1385 | Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)] 1386 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2)) 1387 | Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)] 1388 | Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)] 1389 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2)) 1390 | Uc_pad_hat_y2_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2f), self.complex, 0)] 1391 | Uc_pad_hat_y2 = Uc_pad_hat_y2_T.transpose((1, 0, 2)) 1392 | Uc_pad_hat_x2 = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)] 1393 | xy_plane_T = self.work_arrays[((self.N[1], int(self.padsize*self.N1[0])), self.complex, 0)] 1394 | xy_plane = xy_plane_T.transpose((1, 0)) 1395 | xy_plane2 = self.work_arrays[((self.N[1]//2+1, int(self.padsize*self.N1[0])), self.complex, 0)] 1396 | 1397 | # Do fft in z direction on owned data 1398 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 1399 | planner_effort=self.planner_effort['rfft']) 1400 | 1401 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 1402 | 1403 | # Move real part of Nyquist to k=0 1404 | Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1] 1405 | 1406 | # Transform to y direction neglecting k=N//2 (Nyquist) 1407 | Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2) 1408 | 1409 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first 1410 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype]) 1411 | Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads, 1412 | planner_effort=self.planner_effort['fft']) 1413 | 1414 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y) 1415 | 1416 | Uc_pad_hat_y2[:, :, :self.N2[2]//2] = Uc_pad_hat_y[:] 1417 | 1418 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0 1419 | if self.comm1_rank == 0: 1420 | M = self.N[1] 1421 | xy_plane[:] = Uc_pad_hat_y[:, :, 0] 1422 | xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real)) 1423 | Uc_pad_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0)) 1424 | xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag)) 1425 | xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1]))) 1426 | self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77) 1427 | 1428 | if self.comm1_rank == self.P2-1: 1429 | self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77) 1430 | Uc_pad_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0)) 1431 | 1432 | # Communicate and transform to final x-direction 1433 | Uc_pad_hat_x2 = transform_Uc_xy(Uc_pad_hat_x2, Uc_pad_hat_y2, self.P1) 1434 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x2, self.mpitype]) 1435 | 1436 | # Do fft for last direction 1437 | Uc_pad_hat_x2[:] = fft(Uc_pad_hat_x2, axis=0, threads=self.threads, 1438 | planner_effort=self.planner_effort['fft']) 1439 | fu = self.copy_from_padded_x(Uc_pad_hat_x2, fu) 1440 | fu /= padsize**3 1441 | 1442 | elif self.communication == 'Alltoallw': 1443 | Uc_pad_hat_y = self.work_arrays[((int(padsize*N1[0]), N[1], N2f), self.complex, 0)] 1444 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2f), self.complex, 0)] 1445 | Uc_pad_hat_x = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)] 1446 | 1447 | if len(self._subarrays1A_pad) == 0: 1448 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad, 1449 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize) 1450 | 1451 | # Do fft in z direction on owned data 1452 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads, 1453 | planner_effort=self.planner_effort['rfft']) 1454 | 1455 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z) 1456 | 1457 | self.comm1.Alltoallw( 1458 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad], 1459 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad]) 1460 | 1461 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=1, threads=self.threads, 1462 | planner_effort=self.planner_effort['fft']) 1463 | 1464 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy, Uc_pad_hat_y) 1465 | 1466 | # Communicate and transform to final x-direction 1467 | self.comm0.Alltoallw( 1468 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1B_pad], 1469 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad]) 1470 | 1471 | # Do fft for last direction 1472 | Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads, 1473 | planner_effort=self.planner_effort['fft']) 1474 | fu = self.copy_from_padded_x(Uc_pad_hat_x, fu) 1475 | fu /= padsize**3 1476 | 1477 | return fu 1478 | 1479 | def R2C(N, L, comm, precision, P1=None, communication="Alltoall", padsize=1.5, threads=1, 1480 | alignment="X", planner_effort=defaultdict(lambda : "FFTW_MEASURE")): 1481 | if alignment == 'X': 1482 | return R2CX(N, L, comm, precision, P1, communication, padsize, threads, planner_effort) 1483 | else: 1484 | return R2CY(N, L, comm, precision, P1, communication, padsize, threads, planner_effort) 1485 | -------------------------------------------------------------------------------- /mpiFFT4py/serialFFT/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | #assert False 3 | from .pyfftw_fft import * 4 | 5 | except: 6 | from .numpy_fft import * 7 | -------------------------------------------------------------------------------- /mpiFFT4py/serialFFT/numpy_fft.py: -------------------------------------------------------------------------------- 1 | __author__ = "Mikael Mortensen " 2 | __date__ = "2016-02-16" 3 | __copyright__ = "Copyright (C) 2016 " + __author__ 4 | __license__ = "GNU Lesser GPL version 3 or any later version" 5 | 6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn', 7 | 'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn'] 8 | 9 | from numpy import iscomplexobj 10 | import numpy.fft 11 | from scipy.fftpack import dct 12 | 13 | dct1 = dct 14 | def dct(a, b, type=2, axis=0, **kw): 15 | if iscomplexobj(a): 16 | b.real[:] = dct1(a.real, type=type, axis=axis) 17 | b.imag[:] = dct1(a.imag, type=type, axis=axis) 18 | return b 19 | 20 | else: 21 | b[:] = dct1(a, type=type, axis=axis) 22 | return b 23 | 24 | # Define functions taking both input array and output array 25 | def fft(a, b=None, axis=0, threads=1, **kw): 26 | if b is None: 27 | return numpy.fft.fft(a, axis=axis) 28 | else: 29 | b[:] = numpy.fft.fft(a, axis=axis) 30 | return b 31 | 32 | def ifft(a, b=None, axis=0, threads=1, **kw): 33 | if b is None: 34 | return numpy.fft.ifft(a, axis=axis) 35 | else: 36 | b[:] = numpy.fft.ifft(a, axis=axis) 37 | return b 38 | 39 | def rfft(a, b=None, axis=0, threads=1, **kw): 40 | if b is None: 41 | return numpy.fft.rfft(a, axis=axis) 42 | else: 43 | b[:] = numpy.fft.rfft(a, axis=axis) 44 | return b 45 | 46 | def irfft(a, b=None, axis=0, threads=1, **kw): 47 | if b is None: 48 | return numpy.fft.irfft(a, axis=axis) 49 | else: 50 | b[:] = numpy.fft.irfft(a, axis=axis) 51 | return b 52 | 53 | def fft2(a, b=None, axes=(0, 1), threads=1, **kw): 54 | if b is None: 55 | return numpy.fft.fft2(a, axes=axes) 56 | else: 57 | b[:] = numpy.fft.fft2(a, axes=axes) 58 | return b 59 | 60 | def ifft2(a, b=None, axes=(0, 1), threads=1, **kw): 61 | if b is None: 62 | return numpy.fft.ifft2(a, axes=axes) 63 | else: 64 | b[:] = numpy.fft.ifft2(a, axes=axes) 65 | return b 66 | 67 | def rfft2(a, b=None, axes=(0, 1), threads=1, **kw): 68 | if b is None: 69 | return numpy.fft.rfft2(a, axes=axes) 70 | else: 71 | b[:] = numpy.fft.rfft2(a, axes=axes) 72 | return b 73 | 74 | def irfft2(a, b=None, axes=(0, 1), threads=1, **kw): 75 | if b is None: 76 | return numpy.fft.irfft2(a, axes=axes) 77 | else: 78 | b[:] = numpy.fft.irfft2(a, axes=axes) 79 | return b 80 | 81 | def fftn(a, b=None, axes=(0, 1, 2), threads=1, **kw): 82 | if b is None: 83 | return numpy.fft.fftn(a, axes=axes) 84 | else: 85 | b[:] = numpy.fft.fftn(a, axes=axes) 86 | return b 87 | 88 | def ifftn(a, b=None, axes=(0, 1, 2), threads=1, **kw): 89 | if b is None: 90 | return numpy.fft.ifftn(a, axes=axes) 91 | else: 92 | b[:] = numpy.fft.ifftn(a, axes=axes) 93 | return b 94 | 95 | def rfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw): 96 | if b is None: 97 | return numpy.fft.rfftn(a, axes=axes) 98 | else: 99 | b[:] = numpy.fft.rfftn(a, axes=axes) 100 | return b 101 | 102 | def irfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw): 103 | if b is None: 104 | return numpy.fft.irfftn(a, axes=axes) 105 | else: 106 | b[:] = numpy.fft.irfftn(a, axes=axes) 107 | return b 108 | -------------------------------------------------------------------------------- /mpiFFT4py/serialFFT/pyfftw_fft.py: -------------------------------------------------------------------------------- 1 | __author__ = "Mikael Mortensen " 2 | __date__ = "2016-02-16" 3 | __copyright__ = "Copyright (C) 2016 " + __author__ 4 | __license__ = "GNU Lesser GPL version 3 or any later version" 5 | 6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn', 7 | 'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn'] 8 | 9 | import pyfftw 10 | from numpy import iscomplexobj 11 | 12 | dct_object = {} 13 | fft_object = {} 14 | ifft_object = {} 15 | fft2_object = {} 16 | ifft2_object = {} 17 | fftn_object = {} 18 | ifftn_object = {} 19 | irfft_object = {} 20 | irfftn_object = {} 21 | irfft2_object = {} 22 | rfft2_object = {} 23 | rfft_object = {} 24 | rfftn_object = {} 25 | 26 | def ifft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 27 | global ifft_object 28 | if not (a.shape, a.dtype, overwrite_input, axis) in ifft_object: 29 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.ifft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 30 | if not b is None: 31 | if b.flags['C_CONTIGUOUS'] is True: 32 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b) 33 | else: 34 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 35 | b[:] = ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 36 | return b 37 | else: 38 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 39 | return ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 40 | 41 | def ifft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 42 | global ifft2_object 43 | if not (a.shape, a.dtype, overwrite_input, axes) in ifft2_object: 44 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 45 | if not b is None: 46 | if b.flags['C_CONTIGUOUS'] is True: 47 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 48 | else: 49 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 50 | b[:] = ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 51 | return b 52 | else: 53 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 54 | return ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 55 | 56 | def ifftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 57 | global ifftn_object 58 | if not (a.shape, a.dtype, overwrite_input, axes) in ifftn_object: 59 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 60 | if not b is None: 61 | if b.flags['C_CONTIGUOUS'] is True: 62 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 63 | else: 64 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 65 | b[:] = ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 66 | return b 67 | else: 68 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 69 | return ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 70 | 71 | def irfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 72 | global irfft_object 73 | if not (a.shape, a.dtype, axis) in irfft_object: 74 | irfft_object[(a.shape, a.dtype, axis)] = pyfftw.builders.irfft(a, axis=axis, threads=threads, planner_effort=planner_effort) 75 | if overwrite_input: 76 | irfft_object[(a.shape, a.dtype, axis)](a) 77 | else: 78 | irfft_object[(a.shape, a.dtype, axis)](a.copy()) 79 | if not b is None: 80 | b[:] = irfft_object[(a.shape, a.dtype, axis)].output_array 81 | return b 82 | else: 83 | return irfft_object[(a.shape, a.dtype, axis)].output_array 84 | 85 | def irfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 86 | global irfft2_object 87 | if not (a.shape, a.dtype, axes) in irfft2_object: 88 | irfft2_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfft2(a, axes=axes, threads=threads, planner_effort=planner_effort) 89 | # Copy required for irfft2 because input is destroyed 90 | if overwrite_input: 91 | irfft2_object[(a.shape, a.dtype, axes)](a) 92 | else: 93 | irfft2_object[(a.shape, a.dtype, axes)](a.copy()) 94 | if not b is None: 95 | b[:] = irfft2_object[(a.shape, a.dtype, axes)].output_array 96 | return b 97 | else: 98 | return irfft2_object[(a.shape, a.dtype, axes)].output_array 99 | 100 | def irfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 101 | global irfftn_object 102 | if not (a.shape, a.dtype, axes) in irfftn_object: 103 | irfftn_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfftn(a, axes=axes, threads=threads, planner_effort=planner_effort) 104 | # Copy required because input is always destroyed 105 | if overwrite_input: 106 | irfftn_object[(a.shape, a.dtype, axes)](a) 107 | else: 108 | irfftn_object[(a.shape, a.dtype, axes)](a.copy()) 109 | if not b is None: 110 | b[:] = irfftn_object[(a.shape, a.dtype, axes)].output_array 111 | return b 112 | else: 113 | return irfftn_object[(a.shape, a.dtype, axes)].output_array 114 | 115 | def fft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 116 | global fft_object 117 | if not (a.shape, a.dtype, overwrite_input, axis) in fft_object: 118 | fft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.fft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 119 | if not b is None: 120 | if b.flags['C_CONTIGUOUS'] is True: 121 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b) 122 | else: 123 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 124 | b[:] = fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 125 | return b 126 | else: 127 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 128 | return fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 129 | 130 | def fft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 131 | global fft2_object 132 | if not (a.shape, a.dtype, overwrite_input, axes) in fft2_object: 133 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 134 | if not b is None: 135 | if b.flags['C_CONTIGUOUS'] is True: 136 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 137 | else: 138 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 139 | b[:] = fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 140 | return b 141 | else: 142 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 143 | return fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 144 | 145 | def fftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 146 | global fftn_object 147 | if not (a.shape, a.dtype, overwrite_input, axes) in fftn_object: 148 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 149 | if not b is None: 150 | if b.flags['C_CONTIGUOUS'] is True: 151 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 152 | else: 153 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 154 | b[:] = fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 155 | return b 156 | else: 157 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 158 | return fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 159 | 160 | def rfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 161 | global rfft_object 162 | if not (a.shape, a.dtype, overwrite_input, axis) in rfft_object: 163 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.rfft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 164 | if not b is None: 165 | if b.flags['C_CONTIGUOUS'] is True: 166 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b) 167 | else: 168 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 169 | b[:] = rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 170 | return b 171 | else: 172 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a) 173 | return rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array 174 | 175 | def rfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 176 | global rfft2_object 177 | if not (a.shape, a.dtype, overwrite_input, axes) in rfft2_object: 178 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 179 | if not b is None: 180 | if b.flags['C_CONTIGUOUS'] is True: 181 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 182 | else: 183 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 184 | b[:] = rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 185 | return b 186 | else: 187 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a) 188 | return rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 189 | 190 | def rfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 191 | global rfftn_object 192 | if not (a.shape, a.dtype, overwrite_input, axes) in rfftn_object: 193 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 194 | if not b is None: 195 | if b.flags['C_CONTIGUOUS'] is True: 196 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b) 197 | else: 198 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 199 | b[:] = rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 200 | return b 201 | else: 202 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a) 203 | return rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array 204 | 205 | if hasattr(pyfftw.builders, "dct"): 206 | #@profile 207 | def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_EXHAUSTIVE"): 208 | global dct_object 209 | key = (a.shape, a.dtype, overwrite_input, axis, type) 210 | if not key in dct_object: 211 | if iscomplexobj(a): 212 | ac = a.real.copy() 213 | else: 214 | ac = a 215 | dct_object[key] = pyfftw.builders.dct(ac, axis=axis, type=type, 216 | overwrite_input=overwrite_input, 217 | threads=threads, 218 | planner_effort=planner_effort) 219 | 220 | dobj = dct_object[key] 221 | c = dobj.get_output_array() 222 | if iscomplexobj(a): 223 | dobj(a.real, c) 224 | b.real[:] = c 225 | dobj(a.imag, c) 226 | b.imag[:] = c 227 | 228 | else: 229 | dobj(a) 230 | b[:] = c 231 | return b 232 | 233 | else: 234 | dct1 = pyfftw.interfaces.scipy_fftpack.dct 235 | #@profile 236 | def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 237 | if iscomplexobj(a): 238 | b.real[:] = dct1(a.real, type=type, axis=axis) 239 | b.imag[:] = dct1(a.imag, type=type, axis=axis) 240 | return b 241 | 242 | else: 243 | b[:] = dct1(a, type=type, axis=axis) 244 | return b 245 | 246 | 247 | #def fft(a, b=None, axis=0): 248 | #if b is None: 249 | #b = nfft.fft(a, axis=axis) 250 | #else: 251 | #b[:] = nfft.fft(a, axis=axis) 252 | #return b 253 | 254 | #def ifft(a, b=None, axis=0): 255 | #if b is None: 256 | #b = nfft.ifft(a, axis=axis) 257 | #else: 258 | #b[:] = nfft.ifft(a, axis=axis) 259 | #return b 260 | 261 | #def rfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 262 | #b[:] = nfft.rfft(a, axis=axis, overwrite_input=overwrite_input) 263 | #return b 264 | 265 | #def irfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 266 | #b[:] = nfft.irfft(a, axis=axis, overwrite_input=overwrite_input) 267 | #return b 268 | 269 | #def fft2(a, b=None, axes=(0, 1)): 270 | #if b is None: 271 | #b = nfft.fft2(a, axes=axes) 272 | #else: 273 | #b[:] = nfft.fft2(a, axes=axes) 274 | #return b 275 | 276 | #def ifft2(a, b=None, axes=(0, 1)): 277 | #if b is None: 278 | #b = nfft.ifft2(a, axes=axes) 279 | #else: 280 | #b[:] = nfft.ifft2(a, axes=axes) 281 | #return b 282 | 283 | #def rfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 284 | #b[:] = nfft.rfft2(a, axes=axes, overwrite_input=overwrite_input) 285 | #return b 286 | 287 | #def irfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 288 | #b[:] = nfft.irfft2(a, axes=axes, overwrite_input=overwrite_input) 289 | #return b 290 | 291 | #def fftn(a, b=None, axes=(0, 1, 2)): 292 | #if b is None: 293 | #b = nfft.fftn(a, axes=axes) 294 | #else: 295 | #b[:] = nfft.fftn(a, axes=axes) 296 | #return b 297 | 298 | #def ifftn(a, b=None, axes=(0, 1, 2)): 299 | #if b is None: 300 | #b = nfft.ifftn(a, axes=axes) 301 | #else: 302 | #b[:] = nfft.ifftn(a, axes=axes) 303 | #return b 304 | 305 | #def rfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 306 | #b[:] = nfft.rfftn(a, axes=axes, overwrite_input=overwrite_input) 307 | #return b 308 | 309 | #def irfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"): 310 | #b[:] = nfft.irfftn(a, axes=axes, overwrite_input=overwrite_input) 311 | #return b 312 | 313 | -------------------------------------------------------------------------------- /mpiFFT4py/slab.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | """Slab decomposition 3 | 4 | This module contains classes for performing FFTs with slab decomposition 5 | of three-dimensional data structures data[Nx, Ny, Nz], where (Nx, Ny, Nz) is 6 | the shape of the input data. With slab decomposition only one of these three 7 | indices is shared, leading to local datastructures on each processor 8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors. 9 | 10 | classes: 11 | R2C - For real to complex transforms 12 | C2C - For complex to complex transforms 13 | """ 14 | __author__ = "Mikael Mortensen " 15 | __date__ = "2016-02-16" 16 | __copyright__ = "Copyright (C) 2016 " + __author__ 17 | __license__ = "GNU Lesser GPL version 3 or any later version" 18 | 19 | from .serialFFT import * 20 | import numpy as np 21 | from .mpibase import work_arrays, datatypes 22 | from numpy.fft import fftfreq, rfftfreq 23 | from .cython.maths import dealias_filter, transpose_Uc #, transpose_Umpi 24 | from collections import defaultdict 25 | from mpi4py import MPI 26 | 27 | # Using Lisandro Dalcin's code for Alltoallw. 28 | # Note that _subsize and _distribution are only really required for 29 | # general shape meshes. Here we require power two. 30 | 31 | def _subsize(N, size, rank): 32 | return N // size + (N % size > rank) 33 | 34 | def _distribution(N, size): 35 | q = N // size 36 | r = N % size 37 | n = s = i = 0 38 | while i < size: 39 | n = q 40 | s = q * i 41 | if i < r: 42 | n += 1 43 | s += i 44 | else: 45 | s += r 46 | yield n, s 47 | i += 1 48 | 49 | class R2C(object): 50 | """Class for performing FFT in 3D using MPI 51 | 52 | Slab decomposition 53 | 54 | Args: 55 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh 56 | L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh 57 | comm - The MPI communicator object 58 | precision - "single" or "double" 59 | communication - Method used for communication ('Alltoall', 'Sendrecv_replace', 'Alltoallw') 60 | padsize - Padsize when dealias = 3/2-rule is used 61 | threads - Number of threads used by FFTs 62 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 63 | Give as defaultdict, with keys representing transform (e.g., fft, ifft) 64 | 65 | The forward transform is real to complex and the inverse is complex to real 66 | """ 67 | def __init__(self, N, L, comm, precision, 68 | communication="Alltoallw", 69 | padsize=1.5, 70 | threads=1, 71 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")): 72 | assert len(L) == 3 73 | assert len(N) == 3 74 | self.N = N 75 | self.Nf = N[2]//2+1 # Independent complex wavenumbers in z-direction 76 | self.Nfp = int(padsize*N[2]//2+1) # Independent complex wavenumbers in z-direction for padded array 77 | self.comm = comm 78 | self.float, self.complex, self.mpitype = datatypes(precision) 79 | self.communication = communication 80 | self.num_processes = comm.Get_size() 81 | self.rank = comm.Get_rank() 82 | self.Np = N // self.num_processes 83 | self.L = L.astype(self.float) 84 | self.dealias = np.zeros(0) 85 | self.padsize = padsize 86 | self.threads = threads 87 | self.planner_effort = planner_effort 88 | self.work_arrays = work_arrays() 89 | if not self.num_processes in [2**i for i in range(int(np.log2(N[0]))+1)]: 90 | raise IOError("Number of cpus must be in ", 91 | [2**i for i in range(int(np.log2(N[0]))+1)]) 92 | self._subarraysA = [] 93 | self._subarraysB = [] 94 | self._counts_displs = 0 95 | self._subarraysA_pad = [] 96 | self._subarraysB_pad = [] 97 | 98 | def real_shape(self): 99 | """The local shape of the real data""" 100 | return (self.Np[0], self.N[1], self.N[2]) 101 | 102 | def complex_shape(self): 103 | """The local shape of the complex data""" 104 | return (self.N[0], self.Np[1], self.Nf) 105 | 106 | def complex_shape_T(self): 107 | """The local transposed shape of the complex data""" 108 | return (self.Np[0], self.N[1], self.Nf) 109 | 110 | def global_real_shape(self): 111 | """Global size of problem in real physical space""" 112 | return (self.N[0], self.N[1], self.N[2]) 113 | 114 | def global_complex_shape(self, padsize=1.): 115 | """Global size of problem in complex wavenumber space""" 116 | return (int(padsize*self.N[0]), int(padsize*self.N[1]), 117 | int(padsize*self.N[2]//2+1)) 118 | 119 | def work_shape(self, dealias): 120 | """Shape of work arrays used in convection with dealiasing. 121 | 122 | Note the different shape whether or not padding is involved. 123 | """ 124 | if dealias == '3/2-rule': 125 | return self.real_shape_padded() 126 | 127 | else: 128 | return self.real_shape() 129 | 130 | def real_local_slice(self, padsize=1): 131 | """Local slice in real space of the input array 132 | 133 | Array can be padded with padsize > 1 134 | """ 135 | return (slice(int(padsize*self.rank*self.Np[0]), 136 | int(padsize*(self.rank+1)*self.Np[0]), 1), 137 | slice(0, int(padsize*self.N[1]), 1), 138 | slice(0, int(padsize*self.N[2]), 1)) 139 | 140 | def complex_local_slice(self): 141 | """Local slice of complex return array""" 142 | return (slice(0, self.N[0], 1), 143 | slice(self.rank*self.Np[1], (self.rank+1)*self.Np[1], 1), 144 | slice(0, self.Nf, 1)) 145 | 146 | def complex_local_wavenumbers(self): 147 | """Returns local wavenumbers of complex space""" 148 | return (fftfreq(self.N[0], 1./self.N[0]).astype(self.float), 149 | fftfreq(self.N[1], 1./self.N[1])[self.complex_local_slice()[1]].astype(self.float), 150 | rfftfreq(self.N[2], 1./self.N[2]).astype(self.float)) 151 | 152 | def get_local_mesh(self): 153 | """Returns the local decomposed physical mesh""" 154 | X = np.ogrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0], 155 | :self.N[1], :self.N[2]] 156 | X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float) 157 | X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float) 158 | X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float) 159 | X = [np.broadcast_to(x, self.real_shape()) for x in X] 160 | return X 161 | 162 | def get_local_wavenumbermesh(self, scaled=False, broadcast=False, eliminate_highest_freq=False): 163 | """Returns (scaled) local decomposed wavenumbermesh 164 | 165 | If scaled is True, then the wavenumbermesh is scaled with physical mesh 166 | size. This takes care of mapping the physical domain to a computational 167 | cube of size (2pi)**3. 168 | 169 | If eliminate_highest_freq is True, then the Nyquist frequency is set to zero. 170 | """ 171 | kx, ky, kz = self.complex_local_wavenumbers() 172 | if eliminate_highest_freq: 173 | ky = fftfreq(self.N[1], 1./self.N[1].astype(self.float)) 174 | for i, k in enumerate((kx, ky, kz)): 175 | if self.N[i] % 2 == 0: 176 | k[self.N[i]//2] = 0 177 | ky = ky[self.complex_local_slice()[1]] 178 | 179 | Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True) 180 | for i in range(3): 181 | Ks[i] = Ks[i].astype(self.float) 182 | if scaled: 183 | Lp = 2*np.pi/self.L 184 | for i in range(3): 185 | Ks[i] *= Lp[i] 186 | K = Ks 187 | if broadcast is True: 188 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks] 189 | return K 190 | 191 | def get_dealias_filter(self): 192 | """Filter for dealiasing nonlinear convection""" 193 | K = self.get_local_wavenumbermesh() 194 | kmax = 2./3.*(self.N//2+1) 195 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])* 196 | (abs(K[2]) < kmax[2]), dtype=np.uint8) 197 | return dealias 198 | 199 | def get_subarrays(self, padsize=1): 200 | """Subarrays for Alltoallw transforms""" 201 | datatype = MPI._typedict[np.dtype(self.complex).char] 202 | _subarraysA = [ 203 | datatype.Create_subarray([int(padsize*self.N[0]), self.Np[1], self.Nf], [l, self.Np[1], self.Nf], [s, 0, 0]).Commit() 204 | for l, s in _distribution(int(padsize*self.N[0]), self.num_processes) 205 | ] 206 | _subarraysB = [ 207 | datatype.Create_subarray([int(padsize*self.Np[0]), self.N[1], self.Nf], [int(padsize*self.Np[0]), l, self.Nf], [0, s, 0]).Commit() 208 | for l, s in _distribution(self.N[1], self.num_processes) 209 | ] 210 | _counts_displs = ([1] * self.num_processes, [0] * self.num_processes) 211 | return _subarraysA, _subarraysB, _counts_displs 212 | 213 | #@profile 214 | def ifftn(self, fu, u, dealias=None): 215 | """ifft in three directions using mpi. 216 | 217 | Need to do ifft in reversed order of fft 218 | 219 | dealias = "3/2-rule" 220 | - Padded transform with 3/2-rule. fu is padded with zeros 221 | before transforming to real space of shape real_shape_padded() 222 | - u is of real_shape_padded() 223 | 224 | dealias = "2/3-rule" 225 | - Transform is using 2/3-rule, i.e., frequencies higher than 226 | 2/3*N are set to zero before transforming 227 | - u is of real_shape() 228 | 229 | dealias = None 230 | - Regular transform 231 | - u is of real_shape() 232 | 233 | fu is of complex_shape() 234 | """ 235 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 236 | 237 | if dealias == '2/3-rule' and self.dealias.shape == (0,): 238 | self.dealias = self.get_dealias_filter() 239 | 240 | fu_ = fu 241 | if dealias == '2/3-rule': 242 | fu_ = self.work_arrays[(fu, 0, False)] 243 | fu_[:] = fu 244 | fu_ = dealias_filter(fu_, self.dealias) 245 | #fu_ *= self.dealias 246 | 247 | if self.num_processes == 1: 248 | if not dealias == '3/2-rule': 249 | u = irfftn(fu_, u, axes=(0, 1, 2), threads=self.threads, planner_effort=self.planner_effort['irfftn']) 250 | 251 | else: 252 | assert u.shape == self.real_shape_padded() 253 | 254 | # Scale smallest array with padsize 255 | fu_ = self.work_arrays[(fu, 0, False)] 256 | fu_[:] = fu*self.padsize**3 257 | 258 | # First create padded complex array and then perform irfftn 259 | fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5), self.complex, 0)] 260 | fu_padded[:self.N[0]//2, :self.N[1]//2, :self.Nf] = fu_[:self.N[0]//2, :self.N[1]//2] 261 | fu_padded[:self.N[0]//2, -self.N[1]//2:, :self.Nf] = fu_[:self.N[0]//2, self.N[1]//2:] 262 | fu_padded[-self.N[0]//2:, :self.N[1]//2, :self.Nf] = fu_[self.N[0]//2:, :self.N[1]//2] 263 | fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf] = fu_[self.N[0]//2:, -self.N[1]//2:] 264 | 265 | u[:] = irfftn(fu_padded, overwrite_input=True, 266 | axes=(0, 1, 2), threads=self.threads, 267 | planner_effort=self.planner_effort['irfftn']) 268 | return u 269 | 270 | if not dealias == '3/2-rule': 271 | # Intermediate work arrays required for transform 272 | Uc_hat = self.work_arrays[(self.complex_shape(), self.complex, 0, False)] 273 | 274 | # Do first owned direction 275 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft']) 276 | 277 | if self.communication == 'Alltoall': 278 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)] 279 | 280 | ## Communicate all values 281 | self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype]) 282 | #Uc_hatT = np.rollaxis(Uc_mpi, 1).reshape(self.complex_shape_T()) 283 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 284 | Uc_hatT = transpose_Uc(Uc_hatT, Uc_mpi, self.num_processes, self.Np[0], self.Np[1], self.Nf) 285 | 286 | #self.comm.Alltoall(MPI.IN_PLACE, [Uc_hat, self.mpitype]) 287 | #Uc_hatT = np.rollaxis(Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf)), 1).reshape(self.complex_shape_T()) 288 | 289 | elif self.communication == 'Sendrecv_replace': 290 | Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf)) 291 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 292 | for i in xrange(self.num_processes): 293 | if not i == self.rank: 294 | self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0) 295 | Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i] 296 | 297 | elif self.communication == 'Alltoallw': 298 | if len(self._subarraysA) == 0: 299 | self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays() 300 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 301 | self.comm.Alltoallw( 302 | [Uc_hat, self._counts_displs, self._subarraysA], 303 | [Uc_hatT, self._counts_displs, self._subarraysB]) 304 | 305 | # Do last two directions 306 | u = irfft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2), 307 | threads=self.threads, 308 | planner_effort=self.planner_effort['irfft2']) 309 | 310 | else: 311 | assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule" 312 | 313 | # Intermediate work arrays required for transform 314 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0)] 315 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)] 316 | Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0)] 317 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0)] 318 | 319 | # Expand in x-direction and perform ifft 320 | Upad_hat = R2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0) 321 | Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads, 322 | planner_effort=self.planner_effort['ifft']) 323 | 324 | if not self.communication == 'Alltoallw': 325 | # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir) 326 | self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat, self.mpitype]) 327 | Upad_hat1[:] = np.rollaxis(Upad_hat.reshape(self.complex_shape_padded_0_I()), 1).reshape(Upad_hat1.shape) 328 | 329 | else: 330 | if len(self._subarraysA_pad) == 0: 331 | self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize) 332 | self.comm.Alltoallw( 333 | [Upad_hat, self._counts_displs, self._subarraysA_pad], 334 | [Upad_hat1, self._counts_displs, self._subarraysB_pad]) 335 | 336 | # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y 337 | Upad_hat2 = R2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1) 338 | Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads, 339 | planner_effort=self.planner_effort['ifft']) 340 | 341 | # pad in z-direction and perform final irfft 342 | Upad_hat3 = R2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2) 343 | u[:] = irfft(Upad_hat3, overwrite_input=True, axis=2, threads=self.threads, 344 | planner_effort=self.planner_effort['irfft']) 345 | 346 | return u 347 | 348 | #@profile 349 | def fftn(self, u, fu, dealias=None): 350 | """fft in three directions using mpi 351 | 352 | dealias = "3/2-rule" 353 | - Truncated transform with 3/2-rule. The transformed fu is truncated 354 | when copied to complex space of complex_shape() 355 | - fu is of complex_shape() 356 | - u is of real_shape_padded() 357 | 358 | dealias = "2/3-rule" or None 359 | - Regular transform 360 | - fu is of complex_shape() 361 | - u is of real_shape() 362 | 363 | """ 364 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 365 | 366 | if self.num_processes == 1: 367 | if not dealias == '3/2-rule': 368 | assert u.shape == self.real_shape() 369 | fu = rfftn(u, fu, axes=(0, 1, 2), threads=self.threads, 370 | planner_effort=self.planner_effort['rfftn']) 371 | 372 | else: 373 | assert u.shape == self.real_shape_padded() 374 | 375 | fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5), 376 | self.complex, 0, False)] 377 | fu_padded = rfftn(u, fu_padded, axes=(0, 1, 2), 378 | planner_effort=self.planner_effort['rfftn']) 379 | 380 | # Copy with truncation 381 | fu.fill(0) 382 | fu[:self.N[0]//2+1, :self.N[1]//2+1] = fu_padded[:self.N[0]//2+1, :self.N[1]//2+1, :self.Nf] 383 | fu[:self.N[0]//2+1, self.N[1]//2:] += fu_padded[:self.N[0]//2+1, -self.N[1]//2:, :self.Nf] 384 | fu[self.N[0]//2:, :self.N[1]//2+1] += fu_padded[-self.N[0]//2:, :self.N[1]//2+1, :self.Nf] 385 | fu[self.N[0]//2:, self.N[1]//2:] += fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf] 386 | fu /= self.padsize**3 387 | 388 | return fu 389 | 390 | if not dealias == '3/2-rule': 391 | 392 | Uc_hat = self.work_arrays[(fu, 0, False)] 393 | 394 | if self.communication == 'Alltoall': 395 | # Intermediate work arrays required for transform 396 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 397 | U_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)] 398 | 399 | # Do 2 ffts in y-z directions on owned data 400 | Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads, planner_effort=self.planner_effort['rfft2']) 401 | 402 | #Transform data to align with x-direction 403 | U_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1) 404 | 405 | #Communicate all values 406 | self.comm.Alltoall([U_mpi, self.mpitype], [Uc_hat, self.mpitype]) 407 | 408 | ## Transform data to align with x-direction 409 | #U_mpi = transpose_Umpi(U_mpi, Uc_hatT, self.num_processes, self.Np[0], self.Np[1], self.Nf) 410 | 411 | ## Communicate all values 412 | #self.comm.Alltoall([U_mpi, self.mpitype], [fu, self.mpitype]) 413 | 414 | elif self.communication == 'Sendrecv_replace': 415 | # Communicating intermediate result 416 | ft = Uc_hat.transpose(1, 0, 2) 417 | ft = rfft2(u, ft, axes=(1, 2), threads=self.threads, 418 | planner_effort=self.planner_effort['rfft2']) 419 | fu_send = Uc_hat.reshape((self.num_processes, self.Np[1], 420 | self.Np[1], self.Nf)) 421 | for i in xrange(self.num_processes): 422 | if not i == self.rank: 423 | self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0) 424 | fu_send[:] = fu_send.transpose(0, 2, 1, 3) 425 | 426 | elif self.communication == 'Alltoallw': 427 | if len(self._subarraysA) == 0: 428 | self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays() 429 | 430 | # Intermediate work arrays required for transform 431 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 432 | 433 | # Do 2 ffts in y-z directions on owned data 434 | Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads, 435 | planner_effort=self.planner_effort['rfft2']) 436 | 437 | self.comm.Alltoallw( 438 | [Uc_hatT, self._counts_displs, self._subarraysB], 439 | [Uc_hat, self._counts_displs, self._subarraysA]) 440 | 441 | # Do fft for last direction 442 | fu = fft(Uc_hat, fu, overwrite_input=True, axis=0, 443 | threads=self.threads, planner_effort=self.planner_effort['fft']) 444 | 445 | else: 446 | assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule" 447 | assert u.shape == self.real_shape_padded() 448 | 449 | # Intermediate work arrays required for transform 450 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)] 451 | Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)] 452 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)] 453 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)] 454 | 455 | # Do ffts in the padded y and z directions 456 | Upad_hat3 = rfft2(u, Upad_hat3, axes=(1, 2), threads=self.threads, 457 | planner_effort=self.planner_effort['rfft2']) 458 | 459 | # Copy with truncation 460 | Upad_hat1 = R2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1) 461 | 462 | if self.communication == 'Alltoall': 463 | # Transpose and commuincate data 464 | Upad_hat0[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1).reshape(Upad_hat0.shape) 465 | self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat0, self.mpitype]) 466 | 467 | elif self.communication == 'Alltoallw': 468 | if len(self._subarraysA_pad) == 0: 469 | self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize) 470 | 471 | self.comm.Alltoallw( 472 | [Upad_hat1, self._counts_displs, self._subarraysB_pad], 473 | [Upad_hat0, self._counts_displs, self._subarraysA_pad]) 474 | 475 | # Perform fft of data in x-direction 476 | Upad_hat = fft(Upad_hat0, Upad_hat, axis=0, threads=self.threads, 477 | planner_effort=self.planner_effort['fft']) 478 | 479 | # Truncate to original complex shape 480 | fu.fill(0) 481 | fu[:self.N[0]//2+1] = Upad_hat[:self.N[0]//2+1] 482 | fu[self.N[0]//2:] += Upad_hat[-self.N[0]//2:] 483 | fu /= self.padsize**3 484 | 485 | return fu 486 | 487 | def real_shape_padded(self): 488 | """The local shape of the real data""" 489 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), int(self.padsize*self.N[2])) 490 | 491 | def complex_shape_padded_0(self): 492 | """Padding in x-direction""" 493 | return (int(self.padsize*self.N[0]), self.Np[1], self.Nf) 494 | 495 | def complex_shape_padded_0_I(self): 496 | """Padding in x-direction - reshaped for MPI communications""" 497 | return (self.num_processes, int(self.padsize*self.Np[0]), self.Np[1], self.Nf) 498 | 499 | def complex_shape_padded_1(self): 500 | """Transpose of complex_shape_padded_0""" 501 | return (int(self.padsize*self.Np[0]), self.N[1], self.Nf) 502 | 503 | def complex_shape_padded_2(self): 504 | """Padding in x and y-directions""" 505 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nf) 506 | 507 | def complex_shape_padded_3(self): 508 | """Padding in all directions. 509 | ifft of this shape leads to real_shape_padded""" 510 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nfp) 511 | 512 | def complex_shape_padded_I(self): 513 | """A local intermediate shape of the complex data""" 514 | return (int(self.padsize*self.Np[0]), self.num_processes, self.Np[1], self.Nf) 515 | 516 | @staticmethod 517 | def copy_to_padded(fu, fp, N, axis=0): 518 | if axis == 0: 519 | fp[:N[0]//2] = fu[:N[0]//2] 520 | fp[-N[0]//2:] = fu[N[0]//2:] 521 | elif axis == 1: 522 | fp[:, :N[1]//2] = fu[:, :N[1]//2] 523 | fp[:, -N[1]//2:] = fu[:, N[1]//2:] 524 | elif axis == 2: 525 | fp[:, :, :(N[2]//2+1)] = fu[:] 526 | return fp 527 | 528 | @staticmethod 529 | def copy_from_padded(fp, fu, N, axis=0): 530 | if axis == 1: 531 | fu.fill(0) 532 | fu[:, :N[1]//2+1] = fp[:, :N[1]//2+1, :(N[2]//2+1)] 533 | fu[:, N[1]//2:] += fp[:, -N[1]//2:, :(N[2]//2+1)] 534 | elif axis == 2: 535 | fu[:] = fp[:, :, :(N[2]//2+1)] 536 | return fu 537 | 538 | class C2C(R2C): 539 | """Class for performing FFT in 3D using MPI 540 | 541 | Slab decomposition 542 | 543 | Args: 544 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh 545 | L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh 546 | comm - The MPI communicator object 547 | precision - "single" or "double" 548 | communication - Method used for communication ('Alltoall', 'Sendrecv_replace') 549 | padsize - Padsize when dealias = 3/2-rule is used 550 | threads - Number of threads used by FFTs 551 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE") 552 | Give as defaultdict, with keys representing transform (e.g., fft, ifft) 553 | 554 | The transform is complex to complex 555 | """ 556 | def __init__(self, N, L, comm, precision, 557 | communication="Alltoall", 558 | padsize=1.5, 559 | threads=1, 560 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")): 561 | R2C.__init__(self, N, L, comm, precision, 562 | communication=communication, 563 | padsize=padsize, threads=threads, 564 | planner_effort=planner_effort) 565 | # Reuse all shapes from r2c transform R2C simply by resizing the final complex z-dimension: 566 | self.Nf = N[2] 567 | self.Nfp = int(self.padsize*self.N[2]) # Independent complex wavenumbers in z-direction for padded array 568 | 569 | # Rename since there's no real space 570 | self.original_shape_padded = self.real_shape_padded 571 | self.original_shape = self.real_shape 572 | self.transformed_shape = self.complex_shape 573 | self.original_local_slice = self.real_local_slice 574 | self.transformed_local_slice = self.complex_local_slice 575 | self.ks = (fftfreq(N[2])*N[2]).astype(int) 576 | 577 | def global_shape(self, padsize=1.): 578 | """Global size of problem in transformed space""" 579 | return (int(padsize*self.N[0]), int(padsize*self.N[1]), 580 | int(padsize*self.N[2])) 581 | 582 | def transformed_local_wavenumbers(self): 583 | return (fftfreq(self.N[0], 1./self.N[0]), 584 | fftfreq(self.N[1], 1./self.N[1])[self.transformed_local_slice()[1]], 585 | fftfreq(self.N[2], 1./self.N[2])) 586 | 587 | def ifftn(self, fu, u, dealias=None): 588 | """ifft in three directions using mpi. 589 | Need to do ifft in reversed order of fft 590 | 591 | dealias = "3/2-rule" 592 | - Padded transform with 3/2-rule. fu is padded with zeros 593 | before transforming to complex space of shape original_shape_padded() 594 | - u is of original_shape_padded() 595 | 596 | dealias = "2/3-rule" 597 | - Transform is using 2/3-rule, i.e., frequencies higher than 598 | 2/3*N are set to zero before transforming 599 | - u is of original_shape() 600 | 601 | dealias = None 602 | - Regular transform 603 | - u is of original_shape() 604 | 605 | fu is of transformed_shape() 606 | """ 607 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 608 | 609 | if dealias == '2/3-rule' and self.dealias.shape == (0,): 610 | self.dealias = self.get_dealias_filter() 611 | 612 | if self.num_processes == 1: 613 | if not dealias == '3/2-rule': 614 | fu_ = fu 615 | if dealias == '2/3-rule': 616 | fu_ = self.work_arrays[(fu, 0, False)] 617 | fu_[:] = fu 618 | fu_ *= self.dealias 619 | 620 | u = ifftn(fu_, u, axes=(0, 1, 2), threads=self.threads, 621 | planner_effort=self.planner_effort['ifftn']) 622 | 623 | else: 624 | assert u.shape == self.original_shape_padded() 625 | 626 | # First create padded complex array and then perform irfftn 627 | fu_padded = self.work_arrays[(u, 0)] 628 | fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks] = fu[:self.N[0]//2, :self.N[1]//2] 629 | fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks] = fu[:self.N[0]//2, self.N[1]//2:] 630 | fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks] = fu[self.N[0]//2:, :self.N[1]//2] 631 | fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks] = fu[self.N[0]//2:, self.N[1]//2:] 632 | u = ifftn(fu_padded*self.padsize**3, u, overwrite_input=True, 633 | axes=(0, 1, 2), threads=self.threads, 634 | planner_effort=self.planner_effort['ifftn']) 635 | 636 | return u 637 | 638 | if not dealias == '3/2-rule': 639 | fu_ = fu 640 | if dealias == '2/3-rule': 641 | fu_ = self.work_arrays[(fu, 0, False)] 642 | fu_[:] = fu 643 | fu_ *= self.dealias 644 | 645 | # Intermediate work arrays required for transform 646 | Uc_hat = self.work_arrays[(self.complex_shape(), self.complex, 0, False)] 647 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)] 648 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 649 | 650 | # Do first owned direction 651 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, 652 | planner_effort=self.planner_effort['ifft']) 653 | 654 | if self.communication == 'Alltoall': 655 | # Communicate all values 656 | self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype]) 657 | Uc_hatT[:] = np.rollaxis(Uc_mpi, 1).reshape(Uc_hatT.shape) 658 | 659 | else: 660 | Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf)) 661 | for i in xrange(self.num_processes): 662 | if not i == self.rank: 663 | self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0) 664 | Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i] 665 | 666 | # Do last two directions 667 | u = ifft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2), 668 | threads=self.threads, 669 | planner_effort=self.planner_effort['ifft2']) 670 | 671 | else: 672 | # Intermediate work arrays required for transform 673 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)] 674 | U_mpi = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)] 675 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)] 676 | Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0, False)] 677 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)] 678 | 679 | # Expand in x-direction and perform ifft 680 | Upad_hat = C2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0) 681 | Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads, 682 | planner_effort=self.planner_effort['ifft']) 683 | 684 | # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir and z-direction of full size) 685 | self.comm.Alltoall([Upad_hat, self.mpitype], [U_mpi, self.mpitype]) 686 | 687 | # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y 688 | Upad_hat1[:] = np.rollaxis(U_mpi, 1).reshape(Upad_hat1.shape) 689 | Upad_hat2 = C2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1) 690 | Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads, 691 | planner_effort=self.planner_effort['ifft']) 692 | 693 | # pad in z-direction and perform final ifft 694 | Upad_hat3 = C2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2) 695 | u = ifft(Upad_hat3, u, overwrite_input=True, axis=2, 696 | threads=self.threads, planner_effort=self.planner_effort['ifft']) 697 | 698 | return u 699 | 700 | def fftn(self, u, fu, dealias=None): 701 | """fft in three directions using mpi 702 | 703 | dealias = "3/2-rule" 704 | - Truncated transform with 3/2-rule. The transfored fu is truncated 705 | when copied to complex space of complex_shape() 706 | - fu is of transformed_shape() 707 | - u is of original_shape_padded() 708 | 709 | dealias = "2/3-rule" 710 | - Regular transform 711 | - fu is of transformed_shape() 712 | - u is of original_shape() 713 | 714 | dealias = None 715 | - Regular transform 716 | - fu is of transformed_shape() 717 | - u is of original_shape() 718 | """ 719 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None) 720 | 721 | if self.num_processes == 1: 722 | if not dealias == '3/2-rule': 723 | assert u.shape == self.original_shape() 724 | 725 | fu = fftn(u, fu, axes=(0, 1, 2), threads=self.threads, 726 | planner_effort=self.planner_effort['fftn']) 727 | 728 | else: 729 | assert u.shape == self.original_shape_padded() 730 | 731 | fu_padded = self.work_arrays[(u, 0)] 732 | fu_padded = fftn(u, fu_padded, axes=(0, 1, 2), threads=self.threads, 733 | planner_effort=self.planner_effort['fftn']) 734 | 735 | # Copy with truncation 736 | fu[:self.N[0]//2, :self.N[1]//2] = fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks] 737 | fu[:self.N[0]//2, self.N[1]//2:] = fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks] 738 | fu[self.N[0]//2:, :self.N[1]//2] = fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks] 739 | fu[self.N[0]//2:, self.N[1]//2:] = fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks] 740 | fu /= self.padsize**3 741 | return fu 742 | 743 | if not dealias == '3/2-rule': 744 | if self.communication == 'Alltoall': 745 | # Intermediate work arrays required for transform 746 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)] 747 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)] 748 | 749 | # Do 2 ffts in y-z directions on owned data 750 | Uc_hatT = fft2(u, Uc_hatT, axes=(1,2), threads=self.threads, planner_effort=self.planner_effort['fft2']) 751 | 752 | # Transform data to align with x-direction 753 | Uc_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1) 754 | 755 | # Communicate all values 756 | self.comm.Alltoall([Uc_mpi, self.mpitype], [fu, self.mpitype]) 757 | 758 | else: 759 | # Communicating intermediate result 760 | ft = fu.transpose(1, 0, 2) 761 | ft = fft2(u, ft, axes=(1, 2), threads=self.threads, 762 | planner_effort=self.planner_effort['fft2']) 763 | fu_send = fu.reshape((self.num_processes, self.Np[1], 764 | self.Np[1], self.Nf)) 765 | for i in xrange(self.num_processes): 766 | if not i == self.rank: 767 | self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0) 768 | fu_send[:] = fu_send.transpose(0, 2, 1, 3) 769 | 770 | # Do fft for last direction 771 | fu[:] = fft(fu, axis=0, threads=self.threads, 772 | planner_effort=self.planner_effort['fft']) 773 | 774 | else: 775 | # Intermediate work arrays required for transform 776 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)] 777 | Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)] 778 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)] 779 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)] 780 | U_mpi = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)] 781 | 782 | # Do ffts in y and z directions 783 | Upad_hat3 = fft2(u, Upad_hat3, axes=(1, 2), threads=self.threads, 784 | planner_effort=self.planner_effort['fft2']) 785 | 786 | # Copy with truncation 787 | Upad_hat1 = C2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1) 788 | 789 | # Transpose and commuincate data 790 | U_mpi[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1) 791 | self.comm.Alltoall([U_mpi, self.mpitype], [Upad_hat0, self.mpitype]) 792 | 793 | # Perform fft of data in x-direction 794 | Upad_hat = fft(Upad_hat0, Upad_hat, overwrite_input=True, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft']) 795 | 796 | # Truncate to original complex shape 797 | fu[:self.N[0]//2] = Upad_hat[:self.N[0]//2] 798 | fu[self.N[0]//2:] = Upad_hat[-self.N[0]//2:] 799 | fu /= self.padsize**3 800 | 801 | return fu 802 | 803 | @staticmethod 804 | def copy_to_padded(fu, fp, N, axis=0): 805 | if axis == 0: 806 | fp[:N[0]//2] = fu[:N[0]//2] 807 | fp[-N[0]//2:] = fu[N[0]//2:] 808 | elif axis == 1: 809 | fp[:, :N[1]//2] = fu[:, :N[1]//2] 810 | fp[:, -N[1]//2:] = fu[:, N[1]//2:] 811 | elif axis == 2: 812 | fp[:, :, :N[2]//2] = fu[:, :, :N[2]//2] 813 | fp[:, :, -N[2]//2:] = fu[:, :, N[2]//2:] 814 | return fp 815 | 816 | @staticmethod 817 | def copy_from_padded(fp, fu, N, axis=0): 818 | if axis == 1: 819 | fu.fill(0) 820 | fu[:, :N[1]//2+1, :N[2]//2+1] = fp[:, :N[1]//2+1, :N[2]//2+1] 821 | fu[:, :N[1]//2+1, N[2]//2:] += fp[:, :N[1]//2+1, -N[2]//2:] 822 | fu[:, N[1]//2:, :N[2]//2+1] += fp[:, -N[1]//2:, :N[2]//2+1] 823 | fu[:, N[1]//2:, N[2]//2:] += fp[:, -N[1]//2:, -N[2]//2:] 824 | 825 | return fu 826 | 827 | 828 | #def transpose_Uc(Uc_hatT, U_mpi, num_processes, Np0, Np1, Nf): 829 | #for i in xrange(num_processes): 830 | #Uc_hatT[:, i*Np1:(i+1)*Np1] = U_mpi[i] 831 | #return Uc_hatT 832 | 833 | #def transpose_Umpi(U_mpi, Uc_hatT, num_processes, Np0, Np1, Nf): 834 | #for i in xrange(num_processes): 835 | #U_mpi[i] = Uc_hatT[:, i*Np1:(i+1)*Np1] 836 | #return U_mpi 837 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mpi4py 2 | cython 3 | numpy>=1.15 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import re 5 | import subprocess 6 | from setuptools import setup, Extension 7 | from setuptools.command.build_ext import build_ext 8 | from numpy import get_include 9 | 10 | cwd = os.path.abspath(os.path.dirname(__file__)) 11 | cdir = os.path.join(cwd, "mpiFFT4py", "cython") 12 | 13 | def has_flag(compiler, flagname): 14 | """Return a boolean indicating whether a flag name is supported on 15 | the specified compiler. 16 | """ 17 | devnull = open(os.devnull, "w") 18 | p = subprocess.Popen([compiler.compiler[0], '-E', '-'] + [flagname], 19 | stdin=subprocess.PIPE, stdout=devnull, stderr=devnull, 20 | shell=True) 21 | p.communicate("") 22 | return True if p.returncode == 0 else False 23 | 24 | class build_ext_subclass(build_ext): 25 | def build_extensions(self): 26 | extra_compile_args = ['-g0'] 27 | for c in ['-w', '-Ofast', '-ffast-math', '-march=native']: 28 | if has_flag(self.compiler, c): 29 | extra_compile_args.append(c) 30 | 31 | for e in self.extensions: 32 | e.extra_compile_args += extra_compile_args 33 | e.include_dirs.extend([get_include()]) 34 | build_ext.build_extensions(self) 35 | 36 | ext = [Extension('mpiFFT4py.cython.maths', 37 | sources=[os.path.join(cdir, "maths.pyx")])] 38 | 39 | def version(): 40 | srcdir = os.path.join(cwd, 'mpiFFT4py') 41 | with open(os.path.join(srcdir, '__init__.py')) as f: 42 | m = re.search(r"__version__\s*=\s*'(.*)'", f.read()) 43 | return m.groups()[0] 44 | 45 | with open("README.rst", "r") as fh: 46 | long_description = fh.read() 47 | 48 | setup(name = "mpiFFT4py", 49 | version = version(), 50 | description = "mpiFFT4py -- Parallel 3D FFT in Python using MPI for Python", 51 | long_description = long_description, 52 | author = "Mikael Mortensen", 53 | author_email = "mikaem@math.uio.no", 54 | url = 'https://github.com/spectralDNS/mpiFFT4py', 55 | classifiers = [ 56 | 'Development Status :: 5 - Production/Stable', 57 | 'Environment :: Console', 58 | 'Intended Audience :: Developers', 59 | 'Intended Audience :: Science/Research', 60 | 'Intended Audience :: Education', 61 | 'Programming Language :: Python', 62 | 'Programming Language :: Python :: 2', 63 | 'Programming Language :: Python :: 3', 64 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', 65 | 'Topic :: Scientific/Engineering :: Mathematics', 66 | 'Topic :: Software Development :: Libraries :: Python Modules', 67 | ], 68 | packages = ["mpiFFT4py", 69 | "mpiFFT4py.serialFFT", 70 | "mpiFFT4py.cython" 71 | ], 72 | package_dir = {"mpiFFT4py": "mpiFFT4py"}, 73 | install_requires=["numpy"], 74 | setup_requires=["numpy>=1.11", 75 | "cython>=0.25", 76 | "setuptools>=18.0"], 77 | ext_modules = ext, 78 | cmdclass = {'build_ext': build_ext_subclass} 79 | ) 80 | -------------------------------------------------------------------------------- /tests/test_FFT.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import string 3 | import numpy as np 4 | from numpy.random import random, randn 5 | from numpy import allclose, empty, zeros, zeros_like, pi, array, int, all, float64 6 | from numpy.fft import fftfreq 7 | from mpi4py import MPI 8 | 9 | from mpiFFT4py.pencil import R2C as Pencil_R2C 10 | from mpiFFT4py.slab import R2C as Slab_R2C 11 | from mpiFFT4py.line import R2C as Line_R2C 12 | from mpiFFT4py import rfft2, rfftn, irfftn, irfft2, fftn, ifftn, irfft, ifft 13 | from mpiFFT4py.slab import C2C 14 | 15 | def reset_profile(prof): 16 | prof.code_map = {} 17 | prof.last_time = {} 18 | prof.enable_count = 0 19 | for func in prof.functions: 20 | prof.add_function(func) 21 | 22 | N = 2**5 23 | L = array([2*pi, 2*pi, 2*pi]) 24 | ks = (fftfreq(N)*N).astype(int) 25 | comm = MPI.COMM_WORLD 26 | 27 | if comm.Get_size() >= 4: 28 | params = ("slabas", "slabad", "slabws", "slabwd", 29 | "pencilsys", "pencilsyd", "pencilnys", "pencilnyd", 30 | "pencilsxd", "pencilsxs", "pencilnxd", "pencilnxs", 31 | "pencilaxd", "pencilaxs", "pencilayd", "pencilays") 32 | 33 | else: 34 | params = ("slabas", "slabad", "slabws", "slabwd") 35 | 36 | @pytest.fixture(params=params, scope='module') 37 | 38 | def FFT(request): 39 | prec = {"s": "single", "d":"double"}[request.param[-1]] 40 | if request.param[:3] == "pen": 41 | communication = {"s": "Alltoall", "n": "AlltoallN", "a": "Alltoallw"}[request.param[-3]] 42 | alignment = request.param[-2].upper() 43 | return Pencil_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication, alignment=alignment) 44 | else: 45 | communication = 'Alltoall' if request.param[-2] == 'a' else 'Alltoallw' 46 | return Slab_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication) 47 | 48 | @pytest.fixture(params=("lines", "lined"), scope='module') 49 | def FFT2(request): 50 | prec = {"s": "single", "d":"double"}[request.param[-1]] 51 | return Line_R2C(array([N, 2*N]), L[:-1], comm, prec) 52 | 53 | 54 | @pytest.fixture(params=("slabd", "slabs"), scope='module') 55 | def FFT_C2C(request): 56 | prec = {"s": "single", "d":"double"}[request.param[-1]] 57 | return C2C(array([N, 2*N, 4*N]), L, comm, prec) 58 | 59 | #@profile 60 | def test_FFT(FFT): 61 | N = FFT.N 62 | if FFT.rank == 0: 63 | A = random(N).astype(FFT.float) 64 | if FFT.communication == 'AlltoallN': 65 | C = empty(FFT.global_complex_shape(), dtype=FFT.complex) 66 | C = rfftn(A, C, axes=(0,1,2)) 67 | C[:, :, -1] = 0 # Remove Nyquist frequency 68 | A = irfftn(C, A, axes=(0,1,2)) 69 | B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex) 70 | B2 = rfftn(A, B2, axes=(0,1,2)) 71 | 72 | else: 73 | A = zeros(N, dtype=FFT.float) 74 | B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex) 75 | 76 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4) 77 | FFT.comm.Bcast(A, root=0) 78 | FFT.comm.Bcast(B2, root=0) 79 | 80 | a = zeros(FFT.real_shape(), dtype=FFT.float) 81 | c = zeros(FFT.complex_shape(), dtype=FFT.complex) 82 | a[:] = A[FFT.real_local_slice()] 83 | c = FFT.fftn(a, c) 84 | #print abs((c - B2[FFT.complex_local_slice()])/c.max()).max() 85 | assert all(abs((c - B2[FFT.complex_local_slice()])/c.max()) < rtol) 86 | #assert allclose(c, B2[FFT.complex_local_slice()], rtol, atol) 87 | a = FFT.ifftn(c, a) 88 | #print abs((a - A[FFT.real_local_slice()])/a.max()).max() 89 | 90 | assert all(abs((a - A[FFT.real_local_slice()])/a.max()) < rtol) 91 | #assert allclose(a, A[FFT.real_local_slice()], rtol, atol) 92 | 93 | def test_FFT2(FFT2): 94 | N = FFT2.N 95 | if FFT2.rank == 0: 96 | A = random(N).astype(FFT2.float) 97 | 98 | else: 99 | A = zeros(N, dtype=FFT2.float) 100 | 101 | atol, rtol = (1e-10, 1e-8) if FFT2.float is float64 else (5e-7, 1e-4) 102 | FFT2.comm.Bcast(A, root=0) 103 | a = zeros(FFT2.real_shape(), dtype=FFT2.float) 104 | c = zeros(FFT2.complex_shape(), dtype=FFT2.complex) 105 | a[:] = A[FFT2.real_local_slice()] 106 | c = FFT2.fft2(a, c) 107 | B2 = zeros(FFT2.global_complex_shape(), dtype=FFT2.complex) 108 | B2 = rfft2(A, B2, axes=(0,1)) 109 | assert allclose(c, B2[FFT2.complex_local_slice()], rtol, atol) 110 | a = FFT2.ifft2(c, a) 111 | assert allclose(a, A[FFT2.real_local_slice()], rtol, atol) 112 | 113 | def test_FFT2_padded(FFT2): 114 | FFT = FFT2 115 | N = FFT.N 116 | prec = "single" if isinstance(FFT.float, np.float32) else "double" 117 | FFT_SELF = Line_R2C(N, FFT.L, MPI.COMM_SELF, prec) 118 | 119 | if FFT.rank == 0: 120 | A = random(N).astype(FFT.float) 121 | C = zeros((FFT.global_complex_shape()), dtype=FFT.complex) 122 | C = FFT_SELF.fft2(A, C) 123 | 124 | # Eliminate Nyquist, otherwise test will fail 125 | C[-N[0]//2] = 0 126 | 127 | A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float) 128 | A_pad = FFT_SELF.ifft2(C, A_pad, dealias="3/2-rule") 129 | 130 | else: 131 | C = zeros(FFT.global_complex_shape(), dtype=FFT.complex) 132 | A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float) 133 | 134 | FFT.comm.Bcast(C, root=0) 135 | FFT.comm.Bcast(A_pad, root=0) 136 | 137 | ae = zeros(FFT.real_shape_padded(), dtype=FFT.float) 138 | c = zeros(FFT.complex_shape(), dtype=FFT.complex) 139 | 140 | c[:] = C[FFT.complex_local_slice()] 141 | ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)] 142 | 143 | ap = zeros(FFT.real_shape_padded(), dtype=FFT.float) 144 | cp = zeros(FFT.complex_shape(), dtype=FFT.complex) 145 | ap = FFT.ifft2(c, ap, dealias="3/2-rule") 146 | 147 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4) 148 | 149 | #from IPython import embed; embed() 150 | #print np.linalg.norm(ap-ae) 151 | assert allclose(ap, ae, rtol, atol) 152 | 153 | cp = FFT.fft2(ap, cp, dealias="3/2-rule") 154 | 155 | #print np.linalg.norm(abs((cp-c)/cp.max())) 156 | assert all(abs((cp-c)/cp.max()) < rtol) 157 | 158 | 159 | def test_FFT_padded(FFT): 160 | N = FFT.N 161 | prec = "single" if isinstance(FFT.float, np.float32) else "double" 162 | FFT_SELF = Slab_R2C(FFT.N, L, MPI.COMM_SELF, prec, 163 | communication=FFT.communication) 164 | 165 | if FFT.rank == 0: 166 | A = random(N).astype(FFT.float) 167 | C = zeros((FFT.global_complex_shape()), dtype=FFT.complex) 168 | C = FFT_SELF.fftn(A, C) 169 | 170 | # Eliminate Nyquist, otherwise test will fail 171 | #C[-N[0]//2] = 0 172 | #C[:, -N[1]//2] = 0 173 | if FFT.communication == 'AlltoallN': 174 | C[:, :, -1] = 0 # Remove Nyquist frequency 175 | 176 | A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float) 177 | A_pad = FFT_SELF.ifftn(C, A_pad, dealias='3/2-rule') 178 | 179 | else: 180 | C = zeros(FFT.global_complex_shape(), dtype=FFT.complex) 181 | A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float) 182 | 183 | FFT.comm.Bcast(C, root=0) 184 | FFT.comm.Bcast(A_pad, root=0) 185 | 186 | ae = zeros(FFT.real_shape_padded(), dtype=FFT.float) 187 | c = zeros(FFT.complex_shape(), dtype=FFT.complex) 188 | 189 | c[:] = C[FFT.complex_local_slice()] 190 | ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)] 191 | 192 | ap = zeros(FFT.real_shape_padded(), dtype=FFT.float) 193 | cp = zeros(FFT.complex_shape(), dtype=FFT.complex) 194 | ap = FFT.ifftn(c, ap, dealias="3/2-rule") 195 | 196 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4) 197 | 198 | #print np.linalg.norm(ap-ae) 199 | assert allclose(ap, ae, rtol, atol) 200 | 201 | cp = FFT.fftn(ap, cp, dealias="3/2-rule") 202 | 203 | #from IPython import embed; embed() 204 | #print np.linalg.norm(abs((cp-c)/cp.max())) 205 | assert all(abs((cp-c)/cp.max()) < rtol) 206 | 207 | #aa = zeros(FFT.real_shape(), dtype=FFT.float) 208 | #aa = FFT.ifftn(cp, aa) 209 | 210 | #a3 = A[FFT.real_local_slice()] 211 | #assert allclose(aa, a3, rtol, atol) 212 | 213 | def test_FFT_C2C(FFT_C2C): 214 | """Test both padded and unpadded transforms""" 215 | FFT = FFT_C2C 216 | N = FFT.N 217 | atol, rtol = (1e-8, 1e-8) if FFT.float is float64 else (5e-7, 1e-4) 218 | 219 | if FFT.rank == 0: 220 | # Create a reference solution using only one CPU 221 | A = (random(N)+random(N)*1j).astype(FFT.complex) 222 | C = zeros((FFT.global_shape()), dtype=FFT.complex) 223 | C = fftn(A, C, axes=(0,1,2)) 224 | 225 | # Copy to array padded with zeros 226 | Cp = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex) 227 | ks = (fftfreq(N[2])*N[2]).astype(int) 228 | Cp[:N[0]//2, :N[1]//2, ks] = C[:N[0]//2, :N[1]//2] 229 | Cp[:N[0]//2, -N[1]//2:, ks] = C[:N[0]//2, N[1]//2:] 230 | Cp[-N[0]//2:, :N[1]//2, ks] = C[N[0]//2:, :N[1]//2] 231 | Cp[-N[0]//2:, -N[1]//2:, ks] = C[N[0]//2:, N[1]//2:] 232 | 233 | # Get transform of padded array 234 | Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex) 235 | Ap = ifftn(Cp*1.5**3, Ap, axes=(0,1,2)) 236 | 237 | else: 238 | C = zeros(FFT.global_shape(), dtype=FFT.complex) 239 | Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex) 240 | A = zeros(N, dtype=FFT.complex) 241 | 242 | # For testing broadcast the arrays computed on root to all CPUs 243 | FFT.comm.Bcast(C, root=0) 244 | FFT.comm.Bcast(Ap, root=0) 245 | FFT.comm.Bcast(A, root=0) 246 | 247 | # Get the single processor solution on local part of the solution 248 | ae = zeros(FFT.original_shape_padded(), dtype=FFT.complex) 249 | ae[:] = Ap[FFT.original_local_slice(padsize=1.5)] 250 | c = zeros(FFT.transformed_shape(), dtype=FFT.complex) 251 | c[:] = C[FFT.transformed_local_slice()] 252 | 253 | # Perform padded transform with MPI and assert ok 254 | ap = zeros(FFT.original_shape_padded(), dtype=FFT.complex) 255 | ap = FFT.ifftn(c, ap, dealias="3/2-rule") 256 | assert allclose(ap, ae, rtol, atol) 257 | 258 | # Perform truncated transform with MPI and assert 259 | cp = zeros(FFT.transformed_shape(), dtype=FFT.complex) 260 | cp = FFT.fftn(ap, cp, dealias="3/2-rule") 261 | assert all(abs(cp-c)/cp.max() < rtol) 262 | 263 | # Now without padding 264 | # Transform back to original 265 | aa = zeros(FFT.original_shape(), dtype=FFT.complex) 266 | aa = FFT.ifftn(c, aa) 267 | # Verify 268 | a3 = A[FFT.original_local_slice()] 269 | assert allclose(aa, a3, rtol, atol) 270 | c2 = zeros(FFT.transformed_shape(), dtype=FFT.complex) 271 | c2 = FFT.fftn(aa, c2) 272 | # Verify 273 | assert all(abs(c2-c)/c2.max() < rtol) 274 | #assert allclose(c2, c, rtol, atol) 275 | 276 | #import time 277 | #t0 = time.time() 278 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall')) 279 | #t1 = time.time() 280 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='Alltoall')) 281 | #t2 = time.time() 282 | 283 | #ty = MPI.COMM_WORLD.reduce(t1-t0, op=MPI.MIN) 284 | #tx = MPI.COMM_WORLD.reduce(t2-t1, op=MPI.MIN) 285 | #if MPI.COMM_WORLD.Get_rank() == 0: 286 | #print "Y: ", ty 287 | #print "X: ", tx 288 | 289 | #test_FFT(Slab_R2C(array([N, 2*N, 4*N]), L, MPI.COMM_WORLD, "double", communication='Alltoall')) 290 | #test_FFT(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall')) 291 | #test_FFT2(Line_R2C(array([N, N]), L[:-1], MPI, "single")) 292 | #test_FFT2_padded(Line_R2C(array([N, N]), L[:-1], MPI, "double")) 293 | #from collections import defaultdict 294 | #FFT = Slab_R2C(array([N//4, N, N]), L, MPI.COMM_WORLD, "double", communication='Alltoallw', threads=2, planner_effort=defaultdict(lambda: "FFTW_MEASURE")) 295 | #test_FFT_padded(FFT) 296 | #reset_profile(profile) 297 | #test_FFT_padded(FFT) 298 | 299 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='AlltoallN')) 300 | #test_FFT_C2C(C2C(array([N, N, N]), L, MPI, "double")) 301 | --------------------------------------------------------------------------------