├── .circleci
└── config.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── conf
└── conda
│ ├── conda_build_config.yaml
│ ├── meta.yaml
│ └── run_test.sh
├── demo
├── spectral_dns_solver.py
└── transforms_realdata.py
├── mpiFFT4py
├── __init__.py
├── cython
│ ├── __init__.py
│ └── maths.pyx
├── line.py
├── mpibase.py
├── pencil.py
├── serialFFT
│ ├── __init__.py
│ ├── numpy_fft.py
│ └── pyfftw_fft.py
└── slab.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
└── test_FFT.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | jobs:
3 | build:
4 | machine: true
5 | steps:
6 | - checkout
7 |
8 | - restore_cache:
9 | key: v2-miniconda-{{ .Branch }}
10 |
11 | - run:
12 | name: install miniconda
13 | command: |
14 | if [[ ! -d /home/circleci/miniconda ]]; then
15 | wget https://repo.continuum.io/miniconda/Miniconda3-4.5.1-Linux-x86_64.sh -O miniconda.sh &&
16 | bash miniconda.sh -b -f -p /home/circleci/miniconda;
17 | else
18 | echo "Using cached miniconda";
19 | fi
20 | source ~/miniconda/bin/activate root
21 | conda config --set always_yes yes
22 | conda config --add channels conda-forge
23 | conda config --add channels spectralDNS
24 | conda clean --lock
25 | conda install --yes --quiet conda-forge-ci-setup=1
26 | source run_conda_forge_build_setup
27 |
28 | - save_cache:
29 | key: v2-miniconda-{{ .Branch }}
30 | paths:
31 | - /home/circleci/miniconda
32 |
33 | - run:
34 | name: Build and test
35 | command: |
36 | source ~/miniconda/bin/activate root
37 | cd /home/circleci/project
38 | conda build --python 2.7 ./conf/conda
39 | conda build --python 3.6 ./conf/conda
40 |
41 | - run:
42 | name: Upload packages
43 | command: |
44 | source ~/miniconda/bin/activate root
45 | cd /home/circleci/project
46 | upload_or_check_non_existence ./conf/conda spectralDNS --channel main
47 | export CONDA_PY=36
48 | upload_or_check_non_existence ./conf/conda spectralDNS --channel main
49 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: generic
2 | os: osx
3 | osx_image: xcode7.3
4 | sudo: false
5 | env:
6 | matrix:
7 | - CONDA_PY=27
8 | - CONDA_PY=36
9 | global:
10 | - secure: "swxbq67k6ag2v7QjLGMtn72mROxDZ7d+c6X+BgR2YS1XT7l45T9+0Z/PTpCJg+9mmEH3YdlpnlzKjatz9xVNY04a7RljFMsNy/+5oiTOmno2IDq2fAPrUFvGAvdqsVgnc6+e+GUwaDL5n/AfDVOIb18tT4P2VRk3ooCsSILtQYvQWixLw5bx3BhTgAfXnmu7e+oaB+vCDXXjlFINlOvHZCBiVI9g0yXH0sW9gYsR2vsmIdxraChsq/+Q0wkaNUgUaiuHXNWcaZiiWleRYnYsktsNfT1nknkLrkPAtQTC5fYgXj6o9Sh+codcfYH95ztBm83rWzfWo2f+Ok1AtrRdG+CiApCFMQ6T4ZjonxEeZhopvY7+xNLXFoHcmnBdf0NM3wmCdwrzuzdHvpqRnozClTqG6Srvna7X4/WtDbKpF2yEHKdiBmaf8NRcGDpbJeyvnzlNz5HMESltvYUVatLzPTzzJplkvgMX3Ti8xcqYgwB1ayrClGFlpWM33MdzJiSSTptv3WYmhi7rV5xdpCc5pBTF5XLOtEB0dFGY60yQd9SWSxjFAMwo9808V6koiKX3D0Ogin8mQmvR2DqVhkBqfHFf36s38OfG/n1iV/Oednc9pfYP55T7ljKRsPUpavblCPizBfQnQEFivjaDlPGX3/bR0TV9F/pRSiJ84JMgKzs="
11 |
12 | before_install:
13 | - brew remove --force $(brew list)
14 | - brew cleanup -s
15 | - rm -rf $(brew --cache)
16 | install:
17 | - |
18 | MINICONDA_URL="https://repo.continuum.io/miniconda"
19 | MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh"
20 | curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}"
21 | bash $MINICONDA_FILE -b
22 | source /Users/travis/miniconda3/bin/activate root
23 | conda config --set show_channel_urls true
24 | conda config --add channels conda-forge
25 | conda install --yes --quiet conda-forge-ci-setup=1
26 | source run_conda_forge_build_setup
27 | script:
28 | - conda build conf/conda
29 | after_success:
30 | - export GIT_DESCRIBE_TAG=`git describe --tags | cut -d'-' -f 1`
31 | - upload_or_check_non_existence ./conf/conda spectralDNS --channel main || exit 1
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.py *.txt *.rst
2 | recursive-include mpiFFT4py *.py *.pyx
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | VERSION=$(shell python3 -c "import mpiFFT4py; print(mpiFFT4py.__version__)")
2 |
3 | default:
4 | python setup.py build_ext -i
5 |
6 | pip:
7 | rm -f dist/*
8 | python setup.py sdist
9 | twine upload dist/*
10 |
11 | tag:
12 | git tag $(VERSION)
13 | git push --tags
14 |
15 | publish: tag pip
16 |
17 | clean:
18 | git clean mpiFFT4py -fx
19 | git clean tests -fx
20 | cd docs && make clean && cd ..
21 | @rm -rf *.egg-info/ build/ dist/ .eggs/
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | mpiFFT4py
2 | ---------
3 |
4 | .. image:: https://travis-ci.org/spectralDNS/mpiFFT4py.svg?branch=master
5 | :target: https://travis-ci.org/spectralDNS/mpiFFT4py
6 | .. image:: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master.svg?style=svg
7 | :target: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master
8 | .. image:: https://zenodo.org/badge/51817237.svg
9 | :target: https://zenodo.org/badge/latestdoi/51817237
10 |
11 | Description
12 | -----------
13 | mpiFFT4py performs FFTs in parallel in Python. It is developed to be able to do FFTs in parallel on a three-dimensional computational box (a structured grid), but there are also routines for doing the FFTs on a 2D mesh. It implements both the *slab* and the *pencil* decompositions.
14 |
15 | Installation
16 | ------------
17 | mpiFFT4py requires *numpy* for basic array oparations, [*pyfftw*](https://github.com/pyfftw/pyFFTW) for efficient FFTs and [*mpi4py*](https://bitbucket.org/mpi4py/mpi4py) for MPI communications. However, if *pyfftw* is not found, then the slower *numpy.fft* is used instead. [*cython*](http://cython.org) is used to optimize a few routines. Install using regular python distutils::
18 |
19 | python setup.py install --prefix="Path on the PYTHONPATH"
20 |
21 | To install in place do::
22 |
23 | python setup.py build_ext --inplace
24 |
25 | To install using Anaconda, you may either compile it yourselves using (from the main directory)::
26 |
27 | conda config --add channels conda-forge
28 | conda build conf/conda
29 | conda install mpiFFT4py --use-local
30 |
31 | or use precompiled binaries in the[*conda-forge*](https://anaconda.org/conda-forge/mpifft4py) or the [*spectralDNS*](https://anaconda.org/spectralDNS/mpifft4py) channel on Anaconda cloud::
32 |
33 | conda install -c conda-forge mpifft4py
34 |
35 | or::
36 |
37 | conda config --add channels conda-forge
38 | conda install -c spectralDNS mpifft4py
39 |
40 | There are binaries compiled for both OSX and linux, and several versions of Python. Note that the spectralDNS channel contains bleeding-edge versions of the Software, whereas conda-forge is more stable.
41 |
42 | Authors
43 | -------
44 | mpiFFT4py is developed by
45 |
46 | * Mikael Mortensen
47 |
48 | Licence
49 | -------
50 | mpiFFT4py is licensed under the GNU GPL, version 3 or (at your option) any later version. mpiFFT4py is Copyright (2014-2016) by the authors.
51 |
52 | Contact
53 | -------
54 | The latest version of this software can be obtained from
55 |
56 | https://github.com/spectralDNS/mpiFFT4py
57 |
58 | Please report bugs and other issues through the issue tracker at:
59 |
60 | https://github.com/spectralDNS/mpiFFT4py/issues
61 |
--------------------------------------------------------------------------------
/conf/conda/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | numpy:
2 | - 1.15
3 |
--------------------------------------------------------------------------------
/conf/conda/meta.yaml:
--------------------------------------------------------------------------------
1 | package:
2 | name: mpifft4py
3 | version: "{{ GIT_DESCRIBE_TAG }}"
4 |
5 | source:
6 | git_url: ../../
7 |
8 | build:
9 | number: 0
10 | script: "pip install ."
11 |
12 | requirements:
13 | build:
14 | - python
15 | - pip
16 | - cython
17 | - numpy
18 |
19 | run:
20 | - python
21 | - numpy
22 | - scipy
23 | - mpi4py
24 | - fftw
25 | - pyfftw
26 |
27 | test:
28 | source_files:
29 | - tests
30 |
31 | requires:
32 | - pytest
33 |
--------------------------------------------------------------------------------
/conf/conda/run_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | pushd tests
4 |
5 | export OMPI_MCA_plm=isolated
6 | export OMPI_MCA_btl_vader_single_copy_mechanism=none
7 | export OMPI_MCA_rmaps_base_oversubscribe=yes
8 |
9 | if [ "$(uname)" == "Darwin" ]; then
10 | mpirun -np 2 py.test -v
11 | fi
12 |
13 | if [ "$(uname)" == "Linux" ]; then
14 | mpirun -np 2 py.test -v
15 | fi
16 | # if [ "${CONDA_PY:0:1}" == "3" ]; then
17 | # mpirun -np 4 py.test
18 | # fi
19 | #
20 | # if [ "${CONDA_PY:0:1}" == "2" ]; then
21 | # mpirun -np 1 py.test
22 | # fi
23 | #
24 |
--------------------------------------------------------------------------------
/demo/spectral_dns_solver.py:
--------------------------------------------------------------------------------
1 | """
2 | Demo program that solves the Navier Stokes equations in a triply
3 | periodic domain. The solution is initialized using the Taylor-Green
4 | vortex and evolved in time with a 4'th order Runge Kutta method.
5 |
6 | Basically, we create an instance of the R2C class for performing 3D FFTs
7 | in parallel on a cube of size N points and physical size L. The mesh
8 | decomposition is performed by the FFT class using a slab decomposition.
9 | With slab decomposition the first index in real physical space is shared
10 | amongst the processors, whereas in wavenumber space the second index is shared.
11 | """
12 | __author__ = "Mikael Mortensen "
13 | __date__ = "2016-04-07"
14 | __copyright__ = "Copyright (C) 2016 " + __author__
15 | __license__ = "GNU Lesser GPL version 3 or any later version"
16 |
17 | from numpy import array, pi, empty, where, sin, cos, sum
18 | from mpi4py import MPI
19 | from mpiFFT4py import work_arrays
20 | from mpiFFT4py.slab import R2C
21 | from collections import defaultdict
22 |
23 | # Set viscosity, end time and time step
24 | nu = 0.000625
25 | T = 0.1
26 | dt = 0.01
27 |
28 | # Set global size of the computational box
29 | N = array([2**5, 2**5, 2**5], dtype=int)
30 | L = array([2*pi, 2*pi, 2*pi], dtype=float)
31 |
32 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", planner_effort=
33 | defaultdict(lambda: 'FFTW_ESTIMATE', {'irfft2': 'FFTW_PATIENT'}))
34 |
35 | U = empty((3,) + FFT.real_shape()) # real_shape = (N[0]/comm.Get_size(), N[1], N[2])
36 | U_hat = empty((3,) + FFT.complex_shape(), dtype=complex) # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]/2+1)
37 | P = empty(FFT.real_shape())
38 | P_hat = empty(FFT.complex_shape(), dtype=complex)
39 | U_hat0 = empty((3,) + FFT.complex_shape(), dtype=complex)
40 | U_hat1 = empty((3,) + FFT.complex_shape(), dtype=complex)
41 | dU = empty((3,) + FFT.complex_shape(), dtype=complex)
42 | work = work_arrays()
43 | X = FFT.get_local_mesh()
44 | K = FFT.get_local_wavenumbermesh(scaled=True)
45 | K2 = K[0]*K[0] + K[1]*K[1] + K[2]*K[2]
46 | K_over_K2 = empty((3,) + FFT.complex_shape())
47 | for k in range(3):
48 | K_over_K2[k] = K[k].astype(float) / where(K2 == 0, 1, K2).astype(float)
49 | a = [1./6., 1./3., 1./3., 1./6.]
50 | b = [0.5, 0.5, 1.]
51 | dealias = '3/2-rule' # ('2/3-rule', None)
52 |
53 | def cross(x, y, z):
54 | """Cross product z = x X y"""
55 | z[0] = FFT.fftn(x[1]*y[2]-x[2]*y[1], z[0], dealias)
56 | z[1] = FFT.fftn(x[2]*y[0]-x[0]*y[2], z[1], dealias)
57 | z[2] = FFT.fftn(x[0]*y[1]-x[1]*y[0], z[2], dealias)
58 | return z
59 |
60 | def curl(x, z):
61 | z[2] = FFT.ifftn(1j*(K[0]*x[1]-K[1]*x[0]), z[2], dealias)
62 | z[1] = FFT.ifftn(1j*(K[2]*x[0]-K[0]*x[2]), z[1], dealias)
63 | z[0] = FFT.ifftn(1j*(K[1]*x[2]-K[2]*x[1]), z[0], dealias)
64 | return z
65 |
66 | def compute_rhs(rhs):
67 | U_dealiased = work[((3,) + FFT.work_shape(dealias), float, 0)]
68 | curl_dealiased = work[((3,) + FFT.work_shape(dealias), float, 1)]
69 | for i in range(3):
70 | U_dealiased[i] = FFT.ifftn(U_hat[i], U_dealiased[i], dealias)
71 |
72 | curl_dealiased = curl(U_hat, curl_dealiased)
73 | rhs = cross(U_dealiased, curl_dealiased, rhs)
74 | P_hat[:] = sum(rhs*K_over_K2, 0, out=P_hat)
75 | rhs -= P_hat*K
76 | rhs -= nu*K2*U_hat
77 | return rhs
78 |
79 | # Initialize a Taylor Green vortex
80 | U[0] = sin(X[0])*cos(X[1])*cos(X[2])
81 | U[1] = -cos(X[0])*sin(X[1])*cos(X[2])
82 | U[2] = 0
83 | for i in range(3):
84 | U_hat[i] = FFT.fftn(U[i], U_hat[i])
85 |
86 | # Integrate using a 4th order Rung-Kutta method
87 | t = 0.0
88 | tstep = 0
89 | while t < T-1e-8:
90 | t += dt
91 | tstep += 1
92 | U_hat1[:] = U_hat0[:] = U_hat
93 | for rk in range(4):
94 | dU = compute_rhs(dU)
95 | if rk < 3:
96 | U_hat[:] = U_hat0 + b[rk]*dt*dU
97 | U_hat1[:] += a[rk]*dt*dU
98 | U_hat[:] = U_hat1[:]
99 |
100 | for i in range(3):
101 | U[i] = FFT.ifftn(U_hat[i], U[i])
102 |
103 | k = FFT.comm.reduce(sum(U*U)/N[0]/N[1]/N[2]/2)
104 | if FFT.rank == 0:
105 | assert round(k - 0.124953117517, 7) == 0
106 |
--------------------------------------------------------------------------------
/demo/transforms_realdata.py:
--------------------------------------------------------------------------------
1 | __author__ = "Mikael Mortensen "
2 | __date__ = "2016-03-09"
3 | __copyright__ = "Copyright (C) 2016 " + __author__
4 | __license__ = "GNU Lesser GPL version 3 or any later version"
5 |
6 | from numpy import *
7 | from mpi4py import MPI
8 | #from mpiFFT4py.pencil import R2C
9 | from mpiFFT4py.slab import R2C
10 | from mpi4py_fft.mpifft import PFFT
11 | from time import time
12 |
13 | #assert MPI.COMM_WORLD.Get_size() >= 4
14 |
15 | # Set global size of the computational box
16 | M = 6
17 | N = array([2**M, 2**M, 2**M], dtype=int)
18 | L = array([2*pi, 2*pi, 2*pi], dtype=float)
19 |
20 | # Create an instance of the R2C class for performing 3D FFTs in parallel
21 | # on a cube of size N points and physical size L. The mesh decomposition is
22 | # performed by the FFT class using a slab decomposition. With slab decomposition
23 | # the first index in real physical space is shared amongst the processors,
24 | # whereas in wavenumber space the second index is shared.
25 |
26 | #FFT = R2C(N, L, MPI.COMM_WORLD, "double", None, alignment='X', communication='Alltoallw')
27 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", communication='Alltoallw')
28 | fft = PFFT(MPI.COMM_WORLD, N, collapse=False, slab=2)
29 |
30 | U = random.random(FFT.real_shape()).astype(FFT.float) # real_shape = (N[0]//comm.Get_size(), N[1], N[2])
31 | U_copy = zeros_like(U)
32 | U_hat = zeros(FFT.complex_shape(), dtype=FFT.complex) # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]//2+1)
33 |
34 | # Perform forward FFT. Real transform in third direction, complex in first two
35 | U_hat = FFT.fftn(U, U_hat)
36 |
37 | # Perform inverse FFT.
38 | U_copy = FFT.ifftn(U_hat, U_copy)
39 | MPI.COMM_WORLD.barrier()
40 | t0 = time()
41 | U_hat = FFT.fftn(U, U_hat)
42 | U_copy = FFT.ifftn(U_hat, U_copy)
43 | print("mpiFFT4py ", time()-t0)
44 | ###########
45 | u = random.random(fft.forward.input_array.shape).astype(fft.forward.input_array.dtype)
46 | MPI.COMM_WORLD.barrier()
47 | t0 = time()
48 | u_hat = fft.forward(u)
49 | u_copy = fft.backward(u_hat)
50 | print("mpi4py-fft ", time()-t0)
51 | #########
52 |
53 | tol = 1e-6 if FFT.float == float32 else 1e-10
54 |
55 | assert allclose(U, U_copy, tol, tol)
56 | assert allclose(u, u_copy, tol, tol)
57 |
--------------------------------------------------------------------------------
/mpiFFT4py/__init__.py:
--------------------------------------------------------------------------------
1 | from .serialFFT import *
2 | from .slab import R2C as Slab_R2C
3 | from .pencil import R2C as Pencil_R2C
4 | from .line import R2C as Line_R2C
5 | from .mpibase import work_arrays, datatypes, empty, zeros
6 | from numpy.fft import fftfreq, rfftfreq
7 |
8 | __version__ = '1.1.2'
9 |
--------------------------------------------------------------------------------
/mpiFFT4py/cython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spectralDNS/mpiFFT4py/61ce6474771efff4e3b280b3f69f09611a2c1150/mpiFFT4py/cython/__init__.py
--------------------------------------------------------------------------------
/mpiFFT4py/cython/maths.pyx:
--------------------------------------------------------------------------------
1 | #cython: boundscheck=False
2 | #cython: wraparound=False
3 | cimport numpy as np
4 |
5 | ctypedef fused complex_t:
6 | np.complex64_t
7 | np.complex128_t
8 |
9 | def dealias_filter(np.ndarray[complex_t, ndim=3] fu,
10 | np.ndarray[np.uint8_t, ndim=3] dealias):
11 | cdef unsigned int i, j, k
12 | cdef np.uint8_t uu
13 | for i in xrange(dealias.shape[0]):
14 | for j in xrange(dealias.shape[1]):
15 | for k in xrange(dealias.shape[2]):
16 | uu = dealias[i, j, k]
17 | fu[i, j, k].real *= uu
18 | fu[i, j, k].imag *= uu
19 | return fu
20 |
21 | def transpose_Uc(np.ndarray[complex_t, ndim=3] Uc_hatT,
22 | np.ndarray[complex_t, ndim=4] U_mpi,
23 | int num_processes, int Np0, int Np1, int Nf):
24 | cdef unsigned int i, j, k, l, kk
25 | for i in xrange(num_processes):
26 | for j in xrange(Np0):
27 | for k in xrange(i*Np1, (i+1)*Np1):
28 | kk = k-i*Np1
29 | for l in xrange(Nf):
30 | Uc_hatT[j, k, l] = U_mpi[i, j, kk, l]
31 | return Uc_hatT
32 |
33 | def transpose_Umpi(np.ndarray[complex_t, ndim=4] U_mpi,
34 | np.ndarray[complex_t, ndim=3] Uc_hatT,
35 | int num_processes, int Np, int Nf):
36 | cdef unsigned int i,j,k,l,kk
37 | for i in xrange(num_processes):
38 | for j in xrange(Np):
39 | for kk in xrange(Np):
40 | k = kk+i*Np
41 | for l in xrange(Nf):
42 | U_mpi[i,j,kk,l] = Uc_hatT[j,k,l]
43 | return U_mpi
44 |
45 | #for i in xrange(num_processes):
46 | #for j in xrange(Np):
47 | #for k in xrange(i*Np, (i+1)*Np):
48 | #kk = k-i*Np
49 | #for l in xrange(Nf):
50 | #U_mpi[i,j,kk,l] = Uc_hatT[j,k,l]
51 | #return U_mpi
52 |
53 | #def copy_to_padded(np.ndarray[complex_t, ndim=3] fu,
54 | #np.ndarray[complex_t, ndim=3] fp,
55 | #np.ndarray[int, ndim=1] N, int axis=0):
56 | #if axis == 0:
57 | #fp[:N[0]/2] = fu[:N[0]/2]
58 | #fp[-N[0]/2:] = fu[N[0]/2:]
59 | #elif axis == 1:
60 | #fp[:, :N[1]/2] = fu[:, :N[1]/2]
61 | #fp[:, -N[1]/2:] = fu[:, N[1]/2:]
62 | #elif axis == 2:
63 | #fp[:, :, :(N[2]/2+1)] = fu[:]
64 | #return fp
65 |
66 | #def copy_to_padded_c(np.ndarray[complex_t, ndim=3] fu,
67 | #np.ndarray[complex_t, ndim=3] fp,
68 | #np.ndarray[int, ndim=1] N, int axis=0):
69 | #if axis == 0:
70 | #fp[:N[0]] = fu[:N[0]]
71 | #elif axis == 1:
72 | #fp[:, :N[1]/2] = fu[:, :N[1]/2]
73 | #fp[:, -N[1]/2:] = fu[:, N[1]/2:]
74 | #elif axis == 2:
75 | #fp[:, :, :(N[2]/2+1)] = fu[:]
76 | #return fp
77 |
78 |
--------------------------------------------------------------------------------
/mpiFFT4py/line.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | __author__ = "Mikael Mortensen "
3 | __date__ = "2016-02-16"
4 | __copyright__ = "Copyright (C) 2016 " + __author__
5 | __license__ = "GNU Lesser GPL version 3 or any later version"
6 |
7 | from .serialFFT import *
8 | import numpy as np
9 | from .mpibase import work_arrays, datatypes, zeros, empty
10 | from numpy.fft import fftfreq, rfftfreq
11 | from collections import defaultdict
12 | from mpi4py import MPI
13 |
14 | def transpose_x(U_send, Uc_hatT, num_processes):
15 | sx = U_send.shape
16 | sy = Uc_hatT.shape
17 | U_send[:] = np.rollaxis(Uc_hatT[:,:-1].reshape(sy[0], num_processes, sx[2]), 1)
18 | return U_send
19 |
20 | def transpose_y(Uc_hatT, U_recv, num_processes):
21 | sx = Uc_hatT.shape
22 | sy = U_recv.shape
23 | Uc_hatT[:, :-1] = np.rollaxis(U_recv.reshape(num_processes, sx[0], sy[1]), 1).reshape((sx[0], sx[1]-1))
24 | return Uc_hatT
25 |
26 | def swap_Nq(fft_y, fu, fft_x, N):
27 | f = fu[:, 0].copy()
28 | fft_x[0] = f[0].real
29 | fft_x[1:N//2] = 0.5*(f[1:N//2] + np.conj(f[:N//2:-1]))
30 | fft_x[N//2] = f[N//2].real
31 | fu[:N//2+1, 0] = fft_x[:N//2+1]
32 | fu[N//2+1:, 0] = np.conj(fft_x[(N//2-1):0:-1])
33 |
34 | fft_y[0] = f[0].imag
35 | fft_y[1:N//2] = -0.5*1j*(f[1:N//2] - np.conj(f[:N//2:-1]))
36 | fft_y[N//2] = f[N//2].imag
37 |
38 | fft_y[N//2+1:] = np.conj(fft_y[(N//2-1):0:-1])
39 | return fft_y
40 |
41 | class R2C(object):
42 | """Class for performing FFT in 2D using MPI
43 |
44 | Slab decomposition
45 |
46 | Args:
47 | N - NumPy array([Nx, Ny]) Number of nodes for the real mesh
48 | L - NumPy array([Lx, Ly]) The actual size of the real mesh
49 | comm - The MPI communicator object
50 | precision - "single" or "double"
51 | padsize - For performing transforms with padding
52 |
53 | """
54 |
55 | def __init__(self, N, L, comm, precision, padsize=1.5, threads=1,
56 | planner_effort=defaultdict(lambda : "FFTW_MEASURE")):
57 | self.N = N # The global size of the problem
58 | self.L = L
59 | assert len(L) == 2
60 | assert len(N) == 2
61 | self.comm = comm
62 | self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision)
63 | self.num_processes = comm.Get_size()
64 | self.rank = comm.Get_rank()
65 | self.padsize = padsize
66 | self.threads = threads
67 | self.planner_effort = planner_effort
68 | # Each cpu gets ownership of Np indices
69 | self.Np = N // self.num_processes
70 | self.Nf = N[1]//2+1
71 | self.Npf = self.Np[1]//2+1 if self.rank+1 == self.num_processes else self.Np[1]//2
72 | self.Nfp = int(padsize*self.N[1]/2+1)
73 | self.ks = (fftfreq(N[0])*N[0]).astype(int)
74 | self.dealias = zeros(0)
75 | self.work_arrays = work_arrays()
76 |
77 | def real_shape(self):
78 | """The local shape of the real data"""
79 | return (self.Np[0], self.N[1])
80 |
81 | def complex_shape(self):
82 | """The local shape of the complex data"""
83 | return (self.N[0], self.Npf)
84 |
85 | def global_complex_shape(self):
86 | """The local shape of the complex data"""
87 | return (self.N[0], self.Nf)
88 |
89 | def global_real_shape(self):
90 | """The local shape of the complex data"""
91 | return (self.N[0], self.N[1])
92 |
93 | def real_local_slice(self, padsize=1):
94 | return (slice(int(padsize*self.rank*self.Np[0]),
95 | int(padsize*(self.rank+1)*self.Np[0]), 1),
96 | slice(0, int(padsize*self.N[1])))
97 |
98 | def complex_local_slice(self):
99 | return (slice(0, self.N[0]),
100 | slice(self.rank*self.Np[1]//2, self.rank*self.Np[1]//2+self.Npf, 1))
101 |
102 | def get_N(self):
103 | return self.N
104 |
105 | def get_local_mesh(self):
106 | # Create the mesh
107 | X = np.mgrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0], :self.N[1]].astype(self.float)
108 | X[0] *= self.L[0]/self.N[0]
109 | X[1] *= self.L[1]/self.N[1]
110 | return X
111 |
112 | def get_local_wavenumbermesh(self, scaled=True, broadcast=False,
113 | eliminate_highest_freq=False):
114 | kx = fftfreq(self.N[0], 1./self.N[0])
115 | ky = rfftfreq(self.N[1], 1./self.N[1])
116 | if eliminate_highest_freq:
117 | for i, k in enumerate((kx, ky)):
118 | if self.N[i] % 2 == 0:
119 | k[self.N[i]//2] = 0
120 |
121 | Ks = np.meshgrid(kx, ky[self.rank*self.Np[1]//2:(self.rank*self.Np[1]//2+self.Npf)], indexing='ij', sparse=True)
122 | if scaled is True:
123 | Lp = 2*np.pi/self.L
124 | Ks[0] *= Lp[0]
125 | Ks[1] *= Lp[1]
126 | K = Ks
127 | if broadcast is True:
128 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
129 | return K
130 |
131 | def get_dealias_filter(self):
132 | """Filter for dealiasing nonlinear convection"""
133 | K = self.get_local_wavenumbermesh()
134 | kmax = 2./3.*(self.N//2+1)
135 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1]), dtype=np.uint8)
136 | return dealias
137 |
138 | def global_complex_shape_padded(self):
139 | """Global size of problem in complex wavenumber space"""
140 | return (int(self.padsize*self.N[0]), int(self.padsize*self.N[1]/2+1))
141 |
142 | def real_shape_padded(self):
143 | """The local shape of the real data"""
144 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]))
145 |
146 | def complex_padded_xy(self):
147 | """The local shape of the real data"""
148 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]/2+1))
149 |
150 | def complex_shape_padded_01(self):
151 | """The local shape of the real data"""
152 | return (int(self.padsize*self.Np[0]), self.Nf)
153 |
154 | def complex_padded_x(self):
155 | """Padding in x-direction"""
156 | return (int(self.padsize*self.N[0]), self.Npf)
157 |
158 | def work_shape(self, dealias):
159 | """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved"""
160 | if dealias == '3/2-rule':
161 | return self.real_shape_padded()
162 |
163 | else:
164 | return self.real_shape()
165 |
166 | def copy_to_padded_x(self, fu, fp):
167 | fp[:self.N[0]//2] = fu[:self.N[0]//2]
168 | fp[-(self.N[0]//2):] = fu[self.N[0]//2:]
169 | return fp
170 |
171 | def copy_to_padded_y(self, fu, fp):
172 | fp[:, :self.Nf] = fu[:]
173 | return fp
174 |
175 | def copy_from_padded_y(self, fp, fu):
176 | fu[:] = fp[:, :self.Nf]
177 | return fu
178 |
179 | def fft2(self, u, fu, dealias=None):
180 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
181 |
182 | if self.num_processes == 1:
183 | if not dealias == '3/2-rule':
184 | fu = rfft2(u, fu, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
185 |
186 | else:
187 | fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)]
188 | fu_padded = rfft2(u/self.padsize**2, fu_padded, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
189 | fu[:] = fu_padded[self.ks, :self.Nf]
190 |
191 | return fu
192 |
193 | if not dealias == '3/2-rule':
194 |
195 | # Work arrays
196 | Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)]
197 | U_send = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)]
198 | U_sendr = U_send.reshape((self.N[0], self.Np[1]//2))
199 | Uc = self.work_arrays[((self.N[0], self.Np[1]//2), self.complex, 0)]
200 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
201 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
202 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
203 |
204 | # Transform in y-direction
205 | Uc_hatT = rfft(u, Uc_hatT, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft'])
206 | Uc_hatT[:, 0] += 1j*Uc_hatT[:, -1]
207 |
208 | U_send = transpose_x(U_send, Uc_hatT, self.num_processes)
209 |
210 | # Communicate all values
211 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
212 |
213 | Uc = fft(U_sendr, Uc, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
214 | fu[:, :self.Np[1]//2] = Uc
215 |
216 | # Handle Nyquist frequency
217 | if self.rank == 0:
218 | fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0])
219 | self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77)
220 |
221 | elif self.rank == self.num_processes-1:
222 | self.comm.Recv([fft_y, self.mpitype], source=0, tag=77)
223 | fu[:, -1] = fft_y
224 |
225 | else:
226 | # Work arrays
227 | U_send = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)]
228 | U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2))
229 | fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)]
230 | fu_padded_xy2 = self.work_arrays[(self.complex_shape_padded_01(), self.complex, 0)]
231 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
232 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
233 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
234 |
235 | # Transform in y-direction
236 | fu_padded_xy = rfft(u/self.padsize, fu_padded_xy, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft'])
237 | fu_padded_xy2 = self.copy_from_padded_y(fu_padded_xy, fu_padded_xy2)
238 | fu_padded_xy2[:, 0] += 1j*fu_padded_xy2[:, -1]
239 |
240 | U_send = transpose_x(U_send, fu_padded_xy2, self.num_processes)
241 |
242 | # Communicate all values
243 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
244 |
245 | U_sendr = fft(U_sendr/self.padsize, U_sendr, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
246 |
247 | fu.fill(0)
248 | fu[:self.N[0]//2+1, :self.Np[1]//2] = U_sendr[:self.N[0]//2+1]
249 | fu[self.N[0]//2:, :self.Np[1]//2] += U_sendr[-self.N[0]//2:]
250 |
251 | # Handle Nyquist frequency
252 | if self.rank == 0:
253 | fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0])
254 | self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77)
255 |
256 | elif self.rank == self.num_processes-1:
257 | self.comm.Recv([fft_y, self.mpitype], source=0, tag=77)
258 | fu[:, -1] = fft_y
259 |
260 | return fu
261 |
262 | def ifft2(self, fu, u, dealias=None):
263 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
264 |
265 | if dealias == '2/3-rule' and self.dealias.shape == (0,):
266 | self.dealias = self.get_dealias_filter()
267 |
268 | fu_ = fu
269 | if dealias == '2/3-rule':
270 | fu_ = self.work_arrays[(fu, 0, False)]
271 | fu_[:] = fu
272 | fu_ *= self.dealias
273 |
274 | if self.num_processes == 1:
275 | if not dealias == '3/2-rule':
276 | u = irfft2(fu_, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2'])
277 |
278 | else:
279 | fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)]
280 | fu_padded[self.ks, :self.Nf] = fu[:]
281 | u = irfft2(fu_padded*self.padsize**2, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2'])
282 |
283 | return u
284 |
285 | if not dealias == '3/2-rule':
286 | # Get some work arrays
287 | Uc_hat = self.work_arrays[((self.N[0], self.Npf), self.complex, 0)]
288 | Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)]
289 | U_send = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)]
290 | U_sendr = U_send.reshape((self.N[0], self.Np[1]//2))
291 | fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
292 | fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
293 | plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
294 |
295 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
296 | U_sendr[:] = Uc_hat[:, :self.Np[1]//2]
297 |
298 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
299 |
300 | Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes)
301 |
302 | if self.rank == self.num_processes-1:
303 | fft_y[:] = Uc_hat[:, -1]
304 |
305 | self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1)
306 | Uc_hatT[:, -1] = plane_recv
307 |
308 | u = irfft(Uc_hatT, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft'])
309 |
310 | else:
311 | U_send = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)]
312 | U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2))
313 | Uc_hatT = self.work_arrays[((int(self.padsize*self.Np[0]), self.Nf), self.complex, 0)]
314 | fu_padded_x = self.work_arrays[(self.complex_padded_x(), self.complex, 0)]
315 | fu_padded_x2= self.work_arrays[(self.complex_padded_x(), self.complex, 1)]
316 | fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)]
317 | fft_y = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 0)]
318 | fft_x = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 1)]
319 | plane_recv = self.work_arrays[((int(self.padsize*self.Np[0]),), self.complex, 2)]
320 |
321 | fu_padded_x2 = self.copy_to_padded_x(fu, fu_padded_x2)
322 | fu_padded_x = ifft(fu_padded_x2, fu_padded_x, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
323 |
324 | U_sendr[:] = fu_padded_x[:, :self.Np[1]//2]
325 |
326 | self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
327 |
328 | Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes)
329 |
330 | if self.rank == self.num_processes-1:
331 | fft_y[:] = fu_padded_x[:, -1]
332 |
333 | self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1)
334 | Uc_hatT[:, -1] = plane_recv
335 |
336 | fu_padded_xy = self.copy_to_padded_y(Uc_hatT, fu_padded_xy)
337 |
338 | u = irfft(fu_padded_xy*self.padsize**2, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft'])
339 |
340 | return u
341 |
--------------------------------------------------------------------------------
/mpiFFT4py/mpibase.py:
--------------------------------------------------------------------------------
1 | __author__ = "Mikael Mortensen "
2 | __date__ = "2016-04-14"
3 | __copyright__ = "Copyright (C) 2016 " + __author__
4 | __license__ = "GNU Lesser GPL version 3 or any later version"
5 |
6 | import numpy as np
7 | from mpi4py import MPI
8 | import collections
9 |
10 | # Possible way to give numpy arrays attributes...
11 | #class Empty(np.ndarray):
12 | #"""Numpy empty array with additional info dictionary to hold attributes
13 | #"""
14 | #def __new__(subtype, shape, dtype=np.float, info={}):
15 | #obj = np.ndarray.__new__(subtype, shape, dtype)
16 | #obj.info = info
17 | #return obj
18 |
19 | #def __array_finalize__(self, obj):
20 | #if obj is None: return
21 | #self.info = getattr(obj, 'info', {})
22 |
23 | #class Zeros(np.ndarray):
24 | #"""Numpy zeros array with additional info dictionary to hold attributes
25 | #"""
26 | #def __new__(subtype, shape, dtype=float, info={}):
27 | #obj = np.ndarray.__new__(subtype, shape, dtype)
28 | #obj.fill(0)
29 | #obj.info = info
30 | #return obj
31 |
32 | #def __array_finalize__(self, obj):
33 | #if obj is None: return
34 | #self.info = getattr(obj, 'info', {})
35 |
36 | Empty, Zeros = np.empty, np.zeros
37 |
38 | try:
39 | import pyfftw
40 | def empty(N, dtype=np.float, bytes=16):
41 | return pyfftw.empty_aligned(N, dtype=dtype, n=bytes)
42 |
43 | def zeros(N, dtype=np.float, bytes=16):
44 | return pyfftw.zeros_aligned(N, dtype=dtype, n=bytes)
45 |
46 | except ImportError:
47 | def empty(N, dtype=np.float, bytes=None):
48 | return Empty(N, dtype=dtype)
49 |
50 | def zeros(N, dtype=np.float, bytes=None):
51 | return Zeros(N, dtype=dtype)
52 |
53 | class work_array_dict(dict):
54 | """Dictionary of work arrays indexed by their shape, type and an indicator i."""
55 | def __missing__(self, key):
56 | shape, dtype, i = key
57 | a = zeros(shape, dtype=dtype)
58 | self[key] = a
59 | return self[key]
60 |
61 | class work_arrays(collections.MutableMapping):
62 | """A dictionary to hold numpy work arrays.
63 |
64 | The dictionary allows two types of keys for the same item.
65 |
66 | keys:
67 | - (shape, dtype, index (, fillzero)), where shape is tuple, dtype is np.dtype and
68 | index an integer
69 | - (ndarray, index (, fillzero)), where ndarray is a numpy array and index is
70 | an integer
71 | fillzero is an optional bool that determines
72 | whether the array is initialised to zero
73 |
74 | Usage:
75 | To create two real work arrays of shape (3,3), do:
76 | - work = workarrays()
77 | - a = work[((3,3), np.float, 0)]
78 | - b = work[(a, 1)]
79 |
80 | Returns:
81 | Numpy array of given shape. The array is by default initialised to zero, but this
82 | can be overridden using the fillzero argument.
83 |
84 | """
85 |
86 | def __init__(self):
87 | self.store = work_array_dict()
88 | self.fillzero = True
89 |
90 | def __getitem__(self, key):
91 | val = self.store[self.__keytransform__(key)]
92 | if self.fillzero is True: val.fill(0)
93 | return val
94 |
95 | def __setitem__(self, key, value):
96 | self.store[self.__keytransform__(key)] = value
97 |
98 | def __delitem__(self, key):
99 | del self.store[self.__keytransform__(key)]
100 |
101 | def __iter__(self):
102 | return iter(self.store)
103 |
104 | def __len__(self):
105 | return len(self.store)
106 |
107 | def values(self):
108 | raise TypeError('Work arrays not iterable')
109 |
110 | def __keytransform__(self, key):
111 | if isinstance(key[0], np.ndarray):
112 | shape = key[0].shape
113 | dtype = key[0].dtype
114 | i = key[1]
115 | zero = True if len(key) == 2 else key[2]
116 |
117 | elif isinstance(key[0], tuple):
118 | if len(key) == 3:
119 | shape, dtype, i = key
120 | zero = True
121 |
122 | elif len(key) == 4:
123 | shape, dtype, i, zero = key
124 |
125 | else:
126 | raise TypeError("Wrong type of key for work array")
127 |
128 | assert isinstance(zero, bool)
129 | assert isinstance(i, int)
130 | self.fillzero = zero
131 | return (shape, np.dtype(dtype), i)
132 |
133 | def datatypes(precision):
134 | """Return datatypes associated with precision."""
135 | assert precision in ("single", "double")
136 | return {"single": (np.float32, np.complex64, MPI.C_FLOAT_COMPLEX),
137 | "double": (np.float64, np.complex128, MPI.C_DOUBLE_COMPLEX)}[precision]
138 |
--------------------------------------------------------------------------------
/mpiFFT4py/pencil.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | """Pencil decomposition
3 |
4 | This module contains classes for performing FFTs with pencil decomposition
5 | of three-dimensional data structures data[Nx,Ny,Nz], where (Nx, Ny, Nz) is
6 | the shape of the input data. With slab decomposition only one of these three
7 | indices is shared, leading to local datastructures on each processor
8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors.
9 | With pencil, two of the input arrays indices are shared, leading to local
10 | data of shape (Nx/P1, Ny/P2, Nz), i.e., pencils aligned in the z-direction.
11 |
12 | The final transformed data can be aligned in either the y-direction or
13 | the x-direction.
14 |
15 | classes:
16 | R2CX - For real to complex transforms. Final alignment in x-direction
17 | Args:
18 | N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh
19 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain
20 | comm - The MPI communicator object
21 | precision - "single" or "double"
22 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
23 | padsize - The size of padding, if padding is used in transforms
24 | threads - Number of threads used by FFTs
25 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE",
26 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
27 |
28 | R2CY - For real to complex transforms. Final alignment in y-direction
29 | Args:
30 | N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh
31 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain
32 | comm - The MPI communicator object
33 | precision - "single" or "double"
34 | P1 - Decomposition along first dimension
35 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
36 | padsize - The size of padding, if padding is used in transforms
37 | threads - Number of threads used by FFTs
38 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE",
39 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
40 |
41 | function:
42 | R2C
43 |
44 | Args:
45 | N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh
46 | L - NumPy array([Lx, Ly, Lz]) size of the computational domain
47 | comm - The MPI communicator object
48 | precision - "single" or "double"
49 | P1 - Decomposition along first dimension
50 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
51 | padsize - The size of padding, if padding is used in transforms
52 | threads - Number of threads used by FFTs
53 | alignment - Final alignment, ('X' or 'Y')
54 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE",
55 | "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
56 |
57 | """
58 | __author__ = "Mikael Mortensen "
59 | __date__ = "2016-02-16"
60 | __copyright__ = "Copyright (C) 2016 " + __author__
61 | __license__ = "GNU Lesser GPL version 3 or any later version"
62 |
63 | from .serialFFT import *
64 | import numpy as np
65 | from .mpibase import work_arrays, datatypes
66 | from .cython.maths import dealias_filter
67 | from numpy.fft import fftfreq, rfftfreq
68 | from collections import defaultdict
69 | from mpi4py import MPI
70 |
71 | #__all__ = ['R2C']
72 |
73 | # Using Lisandro Dalcin's code for Alltoallw.
74 | # Note that _subsize and _distribution are modified for a mesh of power two.
75 |
76 | def _subsize(N, size, rank):
77 | return N // size + ((N % size) * (rank == size -1))
78 | #return N // size + (N % size > rank) # Generic
79 |
80 | def _distribution(N, size):
81 | q = N // size
82 | r = N % size
83 | n = s = i = 0
84 | while i < size:
85 | n = q
86 | s = q * i
87 | if r == 1 and i+1 == size:
88 | n += 1
89 | yield n, s
90 | i += 1
91 |
92 | # Generic
93 | #def _distribution2(N, size):
94 | #q = N // size
95 | #r = N % size
96 | #n = s = i = 0
97 | #while i < size:
98 | #n = q
99 | #s = q * i
100 | #if i < r:
101 | #n += 1
102 | #s += i
103 | #else:
104 | #s += r
105 | #yield n, s
106 | #i += 1
107 |
108 |
109 | def transform_Uc_xz(Uc_hat_x, Uc_hat_z, P1):
110 | sz = Uc_hat_z.shape
111 | sx = Uc_hat_x.shape
112 | Uc_hat_x[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P1, sx[2])), 2).reshape(sx)
113 | return Uc_hat_x
114 |
115 | def transform_Uc_zx(Uc_hat_z, Uc_hat_xr, P1):
116 | sz = Uc_hat_z.shape
117 | sx = Uc_hat_xr.shape
118 | Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_xr.reshape((P1, sz[0], sz[1], sx[2])), 0, 3).reshape((sz[0], sz[1], sz[2]-1))
119 | return Uc_hat_z
120 |
121 | def transform_Uc_xy(Uc_hat_x, Uc_hat_y, P):
122 | sy = Uc_hat_y.shape
123 | sx = Uc_hat_x.shape
124 | Uc_hat_x[:] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sx[1], sx[2])), 1).reshape(sx)
125 | return Uc_hat_x
126 |
127 | def transform_Uc_yx(Uc_hat_y, Uc_hat_x, P):
128 | sy = Uc_hat_y.shape
129 | sx = Uc_hat_x.shape
130 | Uc_hat_y[:] = np.rollaxis(Uc_hat_x.reshape((P, sx[0]//P, sx[1], sx[2])), 1).reshape(sy)
131 | return Uc_hat_y
132 |
133 | def transform_Uc_yz(Uc_hat_y, Uc_hat_z, P):
134 | sz = Uc_hat_z.shape
135 | sy = Uc_hat_y.shape
136 | Uc_hat_y[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P, sy[2])), 1, 3).reshape(sy)
137 | return Uc_hat_y
138 |
139 | def transform_Uc_zy(Uc_hat_z, Uc_hat_y, P):
140 | sz = Uc_hat_z.shape
141 | sy = Uc_hat_y.shape
142 | Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sz[1], sy[2])), 1, 3).reshape((sz[0], sz[1], sz[2]-1))
143 | return Uc_hat_z
144 |
145 | class R2CY(object):
146 | """Class for performing FFT in 3D using MPI
147 |
148 | Pencil decomposition
149 |
150 | Args:
151 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
152 | L - NumPy array([Lx, Ly, Lz]) The actual size of the computational domain
153 | comm - The MPI communicator object
154 | precision - "single" or "double"
155 | P1 - Decomposition along first dimension
156 | communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
157 | padsize - The size of padding, if padding is used in transforms
158 | threads - Number of threads used by FFTs
159 | planner_effort - Planner effort used by FFTs ("FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
160 | Give as defaultdict, with keys representing transform (e.g., fft, ifft)
161 |
162 | This version has the final complex data aligned in the y-direction, in agreement
163 | with the paper in CPC (http://arxiv.org/pdf/1602.03638v1.pdf)
164 |
165 | """
166 |
167 | def __init__(self, N, L, comm, precision, P1=None, communication='Alltoallw', padsize=1.5, threads=1,
168 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
169 | self.N = N
170 | assert len(L) == 3
171 | assert len(N) == 3
172 | self.Nf = N[2]//2+1 # Number of independent complex wavenumbers in z-direction
173 | self.comm = comm
174 | self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision)
175 | self.num_processes = comm.Get_size()
176 | assert self.num_processes > 1
177 | self.L = L.astype(float)
178 | self.dealias = np.zeros(0)
179 | self.communication = communication
180 | self.padsize = padsize
181 | self.threads = threads
182 | self.planner_effort = planner_effort
183 | self.rank = comm.Get_rank()
184 | if P1 is None:
185 | P1, P2 = MPI.Compute_dims(self.num_processes, 2)
186 | self.P1, self.P2 = P1, P2
187 | else:
188 | self.P1 = P1
189 | self.P2 = P2 = self.num_processes // P1
190 | self.N1 = N // P1
191 | self.N2 = N // P2
192 | self.comm0 = comm.Split(self.rank/P1)
193 | self.comm1 = comm.Split(self.rank%P1)
194 | self.comm0_rank = self.comm0.Get_rank()
195 | self.comm1_rank = self.comm1.Get_rank()
196 | self.work_arrays = work_arrays()
197 | self.N1f = self.N1[2]//2 if self.comm0_rank < self.P1-1 else self.N1[2]//2+1
198 | if self.communication == 'AlltoallN':
199 | self.N1f = self.N1[2]//2
200 |
201 | if not (self.num_processes % 2 == 0 or self.num_processes == 1):
202 | raise IOError("Number of cpus must be even")
203 |
204 | if (P1 % 2 != 0) or (P2 % 2 != 0):
205 | raise IOError("Number of cpus in each direction must be even power of 2")
206 |
207 | self._subarrays1A = []
208 | self._subarrays1B = []
209 | self._subarrays2A = []
210 | self._subarrays2B = []
211 | self._subarrays1A_pad = []
212 | self._subarrays1B_pad = []
213 | self._subarrays2A_pad = []
214 | self._subarrays2B_pad = []
215 | self._counts_displs1 = None
216 | self._counts_displs2 = None
217 |
218 | def get_subarrays(self, padsize=1):
219 | datatype = MPI._typedict[np.dtype(self.complex).char]
220 | M, N, Q = self.N[0], self.N[1], self.Nf
221 | m = _subsize(M, self.P2, self.comm1_rank)
222 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
223 | q = _subsize(Q, self.P1, self.comm0_rank)
224 | _subarrays1A = [
225 | datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit()
226 | for l, s in _distribution(int(padsize*N), self.P2)
227 | ]
228 | _subarrays1B = [
229 | datatype.Create_subarray([M,n,q], [l,n,q], [s,0,0]).Commit()
230 | for l, s in _distribution(M, self.P2)
231 | ]
232 | _counts_displs1 = ([1] * self.P2, [0] * self.P2)
233 |
234 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
235 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
236 | q = _subsize(Q, self.P1, self.comm0_rank)
237 | _subarrays2A = [
238 | datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit()
239 | for l, s in _distribution(int(padsize*M), self.P1)
240 | ]
241 | _subarrays2B = [
242 | datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit()
243 | for l, s in _distribution(Q, self.P1)
244 | ]
245 | _counts_displs2 = ([1] * self.P1, [0] * self.P1)
246 | return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2
247 |
248 | def real_shape(self):
249 | """The local shape of the real data"""
250 | return (self.N1[0], self.N2[1], self.N[2])
251 |
252 | def complex_shape(self):
253 | """The local shape of the complex data"""
254 | return (self.N2[0], self.N[1], self.N1f)
255 |
256 | def complex_shape_T(self):
257 | """The local transposed shape of the complex data"""
258 | return (self.Np[0], self.N[1], self.Nf)
259 |
260 | def complex_shape_I(self):
261 | """A local intermediate shape of the complex data"""
262 | return (self.Np[0], self.num_processes, self.Np[1], self.Nf)
263 |
264 | def real_shape_padded(self):
265 | return (int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2]))
266 |
267 | def work_shape(self, dealias):
268 | """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved"""
269 | if dealias == '3/2-rule':
270 | return self.real_shape_padded()
271 |
272 | else:
273 | return self.real_shape()
274 |
275 | def real_local_slice(self, padsize=1):
276 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane
277 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane
278 | return (slice(int(padsize * xzrank * self.N1[0]), int(padsize * (xzrank+1) * self.N1[0]), 1),
279 | slice(int(padsize * xyrank * self.N2[1]), int(padsize * (xyrank+1) * self.N2[1]), 1),
280 | slice(0, int(padsize*self.N[2])))
281 |
282 | def complex_local_slice(self):
283 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane
284 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane
285 | return (slice(xyrank*self.N2[0], (xyrank+1)*self.N2[0], 1),
286 | slice(0, self.N[1]),
287 | slice(xzrank*self.N1[2]//2, xzrank*self.N1[2]//2 + self.N1f, 1))
288 |
289 | def complex_local_wavenumbers(self):
290 | s = self.complex_local_slice()
291 | return (fftfreq(self.N[0], 1./self.N[0]).astype(int)[s[0]],
292 | fftfreq(self.N[1], 1./self.N[1]).astype(int),
293 | rfftfreq(self.N[2], 1./self.N[2]).astype(int)[s[2]])
294 |
295 | def get_P(self):
296 | return self.P1, self.P2
297 |
298 | def get_local_mesh(self):
299 | xzrank = self.comm0.Get_rank() # Local rank in xz-plane
300 | xyrank = self.comm1.Get_rank() # Local rank in xy-plane
301 |
302 | # Create the physical mesh
303 | x1 = slice(xzrank * self.N1[0], (xzrank+1) * self.N1[0], 1)
304 | x2 = slice(xyrank * self.N2[1], (xyrank+1) * self.N2[1], 1)
305 | X = np.ogrid[x1, x2, :self.N[2]]
306 |
307 | X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float)
308 | X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float)
309 | X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float)
310 | X = [np.broadcast_to(x, self.real_shape()) for x in X]
311 | return X
312 |
313 | def get_local_wavenumbermesh(self, scaled=False, broadcast=False,
314 | eliminate_highest_freq=False):
315 | """Returns (scaled) local decomposed wavenumbermesh
316 |
317 | If scaled is True, then the wavenumbermesh is scaled with physical mesh
318 | size. This takes care of mapping the physical domain to a computational
319 | cube of size (2pi)**3
320 |
321 |
322 | """
323 | s = self.complex_local_slice()
324 | kx = fftfreq(self.N[0], 1./self.N[0]).astype(int)
325 | ky = fftfreq(self.N[1], 1./self.N[1]).astype(int)
326 | kz = rfftfreq(self.N[2], 1./self.N[2]).astype(int)
327 | if eliminate_highest_freq:
328 | for i, k in enumerate((kx, ky, kz)):
329 | if self.N[i] % 2 == 0:
330 | k[self.N[i]//2] = 0
331 | kx = kx[s[0]]
332 | kz = kz[s[2]]
333 | Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True)
334 | if scaled is True:
335 | Lp = 2*np.pi/self.L
336 | for i in range(3):
337 | Ks[i] = (Ks[i]*Lp[i]).astype(self.float)
338 | K = Ks
339 | if broadcast is True:
340 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
341 | return K
342 |
343 | def get_dealias_filter(self):
344 | """Filter for dealiasing nonlinear convection"""
345 | K = self.get_local_wavenumbermesh()
346 | kmax = 2./3.*(self.N//2+1)
347 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])*
348 | (abs(K[2]) < kmax[2]), dtype=np.uint8)
349 | return dealias
350 |
351 | def copy_to_padded_x(self, fu, fp):
352 | fp[:self.N[0]//2] = fu[:self.N[0]//2]
353 | fp[-(self.N[0]//2):] = fu[self.N[0]//2:]
354 | return fp
355 |
356 | def copy_to_padded_y(self, fu, fp):
357 | fp[:, :self.N[1]//2] = fu[:, :self.N[1]//2]
358 | fp[:, -(self.N[1]//2):] = fu[:, self.N[1]//2:]
359 | return fp
360 |
361 | def copy_to_padded_z(self, fu, fp):
362 | fp[:, :, :self.Nf] = fu[:]
363 | return fp
364 |
365 | def copy_from_padded_z(self, fp, fu):
366 | fu[:] = fp[:, :, :self.Nf]
367 | return fu
368 |
369 | def copy_from_padded_x(self, fp, fu):
370 | fu.fill(0)
371 | fu[:self.N[0]//2+1] = fp[:self.N[0]//2+1]
372 | fu[self.N[0]//2:] += fp[-self.N[0]//2:]
373 | return fu
374 |
375 | def copy_from_padded_y(self, fp, fu):
376 | fu.fill(0)
377 | fu[:, :self.N[1]//2+1] = fp[:, :self.N[1]//2+1]
378 | fu[:, self.N[1]//2:] += fp[:, -self.N[1]//2:]
379 | return fu
380 |
381 | def global_complex_shape(self, padsize=1.0):
382 | """Global size of problem in complex wavenumber space"""
383 | return (int(padsize*self.N[0]), int(padsize*self.N[1]),
384 | int(padsize*self.N[2]//2+1))
385 |
386 | def ifftn(self, fu, u, dealias=None):
387 | """ifft in three directions using mpi.
388 | Need to do ifft in reversed order of fft
389 | """
390 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
391 |
392 | if dealias == '2/3-rule' and self.dealias.shape == (0,):
393 | self.dealias = self.get_dealias_filter()
394 |
395 | # Strip off self
396 | N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f
397 |
398 | if not dealias == '3/2-rule':
399 |
400 | fu_ = fu
401 | if dealias == '2/3-rule':
402 | fu_ = self.work_arrays[(fu, 0, False)]
403 | fu_[:] = fu
404 | fu_ = dealias_filter(fu_, self.dealias)
405 | #fu_ *= self.dealias
406 |
407 | Uc_hat_y = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0, False)]
408 | Uc_hat_z = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0, False)]
409 |
410 | if self.communication == 'AlltoallN':
411 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)]
412 |
413 | # Do first owned direction
414 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
415 | planner_effort=self.planner_effort['ifft'])
416 |
417 | # Transform to x all but k=N//2 (the neglected Nyquist mode)
418 | Uc_hat_x[:] = transform_Uc_xy(Uc_hat_x, Uc_hat_y, self.P2)
419 |
420 | # Communicate in xz-plane and do fft in x-direction
421 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
422 | Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads,
423 | planner_effort=self.planner_effort['ifft'])
424 |
425 | # Communicate and transform in xy-plane
426 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
427 | Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1)
428 |
429 | # Do fft for z-direction
430 | Uc_hat_z[:, :, -1] = 0
431 | u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads,
432 | planner_effort=self.planner_effort['irfft'])
433 |
434 | elif self.communication == 'Alltoall':
435 | # Additional work arrays
436 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)]
437 | Uc_hat_xp = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)]
438 | xy_plane = self.work_arrays[((N[0], N2[1]), self.complex, 0, False)]
439 | xy_recv = self.work_arrays[((N1[0], N2[1]), self.complex, 0, False)]
440 |
441 | # Do first owned direction
442 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
443 | planner_effort=self.planner_effort['ifft'])
444 |
445 | # Transform to x
446 | Uc_hat_xp = transform_Uc_xy(Uc_hat_xp, Uc_hat_y, self.P2)
447 |
448 | ###### In-place
449 | ## Communicate in xz-plane and do fft in x-direction
450 | #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xp, self.mpitype])
451 | #Uc_hat_xp[:] = ifft(Uc_hat_xp, axis=0, threads=self.threads,
452 | #planner_effort=self.planner_effort['ifft'])
453 |
454 | #Uc_hat_x[:] = Uc_hat_xp[:, :, :self.N1[2]//2]
455 |
456 | ## Communicate and transform in xy-plane all but k=N//2
457 | #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
458 |
459 | ####### Not in-place
460 | # Communicate in xz-plane and do fft in x-direction
461 | Uc_hat_xp2 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 1, False)]
462 | self.comm1.Alltoall([Uc_hat_xp, self.mpitype], [Uc_hat_xp2, self.mpitype])
463 | Uc_hat_xp = ifft(Uc_hat_xp2, Uc_hat_xp, axis=0, threads=self.threads,
464 | planner_effort=self.planner_effort['ifft'])
465 |
466 | Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 1, False)]
467 | Uc_hat_x2[:] = Uc_hat_xp[:, :, :N1[2]//2]
468 |
469 | # Communicate and transform in xy-plane all but k=N//2
470 | self.comm0.Alltoall([Uc_hat_x2, self.mpitype], [Uc_hat_x, self.mpitype])
471 | #########################
472 |
473 | Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1)
474 |
475 | xy_plane[:] = Uc_hat_xp[:, :, -1]
476 | self.comm0.Scatter(xy_plane, xy_recv, root=self.P1-1)
477 | Uc_hat_z[:, :, -1] = xy_recv
478 |
479 | # Do ifft for z-direction
480 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
481 | planner_effort=self.planner_effort['irfft'])
482 |
483 | elif self.communication == 'Alltoallw':
484 | if len(self._subarrays1A) == 0:
485 | (self._subarrays1A, self._subarrays1B, self._subarrays2A,
486 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
487 |
488 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)]
489 |
490 | # Do first owned direction
491 | Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
492 | planner_effort=self.planner_effort['ifft'])
493 |
494 | self.comm1.Alltoallw(
495 | [Uc_hat_y, self._counts_displs1, self._subarrays1A],
496 | [Uc_hat_x, self._counts_displs1, self._subarrays1B])
497 |
498 | Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads,
499 | planner_effort=self.planner_effort['ifft'])
500 |
501 | self.comm0.Alltoallw(
502 | [Uc_hat_x, self._counts_displs2, self._subarrays2A],
503 | [Uc_hat_z, self._counts_displs2, self._subarrays2B])
504 |
505 | # Do fft for z-direction
506 | u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads,
507 | planner_effort=self.planner_effort['irfft'])
508 |
509 | return u
510 |
511 | else: # padded
512 |
513 | padsize = self.padsize
514 | Uc_pad_hat_y = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)]
515 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
516 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
517 |
518 | if self.communication == 'AlltoallN':
519 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
520 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
521 |
522 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
523 |
524 | # Do first owned direction
525 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
526 | planner_effort=self.planner_effort['ifft'])
527 |
528 | # Transform to x all but k=N//2 (the neglected Nyquist mode)
529 | Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P2)
530 |
531 | # Communicate in xz-plane
532 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
533 |
534 | # Pad and do fft in x-direction
535 | Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy)
536 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads,
537 | planner_effort=self.planner_effort['ifft'])
538 |
539 | # Communicate in xy-plane
540 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
541 |
542 | # Transform
543 | Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1)
544 | Uc_pad_hat_z[:, :, -1] = 0
545 |
546 | # Pad in z-dir
547 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
548 |
549 | # Do ifft for z-direction
550 | u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads,
551 | planner_effort=self.planner_effort['irfft'])
552 |
553 | elif self.communication == 'Alltoall':
554 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
555 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
556 | Uc_pad_hat_xr2 = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
557 | Uc_pad_hat_xy3 = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
558 | xy2_pad_plane = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1])), self.complex, 0)]
559 | xy2_pad_recv = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1])), self.complex, 1)]
560 |
561 | # Pad in y-direction
562 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
563 |
564 | # Transform first owned direction
565 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
566 | planner_effort=self.planner_effort['ifft'])
567 |
568 | # Transpose datastructure to x
569 | Uc_pad_hat_xr2[:] = transform_Uc_xy(Uc_pad_hat_xr2, Uc_pad_hat_y, self.P2)
570 |
571 | # Communicate in xz-plane and do fft in x-direction
572 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype])
573 |
574 | # Pad and do fft in x-direction
575 | Uc_pad_hat_xy3 = self.copy_to_padded_x(Uc_pad_hat_xr2, Uc_pad_hat_xy3)
576 | Uc_pad_hat_xy3[:] = ifft(Uc_pad_hat_xy3, axis=0, threads=self.threads,
577 | planner_effort=self.planner_effort['ifft'])
578 |
579 | Uc_pad_hat_xy[:] = Uc_pad_hat_xy3[:, :, :N1[2]//2]
580 |
581 | # Communicate and transform in xy-plane all but k=N//2
582 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
583 |
584 | Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1)
585 |
586 | xy2_pad_plane[:] = Uc_pad_hat_xy3[:, :, -1]
587 | self.comm0.Scatter(xy2_pad_plane, xy2_pad_recv, root=self.P1-1)
588 | Uc_pad_hat_z[:, :, -1] = xy2_pad_recv
589 |
590 | # Pad in z-dir
591 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
592 |
593 | # Do ifft for z-direction
594 | u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads,
595 | planner_effort=self.planner_effort['irfft'])
596 |
597 | elif self.communication == 'Alltoallw':
598 | if len(self._subarrays1A_pad) == 0:
599 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
600 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
601 |
602 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
603 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
604 |
605 | # Pad in y-direction
606 | Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
607 |
608 | # Transform first owned direction
609 | Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
610 | planner_effort=self.planner_effort['ifft'])
611 |
612 | self.comm1.Alltoallw(
613 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad],
614 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad])
615 |
616 | # Pad and do fft in x-direction
617 | Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy)
618 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads,
619 | planner_effort=self.planner_effort['ifft'])
620 |
621 | self.comm0.Alltoallw(
622 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad],
623 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad])
624 |
625 | # Pad in z-dir
626 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
627 |
628 | # Do fft for z-direction
629 | u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads,
630 | planner_effort=self.planner_effort['irfft'])
631 |
632 | return u
633 |
634 | def fftn(self, u, fu, dealias=None):
635 | """fft in three directions using mpi."""
636 |
637 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
638 |
639 | # Strip off self
640 | N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f
641 |
642 | if not dealias == '3/2-rule':
643 |
644 | Uc_hat_y = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0)]
645 | Uc_hat_z = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0)]
646 |
647 | if self.communication == 'AlltoallN':
648 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)]
649 |
650 | # Do fft in z direction on owned data
651 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
652 | planner_effort=self.planner_effort['rfft'])
653 |
654 | # Transform to x direction neglecting k=N//2 (Nyquist)
655 | Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1)
656 |
657 | # Communicate and do fft in x-direction
658 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
659 | Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
660 | planner_effort=self.planner_effort['fft'])
661 |
662 | # Communicate and transform to final y-direction
663 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
664 | Uc_hat_y[:] = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P2)
665 |
666 | # Do fft for last direction
667 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
668 | planner_effort=self.planner_effort['fft'])
669 |
670 | elif self.communication == 'Alltoall':
671 |
672 | # Additional work arrays
673 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)]
674 | Uc_hat_xr2= self.work_arrays[((N[0], N2[1], N1f), self.complex, 1)]
675 | xy_plane = self.work_arrays[((N[0], N2[1]), self.complex, 0)]
676 | xy_plane2 = self.work_arrays[((N[0]//2+1, N2[1]), self.complex, 0)]
677 | xy_recv = self.work_arrays[((N1[0], N2[1]), self.complex, 0)]
678 |
679 | # Do fft in z direction on owned data
680 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
681 | planner_effort=self.planner_effort['rfft'])
682 |
683 | # Move real part of Nyquist to k=0
684 | Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1]
685 |
686 | # Transform to x direction neglecting k=N//2 (Nyquist)
687 | Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1)
688 |
689 | # In-place
690 | # Communicate and do fft in x-direction
691 | #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
692 | #Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
693 | #planner_effort=self.planner_effort['fft'])
694 |
695 | # Not in-place
696 | Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 2, False)]
697 | self.comm0.Alltoall([Uc_hat_x, self.mpitype], [Uc_hat_x2, self.mpitype])
698 | Uc_hat_x = fft(Uc_hat_x2, Uc_hat_x, axis=0, threads=self.threads,
699 | planner_effort=self.planner_effort['fft'])
700 | ################
701 |
702 | Uc_hat_xr2[:, :, :N1[2]//2] = Uc_hat_x[:]
703 |
704 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
705 | if self.comm0_rank == 0:
706 | M = N[0]
707 | xy_plane[:] = Uc_hat_x[:, :, 0]
708 | xy_plane2[:] = np.vstack((xy_plane[0].real, 0.5*(xy_plane[1:M//2]+np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].real))
709 | Uc_hat_xr2[:, :, 0] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
710 | xy_plane2[:] = np.vstack((xy_plane[0].imag, -0.5*1j*(xy_plane[1:M//2]-np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].imag))
711 | xy_plane[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
712 | self.comm0.Send([xy_plane, self.mpitype], dest=self.P1-1, tag=77)
713 |
714 | if self.comm0_rank == self.P1-1:
715 | self.comm0.Recv([xy_plane, self.mpitype], source=0, tag=77)
716 | Uc_hat_xr2[:, :, -1] = xy_plane
717 |
718 | # Communicate and transform to final y-direction
719 | #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xr2, self.mpitype])
720 | #Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr2, self.P2)
721 | # Not in-place
722 | Uc_hat_xr3 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 3)]
723 | self.comm1.Alltoall([Uc_hat_xr2, self.mpitype], [Uc_hat_xr3, self.mpitype])
724 | Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr3, self.P2)
725 |
726 | # Do fft for last direction
727 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
728 | planner_effort=self.planner_effort['fft'])
729 |
730 | elif self.communication == 'Alltoallw':
731 | if len(self._subarrays1A) == 0:
732 | (self._subarrays1A, self._subarrays1B, self._subarrays2A,
733 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
734 |
735 | Uc_hat_x = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0)]
736 |
737 | # Do fft in z direction on owned data
738 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
739 | planner_effort=self.planner_effort['rfft'])
740 |
741 | self.comm0.Alltoallw(
742 | [Uc_hat_z, self._counts_displs2, self._subarrays2B],
743 | [Uc_hat_x, self._counts_displs2, self._subarrays2A])
744 |
745 | Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
746 | planner_effort=self.planner_effort['fft'])
747 |
748 | self.comm1.Alltoallw(
749 | [Uc_hat_x, self._counts_displs1, self._subarrays1B],
750 | [Uc_hat_y, self._counts_displs1, self._subarrays1A])
751 |
752 | # Do fft for last direction
753 | fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
754 | planner_effort=self.planner_effort['fft'])
755 |
756 | return fu
757 |
758 | else: # padded
759 |
760 | assert u.shape == self.real_shape_padded()
761 |
762 | padsize = self.padsize
763 | Uc_pad_hat_y = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)]
764 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
765 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
766 |
767 | if self.communication == 'AlltoallN':
768 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
769 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
770 |
771 | # Do fft in z direction on owned data
772 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
773 | planner_effort=self.planner_effort['rfft'])
774 |
775 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
776 |
777 | # Transform to x direction neglecting k=N//2 (Nyquist)
778 | Uc_pad_hat_xy = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1)
779 |
780 | # Communicate and do fft in x-direction
781 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
782 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
783 | planner_effort=self.planner_effort['fft'])
784 |
785 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
786 |
787 | # Communicate and transform to final y-direction
788 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
789 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P2)
790 |
791 | # Do fft for last direction
792 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
793 | planner_effort=self.planner_effort['fft'])
794 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
795 | fu /= padsize**3
796 |
797 | elif self.communication == 'Alltoall':
798 |
799 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
800 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
801 | xy_pad_plane = self.work_arrays[((N[0], int(padsize*N2[1])), self.complex, 0)]
802 | xy_pad_plane2= self.work_arrays[((N[0]//2+1, int(padsize*N2[1])), self.complex, 0)]
803 | Uc_pad_hat_xr2 = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
804 |
805 | # Do fft in z direction on owned data
806 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
807 | planner_effort=self.planner_effort['rfft'])
808 |
809 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
810 |
811 | # Move real part of Nyquist to k=0
812 | Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1]
813 |
814 | # Transform to x direction neglecting k=N//2 (Nyquist)
815 | Uc_pad_hat_xy[:] = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1)
816 |
817 | # Communicate and do fft in x-direction
818 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
819 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
820 | planner_effort=self.planner_effort['fft'])
821 |
822 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
823 |
824 | Uc_pad_hat_xr2[:, :, :N1[2]//2] = Uc_pad_hat_x[:]
825 |
826 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
827 | if self.comm0_rank == 0:
828 | N = self.N[0]
829 | xy_pad_plane[:] = Uc_pad_hat_x[:, :, 0]
830 | xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].real, 0.5*(xy_pad_plane[1:N//2]+np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].real))
831 | Uc_pad_hat_xr2[:, :, 0] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1])))
832 | xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].imag, -0.5*1j*(xy_pad_plane[1:N//2]-np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].imag))
833 | xy_pad_plane[:] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1])))
834 | self.comm0.Send([xy_pad_plane, self.mpitype], dest=self.P1-1, tag=77)
835 |
836 | if self.comm0_rank == self.P1-1:
837 | self.comm0.Recv([xy_pad_plane, self.mpitype], source=0, tag=77)
838 | Uc_pad_hat_xr2[:, :, -1] = xy_pad_plane
839 |
840 | # Communicate and transform to final y-direction
841 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype])
842 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_xr2, self.P2)
843 |
844 | # Do fft for last direction
845 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
846 | planner_effort=self.planner_effort['fft'])
847 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
848 | fu /= padsize**3
849 |
850 | elif self.communication == 'Alltoallw':
851 | if len(self._subarrays1A_pad) == 0:
852 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
853 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
854 |
855 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
856 | Uc_pad_hat_x = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
857 |
858 | # Do fft in z direction on owned data
859 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
860 | planner_effort=self.planner_effort['rfft'])
861 |
862 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
863 |
864 | self.comm0.Alltoallw(
865 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad],
866 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad])
867 |
868 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
869 | planner_effort=self.planner_effort['fft'])
870 |
871 | Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
872 |
873 | self.comm1.Alltoallw(
874 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad],
875 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad])
876 |
877 | # Do fft for last direction
878 | Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
879 | planner_effort=self.planner_effort['fft'])
880 | fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
881 | fu /= padsize**3
882 |
883 | return fu
884 |
885 | class R2CX(R2CY):
886 | """Class for performing FFT in 3D using MPI
887 |
888 | Pencil decomposition
889 |
890 | Args:
891 | N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh
892 | L - NumPy array([Lx, Ly, Lz]) setting the actual size of the computational domain
893 | MPI - The MPI object (from mpi4py import MPI)
894 | precision - "single" or "double"
895 | communication - Communication scheme. ('AlltoallN', 'Alltoall' or 'Alltoallw')
896 | padsize - The size of padding, if padding is used in transforms
897 | threads - Number of threads used by FFTs
898 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
899 | Give as defaultdict, with keys representing transform (e.g., fft, ifft)
900 |
901 | This version has the final complex data aligned in the x-direction
902 | """
903 | def __init__(self, N, L, comm, precision, P1=None, communication='Alltoall',
904 | padsize=1.5, threads=1,
905 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
906 | R2CY.__init__(self, N, L, comm, precision, P1=P1, communication=communication,
907 | padsize=padsize, threads=threads, planner_effort=planner_effort)
908 | self.N2f = self.N2[2]//2 if self.comm1_rank < self.P2-1 else self.N2[2]//2+1
909 | if self.communication == 'AlltoallN':
910 | self.N2f = self.N2[2]//2
911 | if self.communication == 'Alltoallw':
912 | q = _subsize(self.Nf, self.P2, self.comm1_rank)
913 | self.N2f = q
914 |
915 | def real_shape(self):
916 | """The local shape of the real data"""
917 | return (self.N1[0], self.N2[1], self.N[2])
918 |
919 | def complex_shape(self):
920 | """The local shape of the complex data"""
921 | return (self.N[0], self.N1[1], self.N2f)
922 |
923 | def complex_shape_T(self):
924 | """The local transposed shape of the complex data"""
925 | return (self.Np[0], self.N[1], self.Nf)
926 |
927 | def complex_shape_I(self):
928 | """A local intermediate shape of the complex data"""
929 | return (self.Np[0], self.num_processes, self.Np[1], self.Nf)
930 |
931 | def real_local_slice(self, padsize=1):
932 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane
933 | yzrank = self.comm1.Get_rank() # Local rank in xy-plane
934 | return (slice(int(padsize * xyrank * self.N1[0]), int(padsize * (xyrank+1) * self.N1[0]), 1),
935 | slice(int(padsize * yzrank * self.N2[1]), int(padsize * (yzrank+1) * self.N2[1]), 1),
936 | slice(0, int(padsize * self.N[2])))
937 |
938 | def complex_local_slice(self):
939 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane
940 | yzrank = self.comm1.Get_rank() # Local rank in yz-plane
941 | return (slice(0, self.N[0]),
942 | slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1),
943 | slice(yzrank*self.N2[2]//2, yzrank*self.N2[2]//2 + self.N2f, 1))
944 |
945 | def get_local_mesh(self):
946 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane
947 | yzrank = self.comm1.Get_rank() # Local rank in xy-plane
948 |
949 | # Create the physical mesh
950 | x1 = slice(xyrank * self.N1[0], (xyrank+1) * self.N1[0], 1)
951 | x2 = slice(yzrank * self.N2[1], (yzrank+1) * self.N2[1], 1)
952 | X = np.mgrid[x1, x2, :self.N[2]].astype(self.float)
953 | X[0] *= self.L[0]/self.N[0]
954 | X[1] *= self.L[1]/self.N[1]
955 | X[2] *= self.L[2]/self.N[2]
956 | return X
957 |
958 | def get_local_wavenumbermesh(self):
959 | xyrank = self.comm0.Get_rank() # Local rank in xz-plane
960 | yzrank = self.comm1.Get_rank() # Local rank in yz-plane
961 |
962 | # Set wavenumbers in grid
963 | kx = fftfreq(self.N[0], 1./self.N[0]).astype(int)
964 | ky = fftfreq(self.N[1], 1./self.N[1]).astype(int)
965 | kz = fftfreq(self.N[2], 1./self.N[2]).astype(int)
966 | k2 = slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1)
967 | k1 = slice(yzrank*self.N2[2]//2, (yzrank+1)*self.N2[2]//2, 1)
968 | K = np.array(np.meshgrid(kx, ky[k2], kz[k1], indexing='ij'), dtype=self.float)
969 | return K
970 |
971 | def get_subarrays(self, padsize=1):
972 | datatype = MPI._typedict[np.dtype(self.complex).char]
973 | M, N, Q = self.N[0], self.N[1], self.Nf
974 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
975 | n = _subsize(N, self.P1, self.comm0_rank)
976 | q = _subsize(Q, self.P2, self.comm1_rank)
977 | _subarrays1A = [
978 | datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit()
979 | for l, s in _distribution(int(padsize*M), self.P1)
980 | ]
981 | _subarrays1B = [
982 | datatype.Create_subarray([m,N,q], [m,l,q], [0,s,0]).Commit()
983 | for l, s in _distribution(N, self.P1)
984 | ]
985 | _counts_displs1 = ([1] * self.P1, [0] * self.P1)
986 |
987 | m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
988 | n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
989 | q = _subsize(Q, self.P2, self.comm1_rank)
990 | _subarrays2A = [
991 | datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit()
992 | for l, s in _distribution(int(padsize*N), self.P2)
993 | ]
994 | _subarrays2B = [
995 | datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit()
996 | for l, s in _distribution(Q, self.P2)
997 | ]
998 | _counts_displs2 = ([1] * self.P2, [0] * self.P2)
999 | return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2
1000 |
1001 | def ifftn(self, fu, u, dealias=None):
1002 | """ifft in three directions using mpi
1003 |
1004 | Need to do ifft in reversed order of fft
1005 | """
1006 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
1007 |
1008 | if dealias == '2/3-rule' and self.dealias.shape == (0,):
1009 | self.dealias = self.get_dealias_filter()
1010 |
1011 | if not dealias == '3/2-rule':
1012 |
1013 | fu_ = fu
1014 | if dealias == '2/3-rule':
1015 | fu_ = self.work_arrays[(fu, 0, False)]
1016 | fu_[:] = fu
1017 | fu_ = dealias_filter(fu_, self.dealias)
1018 | #fu_ *= self.dealias
1019 |
1020 | # Intermediate work arrays required for transform
1021 | Uc_hat_z = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)]
1022 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1023 |
1024 | if self.communication == 'AlltoallN':
1025 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1026 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1027 |
1028 | # Do first owned direction
1029 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1030 | planner_effort=self.planner_effort['ifft'])
1031 |
1032 | # Communicate in xz-plane and do fft in y-direction
1033 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1034 |
1035 | # Transform to y all but k=N//2 (the neglected Nyquist mode)
1036 | Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P1)
1037 | Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads,
1038 | planner_effort=self.planner_effort['ifft'])
1039 |
1040 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1041 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1042 | Uc_hat_z[:] = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2)
1043 |
1044 | # Do ifft for z-direction
1045 | Uc_hat_z[:, :, -1] = 0
1046 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1047 | planner_effort=self.planner_effort['irfft'])
1048 |
1049 | elif self.communication == 'Alltoall':
1050 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1051 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1052 | Uc_hat_y2 = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1053 | xy_plane_T = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)]
1054 | xy_plane = xy_plane_T.transpose((1, 0))
1055 | xy_recv = self.work_arrays[((self.N2[1], self.N1[0]), self.complex, 0)]
1056 |
1057 | # Do first owned direction
1058 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1059 | planner_effort=self.planner_effort['ifft'])
1060 |
1061 | # Communicate in xz-plane and do fft in y-direction
1062 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1063 |
1064 | # Transform to y all but k=N//2 (the neglected Nyquist mode)
1065 | Uc_hat_y2 = transform_Uc_yx(Uc_hat_y2, Uc_hat_x, self.P1)
1066 | Uc_hat_y2[:] = ifft(Uc_hat_y2, axis=1, threads=self.threads,
1067 | planner_effort=self.planner_effort['ifft'])
1068 | xy_plane[:] = Uc_hat_y2[:, :, -1]
1069 |
1070 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1071 | Uc_hat_y[:] = Uc_hat_y2[:, :, :self.N2[2]//2]
1072 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1073 | Uc_hat_z = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2)
1074 |
1075 | self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1)
1076 | Uc_hat_z[:, :, -1] = xy_recv.transpose((1, 0))
1077 |
1078 | # Do ifft for z-direction
1079 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1080 | planner_effort=self.planner_effort['irfft'])
1081 |
1082 | elif self.communication == 'Alltoallw':
1083 | if len(self._subarrays1A) == 0:
1084 | (self._subarrays1A, self._subarrays1B, self._subarrays2A,
1085 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
1086 |
1087 | Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1088 |
1089 | # Do first owned direction
1090 | Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1091 | planner_effort=self.planner_effort['ifft'])
1092 |
1093 | self.comm0.Alltoallw(
1094 | [Uc_hat_x, self._counts_displs1, self._subarrays1A],
1095 | [Uc_hat_y, self._counts_displs1, self._subarrays1B])
1096 |
1097 | Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads,
1098 | planner_effort=self.planner_effort['ifft'])
1099 |
1100 | self.comm1.Alltoallw(
1101 | [Uc_hat_y, self._counts_displs2, self._subarrays2A],
1102 | [Uc_hat_z, self._counts_displs2, self._subarrays2B])
1103 | # Do ifft for z-direction
1104 | u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1105 | planner_effort=self.planner_effort['irfft'])
1106 |
1107 | else:
1108 | # Intermediate work arrays required for transform
1109 | Uc_pad_hat_z = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), self.Nf), self.complex, 0)]
1110 | Uc_pad_hat_z2 = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2]//2)+1), self.complex, 0)]
1111 | Uc_pad_hat_x = self.work_arrays[((int(self.padsize*self.N[0]), self.N1[1], self.N2f), self.complex, 0)]
1112 |
1113 | if self.communication == 'AlltoallN':
1114 | Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1115 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1116 | Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1117 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1118 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)]
1119 |
1120 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1121 |
1122 | # Do first owned direction
1123 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1124 | planner_effort=self.planner_effort['ifft'])
1125 |
1126 | # Communicate in xz-plane and do fft in y-direction
1127 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1128 |
1129 | # Transform to y
1130 | Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P1)
1131 | Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy2)
1132 |
1133 | Uc_pad_hat_xy = ifft(Uc_pad_hat_xy2, Uc_pad_hat_xy, overwrite_input=True, axis=1, threads=self.threads,
1134 | planner_effort=self.planner_effort['ifft'])
1135 |
1136 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1137 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1138 | Uc_pad_hat_z[:] = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2)
1139 | Uc_pad_hat_z[:, :, -1] = 0
1140 |
1141 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1142 |
1143 | # Do ifft for z-direction
1144 | u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads,
1145 | planner_effort=self.planner_effort['irfft'])
1146 |
1147 | elif self.communication == 'Alltoall':
1148 | Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1149 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1150 | Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1151 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1152 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)]
1153 | Uc_pad_hat_y2_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2f), self.complex, 0)]
1154 | Uc_pad_hat_y2 = Uc_pad_hat_y2_T.transpose((1, 0, 2))
1155 | Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)]
1156 |
1157 | xy_plane_T = self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0])), self.complex, 0)]
1158 | xy_plane = xy_plane_T.transpose((1, 0))
1159 | xy_recv = self.work_arrays[((int(self.padsize*self.N2[1]), int(self.padsize*self.N1[0])), self.complex, 0)]
1160 |
1161 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1162 |
1163 | # Do first owned direction
1164 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1165 | planner_effort=self.planner_effort['ifft'])
1166 |
1167 | # Communicate in xz-plane and do fft in y-direction
1168 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1169 |
1170 | # Transform to y
1171 | Uc_pad_hat_y2 = transform_Uc_yx(Uc_pad_hat_y2, Uc_pad_hat_x, self.P1)
1172 |
1173 | Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y2, Uc_pad_hat_xy2)
1174 |
1175 | Uc_pad_hat_xy2[:] = ifft(Uc_pad_hat_xy2, axis=1, threads=self.threads,
1176 | planner_effort=self.planner_effort['ifft'])
1177 | xy_plane[:] = Uc_pad_hat_xy2[:, :, -1]
1178 |
1179 | # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1180 | Uc_pad_hat_xy[:] = Uc_pad_hat_xy2[:, :, :self.N2[2]//2]
1181 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1182 | Uc_pad_hat_z = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2)
1183 |
1184 | self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1)
1185 | Uc_pad_hat_z[:, :, -1] = xy_recv.transpose((1, 0))
1186 |
1187 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1188 |
1189 | # Do ifft for z-direction
1190 | u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads,
1191 | planner_effort=self.planner_effort['irfft'])
1192 |
1193 | elif self.communication == 'Alltoallw':
1194 | if len(self._subarrays1A_pad) == 0:
1195 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
1196 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
1197 |
1198 | Uc_pad_hat_y = self.work_arrays[((int(self.padsize*self.N1[0]), self.N[1], self.N2f), self.complex, 0)]
1199 | Uc_pad_hat_xy = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)]
1200 |
1201 | Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1202 |
1203 | # Do first owned direction
1204 | Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1205 | planner_effort=self.planner_effort['ifft'])
1206 |
1207 | self.comm0.Alltoallw(
1208 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad],
1209 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1B_pad])
1210 |
1211 | Uc_pad_hat_xy = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy)
1212 |
1213 | Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=1, threads=self.threads,
1214 | planner_effort=self.planner_effort['ifft'])
1215 |
1216 | self.comm1.Alltoallw(
1217 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad],
1218 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad])
1219 |
1220 | Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1221 |
1222 | # Do ifft for z-direction
1223 | u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads,
1224 | planner_effort=self.planner_effort['irfft'])
1225 |
1226 | return u
1227 |
1228 | def fftn(self, u, fu, dealias=None):
1229 | """fft in three directions using mpi."""
1230 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
1231 |
1232 | if not dealias == '3/2-rule':
1233 |
1234 | # Intermediate work arrays required for transform
1235 | Uc_hat_z = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)]
1236 |
1237 | if self.communication == 'AlltoallN':
1238 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)]
1239 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1240 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1241 | Uc_hat_y2= self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 1)]
1242 |
1243 | # Do fft in z direction on owned data
1244 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1245 | planner_effort=self.planner_effort['rfft'])
1246 |
1247 | # Transform to y direction neglecting k=N//2 (Nyquist)
1248 | Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2)
1249 |
1250 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1251 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1252 | Uc_hat_y2 = fft(Uc_hat_y, Uc_hat_y2, axis=1, threads=self.threads,
1253 | planner_effort=self.planner_effort['fft'])
1254 |
1255 | # Communicate and transform to final x-direction
1256 | Uc_hat_x = transform_Uc_xy(Uc_hat_x, Uc_hat_y2, self.P1)
1257 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1258 |
1259 | # Do fft for last direction
1260 | fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads,
1261 | planner_effort=self.planner_effort['fft'])
1262 |
1263 | elif self.communication == 'Alltoall':
1264 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)]
1265 | Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1266 | Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1267 | Uc_hat_y2 = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1268 | Uc_hat_x2 = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1269 | Uc_hat_y3 = self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 0)]
1270 | xy_plane_T = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)]
1271 | xy_plane = xy_plane_T.transpose((1, 0))
1272 | xy_plane2 = self.work_arrays[((self.N[1]//2+1, self.N1[0]), self.complex, 0)]
1273 |
1274 | # Do fft in z direction on owned data
1275 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1276 | planner_effort=self.planner_effort['rfft'])
1277 |
1278 | # Move real part of Nyquist to k=0
1279 | Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1]
1280 |
1281 | # Transform to y direction neglecting k=N//2 (Nyquist)
1282 | Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2)
1283 |
1284 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1285 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1286 | Uc_hat_y3 = fft(Uc_hat_y, Uc_hat_y3, axis=1, threads=self.threads,
1287 | planner_effort=self.planner_effort['fft'])
1288 | Uc_hat_y2[:, :, :self.N2[2]//2] = Uc_hat_y3[:]
1289 |
1290 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
1291 | if self.comm1_rank == 0:
1292 | M = self.N[1]
1293 | xy_plane[:] = Uc_hat_y3[:, :, 0]
1294 | xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real))
1295 | Uc_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0))
1296 | xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag))
1297 | xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
1298 | self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77)
1299 |
1300 | if self.comm1_rank == self.P2-1:
1301 | self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77)
1302 | Uc_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0))
1303 |
1304 | # Communicate and transform to final x-direction
1305 | Uc_hat_x2 = transform_Uc_xy(Uc_hat_x2, Uc_hat_y2, self.P1)
1306 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x2, self.mpitype])
1307 |
1308 | # Do fft for last direction
1309 | fu = fft(Uc_hat_x2, fu, axis=0, threads=self.threads,
1310 | planner_effort=self.planner_effort['fft'])
1311 |
1312 | elif self.communication == 'Alltoallw':
1313 | Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1314 | Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1315 |
1316 | if len(self._subarrays1A) == 0:
1317 | (self._subarrays1A, self._subarrays1B, self._subarrays2A,
1318 | self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
1319 |
1320 | # Do fft in z direction on owned data
1321 | Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1322 | planner_effort=self.planner_effort['rfft'])
1323 |
1324 | self.comm1.Alltoallw(
1325 | [Uc_hat_z, self._counts_displs2, self._subarrays2B],
1326 | [Uc_hat_y, self._counts_displs2, self._subarrays2A])
1327 | Uc_hat_y[:] = fft(Uc_hat_y, axis=1, threads=self.threads,
1328 | planner_effort=self.planner_effort['fft'])
1329 |
1330 | # Communicate and transform to final x-direction
1331 | self.comm0.Alltoallw(
1332 | [Uc_hat_y, self._counts_displs1, self._subarrays1B],
1333 | [Uc_hat_x, self._counts_displs1, self._subarrays1A])
1334 |
1335 | # Do fft for last direction
1336 | fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads,
1337 | planner_effort=self.planner_effort['fft'])
1338 |
1339 | else:
1340 |
1341 | assert u.shape == self.real_shape_padded()
1342 | padsize = self.padsize
1343 | # Strip off self
1344 | N, N1, N2, Nf, N2f = self.N, self.N1, self.N2, self.Nf, self.N2f
1345 |
1346 | # Intermediate work arrays required for transform
1347 | Uc_pad_hat_z = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
1348 | Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
1349 |
1350 | if self.communication == 'AlltoallN':
1351 | Uc_pad_hat_x = self.work_arrays[((int(padsize*N[0]), N1[1], N2[2]//2), self.complex, 0)]
1352 | Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1353 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1354 | Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)]
1355 | Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1356 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1357 |
1358 | # Do fft in z direction on owned data
1359 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1360 | planner_effort=self.planner_effort['rfft'])
1361 |
1362 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1363 |
1364 | # Transform to y direction neglecting k=N//2 (Nyquist)
1365 | Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2)
1366 |
1367 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1368 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1369 | Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads,
1370 | planner_effort=self.planner_effort['fft'])
1371 |
1372 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y)
1373 |
1374 | # Communicate and transform to final x-direction
1375 | Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P1)
1376 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1377 |
1378 | # Do fft for last direction
1379 | Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads,
1380 | planner_effort=self.planner_effort['fft'])
1381 | fu = self.copy_from_padded_x(Uc_pad_hat_x, fu)
1382 | fu /= padsize**3
1383 |
1384 | elif self.communication == 'Alltoall':
1385 | Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1386 | Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1387 | Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)]
1388 | Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1389 | Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1390 | Uc_pad_hat_y2_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2f), self.complex, 0)]
1391 | Uc_pad_hat_y2 = Uc_pad_hat_y2_T.transpose((1, 0, 2))
1392 | Uc_pad_hat_x2 = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)]
1393 | xy_plane_T = self.work_arrays[((self.N[1], int(self.padsize*self.N1[0])), self.complex, 0)]
1394 | xy_plane = xy_plane_T.transpose((1, 0))
1395 | xy_plane2 = self.work_arrays[((self.N[1]//2+1, int(self.padsize*self.N1[0])), self.complex, 0)]
1396 |
1397 | # Do fft in z direction on owned data
1398 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1399 | planner_effort=self.planner_effort['rfft'])
1400 |
1401 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1402 |
1403 | # Move real part of Nyquist to k=0
1404 | Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1]
1405 |
1406 | # Transform to y direction neglecting k=N//2 (Nyquist)
1407 | Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2)
1408 |
1409 | # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1410 | self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1411 | Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads,
1412 | planner_effort=self.planner_effort['fft'])
1413 |
1414 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y)
1415 |
1416 | Uc_pad_hat_y2[:, :, :self.N2[2]//2] = Uc_pad_hat_y[:]
1417 |
1418 | # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
1419 | if self.comm1_rank == 0:
1420 | M = self.N[1]
1421 | xy_plane[:] = Uc_pad_hat_y[:, :, 0]
1422 | xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real))
1423 | Uc_pad_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0))
1424 | xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag))
1425 | xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
1426 | self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77)
1427 |
1428 | if self.comm1_rank == self.P2-1:
1429 | self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77)
1430 | Uc_pad_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0))
1431 |
1432 | # Communicate and transform to final x-direction
1433 | Uc_pad_hat_x2 = transform_Uc_xy(Uc_pad_hat_x2, Uc_pad_hat_y2, self.P1)
1434 | self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x2, self.mpitype])
1435 |
1436 | # Do fft for last direction
1437 | Uc_pad_hat_x2[:] = fft(Uc_pad_hat_x2, axis=0, threads=self.threads,
1438 | planner_effort=self.planner_effort['fft'])
1439 | fu = self.copy_from_padded_x(Uc_pad_hat_x2, fu)
1440 | fu /= padsize**3
1441 |
1442 | elif self.communication == 'Alltoallw':
1443 | Uc_pad_hat_y = self.work_arrays[((int(padsize*N1[0]), N[1], N2f), self.complex, 0)]
1444 | Uc_pad_hat_xy = self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2f), self.complex, 0)]
1445 | Uc_pad_hat_x = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)]
1446 |
1447 | if len(self._subarrays1A_pad) == 0:
1448 | (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
1449 | self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
1450 |
1451 | # Do fft in z direction on owned data
1452 | Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1453 | planner_effort=self.planner_effort['rfft'])
1454 |
1455 | Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1456 |
1457 | self.comm1.Alltoallw(
1458 | [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad],
1459 | [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad])
1460 |
1461 | Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=1, threads=self.threads,
1462 | planner_effort=self.planner_effort['fft'])
1463 |
1464 | Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy, Uc_pad_hat_y)
1465 |
1466 | # Communicate and transform to final x-direction
1467 | self.comm0.Alltoallw(
1468 | [Uc_pad_hat_y, self._counts_displs1, self._subarrays1B_pad],
1469 | [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad])
1470 |
1471 | # Do fft for last direction
1472 | Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads,
1473 | planner_effort=self.planner_effort['fft'])
1474 | fu = self.copy_from_padded_x(Uc_pad_hat_x, fu)
1475 | fu /= padsize**3
1476 |
1477 | return fu
1478 |
1479 | def R2C(N, L, comm, precision, P1=None, communication="Alltoall", padsize=1.5, threads=1,
1480 | alignment="X", planner_effort=defaultdict(lambda : "FFTW_MEASURE")):
1481 | if alignment == 'X':
1482 | return R2CX(N, L, comm, precision, P1, communication, padsize, threads, planner_effort)
1483 | else:
1484 | return R2CY(N, L, comm, precision, P1, communication, padsize, threads, planner_effort)
1485 |
--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | #assert False
3 | from .pyfftw_fft import *
4 |
5 | except:
6 | from .numpy_fft import *
7 |
--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/numpy_fft.py:
--------------------------------------------------------------------------------
1 | __author__ = "Mikael Mortensen "
2 | __date__ = "2016-02-16"
3 | __copyright__ = "Copyright (C) 2016 " + __author__
4 | __license__ = "GNU Lesser GPL version 3 or any later version"
5 |
6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn',
7 | 'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn']
8 |
9 | from numpy import iscomplexobj
10 | import numpy.fft
11 | from scipy.fftpack import dct
12 |
13 | dct1 = dct
14 | def dct(a, b, type=2, axis=0, **kw):
15 | if iscomplexobj(a):
16 | b.real[:] = dct1(a.real, type=type, axis=axis)
17 | b.imag[:] = dct1(a.imag, type=type, axis=axis)
18 | return b
19 |
20 | else:
21 | b[:] = dct1(a, type=type, axis=axis)
22 | return b
23 |
24 | # Define functions taking both input array and output array
25 | def fft(a, b=None, axis=0, threads=1, **kw):
26 | if b is None:
27 | return numpy.fft.fft(a, axis=axis)
28 | else:
29 | b[:] = numpy.fft.fft(a, axis=axis)
30 | return b
31 |
32 | def ifft(a, b=None, axis=0, threads=1, **kw):
33 | if b is None:
34 | return numpy.fft.ifft(a, axis=axis)
35 | else:
36 | b[:] = numpy.fft.ifft(a, axis=axis)
37 | return b
38 |
39 | def rfft(a, b=None, axis=0, threads=1, **kw):
40 | if b is None:
41 | return numpy.fft.rfft(a, axis=axis)
42 | else:
43 | b[:] = numpy.fft.rfft(a, axis=axis)
44 | return b
45 |
46 | def irfft(a, b=None, axis=0, threads=1, **kw):
47 | if b is None:
48 | return numpy.fft.irfft(a, axis=axis)
49 | else:
50 | b[:] = numpy.fft.irfft(a, axis=axis)
51 | return b
52 |
53 | def fft2(a, b=None, axes=(0, 1), threads=1, **kw):
54 | if b is None:
55 | return numpy.fft.fft2(a, axes=axes)
56 | else:
57 | b[:] = numpy.fft.fft2(a, axes=axes)
58 | return b
59 |
60 | def ifft2(a, b=None, axes=(0, 1), threads=1, **kw):
61 | if b is None:
62 | return numpy.fft.ifft2(a, axes=axes)
63 | else:
64 | b[:] = numpy.fft.ifft2(a, axes=axes)
65 | return b
66 |
67 | def rfft2(a, b=None, axes=(0, 1), threads=1, **kw):
68 | if b is None:
69 | return numpy.fft.rfft2(a, axes=axes)
70 | else:
71 | b[:] = numpy.fft.rfft2(a, axes=axes)
72 | return b
73 |
74 | def irfft2(a, b=None, axes=(0, 1), threads=1, **kw):
75 | if b is None:
76 | return numpy.fft.irfft2(a, axes=axes)
77 | else:
78 | b[:] = numpy.fft.irfft2(a, axes=axes)
79 | return b
80 |
81 | def fftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
82 | if b is None:
83 | return numpy.fft.fftn(a, axes=axes)
84 | else:
85 | b[:] = numpy.fft.fftn(a, axes=axes)
86 | return b
87 |
88 | def ifftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
89 | if b is None:
90 | return numpy.fft.ifftn(a, axes=axes)
91 | else:
92 | b[:] = numpy.fft.ifftn(a, axes=axes)
93 | return b
94 |
95 | def rfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
96 | if b is None:
97 | return numpy.fft.rfftn(a, axes=axes)
98 | else:
99 | b[:] = numpy.fft.rfftn(a, axes=axes)
100 | return b
101 |
102 | def irfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
103 | if b is None:
104 | return numpy.fft.irfftn(a, axes=axes)
105 | else:
106 | b[:] = numpy.fft.irfftn(a, axes=axes)
107 | return b
108 |
--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/pyfftw_fft.py:
--------------------------------------------------------------------------------
1 | __author__ = "Mikael Mortensen "
2 | __date__ = "2016-02-16"
3 | __copyright__ = "Copyright (C) 2016 " + __author__
4 | __license__ = "GNU Lesser GPL version 3 or any later version"
5 |
6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn',
7 | 'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn']
8 |
9 | import pyfftw
10 | from numpy import iscomplexobj
11 |
12 | dct_object = {}
13 | fft_object = {}
14 | ifft_object = {}
15 | fft2_object = {}
16 | ifft2_object = {}
17 | fftn_object = {}
18 | ifftn_object = {}
19 | irfft_object = {}
20 | irfftn_object = {}
21 | irfft2_object = {}
22 | rfft2_object = {}
23 | rfft_object = {}
24 | rfftn_object = {}
25 |
26 | def ifft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
27 | global ifft_object
28 | if not (a.shape, a.dtype, overwrite_input, axis) in ifft_object:
29 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.ifft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
30 | if not b is None:
31 | if b.flags['C_CONTIGUOUS'] is True:
32 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
33 | else:
34 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
35 | b[:] = ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
36 | return b
37 | else:
38 | ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
39 | return ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
40 |
41 | def ifft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
42 | global ifft2_object
43 | if not (a.shape, a.dtype, overwrite_input, axes) in ifft2_object:
44 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
45 | if not b is None:
46 | if b.flags['C_CONTIGUOUS'] is True:
47 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
48 | else:
49 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
50 | b[:] = ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
51 | return b
52 | else:
53 | ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
54 | return ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
55 |
56 | def ifftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
57 | global ifftn_object
58 | if not (a.shape, a.dtype, overwrite_input, axes) in ifftn_object:
59 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
60 | if not b is None:
61 | if b.flags['C_CONTIGUOUS'] is True:
62 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
63 | else:
64 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
65 | b[:] = ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
66 | return b
67 | else:
68 | ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
69 | return ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
70 |
71 | def irfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
72 | global irfft_object
73 | if not (a.shape, a.dtype, axis) in irfft_object:
74 | irfft_object[(a.shape, a.dtype, axis)] = pyfftw.builders.irfft(a, axis=axis, threads=threads, planner_effort=planner_effort)
75 | if overwrite_input:
76 | irfft_object[(a.shape, a.dtype, axis)](a)
77 | else:
78 | irfft_object[(a.shape, a.dtype, axis)](a.copy())
79 | if not b is None:
80 | b[:] = irfft_object[(a.shape, a.dtype, axis)].output_array
81 | return b
82 | else:
83 | return irfft_object[(a.shape, a.dtype, axis)].output_array
84 |
85 | def irfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
86 | global irfft2_object
87 | if not (a.shape, a.dtype, axes) in irfft2_object:
88 | irfft2_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfft2(a, axes=axes, threads=threads, planner_effort=planner_effort)
89 | # Copy required for irfft2 because input is destroyed
90 | if overwrite_input:
91 | irfft2_object[(a.shape, a.dtype, axes)](a)
92 | else:
93 | irfft2_object[(a.shape, a.dtype, axes)](a.copy())
94 | if not b is None:
95 | b[:] = irfft2_object[(a.shape, a.dtype, axes)].output_array
96 | return b
97 | else:
98 | return irfft2_object[(a.shape, a.dtype, axes)].output_array
99 |
100 | def irfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
101 | global irfftn_object
102 | if not (a.shape, a.dtype, axes) in irfftn_object:
103 | irfftn_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfftn(a, axes=axes, threads=threads, planner_effort=planner_effort)
104 | # Copy required because input is always destroyed
105 | if overwrite_input:
106 | irfftn_object[(a.shape, a.dtype, axes)](a)
107 | else:
108 | irfftn_object[(a.shape, a.dtype, axes)](a.copy())
109 | if not b is None:
110 | b[:] = irfftn_object[(a.shape, a.dtype, axes)].output_array
111 | return b
112 | else:
113 | return irfftn_object[(a.shape, a.dtype, axes)].output_array
114 |
115 | def fft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
116 | global fft_object
117 | if not (a.shape, a.dtype, overwrite_input, axis) in fft_object:
118 | fft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.fft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
119 | if not b is None:
120 | if b.flags['C_CONTIGUOUS'] is True:
121 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
122 | else:
123 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
124 | b[:] = fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
125 | return b
126 | else:
127 | fft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
128 | return fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
129 |
130 | def fft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
131 | global fft2_object
132 | if not (a.shape, a.dtype, overwrite_input, axes) in fft2_object:
133 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
134 | if not b is None:
135 | if b.flags['C_CONTIGUOUS'] is True:
136 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
137 | else:
138 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
139 | b[:] = fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
140 | return b
141 | else:
142 | fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
143 | return fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
144 |
145 | def fftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
146 | global fftn_object
147 | if not (a.shape, a.dtype, overwrite_input, axes) in fftn_object:
148 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
149 | if not b is None:
150 | if b.flags['C_CONTIGUOUS'] is True:
151 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
152 | else:
153 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
154 | b[:] = fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
155 | return b
156 | else:
157 | fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
158 | return fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
159 |
160 | def rfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
161 | global rfft_object
162 | if not (a.shape, a.dtype, overwrite_input, axis) in rfft_object:
163 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.rfft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
164 | if not b is None:
165 | if b.flags['C_CONTIGUOUS'] is True:
166 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
167 | else:
168 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
169 | b[:] = rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
170 | return b
171 | else:
172 | rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
173 | return rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
174 |
175 | def rfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
176 | global rfft2_object
177 | if not (a.shape, a.dtype, overwrite_input, axes) in rfft2_object:
178 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
179 | if not b is None:
180 | if b.flags['C_CONTIGUOUS'] is True:
181 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
182 | else:
183 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
184 | b[:] = rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
185 | return b
186 | else:
187 | rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
188 | return rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
189 |
190 | def rfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
191 | global rfftn_object
192 | if not (a.shape, a.dtype, overwrite_input, axes) in rfftn_object:
193 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
194 | if not b is None:
195 | if b.flags['C_CONTIGUOUS'] is True:
196 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
197 | else:
198 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
199 | b[:] = rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
200 | return b
201 | else:
202 | rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
203 | return rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
204 |
205 | if hasattr(pyfftw.builders, "dct"):
206 | #@profile
207 | def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_EXHAUSTIVE"):
208 | global dct_object
209 | key = (a.shape, a.dtype, overwrite_input, axis, type)
210 | if not key in dct_object:
211 | if iscomplexobj(a):
212 | ac = a.real.copy()
213 | else:
214 | ac = a
215 | dct_object[key] = pyfftw.builders.dct(ac, axis=axis, type=type,
216 | overwrite_input=overwrite_input,
217 | threads=threads,
218 | planner_effort=planner_effort)
219 |
220 | dobj = dct_object[key]
221 | c = dobj.get_output_array()
222 | if iscomplexobj(a):
223 | dobj(a.real, c)
224 | b.real[:] = c
225 | dobj(a.imag, c)
226 | b.imag[:] = c
227 |
228 | else:
229 | dobj(a)
230 | b[:] = c
231 | return b
232 |
233 | else:
234 | dct1 = pyfftw.interfaces.scipy_fftpack.dct
235 | #@profile
236 | def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
237 | if iscomplexobj(a):
238 | b.real[:] = dct1(a.real, type=type, axis=axis)
239 | b.imag[:] = dct1(a.imag, type=type, axis=axis)
240 | return b
241 |
242 | else:
243 | b[:] = dct1(a, type=type, axis=axis)
244 | return b
245 |
246 |
247 | #def fft(a, b=None, axis=0):
248 | #if b is None:
249 | #b = nfft.fft(a, axis=axis)
250 | #else:
251 | #b[:] = nfft.fft(a, axis=axis)
252 | #return b
253 |
254 | #def ifft(a, b=None, axis=0):
255 | #if b is None:
256 | #b = nfft.ifft(a, axis=axis)
257 | #else:
258 | #b[:] = nfft.ifft(a, axis=axis)
259 | #return b
260 |
261 | #def rfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
262 | #b[:] = nfft.rfft(a, axis=axis, overwrite_input=overwrite_input)
263 | #return b
264 |
265 | #def irfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
266 | #b[:] = nfft.irfft(a, axis=axis, overwrite_input=overwrite_input)
267 | #return b
268 |
269 | #def fft2(a, b=None, axes=(0, 1)):
270 | #if b is None:
271 | #b = nfft.fft2(a, axes=axes)
272 | #else:
273 | #b[:] = nfft.fft2(a, axes=axes)
274 | #return b
275 |
276 | #def ifft2(a, b=None, axes=(0, 1)):
277 | #if b is None:
278 | #b = nfft.ifft2(a, axes=axes)
279 | #else:
280 | #b[:] = nfft.ifft2(a, axes=axes)
281 | #return b
282 |
283 | #def rfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
284 | #b[:] = nfft.rfft2(a, axes=axes, overwrite_input=overwrite_input)
285 | #return b
286 |
287 | #def irfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
288 | #b[:] = nfft.irfft2(a, axes=axes, overwrite_input=overwrite_input)
289 | #return b
290 |
291 | #def fftn(a, b=None, axes=(0, 1, 2)):
292 | #if b is None:
293 | #b = nfft.fftn(a, axes=axes)
294 | #else:
295 | #b[:] = nfft.fftn(a, axes=axes)
296 | #return b
297 |
298 | #def ifftn(a, b=None, axes=(0, 1, 2)):
299 | #if b is None:
300 | #b = nfft.ifftn(a, axes=axes)
301 | #else:
302 | #b[:] = nfft.ifftn(a, axes=axes)
303 | #return b
304 |
305 | #def rfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
306 | #b[:] = nfft.rfftn(a, axes=axes, overwrite_input=overwrite_input)
307 | #return b
308 |
309 | #def irfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
310 | #b[:] = nfft.irfftn(a, axes=axes, overwrite_input=overwrite_input)
311 | #return b
312 |
313 |
--------------------------------------------------------------------------------
/mpiFFT4py/slab.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | """Slab decomposition
3 |
4 | This module contains classes for performing FFTs with slab decomposition
5 | of three-dimensional data structures data[Nx, Ny, Nz], where (Nx, Ny, Nz) is
6 | the shape of the input data. With slab decomposition only one of these three
7 | indices is shared, leading to local datastructures on each processor
8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors.
9 |
10 | classes:
11 | R2C - For real to complex transforms
12 | C2C - For complex to complex transforms
13 | """
14 | __author__ = "Mikael Mortensen "
15 | __date__ = "2016-02-16"
16 | __copyright__ = "Copyright (C) 2016 " + __author__
17 | __license__ = "GNU Lesser GPL version 3 or any later version"
18 |
19 | from .serialFFT import *
20 | import numpy as np
21 | from .mpibase import work_arrays, datatypes
22 | from numpy.fft import fftfreq, rfftfreq
23 | from .cython.maths import dealias_filter, transpose_Uc #, transpose_Umpi
24 | from collections import defaultdict
25 | from mpi4py import MPI
26 |
27 | # Using Lisandro Dalcin's code for Alltoallw.
28 | # Note that _subsize and _distribution are only really required for
29 | # general shape meshes. Here we require power two.
30 |
31 | def _subsize(N, size, rank):
32 | return N // size + (N % size > rank)
33 |
34 | def _distribution(N, size):
35 | q = N // size
36 | r = N % size
37 | n = s = i = 0
38 | while i < size:
39 | n = q
40 | s = q * i
41 | if i < r:
42 | n += 1
43 | s += i
44 | else:
45 | s += r
46 | yield n, s
47 | i += 1
48 |
49 | class R2C(object):
50 | """Class for performing FFT in 3D using MPI
51 |
52 | Slab decomposition
53 |
54 | Args:
55 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
56 | L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh
57 | comm - The MPI communicator object
58 | precision - "single" or "double"
59 | communication - Method used for communication ('Alltoall', 'Sendrecv_replace', 'Alltoallw')
60 | padsize - Padsize when dealias = 3/2-rule is used
61 | threads - Number of threads used by FFTs
62 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
63 | Give as defaultdict, with keys representing transform (e.g., fft, ifft)
64 |
65 | The forward transform is real to complex and the inverse is complex to real
66 | """
67 | def __init__(self, N, L, comm, precision,
68 | communication="Alltoallw",
69 | padsize=1.5,
70 | threads=1,
71 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
72 | assert len(L) == 3
73 | assert len(N) == 3
74 | self.N = N
75 | self.Nf = N[2]//2+1 # Independent complex wavenumbers in z-direction
76 | self.Nfp = int(padsize*N[2]//2+1) # Independent complex wavenumbers in z-direction for padded array
77 | self.comm = comm
78 | self.float, self.complex, self.mpitype = datatypes(precision)
79 | self.communication = communication
80 | self.num_processes = comm.Get_size()
81 | self.rank = comm.Get_rank()
82 | self.Np = N // self.num_processes
83 | self.L = L.astype(self.float)
84 | self.dealias = np.zeros(0)
85 | self.padsize = padsize
86 | self.threads = threads
87 | self.planner_effort = planner_effort
88 | self.work_arrays = work_arrays()
89 | if not self.num_processes in [2**i for i in range(int(np.log2(N[0]))+1)]:
90 | raise IOError("Number of cpus must be in ",
91 | [2**i for i in range(int(np.log2(N[0]))+1)])
92 | self._subarraysA = []
93 | self._subarraysB = []
94 | self._counts_displs = 0
95 | self._subarraysA_pad = []
96 | self._subarraysB_pad = []
97 |
98 | def real_shape(self):
99 | """The local shape of the real data"""
100 | return (self.Np[0], self.N[1], self.N[2])
101 |
102 | def complex_shape(self):
103 | """The local shape of the complex data"""
104 | return (self.N[0], self.Np[1], self.Nf)
105 |
106 | def complex_shape_T(self):
107 | """The local transposed shape of the complex data"""
108 | return (self.Np[0], self.N[1], self.Nf)
109 |
110 | def global_real_shape(self):
111 | """Global size of problem in real physical space"""
112 | return (self.N[0], self.N[1], self.N[2])
113 |
114 | def global_complex_shape(self, padsize=1.):
115 | """Global size of problem in complex wavenumber space"""
116 | return (int(padsize*self.N[0]), int(padsize*self.N[1]),
117 | int(padsize*self.N[2]//2+1))
118 |
119 | def work_shape(self, dealias):
120 | """Shape of work arrays used in convection with dealiasing.
121 |
122 | Note the different shape whether or not padding is involved.
123 | """
124 | if dealias == '3/2-rule':
125 | return self.real_shape_padded()
126 |
127 | else:
128 | return self.real_shape()
129 |
130 | def real_local_slice(self, padsize=1):
131 | """Local slice in real space of the input array
132 |
133 | Array can be padded with padsize > 1
134 | """
135 | return (slice(int(padsize*self.rank*self.Np[0]),
136 | int(padsize*(self.rank+1)*self.Np[0]), 1),
137 | slice(0, int(padsize*self.N[1]), 1),
138 | slice(0, int(padsize*self.N[2]), 1))
139 |
140 | def complex_local_slice(self):
141 | """Local slice of complex return array"""
142 | return (slice(0, self.N[0], 1),
143 | slice(self.rank*self.Np[1], (self.rank+1)*self.Np[1], 1),
144 | slice(0, self.Nf, 1))
145 |
146 | def complex_local_wavenumbers(self):
147 | """Returns local wavenumbers of complex space"""
148 | return (fftfreq(self.N[0], 1./self.N[0]).astype(self.float),
149 | fftfreq(self.N[1], 1./self.N[1])[self.complex_local_slice()[1]].astype(self.float),
150 | rfftfreq(self.N[2], 1./self.N[2]).astype(self.float))
151 |
152 | def get_local_mesh(self):
153 | """Returns the local decomposed physical mesh"""
154 | X = np.ogrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0],
155 | :self.N[1], :self.N[2]]
156 | X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float)
157 | X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float)
158 | X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float)
159 | X = [np.broadcast_to(x, self.real_shape()) for x in X]
160 | return X
161 |
162 | def get_local_wavenumbermesh(self, scaled=False, broadcast=False, eliminate_highest_freq=False):
163 | """Returns (scaled) local decomposed wavenumbermesh
164 |
165 | If scaled is True, then the wavenumbermesh is scaled with physical mesh
166 | size. This takes care of mapping the physical domain to a computational
167 | cube of size (2pi)**3.
168 |
169 | If eliminate_highest_freq is True, then the Nyquist frequency is set to zero.
170 | """
171 | kx, ky, kz = self.complex_local_wavenumbers()
172 | if eliminate_highest_freq:
173 | ky = fftfreq(self.N[1], 1./self.N[1].astype(self.float))
174 | for i, k in enumerate((kx, ky, kz)):
175 | if self.N[i] % 2 == 0:
176 | k[self.N[i]//2] = 0
177 | ky = ky[self.complex_local_slice()[1]]
178 |
179 | Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True)
180 | for i in range(3):
181 | Ks[i] = Ks[i].astype(self.float)
182 | if scaled:
183 | Lp = 2*np.pi/self.L
184 | for i in range(3):
185 | Ks[i] *= Lp[i]
186 | K = Ks
187 | if broadcast is True:
188 | K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
189 | return K
190 |
191 | def get_dealias_filter(self):
192 | """Filter for dealiasing nonlinear convection"""
193 | K = self.get_local_wavenumbermesh()
194 | kmax = 2./3.*(self.N//2+1)
195 | dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])*
196 | (abs(K[2]) < kmax[2]), dtype=np.uint8)
197 | return dealias
198 |
199 | def get_subarrays(self, padsize=1):
200 | """Subarrays for Alltoallw transforms"""
201 | datatype = MPI._typedict[np.dtype(self.complex).char]
202 | _subarraysA = [
203 | datatype.Create_subarray([int(padsize*self.N[0]), self.Np[1], self.Nf], [l, self.Np[1], self.Nf], [s, 0, 0]).Commit()
204 | for l, s in _distribution(int(padsize*self.N[0]), self.num_processes)
205 | ]
206 | _subarraysB = [
207 | datatype.Create_subarray([int(padsize*self.Np[0]), self.N[1], self.Nf], [int(padsize*self.Np[0]), l, self.Nf], [0, s, 0]).Commit()
208 | for l, s in _distribution(self.N[1], self.num_processes)
209 | ]
210 | _counts_displs = ([1] * self.num_processes, [0] * self.num_processes)
211 | return _subarraysA, _subarraysB, _counts_displs
212 |
213 | #@profile
214 | def ifftn(self, fu, u, dealias=None):
215 | """ifft in three directions using mpi.
216 |
217 | Need to do ifft in reversed order of fft
218 |
219 | dealias = "3/2-rule"
220 | - Padded transform with 3/2-rule. fu is padded with zeros
221 | before transforming to real space of shape real_shape_padded()
222 | - u is of real_shape_padded()
223 |
224 | dealias = "2/3-rule"
225 | - Transform is using 2/3-rule, i.e., frequencies higher than
226 | 2/3*N are set to zero before transforming
227 | - u is of real_shape()
228 |
229 | dealias = None
230 | - Regular transform
231 | - u is of real_shape()
232 |
233 | fu is of complex_shape()
234 | """
235 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
236 |
237 | if dealias == '2/3-rule' and self.dealias.shape == (0,):
238 | self.dealias = self.get_dealias_filter()
239 |
240 | fu_ = fu
241 | if dealias == '2/3-rule':
242 | fu_ = self.work_arrays[(fu, 0, False)]
243 | fu_[:] = fu
244 | fu_ = dealias_filter(fu_, self.dealias)
245 | #fu_ *= self.dealias
246 |
247 | if self.num_processes == 1:
248 | if not dealias == '3/2-rule':
249 | u = irfftn(fu_, u, axes=(0, 1, 2), threads=self.threads, planner_effort=self.planner_effort['irfftn'])
250 |
251 | else:
252 | assert u.shape == self.real_shape_padded()
253 |
254 | # Scale smallest array with padsize
255 | fu_ = self.work_arrays[(fu, 0, False)]
256 | fu_[:] = fu*self.padsize**3
257 |
258 | # First create padded complex array and then perform irfftn
259 | fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5), self.complex, 0)]
260 | fu_padded[:self.N[0]//2, :self.N[1]//2, :self.Nf] = fu_[:self.N[0]//2, :self.N[1]//2]
261 | fu_padded[:self.N[0]//2, -self.N[1]//2:, :self.Nf] = fu_[:self.N[0]//2, self.N[1]//2:]
262 | fu_padded[-self.N[0]//2:, :self.N[1]//2, :self.Nf] = fu_[self.N[0]//2:, :self.N[1]//2]
263 | fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf] = fu_[self.N[0]//2:, -self.N[1]//2:]
264 |
265 | u[:] = irfftn(fu_padded, overwrite_input=True,
266 | axes=(0, 1, 2), threads=self.threads,
267 | planner_effort=self.planner_effort['irfftn'])
268 | return u
269 |
270 | if not dealias == '3/2-rule':
271 | # Intermediate work arrays required for transform
272 | Uc_hat = self.work_arrays[(self.complex_shape(), self.complex, 0, False)]
273 |
274 | # Do first owned direction
275 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
276 |
277 | if self.communication == 'Alltoall':
278 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
279 |
280 | ## Communicate all values
281 | self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype])
282 | #Uc_hatT = np.rollaxis(Uc_mpi, 1).reshape(self.complex_shape_T())
283 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
284 | Uc_hatT = transpose_Uc(Uc_hatT, Uc_mpi, self.num_processes, self.Np[0], self.Np[1], self.Nf)
285 |
286 | #self.comm.Alltoall(MPI.IN_PLACE, [Uc_hat, self.mpitype])
287 | #Uc_hatT = np.rollaxis(Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf)), 1).reshape(self.complex_shape_T())
288 |
289 | elif self.communication == 'Sendrecv_replace':
290 | Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf))
291 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
292 | for i in xrange(self.num_processes):
293 | if not i == self.rank:
294 | self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0)
295 | Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i]
296 |
297 | elif self.communication == 'Alltoallw':
298 | if len(self._subarraysA) == 0:
299 | self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays()
300 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
301 | self.comm.Alltoallw(
302 | [Uc_hat, self._counts_displs, self._subarraysA],
303 | [Uc_hatT, self._counts_displs, self._subarraysB])
304 |
305 | # Do last two directions
306 | u = irfft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2),
307 | threads=self.threads,
308 | planner_effort=self.planner_effort['irfft2'])
309 |
310 | else:
311 | assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule"
312 |
313 | # Intermediate work arrays required for transform
314 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0)]
315 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)]
316 | Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0)]
317 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0)]
318 |
319 | # Expand in x-direction and perform ifft
320 | Upad_hat = R2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0)
321 | Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads,
322 | planner_effort=self.planner_effort['ifft'])
323 |
324 | if not self.communication == 'Alltoallw':
325 | # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir)
326 | self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat, self.mpitype])
327 | Upad_hat1[:] = np.rollaxis(Upad_hat.reshape(self.complex_shape_padded_0_I()), 1).reshape(Upad_hat1.shape)
328 |
329 | else:
330 | if len(self._subarraysA_pad) == 0:
331 | self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize)
332 | self.comm.Alltoallw(
333 | [Upad_hat, self._counts_displs, self._subarraysA_pad],
334 | [Upad_hat1, self._counts_displs, self._subarraysB_pad])
335 |
336 | # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y
337 | Upad_hat2 = R2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1)
338 | Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads,
339 | planner_effort=self.planner_effort['ifft'])
340 |
341 | # pad in z-direction and perform final irfft
342 | Upad_hat3 = R2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2)
343 | u[:] = irfft(Upad_hat3, overwrite_input=True, axis=2, threads=self.threads,
344 | planner_effort=self.planner_effort['irfft'])
345 |
346 | return u
347 |
348 | #@profile
349 | def fftn(self, u, fu, dealias=None):
350 | """fft in three directions using mpi
351 |
352 | dealias = "3/2-rule"
353 | - Truncated transform with 3/2-rule. The transformed fu is truncated
354 | when copied to complex space of complex_shape()
355 | - fu is of complex_shape()
356 | - u is of real_shape_padded()
357 |
358 | dealias = "2/3-rule" or None
359 | - Regular transform
360 | - fu is of complex_shape()
361 | - u is of real_shape()
362 |
363 | """
364 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
365 |
366 | if self.num_processes == 1:
367 | if not dealias == '3/2-rule':
368 | assert u.shape == self.real_shape()
369 | fu = rfftn(u, fu, axes=(0, 1, 2), threads=self.threads,
370 | planner_effort=self.planner_effort['rfftn'])
371 |
372 | else:
373 | assert u.shape == self.real_shape_padded()
374 |
375 | fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5),
376 | self.complex, 0, False)]
377 | fu_padded = rfftn(u, fu_padded, axes=(0, 1, 2),
378 | planner_effort=self.planner_effort['rfftn'])
379 |
380 | # Copy with truncation
381 | fu.fill(0)
382 | fu[:self.N[0]//2+1, :self.N[1]//2+1] = fu_padded[:self.N[0]//2+1, :self.N[1]//2+1, :self.Nf]
383 | fu[:self.N[0]//2+1, self.N[1]//2:] += fu_padded[:self.N[0]//2+1, -self.N[1]//2:, :self.Nf]
384 | fu[self.N[0]//2:, :self.N[1]//2+1] += fu_padded[-self.N[0]//2:, :self.N[1]//2+1, :self.Nf]
385 | fu[self.N[0]//2:, self.N[1]//2:] += fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf]
386 | fu /= self.padsize**3
387 |
388 | return fu
389 |
390 | if not dealias == '3/2-rule':
391 |
392 | Uc_hat = self.work_arrays[(fu, 0, False)]
393 |
394 | if self.communication == 'Alltoall':
395 | # Intermediate work arrays required for transform
396 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
397 | U_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
398 |
399 | # Do 2 ffts in y-z directions on owned data
400 | Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
401 |
402 | #Transform data to align with x-direction
403 | U_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1)
404 |
405 | #Communicate all values
406 | self.comm.Alltoall([U_mpi, self.mpitype], [Uc_hat, self.mpitype])
407 |
408 | ## Transform data to align with x-direction
409 | #U_mpi = transpose_Umpi(U_mpi, Uc_hatT, self.num_processes, self.Np[0], self.Np[1], self.Nf)
410 |
411 | ## Communicate all values
412 | #self.comm.Alltoall([U_mpi, self.mpitype], [fu, self.mpitype])
413 |
414 | elif self.communication == 'Sendrecv_replace':
415 | # Communicating intermediate result
416 | ft = Uc_hat.transpose(1, 0, 2)
417 | ft = rfft2(u, ft, axes=(1, 2), threads=self.threads,
418 | planner_effort=self.planner_effort['rfft2'])
419 | fu_send = Uc_hat.reshape((self.num_processes, self.Np[1],
420 | self.Np[1], self.Nf))
421 | for i in xrange(self.num_processes):
422 | if not i == self.rank:
423 | self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0)
424 | fu_send[:] = fu_send.transpose(0, 2, 1, 3)
425 |
426 | elif self.communication == 'Alltoallw':
427 | if len(self._subarraysA) == 0:
428 | self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays()
429 |
430 | # Intermediate work arrays required for transform
431 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
432 |
433 | # Do 2 ffts in y-z directions on owned data
434 | Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads,
435 | planner_effort=self.planner_effort['rfft2'])
436 |
437 | self.comm.Alltoallw(
438 | [Uc_hatT, self._counts_displs, self._subarraysB],
439 | [Uc_hat, self._counts_displs, self._subarraysA])
440 |
441 | # Do fft for last direction
442 | fu = fft(Uc_hat, fu, overwrite_input=True, axis=0,
443 | threads=self.threads, planner_effort=self.planner_effort['fft'])
444 |
445 | else:
446 | assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule"
447 | assert u.shape == self.real_shape_padded()
448 |
449 | # Intermediate work arrays required for transform
450 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
451 | Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)]
452 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)]
453 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
454 |
455 | # Do ffts in the padded y and z directions
456 | Upad_hat3 = rfft2(u, Upad_hat3, axes=(1, 2), threads=self.threads,
457 | planner_effort=self.planner_effort['rfft2'])
458 |
459 | # Copy with truncation
460 | Upad_hat1 = R2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1)
461 |
462 | if self.communication == 'Alltoall':
463 | # Transpose and commuincate data
464 | Upad_hat0[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1).reshape(Upad_hat0.shape)
465 | self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat0, self.mpitype])
466 |
467 | elif self.communication == 'Alltoallw':
468 | if len(self._subarraysA_pad) == 0:
469 | self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize)
470 |
471 | self.comm.Alltoallw(
472 | [Upad_hat1, self._counts_displs, self._subarraysB_pad],
473 | [Upad_hat0, self._counts_displs, self._subarraysA_pad])
474 |
475 | # Perform fft of data in x-direction
476 | Upad_hat = fft(Upad_hat0, Upad_hat, axis=0, threads=self.threads,
477 | planner_effort=self.planner_effort['fft'])
478 |
479 | # Truncate to original complex shape
480 | fu.fill(0)
481 | fu[:self.N[0]//2+1] = Upad_hat[:self.N[0]//2+1]
482 | fu[self.N[0]//2:] += Upad_hat[-self.N[0]//2:]
483 | fu /= self.padsize**3
484 |
485 | return fu
486 |
487 | def real_shape_padded(self):
488 | """The local shape of the real data"""
489 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), int(self.padsize*self.N[2]))
490 |
491 | def complex_shape_padded_0(self):
492 | """Padding in x-direction"""
493 | return (int(self.padsize*self.N[0]), self.Np[1], self.Nf)
494 |
495 | def complex_shape_padded_0_I(self):
496 | """Padding in x-direction - reshaped for MPI communications"""
497 | return (self.num_processes, int(self.padsize*self.Np[0]), self.Np[1], self.Nf)
498 |
499 | def complex_shape_padded_1(self):
500 | """Transpose of complex_shape_padded_0"""
501 | return (int(self.padsize*self.Np[0]), self.N[1], self.Nf)
502 |
503 | def complex_shape_padded_2(self):
504 | """Padding in x and y-directions"""
505 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nf)
506 |
507 | def complex_shape_padded_3(self):
508 | """Padding in all directions.
509 | ifft of this shape leads to real_shape_padded"""
510 | return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nfp)
511 |
512 | def complex_shape_padded_I(self):
513 | """A local intermediate shape of the complex data"""
514 | return (int(self.padsize*self.Np[0]), self.num_processes, self.Np[1], self.Nf)
515 |
516 | @staticmethod
517 | def copy_to_padded(fu, fp, N, axis=0):
518 | if axis == 0:
519 | fp[:N[0]//2] = fu[:N[0]//2]
520 | fp[-N[0]//2:] = fu[N[0]//2:]
521 | elif axis == 1:
522 | fp[:, :N[1]//2] = fu[:, :N[1]//2]
523 | fp[:, -N[1]//2:] = fu[:, N[1]//2:]
524 | elif axis == 2:
525 | fp[:, :, :(N[2]//2+1)] = fu[:]
526 | return fp
527 |
528 | @staticmethod
529 | def copy_from_padded(fp, fu, N, axis=0):
530 | if axis == 1:
531 | fu.fill(0)
532 | fu[:, :N[1]//2+1] = fp[:, :N[1]//2+1, :(N[2]//2+1)]
533 | fu[:, N[1]//2:] += fp[:, -N[1]//2:, :(N[2]//2+1)]
534 | elif axis == 2:
535 | fu[:] = fp[:, :, :(N[2]//2+1)]
536 | return fu
537 |
538 | class C2C(R2C):
539 | """Class for performing FFT in 3D using MPI
540 |
541 | Slab decomposition
542 |
543 | Args:
544 | N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
545 | L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh
546 | comm - The MPI communicator object
547 | precision - "single" or "double"
548 | communication - Method used for communication ('Alltoall', 'Sendrecv_replace')
549 | padsize - Padsize when dealias = 3/2-rule is used
550 | threads - Number of threads used by FFTs
551 | planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
552 | Give as defaultdict, with keys representing transform (e.g., fft, ifft)
553 |
554 | The transform is complex to complex
555 | """
556 | def __init__(self, N, L, comm, precision,
557 | communication="Alltoall",
558 | padsize=1.5,
559 | threads=1,
560 | planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
561 | R2C.__init__(self, N, L, comm, precision,
562 | communication=communication,
563 | padsize=padsize, threads=threads,
564 | planner_effort=planner_effort)
565 | # Reuse all shapes from r2c transform R2C simply by resizing the final complex z-dimension:
566 | self.Nf = N[2]
567 | self.Nfp = int(self.padsize*self.N[2]) # Independent complex wavenumbers in z-direction for padded array
568 |
569 | # Rename since there's no real space
570 | self.original_shape_padded = self.real_shape_padded
571 | self.original_shape = self.real_shape
572 | self.transformed_shape = self.complex_shape
573 | self.original_local_slice = self.real_local_slice
574 | self.transformed_local_slice = self.complex_local_slice
575 | self.ks = (fftfreq(N[2])*N[2]).astype(int)
576 |
577 | def global_shape(self, padsize=1.):
578 | """Global size of problem in transformed space"""
579 | return (int(padsize*self.N[0]), int(padsize*self.N[1]),
580 | int(padsize*self.N[2]))
581 |
582 | def transformed_local_wavenumbers(self):
583 | return (fftfreq(self.N[0], 1./self.N[0]),
584 | fftfreq(self.N[1], 1./self.N[1])[self.transformed_local_slice()[1]],
585 | fftfreq(self.N[2], 1./self.N[2]))
586 |
587 | def ifftn(self, fu, u, dealias=None):
588 | """ifft in three directions using mpi.
589 | Need to do ifft in reversed order of fft
590 |
591 | dealias = "3/2-rule"
592 | - Padded transform with 3/2-rule. fu is padded with zeros
593 | before transforming to complex space of shape original_shape_padded()
594 | - u is of original_shape_padded()
595 |
596 | dealias = "2/3-rule"
597 | - Transform is using 2/3-rule, i.e., frequencies higher than
598 | 2/3*N are set to zero before transforming
599 | - u is of original_shape()
600 |
601 | dealias = None
602 | - Regular transform
603 | - u is of original_shape()
604 |
605 | fu is of transformed_shape()
606 | """
607 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
608 |
609 | if dealias == '2/3-rule' and self.dealias.shape == (0,):
610 | self.dealias = self.get_dealias_filter()
611 |
612 | if self.num_processes == 1:
613 | if not dealias == '3/2-rule':
614 | fu_ = fu
615 | if dealias == '2/3-rule':
616 | fu_ = self.work_arrays[(fu, 0, False)]
617 | fu_[:] = fu
618 | fu_ *= self.dealias
619 |
620 | u = ifftn(fu_, u, axes=(0, 1, 2), threads=self.threads,
621 | planner_effort=self.planner_effort['ifftn'])
622 |
623 | else:
624 | assert u.shape == self.original_shape_padded()
625 |
626 | # First create padded complex array and then perform irfftn
627 | fu_padded = self.work_arrays[(u, 0)]
628 | fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks] = fu[:self.N[0]//2, :self.N[1]//2]
629 | fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks] = fu[:self.N[0]//2, self.N[1]//2:]
630 | fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks] = fu[self.N[0]//2:, :self.N[1]//2]
631 | fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks] = fu[self.N[0]//2:, self.N[1]//2:]
632 | u = ifftn(fu_padded*self.padsize**3, u, overwrite_input=True,
633 | axes=(0, 1, 2), threads=self.threads,
634 | planner_effort=self.planner_effort['ifftn'])
635 |
636 | return u
637 |
638 | if not dealias == '3/2-rule':
639 | fu_ = fu
640 | if dealias == '2/3-rule':
641 | fu_ = self.work_arrays[(fu, 0, False)]
642 | fu_[:] = fu
643 | fu_ *= self.dealias
644 |
645 | # Intermediate work arrays required for transform
646 | Uc_hat = self.work_arrays[(self.complex_shape(), self.complex, 0, False)]
647 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
648 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
649 |
650 | # Do first owned direction
651 | Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads,
652 | planner_effort=self.planner_effort['ifft'])
653 |
654 | if self.communication == 'Alltoall':
655 | # Communicate all values
656 | self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype])
657 | Uc_hatT[:] = np.rollaxis(Uc_mpi, 1).reshape(Uc_hatT.shape)
658 |
659 | else:
660 | Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf))
661 | for i in xrange(self.num_processes):
662 | if not i == self.rank:
663 | self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0)
664 | Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i]
665 |
666 | # Do last two directions
667 | u = ifft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2),
668 | threads=self.threads,
669 | planner_effort=self.planner_effort['ifft2'])
670 |
671 | else:
672 | # Intermediate work arrays required for transform
673 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
674 | U_mpi = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)]
675 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)]
676 | Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0, False)]
677 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
678 |
679 | # Expand in x-direction and perform ifft
680 | Upad_hat = C2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0)
681 | Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads,
682 | planner_effort=self.planner_effort['ifft'])
683 |
684 | # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir and z-direction of full size)
685 | self.comm.Alltoall([Upad_hat, self.mpitype], [U_mpi, self.mpitype])
686 |
687 | # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y
688 | Upad_hat1[:] = np.rollaxis(U_mpi, 1).reshape(Upad_hat1.shape)
689 | Upad_hat2 = C2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1)
690 | Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads,
691 | planner_effort=self.planner_effort['ifft'])
692 |
693 | # pad in z-direction and perform final ifft
694 | Upad_hat3 = C2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2)
695 | u = ifft(Upad_hat3, u, overwrite_input=True, axis=2,
696 | threads=self.threads, planner_effort=self.planner_effort['ifft'])
697 |
698 | return u
699 |
700 | def fftn(self, u, fu, dealias=None):
701 | """fft in three directions using mpi
702 |
703 | dealias = "3/2-rule"
704 | - Truncated transform with 3/2-rule. The transfored fu is truncated
705 | when copied to complex space of complex_shape()
706 | - fu is of transformed_shape()
707 | - u is of original_shape_padded()
708 |
709 | dealias = "2/3-rule"
710 | - Regular transform
711 | - fu is of transformed_shape()
712 | - u is of original_shape()
713 |
714 | dealias = None
715 | - Regular transform
716 | - fu is of transformed_shape()
717 | - u is of original_shape()
718 | """
719 | assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
720 |
721 | if self.num_processes == 1:
722 | if not dealias == '3/2-rule':
723 | assert u.shape == self.original_shape()
724 |
725 | fu = fftn(u, fu, axes=(0, 1, 2), threads=self.threads,
726 | planner_effort=self.planner_effort['fftn'])
727 |
728 | else:
729 | assert u.shape == self.original_shape_padded()
730 |
731 | fu_padded = self.work_arrays[(u, 0)]
732 | fu_padded = fftn(u, fu_padded, axes=(0, 1, 2), threads=self.threads,
733 | planner_effort=self.planner_effort['fftn'])
734 |
735 | # Copy with truncation
736 | fu[:self.N[0]//2, :self.N[1]//2] = fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks]
737 | fu[:self.N[0]//2, self.N[1]//2:] = fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks]
738 | fu[self.N[0]//2:, :self.N[1]//2] = fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks]
739 | fu[self.N[0]//2:, self.N[1]//2:] = fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks]
740 | fu /= self.padsize**3
741 | return fu
742 |
743 | if not dealias == '3/2-rule':
744 | if self.communication == 'Alltoall':
745 | # Intermediate work arrays required for transform
746 | Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
747 | Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
748 |
749 | # Do 2 ffts in y-z directions on owned data
750 | Uc_hatT = fft2(u, Uc_hatT, axes=(1,2), threads=self.threads, planner_effort=self.planner_effort['fft2'])
751 |
752 | # Transform data to align with x-direction
753 | Uc_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1)
754 |
755 | # Communicate all values
756 | self.comm.Alltoall([Uc_mpi, self.mpitype], [fu, self.mpitype])
757 |
758 | else:
759 | # Communicating intermediate result
760 | ft = fu.transpose(1, 0, 2)
761 | ft = fft2(u, ft, axes=(1, 2), threads=self.threads,
762 | planner_effort=self.planner_effort['fft2'])
763 | fu_send = fu.reshape((self.num_processes, self.Np[1],
764 | self.Np[1], self.Nf))
765 | for i in xrange(self.num_processes):
766 | if not i == self.rank:
767 | self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0)
768 | fu_send[:] = fu_send.transpose(0, 2, 1, 3)
769 |
770 | # Do fft for last direction
771 | fu[:] = fft(fu, axis=0, threads=self.threads,
772 | planner_effort=self.planner_effort['fft'])
773 |
774 | else:
775 | # Intermediate work arrays required for transform
776 | Upad_hat = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
777 | Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)]
778 | Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)]
779 | Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
780 | U_mpi = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)]
781 |
782 | # Do ffts in y and z directions
783 | Upad_hat3 = fft2(u, Upad_hat3, axes=(1, 2), threads=self.threads,
784 | planner_effort=self.planner_effort['fft2'])
785 |
786 | # Copy with truncation
787 | Upad_hat1 = C2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1)
788 |
789 | # Transpose and commuincate data
790 | U_mpi[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1)
791 | self.comm.Alltoall([U_mpi, self.mpitype], [Upad_hat0, self.mpitype])
792 |
793 | # Perform fft of data in x-direction
794 | Upad_hat = fft(Upad_hat0, Upad_hat, overwrite_input=True, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
795 |
796 | # Truncate to original complex shape
797 | fu[:self.N[0]//2] = Upad_hat[:self.N[0]//2]
798 | fu[self.N[0]//2:] = Upad_hat[-self.N[0]//2:]
799 | fu /= self.padsize**3
800 |
801 | return fu
802 |
803 | @staticmethod
804 | def copy_to_padded(fu, fp, N, axis=0):
805 | if axis == 0:
806 | fp[:N[0]//2] = fu[:N[0]//2]
807 | fp[-N[0]//2:] = fu[N[0]//2:]
808 | elif axis == 1:
809 | fp[:, :N[1]//2] = fu[:, :N[1]//2]
810 | fp[:, -N[1]//2:] = fu[:, N[1]//2:]
811 | elif axis == 2:
812 | fp[:, :, :N[2]//2] = fu[:, :, :N[2]//2]
813 | fp[:, :, -N[2]//2:] = fu[:, :, N[2]//2:]
814 | return fp
815 |
816 | @staticmethod
817 | def copy_from_padded(fp, fu, N, axis=0):
818 | if axis == 1:
819 | fu.fill(0)
820 | fu[:, :N[1]//2+1, :N[2]//2+1] = fp[:, :N[1]//2+1, :N[2]//2+1]
821 | fu[:, :N[1]//2+1, N[2]//2:] += fp[:, :N[1]//2+1, -N[2]//2:]
822 | fu[:, N[1]//2:, :N[2]//2+1] += fp[:, -N[1]//2:, :N[2]//2+1]
823 | fu[:, N[1]//2:, N[2]//2:] += fp[:, -N[1]//2:, -N[2]//2:]
824 |
825 | return fu
826 |
827 |
828 | #def transpose_Uc(Uc_hatT, U_mpi, num_processes, Np0, Np1, Nf):
829 | #for i in xrange(num_processes):
830 | #Uc_hatT[:, i*Np1:(i+1)*Np1] = U_mpi[i]
831 | #return Uc_hatT
832 |
833 | #def transpose_Umpi(U_mpi, Uc_hatT, num_processes, Np0, Np1, Nf):
834 | #for i in xrange(num_processes):
835 | #U_mpi[i] = Uc_hatT[:, i*Np1:(i+1)*Np1]
836 | #return U_mpi
837 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mpi4py
2 | cython
3 | numpy>=1.15
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import re
5 | import subprocess
6 | from setuptools import setup, Extension
7 | from setuptools.command.build_ext import build_ext
8 | from numpy import get_include
9 |
10 | cwd = os.path.abspath(os.path.dirname(__file__))
11 | cdir = os.path.join(cwd, "mpiFFT4py", "cython")
12 |
13 | def has_flag(compiler, flagname):
14 | """Return a boolean indicating whether a flag name is supported on
15 | the specified compiler.
16 | """
17 | devnull = open(os.devnull, "w")
18 | p = subprocess.Popen([compiler.compiler[0], '-E', '-'] + [flagname],
19 | stdin=subprocess.PIPE, stdout=devnull, stderr=devnull,
20 | shell=True)
21 | p.communicate("")
22 | return True if p.returncode == 0 else False
23 |
24 | class build_ext_subclass(build_ext):
25 | def build_extensions(self):
26 | extra_compile_args = ['-g0']
27 | for c in ['-w', '-Ofast', '-ffast-math', '-march=native']:
28 | if has_flag(self.compiler, c):
29 | extra_compile_args.append(c)
30 |
31 | for e in self.extensions:
32 | e.extra_compile_args += extra_compile_args
33 | e.include_dirs.extend([get_include()])
34 | build_ext.build_extensions(self)
35 |
36 | ext = [Extension('mpiFFT4py.cython.maths',
37 | sources=[os.path.join(cdir, "maths.pyx")])]
38 |
39 | def version():
40 | srcdir = os.path.join(cwd, 'mpiFFT4py')
41 | with open(os.path.join(srcdir, '__init__.py')) as f:
42 | m = re.search(r"__version__\s*=\s*'(.*)'", f.read())
43 | return m.groups()[0]
44 |
45 | with open("README.rst", "r") as fh:
46 | long_description = fh.read()
47 |
48 | setup(name = "mpiFFT4py",
49 | version = version(),
50 | description = "mpiFFT4py -- Parallel 3D FFT in Python using MPI for Python",
51 | long_description = long_description,
52 | author = "Mikael Mortensen",
53 | author_email = "mikaem@math.uio.no",
54 | url = 'https://github.com/spectralDNS/mpiFFT4py',
55 | classifiers = [
56 | 'Development Status :: 5 - Production/Stable',
57 | 'Environment :: Console',
58 | 'Intended Audience :: Developers',
59 | 'Intended Audience :: Science/Research',
60 | 'Intended Audience :: Education',
61 | 'Programming Language :: Python',
62 | 'Programming Language :: Python :: 2',
63 | 'Programming Language :: Python :: 3',
64 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
65 | 'Topic :: Scientific/Engineering :: Mathematics',
66 | 'Topic :: Software Development :: Libraries :: Python Modules',
67 | ],
68 | packages = ["mpiFFT4py",
69 | "mpiFFT4py.serialFFT",
70 | "mpiFFT4py.cython"
71 | ],
72 | package_dir = {"mpiFFT4py": "mpiFFT4py"},
73 | install_requires=["numpy"],
74 | setup_requires=["numpy>=1.11",
75 | "cython>=0.25",
76 | "setuptools>=18.0"],
77 | ext_modules = ext,
78 | cmdclass = {'build_ext': build_ext_subclass}
79 | )
80 |
--------------------------------------------------------------------------------
/tests/test_FFT.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import string
3 | import numpy as np
4 | from numpy.random import random, randn
5 | from numpy import allclose, empty, zeros, zeros_like, pi, array, int, all, float64
6 | from numpy.fft import fftfreq
7 | from mpi4py import MPI
8 |
9 | from mpiFFT4py.pencil import R2C as Pencil_R2C
10 | from mpiFFT4py.slab import R2C as Slab_R2C
11 | from mpiFFT4py.line import R2C as Line_R2C
12 | from mpiFFT4py import rfft2, rfftn, irfftn, irfft2, fftn, ifftn, irfft, ifft
13 | from mpiFFT4py.slab import C2C
14 |
15 | def reset_profile(prof):
16 | prof.code_map = {}
17 | prof.last_time = {}
18 | prof.enable_count = 0
19 | for func in prof.functions:
20 | prof.add_function(func)
21 |
22 | N = 2**5
23 | L = array([2*pi, 2*pi, 2*pi])
24 | ks = (fftfreq(N)*N).astype(int)
25 | comm = MPI.COMM_WORLD
26 |
27 | if comm.Get_size() >= 4:
28 | params = ("slabas", "slabad", "slabws", "slabwd",
29 | "pencilsys", "pencilsyd", "pencilnys", "pencilnyd",
30 | "pencilsxd", "pencilsxs", "pencilnxd", "pencilnxs",
31 | "pencilaxd", "pencilaxs", "pencilayd", "pencilays")
32 |
33 | else:
34 | params = ("slabas", "slabad", "slabws", "slabwd")
35 |
36 | @pytest.fixture(params=params, scope='module')
37 |
38 | def FFT(request):
39 | prec = {"s": "single", "d":"double"}[request.param[-1]]
40 | if request.param[:3] == "pen":
41 | communication = {"s": "Alltoall", "n": "AlltoallN", "a": "Alltoallw"}[request.param[-3]]
42 | alignment = request.param[-2].upper()
43 | return Pencil_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication, alignment=alignment)
44 | else:
45 | communication = 'Alltoall' if request.param[-2] == 'a' else 'Alltoallw'
46 | return Slab_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication)
47 |
48 | @pytest.fixture(params=("lines", "lined"), scope='module')
49 | def FFT2(request):
50 | prec = {"s": "single", "d":"double"}[request.param[-1]]
51 | return Line_R2C(array([N, 2*N]), L[:-1], comm, prec)
52 |
53 |
54 | @pytest.fixture(params=("slabd", "slabs"), scope='module')
55 | def FFT_C2C(request):
56 | prec = {"s": "single", "d":"double"}[request.param[-1]]
57 | return C2C(array([N, 2*N, 4*N]), L, comm, prec)
58 |
59 | #@profile
60 | def test_FFT(FFT):
61 | N = FFT.N
62 | if FFT.rank == 0:
63 | A = random(N).astype(FFT.float)
64 | if FFT.communication == 'AlltoallN':
65 | C = empty(FFT.global_complex_shape(), dtype=FFT.complex)
66 | C = rfftn(A, C, axes=(0,1,2))
67 | C[:, :, -1] = 0 # Remove Nyquist frequency
68 | A = irfftn(C, A, axes=(0,1,2))
69 | B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
70 | B2 = rfftn(A, B2, axes=(0,1,2))
71 |
72 | else:
73 | A = zeros(N, dtype=FFT.float)
74 | B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
75 |
76 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
77 | FFT.comm.Bcast(A, root=0)
78 | FFT.comm.Bcast(B2, root=0)
79 |
80 | a = zeros(FFT.real_shape(), dtype=FFT.float)
81 | c = zeros(FFT.complex_shape(), dtype=FFT.complex)
82 | a[:] = A[FFT.real_local_slice()]
83 | c = FFT.fftn(a, c)
84 | #print abs((c - B2[FFT.complex_local_slice()])/c.max()).max()
85 | assert all(abs((c - B2[FFT.complex_local_slice()])/c.max()) < rtol)
86 | #assert allclose(c, B2[FFT.complex_local_slice()], rtol, atol)
87 | a = FFT.ifftn(c, a)
88 | #print abs((a - A[FFT.real_local_slice()])/a.max()).max()
89 |
90 | assert all(abs((a - A[FFT.real_local_slice()])/a.max()) < rtol)
91 | #assert allclose(a, A[FFT.real_local_slice()], rtol, atol)
92 |
93 | def test_FFT2(FFT2):
94 | N = FFT2.N
95 | if FFT2.rank == 0:
96 | A = random(N).astype(FFT2.float)
97 |
98 | else:
99 | A = zeros(N, dtype=FFT2.float)
100 |
101 | atol, rtol = (1e-10, 1e-8) if FFT2.float is float64 else (5e-7, 1e-4)
102 | FFT2.comm.Bcast(A, root=0)
103 | a = zeros(FFT2.real_shape(), dtype=FFT2.float)
104 | c = zeros(FFT2.complex_shape(), dtype=FFT2.complex)
105 | a[:] = A[FFT2.real_local_slice()]
106 | c = FFT2.fft2(a, c)
107 | B2 = zeros(FFT2.global_complex_shape(), dtype=FFT2.complex)
108 | B2 = rfft2(A, B2, axes=(0,1))
109 | assert allclose(c, B2[FFT2.complex_local_slice()], rtol, atol)
110 | a = FFT2.ifft2(c, a)
111 | assert allclose(a, A[FFT2.real_local_slice()], rtol, atol)
112 |
113 | def test_FFT2_padded(FFT2):
114 | FFT = FFT2
115 | N = FFT.N
116 | prec = "single" if isinstance(FFT.float, np.float32) else "double"
117 | FFT_SELF = Line_R2C(N, FFT.L, MPI.COMM_SELF, prec)
118 |
119 | if FFT.rank == 0:
120 | A = random(N).astype(FFT.float)
121 | C = zeros((FFT.global_complex_shape()), dtype=FFT.complex)
122 | C = FFT_SELF.fft2(A, C)
123 |
124 | # Eliminate Nyquist, otherwise test will fail
125 | C[-N[0]//2] = 0
126 |
127 | A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
128 | A_pad = FFT_SELF.ifft2(C, A_pad, dealias="3/2-rule")
129 |
130 | else:
131 | C = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
132 | A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
133 |
134 | FFT.comm.Bcast(C, root=0)
135 | FFT.comm.Bcast(A_pad, root=0)
136 |
137 | ae = zeros(FFT.real_shape_padded(), dtype=FFT.float)
138 | c = zeros(FFT.complex_shape(), dtype=FFT.complex)
139 |
140 | c[:] = C[FFT.complex_local_slice()]
141 | ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)]
142 |
143 | ap = zeros(FFT.real_shape_padded(), dtype=FFT.float)
144 | cp = zeros(FFT.complex_shape(), dtype=FFT.complex)
145 | ap = FFT.ifft2(c, ap, dealias="3/2-rule")
146 |
147 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
148 |
149 | #from IPython import embed; embed()
150 | #print np.linalg.norm(ap-ae)
151 | assert allclose(ap, ae, rtol, atol)
152 |
153 | cp = FFT.fft2(ap, cp, dealias="3/2-rule")
154 |
155 | #print np.linalg.norm(abs((cp-c)/cp.max()))
156 | assert all(abs((cp-c)/cp.max()) < rtol)
157 |
158 |
159 | def test_FFT_padded(FFT):
160 | N = FFT.N
161 | prec = "single" if isinstance(FFT.float, np.float32) else "double"
162 | FFT_SELF = Slab_R2C(FFT.N, L, MPI.COMM_SELF, prec,
163 | communication=FFT.communication)
164 |
165 | if FFT.rank == 0:
166 | A = random(N).astype(FFT.float)
167 | C = zeros((FFT.global_complex_shape()), dtype=FFT.complex)
168 | C = FFT_SELF.fftn(A, C)
169 |
170 | # Eliminate Nyquist, otherwise test will fail
171 | #C[-N[0]//2] = 0
172 | #C[:, -N[1]//2] = 0
173 | if FFT.communication == 'AlltoallN':
174 | C[:, :, -1] = 0 # Remove Nyquist frequency
175 |
176 | A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
177 | A_pad = FFT_SELF.ifftn(C, A_pad, dealias='3/2-rule')
178 |
179 | else:
180 | C = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
181 | A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
182 |
183 | FFT.comm.Bcast(C, root=0)
184 | FFT.comm.Bcast(A_pad, root=0)
185 |
186 | ae = zeros(FFT.real_shape_padded(), dtype=FFT.float)
187 | c = zeros(FFT.complex_shape(), dtype=FFT.complex)
188 |
189 | c[:] = C[FFT.complex_local_slice()]
190 | ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)]
191 |
192 | ap = zeros(FFT.real_shape_padded(), dtype=FFT.float)
193 | cp = zeros(FFT.complex_shape(), dtype=FFT.complex)
194 | ap = FFT.ifftn(c, ap, dealias="3/2-rule")
195 |
196 | atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
197 |
198 | #print np.linalg.norm(ap-ae)
199 | assert allclose(ap, ae, rtol, atol)
200 |
201 | cp = FFT.fftn(ap, cp, dealias="3/2-rule")
202 |
203 | #from IPython import embed; embed()
204 | #print np.linalg.norm(abs((cp-c)/cp.max()))
205 | assert all(abs((cp-c)/cp.max()) < rtol)
206 |
207 | #aa = zeros(FFT.real_shape(), dtype=FFT.float)
208 | #aa = FFT.ifftn(cp, aa)
209 |
210 | #a3 = A[FFT.real_local_slice()]
211 | #assert allclose(aa, a3, rtol, atol)
212 |
213 | def test_FFT_C2C(FFT_C2C):
214 | """Test both padded and unpadded transforms"""
215 | FFT = FFT_C2C
216 | N = FFT.N
217 | atol, rtol = (1e-8, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
218 |
219 | if FFT.rank == 0:
220 | # Create a reference solution using only one CPU
221 | A = (random(N)+random(N)*1j).astype(FFT.complex)
222 | C = zeros((FFT.global_shape()), dtype=FFT.complex)
223 | C = fftn(A, C, axes=(0,1,2))
224 |
225 | # Copy to array padded with zeros
226 | Cp = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
227 | ks = (fftfreq(N[2])*N[2]).astype(int)
228 | Cp[:N[0]//2, :N[1]//2, ks] = C[:N[0]//2, :N[1]//2]
229 | Cp[:N[0]//2, -N[1]//2:, ks] = C[:N[0]//2, N[1]//2:]
230 | Cp[-N[0]//2:, :N[1]//2, ks] = C[N[0]//2:, :N[1]//2]
231 | Cp[-N[0]//2:, -N[1]//2:, ks] = C[N[0]//2:, N[1]//2:]
232 |
233 | # Get transform of padded array
234 | Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
235 | Ap = ifftn(Cp*1.5**3, Ap, axes=(0,1,2))
236 |
237 | else:
238 | C = zeros(FFT.global_shape(), dtype=FFT.complex)
239 | Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
240 | A = zeros(N, dtype=FFT.complex)
241 |
242 | # For testing broadcast the arrays computed on root to all CPUs
243 | FFT.comm.Bcast(C, root=0)
244 | FFT.comm.Bcast(Ap, root=0)
245 | FFT.comm.Bcast(A, root=0)
246 |
247 | # Get the single processor solution on local part of the solution
248 | ae = zeros(FFT.original_shape_padded(), dtype=FFT.complex)
249 | ae[:] = Ap[FFT.original_local_slice(padsize=1.5)]
250 | c = zeros(FFT.transformed_shape(), dtype=FFT.complex)
251 | c[:] = C[FFT.transformed_local_slice()]
252 |
253 | # Perform padded transform with MPI and assert ok
254 | ap = zeros(FFT.original_shape_padded(), dtype=FFT.complex)
255 | ap = FFT.ifftn(c, ap, dealias="3/2-rule")
256 | assert allclose(ap, ae, rtol, atol)
257 |
258 | # Perform truncated transform with MPI and assert
259 | cp = zeros(FFT.transformed_shape(), dtype=FFT.complex)
260 | cp = FFT.fftn(ap, cp, dealias="3/2-rule")
261 | assert all(abs(cp-c)/cp.max() < rtol)
262 |
263 | # Now without padding
264 | # Transform back to original
265 | aa = zeros(FFT.original_shape(), dtype=FFT.complex)
266 | aa = FFT.ifftn(c, aa)
267 | # Verify
268 | a3 = A[FFT.original_local_slice()]
269 | assert allclose(aa, a3, rtol, atol)
270 | c2 = zeros(FFT.transformed_shape(), dtype=FFT.complex)
271 | c2 = FFT.fftn(aa, c2)
272 | # Verify
273 | assert all(abs(c2-c)/c2.max() < rtol)
274 | #assert allclose(c2, c, rtol, atol)
275 |
276 | #import time
277 | #t0 = time.time()
278 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall'))
279 | #t1 = time.time()
280 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='Alltoall'))
281 | #t2 = time.time()
282 |
283 | #ty = MPI.COMM_WORLD.reduce(t1-t0, op=MPI.MIN)
284 | #tx = MPI.COMM_WORLD.reduce(t2-t1, op=MPI.MIN)
285 | #if MPI.COMM_WORLD.Get_rank() == 0:
286 | #print "Y: ", ty
287 | #print "X: ", tx
288 |
289 | #test_FFT(Slab_R2C(array([N, 2*N, 4*N]), L, MPI.COMM_WORLD, "double", communication='Alltoall'))
290 | #test_FFT(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall'))
291 | #test_FFT2(Line_R2C(array([N, N]), L[:-1], MPI, "single"))
292 | #test_FFT2_padded(Line_R2C(array([N, N]), L[:-1], MPI, "double"))
293 | #from collections import defaultdict
294 | #FFT = Slab_R2C(array([N//4, N, N]), L, MPI.COMM_WORLD, "double", communication='Alltoallw', threads=2, planner_effort=defaultdict(lambda: "FFTW_MEASURE"))
295 | #test_FFT_padded(FFT)
296 | #reset_profile(profile)
297 | #test_FFT_padded(FFT)
298 |
299 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='AlltoallN'))
300 | #test_FFT_C2C(C2C(array([N, N, N]), L, MPI, "double"))
301 |
--------------------------------------------------------------------------------