├── .circleci
    └── config.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── conf
    └── conda
    │   ├── conda_build_config.yaml
    │   ├── meta.yaml
    │   └── run_test.sh
├── demo
    ├── spectral_dns_solver.py
    └── transforms_realdata.py
├── mpiFFT4py
    ├── __init__.py
    ├── cython
    │   ├── __init__.py
    │   └── maths.pyx
    ├── line.py
    ├── mpibase.py
    ├── pencil.py
    ├── serialFFT
    │   ├── __init__.py
    │   ├── numpy_fft.py
    │   └── pyfftw_fft.py
    └── slab.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    └── test_FFT.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   build:
 4 |     machine: true
 5 |     steps:
 6 |       - checkout
 7 | 
 8 |       - restore_cache:
 9 |           key: v2-miniconda-{{ .Branch }}
10 | 
11 |       - run:
12 |           name: install miniconda
13 |           command: |
14 |             if [[ ! -d /home/circleci/miniconda ]]; then
15 |                 wget https://repo.continuum.io/miniconda/Miniconda3-4.5.1-Linux-x86_64.sh -O miniconda.sh &&
16 |                 bash miniconda.sh -b -f -p /home/circleci/miniconda;
17 |             else
18 |                 echo "Using cached miniconda";
19 |             fi
20 |             source ~/miniconda/bin/activate root
21 |             conda config --set always_yes yes
22 |             conda config --add channels conda-forge
23 |             conda config --add channels spectralDNS
24 |             conda clean --lock
25 |             conda install --yes --quiet conda-forge-ci-setup=1
26 |             source run_conda_forge_build_setup
27 | 
28 |       - save_cache:
29 |           key: v2-miniconda-{{ .Branch }}
30 |           paths:
31 |             - /home/circleci/miniconda
32 | 
33 |       - run:
34 |           name: Build and test
35 |           command: |
36 |             source ~/miniconda/bin/activate root
37 |             cd /home/circleci/project
38 |             conda build --python 2.7 ./conf/conda
39 |             conda build --python 3.6 ./conf/conda
40 | 
41 |       - run:
42 |           name: Upload packages
43 |           command: |
44 |               source ~/miniconda/bin/activate root
45 |               cd /home/circleci/project
46 |               upload_or_check_non_existence ./conf/conda spectralDNS --channel main
47 |               export CONDA_PY=36
48 |               upload_or_check_non_existence ./conf/conda spectralDNS --channel main
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: generic
 2 | os: osx
 3 | osx_image: xcode7.3
 4 | sudo: false
 5 | env:
 6 |   matrix:
 7 |     - CONDA_PY=27
 8 |     - CONDA_PY=36
 9 |   global:
10 |     - secure: "swxbq67k6ag2v7QjLGMtn72mROxDZ7d+c6X+BgR2YS1XT7l45T9+0Z/PTpCJg+9mmEH3YdlpnlzKjatz9xVNY04a7RljFMsNy/+5oiTOmno2IDq2fAPrUFvGAvdqsVgnc6+e+GUwaDL5n/AfDVOIb18tT4P2VRk3ooCsSILtQYvQWixLw5bx3BhTgAfXnmu7e+oaB+vCDXXjlFINlOvHZCBiVI9g0yXH0sW9gYsR2vsmIdxraChsq/+Q0wkaNUgUaiuHXNWcaZiiWleRYnYsktsNfT1nknkLrkPAtQTC5fYgXj6o9Sh+codcfYH95ztBm83rWzfWo2f+Ok1AtrRdG+CiApCFMQ6T4ZjonxEeZhopvY7+xNLXFoHcmnBdf0NM3wmCdwrzuzdHvpqRnozClTqG6Srvna7X4/WtDbKpF2yEHKdiBmaf8NRcGDpbJeyvnzlNz5HMESltvYUVatLzPTzzJplkvgMX3Ti8xcqYgwB1ayrClGFlpWM33MdzJiSSTptv3WYmhi7rV5xdpCc5pBTF5XLOtEB0dFGY60yQd9SWSxjFAMwo9808V6koiKX3D0Ogin8mQmvR2DqVhkBqfHFf36s38OfG/n1iV/Oednc9pfYP55T7ljKRsPUpavblCPizBfQnQEFivjaDlPGX3/bR0TV9F/pRSiJ84JMgKzs="
11 | 
12 |     before_install:
13 |     - brew remove --force $(brew list)
14 |     - brew cleanup -s
15 |     - rm -rf $(brew --cache)
16 | install:
17 |     - |
18 |       MINICONDA_URL="https://repo.continuum.io/miniconda"
19 |       MINICONDA_FILE="Miniconda3-latest-MacOSX-x86_64.sh"
20 |       curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}"
21 |       bash $MINICONDA_FILE -b
22 |       source /Users/travis/miniconda3/bin/activate root
23 |       conda config --set show_channel_urls true
24 |       conda config --add channels conda-forge
25 |       conda install --yes --quiet conda-forge-ci-setup=1
26 |       source run_conda_forge_build_setup
27 | script:
28 |     - conda build conf/conda
29 | after_success:
30 |     - export GIT_DESCRIBE_TAG=`git describe --tags | cut -d'-' -f 1`
31 |     - upload_or_check_non_existence ./conf/conda spectralDNS --channel main || exit 1
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.py *.txt *.rst
2 | recursive-include mpiFFT4py *.py *.pyx
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VERSION=$(shell python3 -c "import mpiFFT4py; print(mpiFFT4py.__version__)")
 2 | 
 3 | default:
 4 | 	python setup.py build_ext -i
 5 | 
 6 | pip:
 7 | 	rm -f dist/*
 8 | 	python setup.py sdist
 9 | 	twine upload dist/*
10 | 
11 | tag:
12 | 	git tag $(VERSION)
13 | 	git push --tags
14 | 
15 | publish: tag pip
16 | 
17 | clean:
18 | 	git clean mpiFFT4py -fx
19 | 	git clean tests -fx
20 | 	cd docs && make clean && cd ..
21 | 	@rm -rf *.egg-info/ build/ dist/ .eggs/


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | mpiFFT4py
 2 | ---------
 3 | 
 4 | .. image:: https://travis-ci.org/spectralDNS/mpiFFT4py.svg?branch=master
 5 |     :target: https://travis-ci.org/spectralDNS/mpiFFT4py
 6 | .. image:: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master.svg?style=svg
 7 |     :target: https://circleci.com/gh/spectralDNS/mpiFFT4py/tree/master
 8 | .. image:: https://zenodo.org/badge/51817237.svg
 9 |     :target: https://zenodo.org/badge/latestdoi/51817237
10 | 
11 | Description
12 | -----------
13 | mpiFFT4py performs FFTs in parallel in Python. It is developed to be able to do FFTs in parallel on a three-dimensional computational box (a structured grid), but there are also routines for doing the FFTs on a 2D mesh. It implements both the *slab* and the *pencil* decompositions.
14 | 
15 | Installation
16 | ------------
17 | mpiFFT4py requires *numpy* for basic array oparations, [*pyfftw*](https://github.com/pyfftw/pyFFTW) for efficient FFTs and [*mpi4py*](https://bitbucket.org/mpi4py/mpi4py) for MPI communications. However, if *pyfftw* is not found, then the slower *numpy.fft* is used instead. [*cython*](http://cython.org) is used to optimize a few routines. Install using regular python distutils::
18 | 
19 |     python setup.py install --prefix="Path on the PYTHONPATH"
20 |   
21 | To install in place do::
22 | 
23 |     python setup.py build_ext --inplace
24 |     
25 | To install using Anaconda, you may either compile it yourselves using (from the main directory)::
26 | 
27 |     conda config --add channels conda-forge
28 |     conda build conf/conda
29 |     conda install mpiFFT4py --use-local
30 |     
31 | or use precompiled binaries in the[*conda-forge*](https://anaconda.org/conda-forge/mpifft4py) or the [*spectralDNS*](https://anaconda.org/spectralDNS/mpifft4py) channel on Anaconda cloud::
32 | 
33 |     conda install -c conda-forge mpifft4py
34 | 
35 | or::
36 | 
37 |     conda config --add channels conda-forge
38 |     conda install -c spectralDNS mpifft4py
39 | 
40 | There are binaries compiled for both OSX and linux, and several versions of Python. Note that the spectralDNS channel contains bleeding-edge versions of the Software, whereas conda-forge is more stable.
41 | 
42 | Authors
43 | -------
44 | mpiFFT4py is developed by
45 | 
46 |   * Mikael Mortensen
47 | 
48 | Licence
49 | -------
50 | mpiFFT4py is licensed under the GNU GPL, version 3 or (at your option) any later version. mpiFFT4py is Copyright (2014-2016) by the authors.
51 | 
52 | Contact
53 | -------
54 | The latest version of this software can be obtained from
55 | 
56 |   https://github.com/spectralDNS/mpiFFT4py
57 | 
58 | Please report bugs and other issues through the issue tracker at:
59 | 
60 |   https://github.com/spectralDNS/mpiFFT4py/issues
61 | 


--------------------------------------------------------------------------------
/conf/conda/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | numpy:
2 |   - 1.15
3 | 


--------------------------------------------------------------------------------
/conf/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: mpifft4py
 3 |   version: "{{ GIT_DESCRIBE_TAG }}"
 4 | 
 5 | source:
 6 |   git_url: ../../
 7 | 
 8 | build:
 9 |   number: 0
10 |   script: "pip install ."
11 | 
12 | requirements:
13 |   build:
14 |     - python
15 |     - pip
16 |     - cython
17 |     - numpy
18 | 
19 |   run:
20 |     - python
21 |     - numpy
22 |     - scipy
23 |     - mpi4py
24 |     - fftw
25 |     - pyfftw
26 | 
27 | test:
28 |   source_files:
29 |     - tests
30 | 
31 |   requires:
32 |     - pytest
33 | 


--------------------------------------------------------------------------------
/conf/conda/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pushd tests
 4 | 
 5 | export OMPI_MCA_plm=isolated
 6 | export OMPI_MCA_btl_vader_single_copy_mechanism=none
 7 | export OMPI_MCA_rmaps_base_oversubscribe=yes
 8 | 
 9 | if [ "$(uname)" == "Darwin" ]; then
10 |     mpirun -np 2 py.test -v
11 | fi
12 | 
13 | if [ "$(uname)" == "Linux" ]; then
14 |     mpirun -np 2 py.test -v
15 | fi
16 | # if [ "${CONDA_PY:0:1}" == "3" ]; then
17 | #     mpirun -np 4 py.test
18 | # fi
19 | #
20 | # if [ "${CONDA_PY:0:1}" == "2" ]; then
21 | #     mpirun -np 1 py.test
22 | # fi
23 | #
24 | 


--------------------------------------------------------------------------------
/demo/spectral_dns_solver.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Demo program that solves the Navier Stokes equations in a triply
  3 | periodic domain. The solution is initialized using the Taylor-Green
  4 | vortex and evolved in time with a 4'th order Runge Kutta method.
  5 | 
  6 | Basically, we create an instance of the R2C class for performing 3D FFTs
  7 | in parallel on a cube of size N points and physical size L. The mesh
  8 | decomposition is performed by the FFT class using a slab decomposition.
  9 | With slab decomposition the first index in real physical space is shared
 10 | amongst the processors, whereas in wavenumber space the second index is shared.
 11 | """
 12 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
 13 | __date__ = "2016-04-07"
 14 | __copyright__ = "Copyright (C) 2016 " + __author__
 15 | __license__ = "GNU Lesser GPL version 3 or any later version"
 16 | 
 17 | from numpy import array, pi, empty, where, sin, cos, sum
 18 | from mpi4py import MPI
 19 | from mpiFFT4py import work_arrays
 20 | from mpiFFT4py.slab import R2C
 21 | from collections import defaultdict
 22 | 
 23 | # Set viscosity, end time and time step
 24 | nu = 0.000625
 25 | T = 0.1
 26 | dt = 0.01
 27 | 
 28 | # Set global size of the computational box
 29 | N = array([2**5, 2**5, 2**5], dtype=int)
 30 | L = array([2*pi, 2*pi, 2*pi], dtype=float)
 31 | 
 32 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", planner_effort=
 33 |           defaultdict(lambda: 'FFTW_ESTIMATE', {'irfft2': 'FFTW_PATIENT'}))
 34 | 
 35 | U = empty((3,) + FFT.real_shape())                        # real_shape = (N[0]/comm.Get_size(), N[1], N[2])
 36 | U_hat = empty((3,) + FFT.complex_shape(), dtype=complex)  # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]/2+1)
 37 | P = empty(FFT.real_shape())
 38 | P_hat = empty(FFT.complex_shape(), dtype=complex)
 39 | U_hat0 = empty((3,) + FFT.complex_shape(), dtype=complex)
 40 | U_hat1 = empty((3,) + FFT.complex_shape(), dtype=complex)
 41 | dU = empty((3,) + FFT.complex_shape(), dtype=complex)
 42 | work = work_arrays()
 43 | X = FFT.get_local_mesh()
 44 | K = FFT.get_local_wavenumbermesh(scaled=True)
 45 | K2 = K[0]*K[0] + K[1]*K[1] + K[2]*K[2]
 46 | K_over_K2 = empty((3,) + FFT.complex_shape())
 47 | for k in range(3):
 48 |     K_over_K2[k] = K[k].astype(float) / where(K2 == 0, 1, K2).astype(float)
 49 | a = [1./6., 1./3., 1./3., 1./6.]
 50 | b = [0.5, 0.5, 1.]
 51 | dealias = '3/2-rule'  # ('2/3-rule', None)
 52 | 
 53 | def cross(x, y, z):
 54 |     """Cross product z = x X y"""
 55 |     z[0] = FFT.fftn(x[1]*y[2]-x[2]*y[1], z[0], dealias)
 56 |     z[1] = FFT.fftn(x[2]*y[0]-x[0]*y[2], z[1], dealias)
 57 |     z[2] = FFT.fftn(x[0]*y[1]-x[1]*y[0], z[2], dealias)
 58 |     return z
 59 | 
 60 | def curl(x, z):
 61 |     z[2] = FFT.ifftn(1j*(K[0]*x[1]-K[1]*x[0]), z[2], dealias)
 62 |     z[1] = FFT.ifftn(1j*(K[2]*x[0]-K[0]*x[2]), z[1], dealias)
 63 |     z[0] = FFT.ifftn(1j*(K[1]*x[2]-K[2]*x[1]), z[0], dealias)
 64 |     return z
 65 | 
 66 | def compute_rhs(rhs):
 67 |     U_dealiased = work[((3,) + FFT.work_shape(dealias), float, 0)]
 68 |     curl_dealiased = work[((3,) + FFT.work_shape(dealias), float, 1)]
 69 |     for i in range(3):
 70 |         U_dealiased[i] = FFT.ifftn(U_hat[i], U_dealiased[i], dealias)
 71 | 
 72 |     curl_dealiased = curl(U_hat, curl_dealiased)
 73 |     rhs = cross(U_dealiased, curl_dealiased, rhs)
 74 |     P_hat[:] = sum(rhs*K_over_K2, 0, out=P_hat)
 75 |     rhs -= P_hat*K
 76 |     rhs -= nu*K2*U_hat
 77 |     return rhs
 78 | 
 79 | # Initialize a Taylor Green vortex
 80 | U[0] = sin(X[0])*cos(X[1])*cos(X[2])
 81 | U[1] = -cos(X[0])*sin(X[1])*cos(X[2])
 82 | U[2] = 0
 83 | for i in range(3):
 84 |     U_hat[i] = FFT.fftn(U[i], U_hat[i])
 85 | 
 86 | # Integrate using a 4th order Rung-Kutta method
 87 | t = 0.0
 88 | tstep = 0
 89 | while t < T-1e-8:
 90 |     t += dt
 91 |     tstep += 1
 92 |     U_hat1[:] = U_hat0[:] = U_hat
 93 |     for rk in range(4):
 94 |         dU = compute_rhs(dU)
 95 |         if rk < 3:
 96 |             U_hat[:] = U_hat0 + b[rk]*dt*dU
 97 |         U_hat1[:] += a[rk]*dt*dU
 98 |     U_hat[:] = U_hat1[:]
 99 | 
100 | for i in range(3):
101 |     U[i] = FFT.ifftn(U_hat[i], U[i])
102 | 
103 | k = FFT.comm.reduce(sum(U*U)/N[0]/N[1]/N[2]/2)
104 | if FFT.rank == 0:
105 |     assert round(k - 0.124953117517, 7) == 0
106 | 


--------------------------------------------------------------------------------
/demo/transforms_realdata.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
 2 | __date__ = "2016-03-09"
 3 | __copyright__ = "Copyright (C) 2016 " + __author__
 4 | __license__  = "GNU Lesser GPL version 3 or any later version"
 5 | 
 6 | from numpy import *
 7 | from mpi4py import MPI
 8 | #from mpiFFT4py.pencil import R2C
 9 | from mpiFFT4py.slab import R2C
10 | from mpi4py_fft.mpifft import PFFT
11 | from time import time
12 | 
13 | #assert MPI.COMM_WORLD.Get_size() >= 4
14 | 
15 | # Set global size of the computational box
16 | M = 6
17 | N = array([2**M, 2**M, 2**M], dtype=int)
18 | L = array([2*pi, 2*pi, 2*pi], dtype=float)
19 | 
20 | # Create an instance of the R2C class for performing 3D FFTs in parallel
21 | # on a cube of size N points and physical size L. The mesh decomposition is
22 | # performed by the FFT class using a slab decomposition. With slab decomposition
23 | # the first index in real physical space is shared amongst the processors,
24 | # whereas in wavenumber space the second index is shared.
25 | 
26 | #FFT = R2C(N, L, MPI.COMM_WORLD, "double", None, alignment='X', communication='Alltoallw')
27 | FFT = R2C(N, L, MPI.COMM_WORLD, "double", communication='Alltoallw')
28 | fft = PFFT(MPI.COMM_WORLD, N, collapse=False, slab=2)
29 | 
30 | U = random.random(FFT.real_shape()).astype(FFT.float) # real_shape = (N[0]//comm.Get_size(), N[1], N[2])
31 | U_copy = zeros_like(U)
32 | U_hat = zeros(FFT.complex_shape(), dtype=FFT.complex) # complex_shape = (N[0], N[1]//comm.Get_size(), N[2]//2+1)
33 | 
34 | # Perform forward FFT. Real transform in third direction, complex in first two
35 | U_hat = FFT.fftn(U, U_hat)
36 | 
37 | # Perform inverse FFT.
38 | U_copy = FFT.ifftn(U_hat, U_copy)
39 | MPI.COMM_WORLD.barrier()
40 | t0 = time()
41 | U_hat = FFT.fftn(U, U_hat)
42 | U_copy = FFT.ifftn(U_hat, U_copy)
43 | print("mpiFFT4py ", time()-t0)
44 | ###########
45 | u = random.random(fft.forward.input_array.shape).astype(fft.forward.input_array.dtype)
46 | MPI.COMM_WORLD.barrier()
47 | t0 = time()
48 | u_hat = fft.forward(u)
49 | u_copy = fft.backward(u_hat)
50 | print("mpi4py-fft ", time()-t0)
51 | #########
52 | 
53 | tol = 1e-6 if FFT.float == float32 else 1e-10
54 | 
55 | assert allclose(U, U_copy, tol, tol)
56 | assert allclose(u, u_copy, tol, tol)
57 | 


--------------------------------------------------------------------------------
/mpiFFT4py/__init__.py:
--------------------------------------------------------------------------------
1 | from .serialFFT import *
2 | from .slab import R2C as Slab_R2C
3 | from .pencil import R2C as Pencil_R2C
4 | from .line import R2C as Line_R2C
5 | from .mpibase import work_arrays, datatypes, empty, zeros
6 | from numpy.fft import fftfreq, rfftfreq
7 | 
8 | __version__ = '1.1.2'
9 | 


--------------------------------------------------------------------------------
/mpiFFT4py/cython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spectralDNS/mpiFFT4py/61ce6474771efff4e3b280b3f69f09611a2c1150/mpiFFT4py/cython/__init__.py


--------------------------------------------------------------------------------
/mpiFFT4py/cython/maths.pyx:
--------------------------------------------------------------------------------
 1 | #cython: boundscheck=False
 2 | #cython: wraparound=False
 3 | cimport numpy as np
 4 | 
 5 | ctypedef fused complex_t:
 6 |     np.complex64_t
 7 |     np.complex128_t
 8 |     
 9 | def dealias_filter(np.ndarray[complex_t, ndim=3] fu,
10 |                    np.ndarray[np.uint8_t, ndim=3] dealias):
11 |     cdef unsigned int i, j, k
12 |     cdef np.uint8_t uu
13 |     for i in xrange(dealias.shape[0]):
14 |         for j in xrange(dealias.shape[1]):
15 |             for k in xrange(dealias.shape[2]):
16 |                 uu = dealias[i, j, k]
17 |                 fu[i, j, k].real *= uu
18 |                 fu[i, j, k].imag *= uu
19 |     return fu
20 | 
21 | def transpose_Uc(np.ndarray[complex_t, ndim=3] Uc_hatT,
22 |                  np.ndarray[complex_t, ndim=4] U_mpi,
23 |                  int num_processes, int Np0, int Np1, int Nf):    
24 |     cdef unsigned int i, j, k, l, kk
25 |     for i in xrange(num_processes): 
26 |         for j in xrange(Np0):
27 |             for k in xrange(i*Np1, (i+1)*Np1):
28 |                 kk = k-i*Np1
29 |                 for l in xrange(Nf):
30 |                     Uc_hatT[j, k, l] = U_mpi[i, j, kk, l]
31 |     return Uc_hatT
32 | 
33 | def transpose_Umpi(np.ndarray[complex_t, ndim=4] U_mpi,
34 |                    np.ndarray[complex_t, ndim=3] Uc_hatT,
35 |                    int num_processes, int Np, int Nf):
36 |     cdef unsigned int i,j,k,l,kk
37 |     for i in xrange(num_processes): 
38 |         for j in xrange(Np):
39 |             for kk in xrange(Np):
40 |                 k = kk+i*Np  
41 |                 for l in xrange(Nf):
42 |                     U_mpi[i,j,kk,l] = Uc_hatT[j,k,l]
43 |     return U_mpi
44 | 
45 |     #for i in xrange(num_processes): 
46 |         #for j in xrange(Np):
47 |             #for k in xrange(i*Np, (i+1)*Np):
48 |                 #kk = k-i*Np  
49 |                 #for l in xrange(Nf):
50 |                     #U_mpi[i,j,kk,l] = Uc_hatT[j,k,l]
51 |     #return U_mpi
52 | 
53 | #def copy_to_padded(np.ndarray[complex_t, ndim=3] fu, 
54 |                    #np.ndarray[complex_t, ndim=3] fp, 
55 |                    #np.ndarray[int, ndim=1] N, int axis=0):
56 |     #if axis == 0:
57 |         #fp[:N[0]/2] = fu[:N[0]/2]
58 |         #fp[-N[0]/2:] = fu[N[0]/2:]
59 |     #elif axis == 1:
60 |         #fp[:, :N[1]/2] = fu[:, :N[1]/2]
61 |         #fp[:, -N[1]/2:] = fu[:, N[1]/2:]
62 |     #elif axis == 2:
63 |         #fp[:, :, :(N[2]/2+1)] = fu[:]        
64 |     #return fp
65 | 
66 | #def copy_to_padded_c(np.ndarray[complex_t, ndim=3] fu, 
67 |                      #np.ndarray[complex_t, ndim=3] fp, 
68 |                      #np.ndarray[int, ndim=1] N, int axis=0):
69 |     #if axis == 0:
70 |         #fp[:N[0]] = fu[:N[0]]
71 |     #elif axis == 1:
72 |         #fp[:, :N[1]/2] = fu[:, :N[1]/2]
73 |         #fp[:, -N[1]/2:] = fu[:, N[1]/2:]
74 |     #elif axis == 2:
75 |         #fp[:, :, :(N[2]/2+1)] = fu[:]        
76 |     #return fp
77 | 
78 | 


--------------------------------------------------------------------------------
/mpiFFT4py/line.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
  3 | __date__ = "2016-02-16"
  4 | __copyright__ = "Copyright (C) 2016 " + __author__
  5 | __license__  = "GNU Lesser GPL version 3 or any later version"
  6 | 
  7 | from .serialFFT import *
  8 | import numpy as np
  9 | from .mpibase import work_arrays, datatypes, zeros, empty
 10 | from numpy.fft import fftfreq, rfftfreq
 11 | from collections import defaultdict
 12 | from mpi4py import MPI
 13 | 
 14 | def transpose_x(U_send, Uc_hatT, num_processes):
 15 |     sx = U_send.shape
 16 |     sy = Uc_hatT.shape
 17 |     U_send[:] = np.rollaxis(Uc_hatT[:,:-1].reshape(sy[0], num_processes, sx[2]), 1)
 18 |     return U_send
 19 | 
 20 | def transpose_y(Uc_hatT, U_recv, num_processes):
 21 |     sx = Uc_hatT.shape
 22 |     sy = U_recv.shape
 23 |     Uc_hatT[:, :-1] = np.rollaxis(U_recv.reshape(num_processes, sx[0], sy[1]), 1).reshape((sx[0], sx[1]-1))
 24 |     return Uc_hatT
 25 | 
 26 | def swap_Nq(fft_y, fu, fft_x, N):
 27 |     f = fu[:, 0].copy()
 28 |     fft_x[0] = f[0].real
 29 |     fft_x[1:N//2] = 0.5*(f[1:N//2] + np.conj(f[:N//2:-1]))
 30 |     fft_x[N//2] = f[N//2].real
 31 |     fu[:N//2+1, 0] = fft_x[:N//2+1]
 32 |     fu[N//2+1:, 0] = np.conj(fft_x[(N//2-1):0:-1])
 33 | 
 34 |     fft_y[0] = f[0].imag
 35 |     fft_y[1:N//2] = -0.5*1j*(f[1:N//2] - np.conj(f[:N//2:-1]))
 36 |     fft_y[N//2] = f[N//2].imag
 37 | 
 38 |     fft_y[N//2+1:] = np.conj(fft_y[(N//2-1):0:-1])
 39 |     return fft_y
 40 | 
 41 | class R2C(object):
 42 |     """Class for performing FFT in 2D using MPI
 43 | 
 44 |     Slab decomposition
 45 | 
 46 |     Args:
 47 |         N - NumPy array([Nx, Ny]) Number of nodes for the real mesh
 48 |         L - NumPy array([Lx, Ly]) The actual size of the real mesh
 49 |         comm - The MPI communicator object
 50 |         precision - "single" or "double"
 51 |         padsize - For performing transforms with padding
 52 | 
 53 |     """
 54 | 
 55 |     def __init__(self, N, L, comm, precision, padsize=1.5, threads=1,
 56 |                  planner_effort=defaultdict(lambda : "FFTW_MEASURE")):
 57 |         self.N = N         # The global size of the problem
 58 |         self.L = L
 59 |         assert len(L) == 2
 60 |         assert len(N) == 2
 61 |         self.comm = comm
 62 |         self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision)
 63 |         self.num_processes = comm.Get_size()
 64 |         self.rank = comm.Get_rank()
 65 |         self.padsize = padsize
 66 |         self.threads = threads
 67 |         self.planner_effort = planner_effort
 68 |         # Each cpu gets ownership of Np indices
 69 |         self.Np = N // self.num_processes
 70 |         self.Nf = N[1]//2+1
 71 |         self.Npf = self.Np[1]//2+1 if self.rank+1 == self.num_processes else self.Np[1]//2
 72 |         self.Nfp = int(padsize*self.N[1]/2+1)
 73 |         self.ks = (fftfreq(N[0])*N[0]).astype(int)
 74 |         self.dealias = zeros(0)
 75 |         self.work_arrays = work_arrays()
 76 | 
 77 |     def real_shape(self):
 78 |         """The local shape of the real data"""
 79 |         return (self.Np[0], self.N[1])
 80 | 
 81 |     def complex_shape(self):
 82 |         """The local shape of the complex data"""
 83 |         return (self.N[0], self.Npf)
 84 | 
 85 |     def global_complex_shape(self):
 86 |         """The local shape of the complex data"""
 87 |         return (self.N[0], self.Nf)
 88 | 
 89 |     def global_real_shape(self):
 90 |         """The local shape of the complex data"""
 91 |         return (self.N[0], self.N[1])
 92 | 
 93 |     def real_local_slice(self, padsize=1):
 94 |         return (slice(int(padsize*self.rank*self.Np[0]),
 95 |                       int(padsize*(self.rank+1)*self.Np[0]), 1),
 96 |                 slice(0, int(padsize*self.N[1])))
 97 | 
 98 |     def complex_local_slice(self):
 99 |         return (slice(0, self.N[0]),
100 |                 slice(self.rank*self.Np[1]//2, self.rank*self.Np[1]//2+self.Npf, 1))
101 | 
102 |     def get_N(self):
103 |         return self.N
104 | 
105 |     def get_local_mesh(self):
106 |         # Create the mesh
107 |         X = np.mgrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0], :self.N[1]].astype(self.float)
108 |         X[0] *= self.L[0]/self.N[0]
109 |         X[1] *= self.L[1]/self.N[1]
110 |         return X
111 | 
112 |     def get_local_wavenumbermesh(self, scaled=True, broadcast=False,
113 |                                  eliminate_highest_freq=False):
114 |         kx = fftfreq(self.N[0], 1./self.N[0])
115 |         ky = rfftfreq(self.N[1], 1./self.N[1])
116 |         if eliminate_highest_freq:
117 |             for i, k in enumerate((kx, ky)):
118 |                 if self.N[i] % 2 == 0:
119 |                     k[self.N[i]//2] = 0
120 | 
121 |         Ks = np.meshgrid(kx, ky[self.rank*self.Np[1]//2:(self.rank*self.Np[1]//2+self.Npf)], indexing='ij', sparse=True)
122 |         if scaled is True:
123 |             Lp = 2*np.pi/self.L
124 |             Ks[0] *= Lp[0]
125 |             Ks[1] *= Lp[1]
126 |         K = Ks
127 |         if broadcast is True:
128 |             K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
129 |         return K
130 | 
131 |     def get_dealias_filter(self):
132 |         """Filter for dealiasing nonlinear convection"""
133 |         K = self.get_local_wavenumbermesh()
134 |         kmax = 2./3.*(self.N//2+1)
135 |         dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1]), dtype=np.uint8)
136 |         return dealias
137 | 
138 |     def global_complex_shape_padded(self):
139 |         """Global size of problem in complex wavenumber space"""
140 |         return (int(self.padsize*self.N[0]), int(self.padsize*self.N[1]/2+1))
141 | 
142 |     def real_shape_padded(self):
143 |         """The local shape of the real data"""
144 |         return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]))
145 | 
146 |     def complex_padded_xy(self):
147 |         """The local shape of the real data"""
148 |         return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]/2+1))
149 | 
150 |     def complex_shape_padded_01(self):
151 |         """The local shape of the real data"""
152 |         return (int(self.padsize*self.Np[0]), self.Nf)
153 | 
154 |     def complex_padded_x(self):
155 |         """Padding in x-direction"""
156 |         return (int(self.padsize*self.N[0]), self.Npf)
157 | 
158 |     def work_shape(self, dealias):
159 |         """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved"""
160 |         if dealias == '3/2-rule':
161 |             return self.real_shape_padded()
162 | 
163 |         else:
164 |             return self.real_shape()
165 | 
166 |     def copy_to_padded_x(self, fu, fp):
167 |         fp[:self.N[0]//2] = fu[:self.N[0]//2]
168 |         fp[-(self.N[0]//2):] = fu[self.N[0]//2:]
169 |         return fp
170 | 
171 |     def copy_to_padded_y(self, fu, fp):
172 |         fp[:, :self.Nf] = fu[:]
173 |         return fp
174 | 
175 |     def copy_from_padded_y(self, fp, fu):
176 |         fu[:] = fp[:, :self.Nf]
177 |         return fu
178 | 
179 |     def fft2(self, u, fu, dealias=None):
180 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
181 | 
182 |         if self.num_processes == 1:
183 |             if not dealias == '3/2-rule':
184 |                 fu = rfft2(u, fu, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
185 | 
186 |             else:
187 |                 fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)]
188 |                 fu_padded = rfft2(u/self.padsize**2, fu_padded, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
189 |                 fu[:] = fu_padded[self.ks, :self.Nf]
190 | 
191 |             return fu
192 | 
193 |         if not dealias == '3/2-rule':
194 | 
195 |             # Work arrays
196 |             Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)]
197 |             U_send  = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)]
198 |             U_sendr = U_send.reshape((self.N[0], self.Np[1]//2))
199 |             Uc = self.work_arrays[((self.N[0], self.Np[1]//2), self.complex, 0)]
200 |             fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
201 |             fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
202 |             plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
203 | 
204 |             # Transform in y-direction
205 |             Uc_hatT = rfft(u, Uc_hatT, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft'])
206 |             Uc_hatT[:, 0] += 1j*Uc_hatT[:, -1]
207 | 
208 |             U_send = transpose_x(U_send, Uc_hatT, self.num_processes)
209 | 
210 |             # Communicate all values
211 |             self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
212 | 
213 |             Uc = fft(U_sendr, Uc, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
214 |             fu[:, :self.Np[1]//2] = Uc
215 | 
216 |             # Handle Nyquist frequency
217 |             if self.rank == 0:
218 |                 fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0])
219 |                 self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77)
220 | 
221 |             elif self.rank == self.num_processes-1:
222 |                 self.comm.Recv([fft_y, self.mpitype], source=0, tag=77)
223 |                 fu[:, -1] = fft_y
224 | 
225 |         else:
226 |             # Work arrays
227 |             U_send  = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)]
228 |             U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2))
229 |             fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)]
230 |             fu_padded_xy2 = self.work_arrays[(self.complex_shape_padded_01(), self.complex, 0)]
231 |             fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
232 |             fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
233 |             plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
234 | 
235 |             # Transform in y-direction
236 |             fu_padded_xy = rfft(u/self.padsize, fu_padded_xy, axis=1, threads=self.threads, planner_effort=self.planner_effort['rfft'])
237 |             fu_padded_xy2 = self.copy_from_padded_y(fu_padded_xy, fu_padded_xy2)
238 |             fu_padded_xy2[:, 0] += 1j*fu_padded_xy2[:, -1]
239 | 
240 |             U_send = transpose_x(U_send, fu_padded_xy2, self.num_processes)
241 | 
242 |             # Communicate all values
243 |             self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
244 | 
245 |             U_sendr = fft(U_sendr/self.padsize, U_sendr, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
246 | 
247 |             fu.fill(0)
248 |             fu[:self.N[0]//2+1, :self.Np[1]//2] = U_sendr[:self.N[0]//2+1]
249 |             fu[self.N[0]//2:, :self.Np[1]//2] += U_sendr[-self.N[0]//2:]
250 | 
251 |             # Handle Nyquist frequency
252 |             if self.rank == 0:
253 |                 fft_y = swap_Nq(fft_y, fu, fft_x, self.N[0])
254 |                 self.comm.Send([fft_y, self.mpitype], dest=self.num_processes-1, tag=77)
255 | 
256 |             elif self.rank == self.num_processes-1:
257 |                 self.comm.Recv([fft_y, self.mpitype], source=0, tag=77)
258 |                 fu[:, -1] = fft_y
259 | 
260 |         return fu
261 | 
262 |     def ifft2(self, fu, u, dealias=None):
263 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
264 | 
265 |         if dealias == '2/3-rule' and self.dealias.shape == (0,):
266 |             self.dealias = self.get_dealias_filter()
267 | 
268 |         fu_ = fu
269 |         if dealias == '2/3-rule':
270 |             fu_ = self.work_arrays[(fu, 0, False)]
271 |             fu_[:] = fu
272 |             fu_ *= self.dealias
273 | 
274 |         if self.num_processes == 1:
275 |             if not dealias == '3/2-rule':
276 |                 u = irfft2(fu_, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2'])
277 | 
278 |             else:
279 |                 fu_padded = self.work_arrays[(self.global_complex_shape_padded(), self.complex, 0)]
280 |                 fu_padded[self.ks, :self.Nf] = fu[:]
281 |                 u = irfft2(fu_padded*self.padsize**2, u, axes=(0,1), threads=self.threads, planner_effort=self.planner_effort['irfft2'])
282 | 
283 |             return u
284 | 
285 |         if not dealias == '3/2-rule':
286 |             # Get some work arrays
287 |             Uc_hat  = self.work_arrays[((self.N[0], self.Npf), self.complex, 0)]
288 |             Uc_hatT = self.work_arrays[((self.Np[0], self.Nf), self.complex, 0)]
289 |             U_send  = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1]//2), self.complex, 0)]
290 |             U_sendr = U_send.reshape((self.N[0], self.Np[1]//2))
291 |             fft_y = self.work_arrays[((self.N[0],), self.complex, 0)]
292 |             fft_x = self.work_arrays[((self.N[0],), self.complex, 1)]
293 |             plane_recv = self.work_arrays[((self.Np[0],), self.complex, 2)]
294 | 
295 |             Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
296 |             U_sendr[:] = Uc_hat[:, :self.Np[1]//2]
297 | 
298 |             self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
299 | 
300 |             Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes)
301 | 
302 |             if self.rank == self.num_processes-1:
303 |                 fft_y[:] = Uc_hat[:, -1]
304 | 
305 |             self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1)
306 |             Uc_hatT[:, -1] = plane_recv
307 | 
308 |             u = irfft(Uc_hatT, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft'])
309 | 
310 |         else:
311 |             U_send  = self.work_arrays[((self.num_processes, int(self.padsize*self.Np[0]), self.Np[1]//2), self.complex, 0)]
312 |             U_sendr = U_send.reshape((int(self.padsize*self.N[0]), self.Np[1]//2))
313 |             Uc_hatT = self.work_arrays[((int(self.padsize*self.Np[0]), self.Nf), self.complex, 0)]
314 |             fu_padded_x = self.work_arrays[(self.complex_padded_x(), self.complex, 0)]
315 |             fu_padded_x2= self.work_arrays[(self.complex_padded_x(), self.complex, 1)]
316 |             fu_padded_xy = self.work_arrays[(self.complex_padded_xy(), self.complex, 0)]
317 |             fft_y = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 0)]
318 |             fft_x = self.work_arrays[((int(self.padsize*self.N[0]),), self.complex, 1)]
319 |             plane_recv = self.work_arrays[((int(self.padsize*self.Np[0]),), self.complex, 2)]
320 | 
321 |             fu_padded_x2 = self.copy_to_padded_x(fu, fu_padded_x2)
322 |             fu_padded_x = ifft(fu_padded_x2, fu_padded_x, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
323 | 
324 |             U_sendr[:] = fu_padded_x[:, :self.Np[1]//2]
325 | 
326 |             self.comm.Alltoall(MPI.IN_PLACE, [U_send, self.mpitype])
327 | 
328 |             Uc_hatT = transpose_y(Uc_hatT, U_sendr, self.num_processes)
329 | 
330 |             if self.rank == self.num_processes-1:
331 |                 fft_y[:] = fu_padded_x[:, -1]
332 | 
333 |             self.comm.Scatter(fft_y, plane_recv, root=self.num_processes-1)
334 |             Uc_hatT[:, -1] = plane_recv
335 | 
336 |             fu_padded_xy = self.copy_to_padded_y(Uc_hatT, fu_padded_xy)
337 | 
338 |             u = irfft(fu_padded_xy*self.padsize**2, u, axis=1, threads=self.threads, planner_effort=self.planner_effort['irfft'])
339 | 
340 |         return u
341 | 


--------------------------------------------------------------------------------
/mpiFFT4py/mpibase.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
  2 | __date__ = "2016-04-14"
  3 | __copyright__ = "Copyright (C) 2016 " + __author__
  4 | __license__  = "GNU Lesser GPL version 3 or any later version"
  5 | 
  6 | import numpy as np
  7 | from mpi4py import MPI
  8 | import collections
  9 | 
 10 | # Possible way to give numpy arrays attributes...
 11 | #class Empty(np.ndarray):
 12 |     #"""Numpy empty array with additional info dictionary to hold attributes
 13 |     #"""
 14 |     #def __new__(subtype, shape, dtype=np.float, info={}):
 15 |         #obj = np.ndarray.__new__(subtype, shape, dtype)
 16 |         #obj.info = info
 17 |         #return obj
 18 | 
 19 |     #def __array_finalize__(self, obj):
 20 |         #if obj is None: return
 21 |         #self.info = getattr(obj, 'info', {})
 22 | 
 23 | #class Zeros(np.ndarray):
 24 |     #"""Numpy zeros array with additional info dictionary to hold attributes
 25 |     #"""
 26 |     #def __new__(subtype, shape, dtype=float, info={}):
 27 |         #obj = np.ndarray.__new__(subtype, shape, dtype)
 28 |         #obj.fill(0)
 29 |         #obj.info = info
 30 |         #return obj
 31 | 
 32 |     #def __array_finalize__(self, obj):
 33 |         #if obj is None: return
 34 |         #self.info = getattr(obj, 'info', {})
 35 | 
 36 | Empty, Zeros = np.empty, np.zeros
 37 | 
 38 | try:
 39 |     import pyfftw
 40 |     def empty(N, dtype=np.float, bytes=16):
 41 |         return pyfftw.empty_aligned(N, dtype=dtype, n=bytes)
 42 | 
 43 |     def zeros(N, dtype=np.float, bytes=16):
 44 |         return pyfftw.zeros_aligned(N, dtype=dtype, n=bytes)
 45 | 
 46 | except ImportError:
 47 |     def empty(N, dtype=np.float, bytes=None):
 48 |         return Empty(N, dtype=dtype)
 49 | 
 50 |     def zeros(N, dtype=np.float, bytes=None):
 51 |         return Zeros(N, dtype=dtype)
 52 | 
 53 | class work_array_dict(dict):
 54 |     """Dictionary of work arrays indexed by their shape, type and an indicator i."""
 55 |     def __missing__(self, key):
 56 |         shape, dtype, i = key
 57 |         a = zeros(shape, dtype=dtype)
 58 |         self[key] = a
 59 |         return self[key]
 60 | 
 61 | class work_arrays(collections.MutableMapping):
 62 |     """A dictionary to hold numpy work arrays.
 63 | 
 64 |     The dictionary allows two types of keys for the same item.
 65 | 
 66 |     keys:
 67 |         - (shape, dtype, index (, fillzero)), where shape is tuple, dtype is np.dtype and
 68 |                                               index an integer
 69 |         - (ndarray, index (, fillzero)),      where ndarray is a numpy array and index is
 70 |                                               an integer
 71 |                                               fillzero is an optional bool that determines
 72 |                                               whether the array is initialised to zero
 73 | 
 74 |     Usage:
 75 |         To create two real work arrays of shape (3,3), do:
 76 |         - work = workarrays()
 77 |         - a = work[((3,3), np.float, 0)]
 78 |         - b = work[(a, 1)]
 79 | 
 80 |     Returns:
 81 |         Numpy array of given shape. The array is by default initialised to zero, but this
 82 |         can be overridden using the fillzero argument.
 83 | 
 84 |     """
 85 | 
 86 |     def __init__(self):
 87 |         self.store = work_array_dict()
 88 |         self.fillzero = True
 89 | 
 90 |     def __getitem__(self, key):
 91 |         val = self.store[self.__keytransform__(key)]
 92 |         if self.fillzero is True: val.fill(0)
 93 |         return val
 94 | 
 95 |     def __setitem__(self, key, value):
 96 |         self.store[self.__keytransform__(key)] = value
 97 | 
 98 |     def __delitem__(self, key):
 99 |         del self.store[self.__keytransform__(key)]
100 | 
101 |     def __iter__(self):
102 |         return iter(self.store)
103 | 
104 |     def __len__(self):
105 |         return len(self.store)
106 | 
107 |     def values(self):
108 |         raise TypeError('Work arrays not iterable')
109 | 
110 |     def __keytransform__(self, key):
111 |         if isinstance(key[0], np.ndarray):
112 |             shape = key[0].shape
113 |             dtype = key[0].dtype
114 |             i = key[1]
115 |             zero = True if len(key) == 2 else key[2]
116 | 
117 |         elif isinstance(key[0], tuple):
118 |             if len(key) == 3:
119 |                 shape, dtype, i = key
120 |                 zero = True
121 | 
122 |             elif len(key) == 4:
123 |                 shape, dtype, i, zero = key
124 | 
125 |         else:
126 |             raise TypeError("Wrong type of key for work array")
127 | 
128 |         assert isinstance(zero, bool)
129 |         assert isinstance(i, int)
130 |         self.fillzero = zero
131 |         return (shape, np.dtype(dtype), i)
132 | 
133 | def datatypes(precision):
134 |     """Return datatypes associated with precision."""
135 |     assert precision in ("single", "double")
136 |     return {"single": (np.float32, np.complex64, MPI.C_FLOAT_COMPLEX),
137 |             "double": (np.float64, np.complex128, MPI.C_DOUBLE_COMPLEX)}[precision]
138 | 


--------------------------------------------------------------------------------
/mpiFFT4py/pencil.py:
--------------------------------------------------------------------------------
   1 | from __future__ import division
   2 | """Pencil decomposition
   3 | 
   4 | This module contains classes for performing FFTs with pencil decomposition
   5 | of three-dimensional data structures data[Nx,Ny,Nz],  where (Nx, Ny, Nz) is
   6 | the shape of the input data. With slab decomposition only one of these three
   7 | indices is shared, leading to local datastructures on each processor
   8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors.
   9 | With pencil, two of the input arrays indices are shared, leading to local
  10 | data of shape (Nx/P1, Ny/P2, Nz), i.e., pencils aligned in the z-direction.
  11 | 
  12 | The final transformed data can be aligned in either the y-direction or
  13 | the x-direction.
  14 | 
  15 | classes:
  16 |     R2CX - For real to complex transforms. Final alignment in x-direction
  17 |         Args:
  18 |         N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh
  19 |         L - NumPy array([Lx, Ly, Lz]) size of the computational domain
  20 |         comm - The MPI communicator object
  21 |         precision - "single" or "double"
  22 |         communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
  23 |         padsize - The size of padding, if padding is used in transforms
  24 |         threads - Number of threads used by FFTs
  25 |         planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE",
  26 |                          "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
  27 | 
  28 |     R2CY - For real to complex transforms. Final alignment in y-direction
  29 |         Args:
  30 |         N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh
  31 |         L - NumPy array([Lx, Ly, Lz]) size of the computational domain
  32 |         comm - The MPI communicator object
  33 |         precision - "single" or "double"
  34 |         P1 - Decomposition along first dimension
  35 |         communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
  36 |         padsize - The size of padding, if padding is used in transforms
  37 |         threads - Number of threads used by FFTs
  38 |         planner_effort - Planner effort used by FFTs ("FFTW_MEASURE",
  39 |                          "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
  40 | 
  41 | function:
  42 |     R2C
  43 | 
  44 |     Args:
  45 |         N - NumPy array([Nx, Ny, Nz]) number of nodes for the real mesh
  46 |         L - NumPy array([Lx, Ly, Lz]) size of the computational domain
  47 |         comm - The MPI communicator object
  48 |         precision - "single" or "double"
  49 |         P1 - Decomposition along first dimension
  50 |         communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
  51 |         padsize - The size of padding, if padding is used in transforms
  52 |         threads - Number of threads used by FFTs
  53 |         alignment - Final alignment, ('X' or 'Y')
  54 |         planner_effort - Planner effort used by FFTs ("FFTW_MEASURE",
  55 |                          "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
  56 | 
  57 | """
  58 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
  59 | __date__ = "2016-02-16"
  60 | __copyright__ = "Copyright (C) 2016 " + __author__
  61 | __license__ = "GNU Lesser GPL version 3 or any later version"
  62 | 
  63 | from .serialFFT import *
  64 | import numpy as np
  65 | from .mpibase import work_arrays, datatypes
  66 | from .cython.maths import dealias_filter
  67 | from numpy.fft import fftfreq, rfftfreq
  68 | from collections import defaultdict
  69 | from mpi4py import MPI
  70 | 
  71 | #__all__ = ['R2C']
  72 | 
  73 | # Using Lisandro Dalcin's code for Alltoallw.
  74 | # Note that _subsize and _distribution are modified for a mesh of power two.
  75 | 
  76 | def _subsize(N, size, rank):
  77 |     return N // size + ((N % size) * (rank == size -1))
  78 |     #return N // size + (N % size > rank) # Generic
  79 | 
  80 | def _distribution(N, size):
  81 |     q = N // size
  82 |     r = N % size
  83 |     n = s = i = 0
  84 |     while i < size:
  85 |         n = q
  86 |         s = q * i
  87 |         if r == 1 and i+1 == size:
  88 |             n += 1
  89 |         yield n, s
  90 |         i += 1
  91 | 
  92 | # Generic
  93 | #def _distribution2(N, size):
  94 |     #q = N // size
  95 |     #r = N % size
  96 |     #n = s = i = 0
  97 |     #while i < size:
  98 |         #n = q
  99 |         #s = q * i
 100 |         #if i < r:
 101 |             #n += 1
 102 |             #s += i
 103 |         #else:
 104 |             #s += r
 105 |         #yield n, s
 106 |         #i += 1
 107 | 
 108 | 
 109 | def transform_Uc_xz(Uc_hat_x, Uc_hat_z, P1):
 110 |     sz = Uc_hat_z.shape
 111 |     sx = Uc_hat_x.shape
 112 |     Uc_hat_x[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P1, sx[2])), 2).reshape(sx)
 113 |     return Uc_hat_x
 114 | 
 115 | def transform_Uc_zx(Uc_hat_z, Uc_hat_xr, P1):
 116 |     sz = Uc_hat_z.shape
 117 |     sx = Uc_hat_xr.shape
 118 |     Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_xr.reshape((P1, sz[0], sz[1], sx[2])), 0, 3).reshape((sz[0], sz[1], sz[2]-1))
 119 |     return Uc_hat_z
 120 | 
 121 | def transform_Uc_xy(Uc_hat_x, Uc_hat_y, P):
 122 |     sy = Uc_hat_y.shape
 123 |     sx = Uc_hat_x.shape
 124 |     Uc_hat_x[:] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sx[1], sx[2])), 1).reshape(sx)
 125 |     return Uc_hat_x
 126 | 
 127 | def transform_Uc_yx(Uc_hat_y, Uc_hat_x, P):
 128 |     sy = Uc_hat_y.shape
 129 |     sx = Uc_hat_x.shape
 130 |     Uc_hat_y[:] = np.rollaxis(Uc_hat_x.reshape((P, sx[0]//P, sx[1], sx[2])), 1).reshape(sy)
 131 |     return Uc_hat_y
 132 | 
 133 | def transform_Uc_yz(Uc_hat_y, Uc_hat_z, P):
 134 |     sz = Uc_hat_z.shape
 135 |     sy = Uc_hat_y.shape
 136 |     Uc_hat_y[:] = np.rollaxis(Uc_hat_z[:,:,:-1].reshape((sz[0], sz[1], P, sy[2])), 1, 3).reshape(sy)
 137 |     return Uc_hat_y
 138 | 
 139 | def transform_Uc_zy(Uc_hat_z, Uc_hat_y, P):
 140 |     sz = Uc_hat_z.shape
 141 |     sy = Uc_hat_y.shape
 142 |     Uc_hat_z[:, :, :-1] = np.rollaxis(Uc_hat_y.reshape((sy[0], P, sz[1], sy[2])), 1, 3).reshape((sz[0], sz[1], sz[2]-1))
 143 |     return Uc_hat_z
 144 | 
 145 | class R2CY(object):
 146 |     """Class for performing FFT in 3D using MPI
 147 | 
 148 |     Pencil decomposition
 149 | 
 150 |     Args:
 151 |         N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
 152 |         L - NumPy array([Lx, Ly, Lz]) The actual size of the computational domain
 153 |         comm - The MPI communicator object
 154 |         precision - "single" or "double"
 155 |         P1 - Decomposition along first dimension
 156 |         communication - Communication scheme ('AlltoallN', 'Alltoall' or 'Alltoallw')
 157 |         padsize - The size of padding, if padding is used in transforms
 158 |         threads - Number of threads used by FFTs
 159 |         planner_effort - Planner effort used by FFTs ("FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
 160 |                          Give as defaultdict, with keys representing transform (e.g., fft, ifft)
 161 | 
 162 |     This version has the final complex data aligned in the y-direction, in agreement
 163 |     with the paper in CPC (http://arxiv.org/pdf/1602.03638v1.pdf)
 164 | 
 165 |     """
 166 | 
 167 |     def __init__(self, N, L, comm, precision, P1=None, communication='Alltoallw', padsize=1.5, threads=1,
 168 |                  planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
 169 |         self.N = N
 170 |         assert len(L) == 3
 171 |         assert len(N) == 3
 172 |         self.Nf = N[2]//2+1 # Number of independent complex wavenumbers in z-direction
 173 |         self.comm = comm
 174 |         self.float, self.complex, self.mpitype = float, complex, mpitype = datatypes(precision)
 175 |         self.num_processes = comm.Get_size()
 176 |         assert self.num_processes > 1
 177 |         self.L = L.astype(float)
 178 |         self.dealias = np.zeros(0)
 179 |         self.communication = communication
 180 |         self.padsize = padsize
 181 |         self.threads = threads
 182 |         self.planner_effort = planner_effort
 183 |         self.rank = comm.Get_rank()
 184 |         if P1 is None:
 185 |             P1, P2 = MPI.Compute_dims(self.num_processes, 2)
 186 |             self.P1, self.P2 = P1, P2
 187 |         else:
 188 |             self.P1 = P1
 189 |             self.P2 = P2 = self.num_processes // P1
 190 |         self.N1 = N // P1
 191 |         self.N2 = N // P2
 192 |         self.comm0 = comm.Split(self.rank/P1)
 193 |         self.comm1 = comm.Split(self.rank%P1)
 194 |         self.comm0_rank = self.comm0.Get_rank()
 195 |         self.comm1_rank = self.comm1.Get_rank()
 196 |         self.work_arrays = work_arrays()
 197 |         self.N1f = self.N1[2]//2 if self.comm0_rank < self.P1-1 else self.N1[2]//2+1
 198 |         if self.communication == 'AlltoallN':
 199 |             self.N1f = self.N1[2]//2
 200 | 
 201 |         if not (self.num_processes % 2 == 0 or self.num_processes == 1):
 202 |             raise IOError("Number of cpus must be even")
 203 | 
 204 |         if (P1 % 2 != 0) or (P2 % 2 != 0):
 205 |             raise IOError("Number of cpus in each direction must be even power of 2")
 206 | 
 207 |         self._subarrays1A = []
 208 |         self._subarrays1B = []
 209 |         self._subarrays2A = []
 210 |         self._subarrays2B = []
 211 |         self._subarrays1A_pad = []
 212 |         self._subarrays1B_pad = []
 213 |         self._subarrays2A_pad = []
 214 |         self._subarrays2B_pad = []
 215 |         self._counts_displs1 = None
 216 |         self._counts_displs2 = None
 217 | 
 218 |     def get_subarrays(self, padsize=1):
 219 |         datatype = MPI._typedict[np.dtype(self.complex).char]
 220 |         M, N, Q = self.N[0], self.N[1], self.Nf
 221 |         m = _subsize(M, self.P2, self.comm1_rank)
 222 |         n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
 223 |         q = _subsize(Q, self.P1, self.comm0_rank)
 224 |         _subarrays1A = [
 225 |             datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit()
 226 |             for l, s in _distribution(int(padsize*N), self.P2)
 227 |         ]
 228 |         _subarrays1B = [
 229 |             datatype.Create_subarray([M,n,q], [l,n,q], [s,0,0]).Commit()
 230 |             for l, s in _distribution(M, self.P2)
 231 |         ]
 232 |         _counts_displs1 = ([1] * self.P2, [0] * self.P2)
 233 | 
 234 |         m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
 235 |         n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
 236 |         q = _subsize(Q, self.P1, self.comm0_rank)
 237 |         _subarrays2A = [
 238 |             datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit()
 239 |             for l, s in _distribution(int(padsize*M), self.P1)
 240 |         ]
 241 |         _subarrays2B = [
 242 |             datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit()
 243 |             for l, s in _distribution(Q, self.P1)
 244 |         ]
 245 |         _counts_displs2 = ([1] * self.P1, [0] * self.P1)
 246 |         return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2
 247 | 
 248 |     def real_shape(self):
 249 |         """The local shape of the real data"""
 250 |         return (self.N1[0], self.N2[1], self.N[2])
 251 | 
 252 |     def complex_shape(self):
 253 |         """The local shape of the complex data"""
 254 |         return (self.N2[0], self.N[1], self.N1f)
 255 | 
 256 |     def complex_shape_T(self):
 257 |         """The local transposed shape of the complex data"""
 258 |         return (self.Np[0], self.N[1], self.Nf)
 259 | 
 260 |     def complex_shape_I(self):
 261 |         """A local intermediate shape of the complex data"""
 262 |         return (self.Np[0], self.num_processes, self.Np[1], self.Nf)
 263 | 
 264 |     def real_shape_padded(self):
 265 |         return (int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2]))
 266 | 
 267 |     def work_shape(self, dealias):
 268 |         """Shape of work arrays used in convection with dealiasing. Different shape whether or not padding is involved"""
 269 |         if dealias == '3/2-rule':
 270 |             return self.real_shape_padded()
 271 | 
 272 |         else:
 273 |             return self.real_shape()
 274 | 
 275 |     def real_local_slice(self, padsize=1):
 276 |         xzrank = self.comm0.Get_rank() # Local rank in xz-plane
 277 |         xyrank = self.comm1.Get_rank() # Local rank in xy-plane
 278 |         return (slice(int(padsize * xzrank * self.N1[0]), int(padsize * (xzrank+1) * self.N1[0]), 1),
 279 |                 slice(int(padsize * xyrank * self.N2[1]), int(padsize * (xyrank+1) * self.N2[1]), 1),
 280 |                 slice(0, int(padsize*self.N[2])))
 281 | 
 282 |     def complex_local_slice(self):
 283 |         xzrank = self.comm0.Get_rank() # Local rank in xz-plane
 284 |         xyrank = self.comm1.Get_rank() # Local rank in xy-plane
 285 |         return (slice(xyrank*self.N2[0], (xyrank+1)*self.N2[0], 1),
 286 |                 slice(0, self.N[1]),
 287 |                 slice(xzrank*self.N1[2]//2, xzrank*self.N1[2]//2 + self.N1f, 1))
 288 | 
 289 |     def complex_local_wavenumbers(self):
 290 |         s = self.complex_local_slice()
 291 |         return (fftfreq(self.N[0], 1./self.N[0]).astype(int)[s[0]],
 292 |                 fftfreq(self.N[1], 1./self.N[1]).astype(int),
 293 |                 rfftfreq(self.N[2], 1./self.N[2]).astype(int)[s[2]])
 294 | 
 295 |     def get_P(self):
 296 |         return self.P1, self.P2
 297 | 
 298 |     def get_local_mesh(self):
 299 |         xzrank = self.comm0.Get_rank() # Local rank in xz-plane
 300 |         xyrank = self.comm1.Get_rank() # Local rank in xy-plane
 301 | 
 302 |         # Create the physical mesh
 303 |         x1 = slice(xzrank * self.N1[0], (xzrank+1) * self.N1[0], 1)
 304 |         x2 = slice(xyrank * self.N2[1], (xyrank+1) * self.N2[1], 1)
 305 |         X = np.ogrid[x1, x2, :self.N[2]]
 306 | 
 307 |         X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float)
 308 |         X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float)
 309 |         X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float)
 310 |         X = [np.broadcast_to(x, self.real_shape()) for x in X]
 311 |         return X
 312 | 
 313 |     def get_local_wavenumbermesh(self, scaled=False, broadcast=False,
 314 |                                  eliminate_highest_freq=False):
 315 |         """Returns (scaled) local decomposed wavenumbermesh
 316 | 
 317 |         If scaled is True, then the wavenumbermesh is scaled with physical mesh
 318 |         size. This takes care of mapping the physical domain to a computational
 319 |         cube of size (2pi)**3
 320 | 
 321 | 
 322 |         """
 323 |         s = self.complex_local_slice()
 324 |         kx = fftfreq(self.N[0], 1./self.N[0]).astype(int)
 325 |         ky = fftfreq(self.N[1], 1./self.N[1]).astype(int)
 326 |         kz = rfftfreq(self.N[2], 1./self.N[2]).astype(int)
 327 |         if eliminate_highest_freq:
 328 |             for i, k in enumerate((kx, ky, kz)):
 329 |                 if self.N[i] % 2 == 0:
 330 |                     k[self.N[i]//2] = 0
 331 |         kx = kx[s[0]]
 332 |         kz = kz[s[2]]
 333 |         Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True)
 334 |         if scaled is True:
 335 |             Lp = 2*np.pi/self.L
 336 |             for i in range(3):
 337 |                 Ks[i] = (Ks[i]*Lp[i]).astype(self.float)
 338 |         K = Ks
 339 |         if broadcast is True:
 340 |             K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
 341 |         return K
 342 | 
 343 |     def get_dealias_filter(self):
 344 |         """Filter for dealiasing nonlinear convection"""
 345 |         K = self.get_local_wavenumbermesh()
 346 |         kmax = 2./3.*(self.N//2+1)
 347 |         dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])*
 348 |                            (abs(K[2]) < kmax[2]), dtype=np.uint8)
 349 |         return dealias
 350 | 
 351 |     def copy_to_padded_x(self, fu, fp):
 352 |         fp[:self.N[0]//2] = fu[:self.N[0]//2]
 353 |         fp[-(self.N[0]//2):] = fu[self.N[0]//2:]
 354 |         return fp
 355 | 
 356 |     def copy_to_padded_y(self, fu, fp):
 357 |         fp[:, :self.N[1]//2] = fu[:, :self.N[1]//2]
 358 |         fp[:, -(self.N[1]//2):] = fu[:, self.N[1]//2:]
 359 |         return fp
 360 | 
 361 |     def copy_to_padded_z(self, fu, fp):
 362 |         fp[:, :, :self.Nf] = fu[:]
 363 |         return fp
 364 | 
 365 |     def copy_from_padded_z(self, fp, fu):
 366 |         fu[:] = fp[:, :, :self.Nf]
 367 |         return fu
 368 | 
 369 |     def copy_from_padded_x(self, fp, fu):
 370 |         fu.fill(0)
 371 |         fu[:self.N[0]//2+1] = fp[:self.N[0]//2+1]
 372 |         fu[self.N[0]//2:] += fp[-self.N[0]//2:]
 373 |         return fu
 374 | 
 375 |     def copy_from_padded_y(self, fp, fu):
 376 |         fu.fill(0)
 377 |         fu[:, :self.N[1]//2+1] = fp[:, :self.N[1]//2+1]
 378 |         fu[:, self.N[1]//2:] += fp[:, -self.N[1]//2:]
 379 |         return fu
 380 | 
 381 |     def global_complex_shape(self, padsize=1.0):
 382 |         """Global size of problem in complex wavenumber space"""
 383 |         return (int(padsize*self.N[0]), int(padsize*self.N[1]),
 384 |                 int(padsize*self.N[2]//2+1))
 385 | 
 386 |     def ifftn(self, fu, u, dealias=None):
 387 |         """ifft in three directions using mpi.
 388 |         Need to do ifft in reversed order of fft
 389 |         """
 390 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
 391 | 
 392 |         if dealias == '2/3-rule' and self.dealias.shape == (0,):
 393 |             self.dealias = self.get_dealias_filter()
 394 | 
 395 |         # Strip off self
 396 |         N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f
 397 | 
 398 |         if not dealias == '3/2-rule':
 399 | 
 400 |             fu_ = fu
 401 |             if dealias == '2/3-rule':
 402 |                 fu_ = self.work_arrays[(fu, 0, False)]
 403 |                 fu_[:] = fu
 404 |                 fu_ = dealias_filter(fu_, self.dealias)
 405 |                 #fu_ *= self.dealias
 406 | 
 407 |             Uc_hat_y = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0, False)]
 408 |             Uc_hat_z = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0, False)]
 409 | 
 410 |             if self.communication == 'AlltoallN':
 411 |                 Uc_hat_x = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)]
 412 | 
 413 |                 # Do first owned direction
 414 |                 Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
 415 |                                 planner_effort=self.planner_effort['ifft'])
 416 | 
 417 |                 # Transform to x all but k=N//2 (the neglected Nyquist mode)
 418 |                 Uc_hat_x[:] = transform_Uc_xy(Uc_hat_x, Uc_hat_y, self.P2)
 419 | 
 420 |                 # Communicate in xz-plane and do fft in x-direction
 421 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 422 |                 Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads,
 423 |                                    planner_effort=self.planner_effort['ifft'])
 424 | 
 425 |                 # Communicate and transform in xy-plane
 426 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 427 |                 Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1)
 428 | 
 429 |                 # Do fft for z-direction
 430 |                 Uc_hat_z[:, :, -1] = 0
 431 |                 u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads,
 432 |                              planner_effort=self.planner_effort['irfft'])
 433 | 
 434 |             elif self.communication == 'Alltoall':
 435 |                 # Additional work arrays
 436 |                 Uc_hat_x  = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0, False)]
 437 |                 Uc_hat_xp = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)]
 438 |                 xy_plane  = self.work_arrays[((N[0], N2[1]), self.complex, 0, False)]
 439 |                 xy_recv   = self.work_arrays[((N1[0], N2[1]), self.complex, 0, False)]
 440 | 
 441 |                 # Do first owned direction
 442 |                 Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
 443 |                                 planner_effort=self.planner_effort['ifft'])
 444 | 
 445 |                 # Transform to x
 446 |                 Uc_hat_xp = transform_Uc_xy(Uc_hat_xp, Uc_hat_y, self.P2)
 447 | 
 448 |                 ###### In-place
 449 |                 ## Communicate in xz-plane and do fft in x-direction
 450 |                 #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xp, self.mpitype])
 451 |                 #Uc_hat_xp[:] = ifft(Uc_hat_xp, axis=0, threads=self.threads,
 452 |                                     #planner_effort=self.planner_effort['ifft'])
 453 | 
 454 |                 #Uc_hat_x[:] = Uc_hat_xp[:, :, :self.N1[2]//2]
 455 | 
 456 |                 ## Communicate and transform in xy-plane all but k=N//2
 457 |                 #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 458 | 
 459 |                 ####### Not in-place
 460 |                 # Communicate in xz-plane and do fft in x-direction
 461 |                 Uc_hat_xp2 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 1, False)]
 462 |                 self.comm1.Alltoall([Uc_hat_xp, self.mpitype], [Uc_hat_xp2, self.mpitype])
 463 |                 Uc_hat_xp = ifft(Uc_hat_xp2, Uc_hat_xp, axis=0, threads=self.threads,
 464 |                                  planner_effort=self.planner_effort['ifft'])
 465 | 
 466 |                 Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 1, False)]
 467 |                 Uc_hat_x2[:] = Uc_hat_xp[:, :, :N1[2]//2]
 468 | 
 469 |                 # Communicate and transform in xy-plane all but k=N//2
 470 |                 self.comm0.Alltoall([Uc_hat_x2, self.mpitype], [Uc_hat_x, self.mpitype])
 471 |                 #########################
 472 | 
 473 |                 Uc_hat_z[:] = transform_Uc_zx(Uc_hat_z, Uc_hat_x, self.P1)
 474 | 
 475 |                 xy_plane[:] = Uc_hat_xp[:, :, -1]
 476 |                 self.comm0.Scatter(xy_plane, xy_recv, root=self.P1-1)
 477 |                 Uc_hat_z[:, :, -1] = xy_recv
 478 | 
 479 |                 # Do ifft for z-direction
 480 |                 u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
 481 |                           planner_effort=self.planner_effort['irfft'])
 482 | 
 483 |             elif self.communication == 'Alltoallw':
 484 |                 if len(self._subarrays1A) == 0:
 485 |                     (self._subarrays1A, self._subarrays1B, self._subarrays2A,
 486 |                      self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
 487 | 
 488 |                 Uc_hat_x  = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0, False)]
 489 | 
 490 |                 # Do first owned direction
 491 |                 Uc_hat_y = ifft(fu_, Uc_hat_y, axis=1, threads=self.threads,
 492 |                                 planner_effort=self.planner_effort['ifft'])
 493 | 
 494 |                 self.comm1.Alltoallw(
 495 |                     [Uc_hat_y, self._counts_displs1, self._subarrays1A],
 496 |                     [Uc_hat_x, self._counts_displs1, self._subarrays1B])
 497 | 
 498 |                 Uc_hat_x[:] = ifft(Uc_hat_x, axis=0, threads=self.threads,
 499 |                                    planner_effort=self.planner_effort['ifft'])
 500 | 
 501 |                 self.comm0.Alltoallw(
 502 |                     [Uc_hat_x, self._counts_displs2, self._subarrays2A],
 503 |                     [Uc_hat_z, self._counts_displs2, self._subarrays2B])
 504 | 
 505 |                 # Do fft for z-direction
 506 |                 u[:] = irfft(Uc_hat_z, overwrite_input=True, axis=2, threads=self.threads,
 507 |                              planner_effort=self.planner_effort['irfft'])
 508 | 
 509 |             return u
 510 | 
 511 |         else:  # padded
 512 | 
 513 |             padsize = self.padsize
 514 |             Uc_pad_hat_y  = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)]
 515 |             Uc_pad_hat_z  = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
 516 |             Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
 517 | 
 518 |             if self.communication == 'AlltoallN':
 519 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 520 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 521 | 
 522 |                 Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
 523 | 
 524 |                 # Do first owned direction
 525 |                 Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
 526 |                                        planner_effort=self.planner_effort['ifft'])
 527 | 
 528 |                 # Transform to x all but k=N//2 (the neglected Nyquist mode)
 529 |                 Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P2)
 530 | 
 531 |                 # Communicate in xz-plane
 532 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
 533 | 
 534 |                 # Pad and do fft in x-direction
 535 |                 Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy)
 536 |                 Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads,
 537 |                                         planner_effort=self.planner_effort['ifft'])
 538 | 
 539 |                 # Communicate in xy-plane
 540 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
 541 | 
 542 |                 # Transform
 543 |                 Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1)
 544 |                 Uc_pad_hat_z[:, :, -1] = 0
 545 | 
 546 |                 # Pad in z-dir
 547 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
 548 | 
 549 |                 # Do ifft for z-direction
 550 |                 u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads,
 551 |                           planner_effort=self.planner_effort['irfft'])
 552 | 
 553 |             elif self.communication == 'Alltoall':
 554 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 555 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 556 |                 Uc_pad_hat_xr2  = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
 557 |                 Uc_pad_hat_xy3  = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
 558 |                 xy2_pad_plane   = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1])), self.complex, 0)]
 559 |                 xy2_pad_recv    = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1])), self.complex, 1)]
 560 | 
 561 |                 # Pad in y-direction
 562 |                 Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
 563 | 
 564 |                 # Transform first owned direction
 565 |                 Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
 566 |                                        planner_effort=self.planner_effort['ifft'])
 567 | 
 568 |                 # Transpose datastructure to x
 569 |                 Uc_pad_hat_xr2[:] = transform_Uc_xy(Uc_pad_hat_xr2, Uc_pad_hat_y, self.P2)
 570 | 
 571 |                 # Communicate in xz-plane and do fft in x-direction
 572 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype])
 573 | 
 574 |                 # Pad and do fft in x-direction
 575 |                 Uc_pad_hat_xy3 = self.copy_to_padded_x(Uc_pad_hat_xr2, Uc_pad_hat_xy3)
 576 |                 Uc_pad_hat_xy3[:] = ifft(Uc_pad_hat_xy3, axis=0, threads=self.threads,
 577 |                                          planner_effort=self.planner_effort['ifft'])
 578 | 
 579 |                 Uc_pad_hat_xy[:] = Uc_pad_hat_xy3[:, :, :N1[2]//2]
 580 | 
 581 |                 # Communicate and transform in xy-plane all but k=N//2
 582 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
 583 | 
 584 |                 Uc_pad_hat_z[:] = transform_Uc_zx(Uc_pad_hat_z, Uc_pad_hat_xy, self.P1)
 585 | 
 586 |                 xy2_pad_plane[:] = Uc_pad_hat_xy3[:, :, -1]
 587 |                 self.comm0.Scatter(xy2_pad_plane, xy2_pad_recv, root=self.P1-1)
 588 |                 Uc_pad_hat_z[:, :, -1] = xy2_pad_recv
 589 | 
 590 |                 # Pad in z-dir
 591 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
 592 | 
 593 |                 # Do ifft for z-direction
 594 |                 u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads,
 595 |                           planner_effort=self.planner_effort['irfft'])
 596 | 
 597 |             elif self.communication == 'Alltoallw':
 598 |                 if len(self._subarrays1A_pad) == 0:
 599 |                     (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
 600 |                      self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
 601 | 
 602 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
 603 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
 604 | 
 605 |                 # Pad in y-direction
 606 |                 Uc_pad_hat_y = self.copy_to_padded_y(fu*padsize**3, Uc_pad_hat_y)
 607 | 
 608 |                 # Transform first owned direction
 609 |                 Uc_pad_hat_y[:] = ifft(Uc_pad_hat_y, axis=1, threads=self.threads,
 610 |                                        planner_effort=self.planner_effort['ifft'])
 611 | 
 612 |                 self.comm1.Alltoallw(
 613 |                     [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad],
 614 |                     [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad])
 615 | 
 616 |                 # Pad and do fft in x-direction
 617 |                 Uc_pad_hat_xy = self.copy_to_padded_x(Uc_pad_hat_x, Uc_pad_hat_xy)
 618 |                 Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=0, threads=self.threads,
 619 |                                         planner_effort=self.planner_effort['ifft'])
 620 | 
 621 |                 self.comm0.Alltoallw(
 622 |                     [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad],
 623 |                     [Uc_pad_hat_z,  self._counts_displs2, self._subarrays2B_pad])
 624 | 
 625 |                 # Pad in z-dir
 626 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
 627 | 
 628 |                 # Do fft for z-direction
 629 |                 u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads,
 630 |                           planner_effort=self.planner_effort['irfft'])
 631 | 
 632 |             return u
 633 | 
 634 |     def fftn(self, u, fu, dealias=None):
 635 |         """fft in three directions using mpi."""
 636 | 
 637 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
 638 | 
 639 |         # Strip off self
 640 |         N, N1, N2, Nf, N1f = self.N, self.N1, self.N2, self.Nf, self.N1f
 641 | 
 642 |         if not dealias == '3/2-rule':
 643 | 
 644 |             Uc_hat_y  = self.work_arrays[((N2[0], N[1], N1f), self.complex, 0)]
 645 |             Uc_hat_z  = self.work_arrays[((N1[0], N2[1], Nf), self.complex, 0)]
 646 | 
 647 |             if self.communication == 'AlltoallN':
 648 |                 Uc_hat_x  = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)]
 649 | 
 650 |                 # Do fft in z direction on owned data
 651 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
 652 |                                 planner_effort=self.planner_effort['rfft'])
 653 | 
 654 |                 # Transform to x direction neglecting k=N//2 (Nyquist)
 655 |                 Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1)
 656 | 
 657 |                 # Communicate and do fft in x-direction
 658 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 659 |                 Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
 660 |                                   planner_effort=self.planner_effort['fft'])
 661 | 
 662 |                 # Communicate and transform to final y-direction
 663 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 664 |                 Uc_hat_y[:] = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P2)
 665 | 
 666 |                 # Do fft for last direction
 667 |                 fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
 668 |                          planner_effort=self.planner_effort['fft'])
 669 | 
 670 |             elif self.communication == 'Alltoall':
 671 | 
 672 |                 # Additional work arrays
 673 |                 Uc_hat_x  = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 0)]
 674 |                 Uc_hat_xr2= self.work_arrays[((N[0], N2[1], N1f), self.complex, 1)]
 675 |                 xy_plane  = self.work_arrays[((N[0], N2[1]), self.complex, 0)]
 676 |                 xy_plane2 = self.work_arrays[((N[0]//2+1, N2[1]), self.complex, 0)]
 677 |                 xy_recv   = self.work_arrays[((N1[0], N2[1]), self.complex, 0)]
 678 | 
 679 |                 # Do fft in z direction on owned data
 680 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
 681 |                                 planner_effort=self.planner_effort['rfft'])
 682 | 
 683 |                 # Move real part of Nyquist to k=0
 684 |                 Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1]
 685 | 
 686 |                 # Transform to x direction neglecting k=N//2 (Nyquist)
 687 |                 Uc_hat_x = transform_Uc_xz(Uc_hat_x, Uc_hat_z, self.P1)
 688 | 
 689 |                 # In-place
 690 |                 # Communicate and do fft in x-direction
 691 |                 #self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
 692 |                 #Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
 693 |                                   #planner_effort=self.planner_effort['fft'])
 694 | 
 695 |                 # Not in-place
 696 |                 Uc_hat_x2 = self.work_arrays[((N[0], N2[1], N1[2]//2), self.complex, 2, False)]
 697 |                 self.comm0.Alltoall([Uc_hat_x, self.mpitype], [Uc_hat_x2, self.mpitype])
 698 |                 Uc_hat_x = fft(Uc_hat_x2, Uc_hat_x, axis=0, threads=self.threads,
 699 |                                planner_effort=self.planner_effort['fft'])
 700 |                 ################
 701 | 
 702 |                 Uc_hat_xr2[:, :, :N1[2]//2] = Uc_hat_x[:]
 703 | 
 704 |                 # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
 705 |                 if self.comm0_rank == 0:
 706 |                     M = N[0]
 707 |                     xy_plane[:] = Uc_hat_x[:, :, 0]
 708 |                     xy_plane2[:] = np.vstack((xy_plane[0].real, 0.5*(xy_plane[1:M//2]+np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].real))
 709 |                     Uc_hat_xr2[:, :, 0] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
 710 |                     xy_plane2[:] = np.vstack((xy_plane[0].imag, -0.5*1j*(xy_plane[1:M//2]-np.conj(xy_plane[:M//2:-1])), xy_plane[M//2].imag))
 711 |                     xy_plane[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
 712 |                     self.comm0.Send([xy_plane, self.mpitype], dest=self.P1-1, tag=77)
 713 | 
 714 |                 if self.comm0_rank == self.P1-1:
 715 |                     self.comm0.Recv([xy_plane, self.mpitype], source=0, tag=77)
 716 |                     Uc_hat_xr2[:, :, -1] = xy_plane
 717 | 
 718 |                 # Communicate and transform to final y-direction
 719 |                 #self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_xr2, self.mpitype])
 720 |                 #Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr2, self.P2)
 721 |                 # Not in-place
 722 |                 Uc_hat_xr3 = self.work_arrays[((N[0], N2[1], N1f), self.complex, 3)]
 723 |                 self.comm1.Alltoall([Uc_hat_xr2, self.mpitype], [Uc_hat_xr3, self.mpitype])
 724 |                 Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_xr3, self.P2)
 725 | 
 726 |                 # Do fft for last direction
 727 |                 fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
 728 |                          planner_effort=self.planner_effort['fft'])
 729 | 
 730 |             elif self.communication == 'Alltoallw':
 731 |                 if len(self._subarrays1A) == 0:
 732 |                     (self._subarrays1A, self._subarrays1B, self._subarrays2A,
 733 |                      self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
 734 | 
 735 |                 Uc_hat_x  = self.work_arrays[((N[0], N2[1], N1f), self.complex, 0)]
 736 | 
 737 |                 # Do fft in z direction on owned data
 738 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
 739 |                                 planner_effort=self.planner_effort['rfft'])
 740 | 
 741 |                 self.comm0.Alltoallw(
 742 |                     [Uc_hat_z, self._counts_displs2, self._subarrays2B],
 743 |                     [Uc_hat_x, self._counts_displs2, self._subarrays2A])
 744 | 
 745 |                 Uc_hat_x[:] = fft(Uc_hat_x, axis=0, threads=self.threads,
 746 |                                   planner_effort=self.planner_effort['fft'])
 747 | 
 748 |                 self.comm1.Alltoallw(
 749 |                     [Uc_hat_x, self._counts_displs1, self._subarrays1B],
 750 |                     [Uc_hat_y, self._counts_displs1, self._subarrays1A])
 751 | 
 752 |                 # Do fft for last direction
 753 |                 fu = fft(Uc_hat_y, fu, axis=1, threads=self.threads,
 754 |                          planner_effort=self.planner_effort['fft'])
 755 | 
 756 |             return fu
 757 | 
 758 |         else: # padded
 759 | 
 760 |             assert u.shape == self.real_shape_padded()
 761 | 
 762 |             padsize = self.padsize
 763 |             Uc_pad_hat_y  = self.work_arrays[((N2[0], int(padsize*N[1]), N1f), self.complex, 0)]
 764 |             Uc_pad_hat_z  = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
 765 |             Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
 766 | 
 767 |             if self.communication == 'AlltoallN':
 768 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 769 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 770 | 
 771 |                 # Do fft in z direction on owned data
 772 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
 773 |                                      planner_effort=self.planner_effort['rfft'])
 774 | 
 775 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
 776 | 
 777 |                 # Transform to x direction neglecting k=N//2 (Nyquist)
 778 |                 Uc_pad_hat_xy = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1)
 779 | 
 780 |                 # Communicate and do fft in x-direction
 781 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
 782 |                 Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
 783 |                                        planner_effort=self.planner_effort['fft'])
 784 | 
 785 |                 Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
 786 | 
 787 |                 # Communicate and transform to final y-direction
 788 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
 789 |                 Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P2)
 790 | 
 791 |                 # Do fft for last direction
 792 |                 Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
 793 |                                       planner_effort=self.planner_effort['fft'])
 794 |                 fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
 795 |                 fu /= padsize**3
 796 | 
 797 |             elif self.communication == 'Alltoall':
 798 | 
 799 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 800 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1[2]//2), self.complex, 0)]
 801 |                 xy_pad_plane = self.work_arrays[((N[0], int(padsize*N2[1])), self.complex, 0)]
 802 |                 xy_pad_plane2= self.work_arrays[((N[0]//2+1, int(padsize*N2[1])), self.complex, 0)]
 803 |                 Uc_pad_hat_xr2  = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
 804 | 
 805 |                 # Do fft in z direction on owned data
 806 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
 807 |                                      planner_effort=self.planner_effort['rfft'])
 808 | 
 809 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
 810 | 
 811 |                 # Move real part of Nyquist to k=0
 812 |                 Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1]
 813 | 
 814 |                 # Transform to x direction neglecting k=N//2 (Nyquist)
 815 |                 Uc_pad_hat_xy[:] = transform_Uc_xz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P1)
 816 | 
 817 |                 # Communicate and do fft in x-direction
 818 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy, self.mpitype])
 819 |                 Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
 820 |                                        planner_effort=self.planner_effort['fft'])
 821 | 
 822 |                 Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
 823 | 
 824 |                 Uc_pad_hat_xr2[:, :, :N1[2]//2] = Uc_pad_hat_x[:]
 825 | 
 826 |                 # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
 827 |                 if self.comm0_rank == 0:
 828 |                     N = self.N[0]
 829 |                     xy_pad_plane[:] = Uc_pad_hat_x[:, :, 0]
 830 |                     xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].real, 0.5*(xy_pad_plane[1:N//2]+np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].real))
 831 |                     Uc_pad_hat_xr2[:, :, 0] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1])))
 832 |                     xy_pad_plane2[:] = np.vstack((xy_pad_plane[0].imag, -0.5*1j*(xy_pad_plane[1:N//2]-np.conj(xy_pad_plane[:N//2:-1])), xy_pad_plane[N//2].imag))
 833 |                     xy_pad_plane[:] = np.vstack((xy_pad_plane2, np.conj(xy_pad_plane2[(N//2-1):0:-1])))
 834 |                     self.comm0.Send([xy_pad_plane, self.mpitype], dest=self.P1-1, tag=77)
 835 | 
 836 |                 if self.comm0_rank == self.P1-1:
 837 |                     self.comm0.Recv([xy_pad_plane, self.mpitype], source=0, tag=77)
 838 |                     Uc_pad_hat_xr2[:, :, -1] = xy_pad_plane
 839 | 
 840 |                 # Communicate and transform to final y-direction
 841 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xr2, self.mpitype])
 842 |                 Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_xr2, self.P2)
 843 | 
 844 |                 # Do fft for last direction
 845 |                 Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
 846 |                                       planner_effort=self.planner_effort['fft'])
 847 |                 fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
 848 |                 fu /= padsize**3
 849 | 
 850 |             elif self.communication == 'Alltoallw':
 851 |                 if len(self._subarrays1A_pad) == 0:
 852 |                     (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
 853 |                      self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
 854 | 
 855 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N[0]), int(padsize*N2[1]), N1f), self.complex, 0)]
 856 |                 Uc_pad_hat_x  = self.work_arrays[((N[0], int(padsize*N2[1]), N1f), self.complex, 0)]
 857 | 
 858 |                 # Do fft in z direction on owned data
 859 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
 860 |                                      planner_effort=self.planner_effort['rfft'])
 861 | 
 862 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
 863 | 
 864 |                 self.comm0.Alltoallw(
 865 |                     [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad],
 866 |                     [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad])
 867 | 
 868 |                 Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=0, threads=self.threads,
 869 |                                        planner_effort=self.planner_effort['fft'])
 870 | 
 871 |                 Uc_pad_hat_x = self.copy_from_padded_x(Uc_pad_hat_xy, Uc_pad_hat_x)
 872 | 
 873 |                 self.comm1.Alltoallw(
 874 |                     [Uc_pad_hat_x, self._counts_displs1, self._subarrays1B_pad],
 875 |                     [Uc_pad_hat_y, self._counts_displs1, self._subarrays1A_pad])
 876 | 
 877 |                 # Do fft for last direction
 878 |                 Uc_pad_hat_y[:] = fft(Uc_pad_hat_y, axis=1, threads=self.threads,
 879 |                                       planner_effort=self.planner_effort['fft'])
 880 |                 fu = self.copy_from_padded_y(Uc_pad_hat_y, fu)
 881 |                 fu /= padsize**3
 882 | 
 883 |             return fu
 884 | 
 885 | class R2CX(R2CY):
 886 |     """Class for performing FFT in 3D using MPI
 887 | 
 888 |     Pencil decomposition
 889 | 
 890 |     Args:
 891 |         N - NumPy array([Nx, Ny, Nz]) setting the dimensions of the real mesh
 892 |         L - NumPy array([Lx, Ly, Lz]) setting the actual size of the computational domain
 893 |         MPI - The MPI object (from mpi4py import MPI)
 894 |         precision - "single" or "double"
 895 |         communication - Communication scheme. ('AlltoallN', 'Alltoall' or 'Alltoallw')
 896 |         padsize - The size of padding, if padding is used in transforms
 897 |         threads - Number of threads used by FFTs
 898 |         planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
 899 |                          Give as defaultdict, with keys representing transform (e.g., fft, ifft)
 900 | 
 901 |     This version has the final complex data aligned in the x-direction
 902 |     """
 903 |     def __init__(self, N, L, comm, precision, P1=None, communication='Alltoall',
 904 |                  padsize=1.5, threads=1,
 905 |                  planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
 906 |         R2CY.__init__(self, N, L, comm, precision, P1=P1, communication=communication,
 907 |                       padsize=padsize, threads=threads, planner_effort=planner_effort)
 908 |         self.N2f = self.N2[2]//2 if self.comm1_rank < self.P2-1 else self.N2[2]//2+1
 909 |         if self.communication == 'AlltoallN':
 910 |             self.N2f = self.N2[2]//2
 911 |         if self.communication == 'Alltoallw':
 912 |             q = _subsize(self.Nf, self.P2, self.comm1_rank)
 913 |             self.N2f = q
 914 | 
 915 |     def real_shape(self):
 916 |         """The local shape of the real data"""
 917 |         return (self.N1[0], self.N2[1], self.N[2])
 918 | 
 919 |     def complex_shape(self):
 920 |         """The local shape of the complex data"""
 921 |         return (self.N[0], self.N1[1], self.N2f)
 922 | 
 923 |     def complex_shape_T(self):
 924 |         """The local transposed shape of the complex data"""
 925 |         return (self.Np[0], self.N[1], self.Nf)
 926 | 
 927 |     def complex_shape_I(self):
 928 |         """A local intermediate shape of the complex data"""
 929 |         return (self.Np[0], self.num_processes, self.Np[1], self.Nf)
 930 | 
 931 |     def real_local_slice(self, padsize=1):
 932 |         xyrank = self.comm0.Get_rank() # Local rank in xz-plane
 933 |         yzrank = self.comm1.Get_rank() # Local rank in xy-plane
 934 |         return (slice(int(padsize * xyrank * self.N1[0]), int(padsize * (xyrank+1) * self.N1[0]), 1),
 935 |                 slice(int(padsize * yzrank * self.N2[1]), int(padsize * (yzrank+1) * self.N2[1]), 1),
 936 |                 slice(0, int(padsize * self.N[2])))
 937 | 
 938 |     def complex_local_slice(self):
 939 |         xyrank = self.comm0.Get_rank() # Local rank in xz-plane
 940 |         yzrank = self.comm1.Get_rank() # Local rank in yz-plane
 941 |         return (slice(0, self.N[0]),
 942 |                 slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1),
 943 |                 slice(yzrank*self.N2[2]//2, yzrank*self.N2[2]//2 + self.N2f, 1))
 944 | 
 945 |     def get_local_mesh(self):
 946 |         xyrank = self.comm0.Get_rank() # Local rank in xz-plane
 947 |         yzrank = self.comm1.Get_rank() # Local rank in xy-plane
 948 | 
 949 |         # Create the physical mesh
 950 |         x1 = slice(xyrank * self.N1[0], (xyrank+1) * self.N1[0], 1)
 951 |         x2 = slice(yzrank * self.N2[1], (yzrank+1) * self.N2[1], 1)
 952 |         X = np.mgrid[x1, x2, :self.N[2]].astype(self.float)
 953 |         X[0] *= self.L[0]/self.N[0]
 954 |         X[1] *= self.L[1]/self.N[1]
 955 |         X[2] *= self.L[2]/self.N[2]
 956 |         return X
 957 | 
 958 |     def get_local_wavenumbermesh(self):
 959 |         xyrank = self.comm0.Get_rank() # Local rank in xz-plane
 960 |         yzrank = self.comm1.Get_rank() # Local rank in yz-plane
 961 | 
 962 |         # Set wavenumbers in grid
 963 |         kx = fftfreq(self.N[0], 1./self.N[0]).astype(int)
 964 |         ky = fftfreq(self.N[1], 1./self.N[1]).astype(int)
 965 |         kz = fftfreq(self.N[2], 1./self.N[2]).astype(int)
 966 |         k2 = slice(xyrank*self.N1[1], (xyrank+1)*self.N1[1], 1)
 967 |         k1 = slice(yzrank*self.N2[2]//2, (yzrank+1)*self.N2[2]//2, 1)
 968 |         K  = np.array(np.meshgrid(kx, ky[k2], kz[k1], indexing='ij'), dtype=self.float)
 969 |         return K
 970 | 
 971 |     def get_subarrays(self, padsize=1):
 972 |         datatype = MPI._typedict[np.dtype(self.complex).char]
 973 |         M, N, Q = self.N[0], self.N[1], self.Nf
 974 |         m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
 975 |         n = _subsize(N, self.P1, self.comm0_rank)
 976 |         q = _subsize(Q, self.P2, self.comm1_rank)
 977 |         _subarrays1A = [
 978 |             datatype.Create_subarray([int(padsize*M),n,q], [l,n,q], [s,0,0]).Commit()
 979 |             for l, s in _distribution(int(padsize*M), self.P1)
 980 |         ]
 981 |         _subarrays1B = [
 982 |             datatype.Create_subarray([m,N,q], [m,l,q], [0,s,0]).Commit()
 983 |             for l, s in _distribution(N, self.P1)
 984 |         ]
 985 |         _counts_displs1 = ([1] * self.P1, [0] * self.P1)
 986 | 
 987 |         m = _subsize(int(padsize*M), self.P1, self.comm0_rank)
 988 |         n = _subsize(int(padsize*N), self.P2, self.comm1_rank)
 989 |         q = _subsize(Q, self.P2, self.comm1_rank)
 990 |         _subarrays2A = [
 991 |             datatype.Create_subarray([m,int(padsize*N),q], [m,l,q], [0,s,0]).Commit()
 992 |             for l, s in _distribution(int(padsize*N), self.P2)
 993 |         ]
 994 |         _subarrays2B = [
 995 |             datatype.Create_subarray([m,n,Q], [m,n,l], [0,0,s]).Commit()
 996 |             for l, s in _distribution(Q, self.P2)
 997 |         ]
 998 |         _counts_displs2 = ([1] * self.P2, [0] * self.P2)
 999 |         return _subarrays1A, _subarrays1B, _subarrays2A, _subarrays2B, _counts_displs1, _counts_displs2
1000 | 
1001 |     def ifftn(self, fu, u, dealias=None):
1002 |         """ifft in three directions using mpi
1003 | 
1004 |         Need to do ifft in reversed order of fft
1005 |         """
1006 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
1007 | 
1008 |         if dealias == '2/3-rule' and self.dealias.shape == (0,):
1009 |             self.dealias = self.get_dealias_filter()
1010 | 
1011 |         if not dealias == '3/2-rule':
1012 | 
1013 |             fu_ = fu
1014 |             if dealias == '2/3-rule':
1015 |                 fu_ = self.work_arrays[(fu, 0, False)]
1016 |                 fu_[:] = fu
1017 |                 fu_ = dealias_filter(fu_, self.dealias)
1018 |                 #fu_ *= self.dealias
1019 | 
1020 |             # Intermediate work arrays required for transform
1021 |             Uc_hat_z  = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)]
1022 |             Uc_hat_x  = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1023 | 
1024 |             if self.communication == 'AlltoallN':
1025 |                 Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1026 |                 Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1027 | 
1028 |                 # Do first owned direction
1029 |                 Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1030 |                                 planner_effort=self.planner_effort['ifft'])
1031 | 
1032 |                 # Communicate in xz-plane and do fft in y-direction
1033 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1034 | 
1035 |                 # Transform to y all but k=N//2 (the neglected Nyquist mode)
1036 |                 Uc_hat_y = transform_Uc_yx(Uc_hat_y, Uc_hat_x, self.P1)
1037 |                 Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads,
1038 |                                    planner_effort=self.planner_effort['ifft'])
1039 | 
1040 |                 # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1041 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1042 |                 Uc_hat_z[:] = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2)
1043 | 
1044 |                 # Do ifft for z-direction
1045 |                 Uc_hat_z[:, :, -1] = 0
1046 |                 u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1047 |                           planner_effort=self.planner_effort['irfft'])
1048 | 
1049 |             elif self.communication == 'Alltoall':
1050 |                 Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1051 |                 Uc_hat_y = Uc_hat_y_T.transpose((1, 0, 2))
1052 |                 Uc_hat_y2  = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1053 |                 xy_plane_T  = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)]
1054 |                 xy_plane  = xy_plane_T.transpose((1, 0))
1055 |                 xy_recv   = self.work_arrays[((self.N2[1], self.N1[0]), self.complex, 0)]
1056 | 
1057 |                 # Do first owned direction
1058 |                 Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1059 |                                 planner_effort=self.planner_effort['ifft'])
1060 | 
1061 |                 # Communicate in xz-plane and do fft in y-direction
1062 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1063 | 
1064 |                 # Transform to y all but k=N//2 (the neglected Nyquist mode)
1065 |                 Uc_hat_y2 = transform_Uc_yx(Uc_hat_y2, Uc_hat_x, self.P1)
1066 |                 Uc_hat_y2[:] = ifft(Uc_hat_y2, axis=1, threads=self.threads,
1067 |                                     planner_effort=self.planner_effort['ifft'])
1068 |                 xy_plane[:] = Uc_hat_y2[:, :, -1]
1069 | 
1070 |                 # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1071 |                 Uc_hat_y[:] = Uc_hat_y2[:, :, :self.N2[2]//2]
1072 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1073 |                 Uc_hat_z = transform_Uc_zy(Uc_hat_z, Uc_hat_y, self.P2)
1074 | 
1075 |                 self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1)
1076 |                 Uc_hat_z[:, :, -1] = xy_recv.transpose((1, 0))
1077 | 
1078 |                 # Do ifft for z-direction
1079 |                 u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1080 |                           planner_effort=self.planner_effort['irfft'])
1081 | 
1082 |             elif self.communication == 'Alltoallw':
1083 |                 if len(self._subarrays1A) == 0:
1084 |                     (self._subarrays1A, self._subarrays1B, self._subarrays2A,
1085 |                      self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
1086 | 
1087 |                 Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1088 | 
1089 |                 # Do first owned direction
1090 |                 Uc_hat_x = ifft(fu_, Uc_hat_x, axis=0, threads=self.threads,
1091 |                                 planner_effort=self.planner_effort['ifft'])
1092 | 
1093 |                 self.comm0.Alltoallw(
1094 |                     [Uc_hat_x, self._counts_displs1, self._subarrays1A],
1095 |                     [Uc_hat_y,  self._counts_displs1, self._subarrays1B])
1096 | 
1097 |                 Uc_hat_y[:] = ifft(Uc_hat_y, axis=1, threads=self.threads,
1098 |                                    planner_effort=self.planner_effort['ifft'])
1099 | 
1100 |                 self.comm1.Alltoallw(
1101 |                     [Uc_hat_y, self._counts_displs2, self._subarrays2A],
1102 |                     [Uc_hat_z,  self._counts_displs2, self._subarrays2B])
1103 |                 # Do ifft for z-direction
1104 |                 u = irfft(Uc_hat_z, u, axis=2, threads=self.threads,
1105 |                           planner_effort=self.planner_effort['irfft'])
1106 | 
1107 |         else:
1108 |             # Intermediate work arrays required for transform
1109 |             Uc_pad_hat_z  = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), self.Nf), self.complex, 0)]
1110 |             Uc_pad_hat_z2 = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N2[1]), int(self.padsize*self.N[2]//2)+1), self.complex, 0)]
1111 |             Uc_pad_hat_x  = self.work_arrays[((int(self.padsize*self.N[0]), self.N1[1], self.N2f), self.complex, 0)]
1112 | 
1113 |             if self.communication == 'AlltoallN':
1114 |                 Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1115 |                 Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1116 |                 Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1117 |                 Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1118 |                 Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)]
1119 | 
1120 |                 Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1121 | 
1122 |                 # Do first owned direction
1123 |                 Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1124 |                                        planner_effort=self.planner_effort['ifft'])
1125 | 
1126 |                 # Communicate in xz-plane and do fft in y-direction
1127 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1128 | 
1129 |                 # Transform to y
1130 |                 Uc_pad_hat_y = transform_Uc_yx(Uc_pad_hat_y, Uc_pad_hat_x, self.P1)
1131 |                 Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy2)
1132 | 
1133 |                 Uc_pad_hat_xy = ifft(Uc_pad_hat_xy2, Uc_pad_hat_xy, overwrite_input=True, axis=1, threads=self.threads,
1134 |                                      planner_effort=self.planner_effort['ifft'])
1135 | 
1136 |                 # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1137 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1138 |                 Uc_pad_hat_z[:] = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2)
1139 |                 Uc_pad_hat_z[:, :, -1] = 0
1140 | 
1141 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1142 | 
1143 |                 # Do ifft for z-direction
1144 |                 u = irfft(Uc_pad_hat_z2, u, overwrite_input=True, axis=2, threads=self.threads,
1145 |                           planner_effort=self.planner_effort['irfft'])
1146 | 
1147 |             elif self.communication == 'Alltoall':
1148 |                 Uc_pad_hat_y_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1149 |                 Uc_pad_hat_y = Uc_pad_hat_y_T.transpose((1, 0, 2))
1150 |                 Uc_pad_hat_xy_T= self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0]), self.N2[2]//2), self.complex, 0)]
1151 |                 Uc_pad_hat_xy = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1152 |                 Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2[2]//2), self.complex, 0)]
1153 |                 Uc_pad_hat_y2_T= self.work_arrays[((self.N[1], int(self.padsize*self.N1[0]), self.N2f), self.complex, 0)]
1154 |                 Uc_pad_hat_y2 = Uc_pad_hat_y2_T.transpose((1, 0, 2))
1155 |                 Uc_pad_hat_xy2= self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)]
1156 | 
1157 |                 xy_plane_T  = self.work_arrays[((int(self.padsize*self.N[1]), int(self.padsize*self.N1[0])), self.complex, 0)]
1158 |                 xy_plane  = xy_plane_T.transpose((1, 0))
1159 |                 xy_recv   = self.work_arrays[((int(self.padsize*self.N2[1]), int(self.padsize*self.N1[0])), self.complex, 0)]
1160 | 
1161 |                 Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1162 | 
1163 |                 # Do first owned direction
1164 |                 Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1165 |                                        planner_effort=self.planner_effort['ifft'])
1166 | 
1167 |                 # Communicate in xz-plane and do fft in y-direction
1168 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1169 | 
1170 |                 # Transform to y
1171 |                 Uc_pad_hat_y2 = transform_Uc_yx(Uc_pad_hat_y2, Uc_pad_hat_x, self.P1)
1172 | 
1173 |                 Uc_pad_hat_xy2 = self.copy_to_padded_y(Uc_pad_hat_y2, Uc_pad_hat_xy2)
1174 | 
1175 |                 Uc_pad_hat_xy2[:] = ifft(Uc_pad_hat_xy2, axis=1, threads=self.threads,
1176 |                                          planner_effort=self.planner_effort['ifft'])
1177 |                 xy_plane[:] = Uc_pad_hat_xy2[:, :, -1]
1178 | 
1179 |                 # Communicate and transform in yz-plane. Transpose required to put distributed axis first.
1180 |                 Uc_pad_hat_xy[:] = Uc_pad_hat_xy2[:, :, :self.N2[2]//2]
1181 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1182 |                 Uc_pad_hat_z = transform_Uc_zy(Uc_pad_hat_z, Uc_pad_hat_xy, self.P2)
1183 | 
1184 |                 self.comm1.Scatter(xy_plane_T, xy_recv, root=self.P2-1)
1185 |                 Uc_pad_hat_z[:, :, -1] = xy_recv.transpose((1, 0))
1186 | 
1187 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1188 | 
1189 |                 # Do ifft for z-direction
1190 |                 u = irfft(Uc_pad_hat_z2, u, axis=2, threads=self.threads,
1191 |                           planner_effort=self.planner_effort['irfft'])
1192 | 
1193 |             elif self.communication == 'Alltoallw':
1194 |                 if len(self._subarrays1A_pad) == 0:
1195 |                     (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
1196 |                      self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
1197 | 
1198 |                 Uc_pad_hat_y  = self.work_arrays[((int(self.padsize*self.N1[0]), self.N[1], self.N2f), self.complex, 0)]
1199 |                 Uc_pad_hat_xy = self.work_arrays[((int(self.padsize*self.N1[0]), int(self.padsize*self.N[1]), self.N2f), self.complex, 0)]
1200 | 
1201 |                 Uc_pad_hat_x = self.copy_to_padded_x(fu*self.padsize**3, Uc_pad_hat_x)
1202 | 
1203 |                 # Do first owned direction
1204 |                 Uc_pad_hat_x[:] = ifft(Uc_pad_hat_x, axis=0, threads=self.threads,
1205 |                                        planner_effort=self.planner_effort['ifft'])
1206 | 
1207 |                 self.comm0.Alltoallw(
1208 |                     [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad],
1209 |                     [Uc_pad_hat_y,  self._counts_displs1, self._subarrays1B_pad])
1210 | 
1211 |                 Uc_pad_hat_xy = self.copy_to_padded_y(Uc_pad_hat_y, Uc_pad_hat_xy)
1212 | 
1213 |                 Uc_pad_hat_xy[:] = ifft(Uc_pad_hat_xy, axis=1, threads=self.threads,
1214 |                                         planner_effort=self.planner_effort['ifft'])
1215 | 
1216 |                 self.comm1.Alltoallw(
1217 |                     [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad],
1218 |                     [Uc_pad_hat_z,  self._counts_displs2, self._subarrays2B_pad])
1219 | 
1220 |                 Uc_pad_hat_z2 = self.copy_to_padded_z(Uc_pad_hat_z, Uc_pad_hat_z2)
1221 | 
1222 |                 # Do ifft for z-direction
1223 |                 u = irfft(Uc_pad_hat_z2, u, axis=2, overwrite_input=True, threads=self.threads,
1224 |                           planner_effort=self.planner_effort['irfft'])
1225 | 
1226 |         return u
1227 | 
1228 |     def fftn(self, u, fu, dealias=None):
1229 |         """fft in three directions using mpi."""
1230 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
1231 | 
1232 |         if not dealias == '3/2-rule':
1233 | 
1234 |             # Intermediate work arrays required for transform
1235 |             Uc_hat_z  = self.work_arrays[((self.N1[0], self.N2[1], self.Nf), self.complex, 0)]
1236 | 
1237 |             if self.communication == 'AlltoallN':
1238 |                 Uc_hat_x  = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)]
1239 |                 Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1240 |                 Uc_hat_y  = Uc_hat_y_T.transpose((1, 0, 2))
1241 |                 Uc_hat_y2= self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 1)]
1242 | 
1243 |                 # Do fft in z direction on owned data
1244 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1245 |                                 planner_effort=self.planner_effort['rfft'])
1246 | 
1247 |                 # Transform to y direction neglecting k=N//2 (Nyquist)
1248 |                 Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2)
1249 | 
1250 |                 # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1251 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1252 |                 Uc_hat_y2 = fft(Uc_hat_y, Uc_hat_y2, axis=1, threads=self.threads,
1253 |                                 planner_effort=self.planner_effort['fft'])
1254 | 
1255 |                 # Communicate and transform to final x-direction
1256 |                 Uc_hat_x = transform_Uc_xy(Uc_hat_x, Uc_hat_y2, self.P1)
1257 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x, self.mpitype])
1258 | 
1259 |                 # Do fft for last direction
1260 |                 fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads,
1261 |                          planner_effort=self.planner_effort['fft'])
1262 | 
1263 |             elif self.communication == 'Alltoall':
1264 |                 Uc_hat_x  = self.work_arrays[((self.N[0], self.N1[1], self.N2[2]//2), self.complex, 0)]
1265 |                 Uc_hat_y_T= self.work_arrays[((self.N[1], self.N1[0], self.N2[2]//2), self.complex, 0)]
1266 |                 Uc_hat_y  = Uc_hat_y_T.transpose((1, 0, 2))
1267 |                 Uc_hat_y2  = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1268 |                 Uc_hat_x2  = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1269 |                 Uc_hat_y3  = self.work_arrays[((self.N1[0], self.N[1], self.N2[2]//2), self.complex, 0)]
1270 |                 xy_plane_T = self.work_arrays[((self.N[1], self.N1[0]), self.complex, 0)]
1271 |                 xy_plane = xy_plane_T.transpose((1, 0))
1272 |                 xy_plane2 = self.work_arrays[((self.N[1]//2+1, self.N1[0]), self.complex, 0)]
1273 | 
1274 |                 # Do fft in z direction on owned data
1275 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1276 |                                 planner_effort=self.planner_effort['rfft'])
1277 | 
1278 |                 # Move real part of Nyquist to k=0
1279 |                 Uc_hat_z[:, :, 0] += 1j*Uc_hat_z[:, :, -1]
1280 | 
1281 |                 # Transform to y direction neglecting k=N//2 (Nyquist)
1282 |                 Uc_hat_y = transform_Uc_yz(Uc_hat_y, Uc_hat_z, self.P2)
1283 | 
1284 |                 # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1285 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_hat_y_T, self.mpitype])
1286 |                 Uc_hat_y3 = fft(Uc_hat_y, Uc_hat_y3, axis=1, threads=self.threads,
1287 |                                 planner_effort=self.planner_effort['fft'])
1288 |                 Uc_hat_y2[:, :, :self.N2[2]//2] = Uc_hat_y3[:]
1289 | 
1290 |                 # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
1291 |                 if self.comm1_rank == 0:
1292 |                     M = self.N[1]
1293 |                     xy_plane[:] = Uc_hat_y3[:, :, 0]
1294 |                     xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real))
1295 |                     Uc_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0))
1296 |                     xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag))
1297 |                     xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
1298 |                     self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77)
1299 | 
1300 |                 if self.comm1_rank == self.P2-1:
1301 |                     self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77)
1302 |                     Uc_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0))
1303 | 
1304 |                 # Communicate and transform to final x-direction
1305 |                 Uc_hat_x2 = transform_Uc_xy(Uc_hat_x2, Uc_hat_y2, self.P1)
1306 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_hat_x2, self.mpitype])
1307 | 
1308 |                 # Do fft for last direction
1309 |                 fu = fft(Uc_hat_x2, fu, axis=0, threads=self.threads,
1310 |                          planner_effort=self.planner_effort['fft'])
1311 | 
1312 |             elif self.communication == 'Alltoallw':
1313 |                 Uc_hat_y = self.work_arrays[((self.N1[0], self.N[1], self.N2f), self.complex, 0)]
1314 |                 Uc_hat_x = self.work_arrays[((self.N[0], self.N1[1], self.N2f), self.complex, 0)]
1315 | 
1316 |                 if len(self._subarrays1A) == 0:
1317 |                     (self._subarrays1A, self._subarrays1B, self._subarrays2A,
1318 |                      self._subarrays2B, self._counts_displs1, self._counts_displs2) = self.get_subarrays()
1319 | 
1320 |                 # Do fft in z direction on owned data
1321 |                 Uc_hat_z = rfft(u, Uc_hat_z, axis=2, threads=self.threads,
1322 |                                 planner_effort=self.planner_effort['rfft'])
1323 | 
1324 |                 self.comm1.Alltoallw(
1325 |                     [Uc_hat_z, self._counts_displs2, self._subarrays2B],
1326 |                     [Uc_hat_y, self._counts_displs2, self._subarrays2A])
1327 |                 Uc_hat_y[:] = fft(Uc_hat_y, axis=1, threads=self.threads,
1328 |                                   planner_effort=self.planner_effort['fft'])
1329 | 
1330 |                 # Communicate and transform to final x-direction
1331 |                 self.comm0.Alltoallw(
1332 |                     [Uc_hat_y, self._counts_displs1, self._subarrays1B],
1333 |                     [Uc_hat_x, self._counts_displs1, self._subarrays1A])
1334 | 
1335 |                 # Do fft for last direction
1336 |                 fu = fft(Uc_hat_x, fu, axis=0, threads=self.threads,
1337 |                          planner_effort=self.planner_effort['fft'])
1338 | 
1339 |         else:
1340 | 
1341 |             assert u.shape == self.real_shape_padded()
1342 |             padsize = self.padsize
1343 |             # Strip off self
1344 |             N, N1, N2, Nf, N2f = self.N, self.N1, self.N2, self.Nf, self.N2f
1345 | 
1346 |             # Intermediate work arrays required for transform
1347 |             Uc_pad_hat_z  = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), Nf), self.complex, 0)]
1348 |             Uc_pad_hat_z2 = self.work_arrays[((int(padsize*N1[0]), int(padsize*N2[1]), int(padsize*N[2]//2)+1), self.complex, 0)]
1349 | 
1350 |             if self.communication == 'AlltoallN':
1351 |                 Uc_pad_hat_x  = self.work_arrays[((int(padsize*N[0]), N1[1], N2[2]//2), self.complex, 0)]
1352 |                 Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1353 |                 Uc_pad_hat_xy  = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1354 |                 Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)]
1355 |                 Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1356 |                 Uc_pad_hat_y  = Uc_pad_hat_y_T.transpose((1, 0, 2))
1357 | 
1358 |                 # Do fft in z direction on owned data
1359 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1360 |                                      planner_effort=self.planner_effort['rfft'])
1361 | 
1362 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1363 | 
1364 |                 # Transform to y direction neglecting k=N//2 (Nyquist)
1365 |                 Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2)
1366 | 
1367 |                 # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1368 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1369 |                 Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads,
1370 |                                      planner_effort=self.planner_effort['fft'])
1371 | 
1372 |                 Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y)
1373 | 
1374 |                 # Communicate and transform to final x-direction
1375 |                 Uc_pad_hat_x = transform_Uc_xy(Uc_pad_hat_x, Uc_pad_hat_y, self.P1)
1376 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x, self.mpitype])
1377 | 
1378 |                 # Do fft for last direction
1379 |                 Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads,
1380 |                                       planner_effort=self.planner_effort['fft'])
1381 |                 fu = self.copy_from_padded_x(Uc_pad_hat_x, fu)
1382 |                 fu /= padsize**3
1383 | 
1384 |             elif self.communication == 'Alltoall':
1385 |                 Uc_pad_hat_xy_T= self.work_arrays[((int(padsize*N[1]), int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1386 |                 Uc_pad_hat_xy  = Uc_pad_hat_xy_T.transpose((1, 0, 2))
1387 |                 Uc_pad_hat_xy2= self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2[2]//2), self.complex, 0)]
1388 |                 Uc_pad_hat_y_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2[2]//2), self.complex, 0)]
1389 |                 Uc_pad_hat_y  = Uc_pad_hat_y_T.transpose((1, 0, 2))
1390 |                 Uc_pad_hat_y2_T= self.work_arrays[((N[1], int(padsize*N1[0]), N2f), self.complex, 0)]
1391 |                 Uc_pad_hat_y2  = Uc_pad_hat_y2_T.transpose((1, 0, 2))
1392 |                 Uc_pad_hat_x2  = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)]
1393 |                 xy_plane_T  = self.work_arrays[((self.N[1], int(self.padsize*self.N1[0])), self.complex, 0)]
1394 |                 xy_plane = xy_plane_T.transpose((1, 0))
1395 |                 xy_plane2 = self.work_arrays[((self.N[1]//2+1, int(self.padsize*self.N1[0])), self.complex, 0)]
1396 | 
1397 |                 # Do fft in z direction on owned data
1398 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1399 |                                      planner_effort=self.planner_effort['rfft'])
1400 | 
1401 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1402 | 
1403 |                 # Move real part of Nyquist to k=0
1404 |                 Uc_pad_hat_z[:, :, 0] += 1j*Uc_pad_hat_z[:, :, -1]
1405 | 
1406 |                 # Transform to y direction neglecting k=N//2 (Nyquist)
1407 |                 Uc_pad_hat_xy = transform_Uc_yz(Uc_pad_hat_xy, Uc_pad_hat_z, self.P2)
1408 | 
1409 |                 # Communicate and do fft in y-direction. Transpose required to put distributed axis first
1410 |                 self.comm1.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_xy_T, self.mpitype])
1411 |                 Uc_pad_hat_xy2 = fft(Uc_pad_hat_xy, Uc_pad_hat_xy2, axis=1, threads=self.threads,
1412 |                                      planner_effort=self.planner_effort['fft'])
1413 | 
1414 |                 Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy2, Uc_pad_hat_y)
1415 | 
1416 |                 Uc_pad_hat_y2[:, :, :self.N2[2]//2] = Uc_pad_hat_y[:]
1417 | 
1418 |                 # Now both k=0 and k=N//2 are contained in 0 of comm0_rank = 0
1419 |                 if self.comm1_rank == 0:
1420 |                     M = self.N[1]
1421 |                     xy_plane[:] = Uc_pad_hat_y[:, :, 0]
1422 |                     xy_plane2[:] = np.vstack((xy_plane_T[0].real, 0.5*(xy_plane_T[1:M//2]+np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].real))
1423 |                     Uc_pad_hat_y2[:, :, 0] = (np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))).transpose((1, 0))
1424 |                     xy_plane2[:] = np.vstack((xy_plane_T[0].imag, -0.5*1j*(xy_plane_T[1:M//2]-np.conj(xy_plane_T[:M//2:-1])), xy_plane_T[M//2].imag))
1425 |                     xy_plane_T[:] = np.vstack((xy_plane2, np.conj(xy_plane2[(M//2-1):0:-1])))
1426 |                     self.comm1.Send([xy_plane_T, self.mpitype], dest=self.P2-1, tag=77)
1427 | 
1428 |                 if self.comm1_rank == self.P2-1:
1429 |                     self.comm1.Recv([xy_plane_T, self.mpitype], source=0, tag=77)
1430 |                     Uc_pad_hat_y2[:, :, -1] = xy_plane_T.transpose((1, 0))
1431 | 
1432 |                 # Communicate and transform to final x-direction
1433 |                 Uc_pad_hat_x2 = transform_Uc_xy(Uc_pad_hat_x2, Uc_pad_hat_y2, self.P1)
1434 |                 self.comm0.Alltoall(MPI.IN_PLACE, [Uc_pad_hat_x2, self.mpitype])
1435 | 
1436 |                 # Do fft for last direction
1437 |                 Uc_pad_hat_x2[:] = fft(Uc_pad_hat_x2, axis=0, threads=self.threads,
1438 |                                        planner_effort=self.planner_effort['fft'])
1439 |                 fu = self.copy_from_padded_x(Uc_pad_hat_x2, fu)
1440 |                 fu /= padsize**3
1441 | 
1442 |             elif self.communication == 'Alltoallw':
1443 |                 Uc_pad_hat_y  = self.work_arrays[((int(padsize*N1[0]), N[1], N2f), self.complex, 0)]
1444 |                 Uc_pad_hat_xy = self.work_arrays[((int(padsize*N1[0]), int(padsize*N[1]), N2f), self.complex, 0)]
1445 |                 Uc_pad_hat_x  = self.work_arrays[((int(padsize*N[0]), N1[1], N2f), self.complex, 0)]
1446 | 
1447 |                 if len(self._subarrays1A_pad) == 0:
1448 |                     (self._subarrays1A_pad, self._subarrays1B_pad, self._subarrays2A_pad,
1449 |                      self._subarrays2B_pad, self._counts_displs1, self._counts_displs2) = self.get_subarrays(padsize=self.padsize)
1450 | 
1451 |                 # Do fft in z direction on owned data
1452 |                 Uc_pad_hat_z2 = rfft(u, Uc_pad_hat_z2, axis=2, threads=self.threads,
1453 |                                      planner_effort=self.planner_effort['rfft'])
1454 | 
1455 |                 Uc_pad_hat_z = self.copy_from_padded_z(Uc_pad_hat_z2, Uc_pad_hat_z)
1456 | 
1457 |                 self.comm1.Alltoallw(
1458 |                     [Uc_pad_hat_z, self._counts_displs2, self._subarrays2B_pad],
1459 |                     [Uc_pad_hat_xy, self._counts_displs2, self._subarrays2A_pad])
1460 | 
1461 |                 Uc_pad_hat_xy[:] = fft(Uc_pad_hat_xy, axis=1, threads=self.threads,
1462 |                                        planner_effort=self.planner_effort['fft'])
1463 | 
1464 |                 Uc_pad_hat_y = self.copy_from_padded_y(Uc_pad_hat_xy, Uc_pad_hat_y)
1465 | 
1466 |                 # Communicate and transform to final x-direction
1467 |                 self.comm0.Alltoallw(
1468 |                     [Uc_pad_hat_y, self._counts_displs1, self._subarrays1B_pad],
1469 |                     [Uc_pad_hat_x, self._counts_displs1, self._subarrays1A_pad])
1470 | 
1471 |                 # Do fft for last direction
1472 |                 Uc_pad_hat_x[:] = fft(Uc_pad_hat_x, axis=0, threads=self.threads,
1473 |                                       planner_effort=self.planner_effort['fft'])
1474 |                 fu = self.copy_from_padded_x(Uc_pad_hat_x, fu)
1475 |                 fu /= padsize**3
1476 | 
1477 |         return fu
1478 | 
1479 | def R2C(N, L, comm, precision, P1=None, communication="Alltoall", padsize=1.5, threads=1,
1480 |         alignment="X", planner_effort=defaultdict(lambda : "FFTW_MEASURE")):
1481 |     if alignment == 'X':
1482 |         return R2CX(N, L, comm, precision, P1, communication, padsize, threads, planner_effort)
1483 |     else:
1484 |         return R2CY(N, L, comm, precision, P1, communication, padsize, threads, planner_effort)
1485 | 


--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     #assert False
3 |     from .pyfftw_fft import *
4 |    
5 | except:
6 |     from .numpy_fft import *
7 | 


--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/numpy_fft.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
  2 | __date__ = "2016-02-16"
  3 | __copyright__ = "Copyright (C) 2016 " + __author__
  4 | __license__  = "GNU Lesser GPL version 3 or any later version"
  5 | 
  6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn',
  7 |            'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn']
  8 | 
  9 | from numpy import iscomplexobj
 10 | import numpy.fft
 11 | from scipy.fftpack import dct
 12 | 
 13 | dct1 = dct
 14 | def dct(a, b, type=2, axis=0, **kw):
 15 |     if iscomplexobj(a):
 16 |         b.real[:] = dct1(a.real, type=type, axis=axis)
 17 |         b.imag[:] = dct1(a.imag, type=type, axis=axis)
 18 |         return b
 19 | 
 20 |     else:
 21 |         b[:] = dct1(a, type=type, axis=axis)
 22 |         return b
 23 | 
 24 | # Define functions taking both input array and output array
 25 | def fft(a, b=None, axis=0, threads=1, **kw):
 26 |     if b is None:
 27 |         return numpy.fft.fft(a, axis=axis)
 28 |     else:
 29 |         b[:] = numpy.fft.fft(a, axis=axis)
 30 |         return b
 31 | 
 32 | def ifft(a, b=None, axis=0, threads=1, **kw):
 33 |     if b is None:
 34 |         return numpy.fft.ifft(a, axis=axis)
 35 |     else:
 36 |         b[:] = numpy.fft.ifft(a, axis=axis)
 37 |         return b
 38 | 
 39 | def rfft(a, b=None, axis=0, threads=1, **kw):
 40 |     if b is None:
 41 |         return numpy.fft.rfft(a, axis=axis)
 42 |     else:
 43 |         b[:] = numpy.fft.rfft(a, axis=axis)
 44 |         return b
 45 |         
 46 | def irfft(a, b=None, axis=0, threads=1, **kw):
 47 |     if b is None:
 48 |         return numpy.fft.irfft(a, axis=axis)
 49 |     else:
 50 |         b[:] = numpy.fft.irfft(a, axis=axis)
 51 |         return b
 52 | 
 53 | def fft2(a, b=None, axes=(0, 1), threads=1, **kw):
 54 |     if b is None:
 55 |         return numpy.fft.fft2(a, axes=axes)
 56 |     else:
 57 |         b[:] = numpy.fft.fft2(a, axes=axes)
 58 |         return b
 59 |         
 60 | def ifft2(a, b=None, axes=(0, 1), threads=1, **kw):
 61 |     if b is None:
 62 |         return numpy.fft.ifft2(a, axes=axes)
 63 |     else:
 64 |         b[:] = numpy.fft.ifft2(a, axes=axes)
 65 |         return b
 66 | 
 67 | def rfft2(a, b=None, axes=(0, 1), threads=1, **kw):
 68 |     if b is None:
 69 |         return numpy.fft.rfft2(a, axes=axes)
 70 |     else:
 71 |         b[:] = numpy.fft.rfft2(a, axes=axes)
 72 |         return b
 73 |         
 74 | def irfft2(a, b=None, axes=(0, 1), threads=1, **kw):
 75 |     if b is None:
 76 |         return numpy.fft.irfft2(a, axes=axes)
 77 |     else:
 78 |         b[:] = numpy.fft.irfft2(a, axes=axes)
 79 |         return b
 80 | 
 81 | def fftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
 82 |     if b is None:
 83 |         return numpy.fft.fftn(a, axes=axes)
 84 |     else:
 85 |         b[:] = numpy.fft.fftn(a, axes=axes)
 86 |         return b
 87 |         
 88 | def ifftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
 89 |     if b is None:
 90 |         return numpy.fft.ifftn(a, axes=axes)
 91 |     else:
 92 |         b[:] = numpy.fft.ifftn(a, axes=axes)
 93 |         return b
 94 | 
 95 | def rfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
 96 |     if b is None:
 97 |         return numpy.fft.rfftn(a, axes=axes)
 98 |     else:
 99 |         b[:] = numpy.fft.rfftn(a, axes=axes)
100 |         return b
101 |         
102 | def irfftn(a, b=None, axes=(0, 1, 2), threads=1, **kw):
103 |     if b is None:
104 |         return numpy.fft.irfftn(a, axes=axes)
105 |     else:
106 |         b[:] = numpy.fft.irfftn(a, axes=axes)
107 |         return b
108 | 


--------------------------------------------------------------------------------
/mpiFFT4py/serialFFT/pyfftw_fft.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
  2 | __date__ = "2016-02-16"
  3 | __copyright__ = "Copyright (C) 2016 " + __author__
  4 | __license__  = "GNU Lesser GPL version 3 or any later version"
  5 | 
  6 | __all__ = ['dct', 'fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn',
  7 |            'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn']
  8 | 
  9 | import pyfftw
 10 | from numpy import iscomplexobj
 11 | 
 12 | dct_object    = {}
 13 | fft_object    = {}
 14 | ifft_object   = {}
 15 | fft2_object   = {}
 16 | ifft2_object  = {}
 17 | fftn_object   = {}
 18 | ifftn_object  = {}
 19 | irfft_object  = {}
 20 | irfftn_object = {}
 21 | irfft2_object = {}
 22 | rfft2_object  = {}
 23 | rfft_object   = {}
 24 | rfftn_object  = {}
 25 | 
 26 | def ifft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
 27 |     global ifft_object
 28 |     if not (a.shape, a.dtype, overwrite_input, axis) in ifft_object:
 29 |         ifft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.ifft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort) 
 30 |     if not b is None:
 31 |         if b.flags['C_CONTIGUOUS'] is True:
 32 |             ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
 33 |         else:
 34 |             ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
 35 |             b[:] = ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
 36 |         return b
 37 |     else:
 38 |         ifft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
 39 |         return ifft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
 40 | 
 41 | def ifft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
 42 |     global ifft2_object
 43 |     if not (a.shape, a.dtype, overwrite_input, axes) in ifft2_object:
 44 |         ifft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
 45 |     if not b is None:
 46 |         if b.flags['C_CONTIGUOUS'] is True:
 47 |             ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
 48 |         else:
 49 |             ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
 50 |             b[:] = ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
 51 |         return b
 52 |     else:
 53 |         ifft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
 54 |         return ifft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
 55 | 
 56 | def ifftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
 57 |     global ifftn_object
 58 |     if not (a.shape, a.dtype, overwrite_input, axes) in ifftn_object:
 59 |         ifftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.ifftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)      
 60 |     if not b is None:
 61 |         if b.flags['C_CONTIGUOUS'] is True:
 62 |             ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
 63 |         else:
 64 |             ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
 65 |             b[:] = ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
 66 |         return b
 67 |     else:
 68 |         ifftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
 69 |         return ifftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
 70 | 
 71 | def irfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
 72 |     global irfft_object
 73 |     if not (a.shape, a.dtype, axis) in irfft_object:
 74 |         irfft_object[(a.shape, a.dtype, axis)] = pyfftw.builders.irfft(a, axis=axis, threads=threads, planner_effort=planner_effort)
 75 |     if overwrite_input:
 76 |         irfft_object[(a.shape, a.dtype, axis)](a)
 77 |     else:
 78 |         irfft_object[(a.shape, a.dtype, axis)](a.copy())
 79 |     if not b is None:
 80 |         b[:] = irfft_object[(a.shape, a.dtype, axis)].output_array
 81 |         return b
 82 |     else:
 83 |         return irfft_object[(a.shape, a.dtype, axis)].output_array
 84 | 
 85 | def irfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
 86 |     global irfft2_object
 87 |     if not (a.shape, a.dtype, axes) in irfft2_object:
 88 |         irfft2_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfft2(a, axes=axes, threads=threads, planner_effort=planner_effort)
 89 |     # Copy required for irfft2 because input is destroyed
 90 |     if overwrite_input:
 91 |         irfft2_object[(a.shape, a.dtype, axes)](a)
 92 |     else:
 93 |         irfft2_object[(a.shape, a.dtype, axes)](a.copy())
 94 |     if not b is None:
 95 |         b[:] = irfft2_object[(a.shape, a.dtype, axes)].output_array
 96 |         return b
 97 |     else:
 98 |         return irfft2_object[(a.shape, a.dtype, axes)].output_array
 99 | 
100 | def irfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
101 |     global irfftn_object
102 |     if not (a.shape, a.dtype, axes) in irfftn_object:
103 |         irfftn_object[(a.shape, a.dtype, axes)] = pyfftw.builders.irfftn(a, axes=axes, threads=threads, planner_effort=planner_effort)
104 |     # Copy required because input is always destroyed    
105 |     if overwrite_input:
106 |         irfftn_object[(a.shape, a.dtype, axes)](a)
107 |     else:
108 |         irfftn_object[(a.shape, a.dtype, axes)](a.copy())
109 |     if not b is None:
110 |         b[:] = irfftn_object[(a.shape, a.dtype, axes)].output_array
111 |         return b
112 |     else:
113 |         return irfftn_object[(a.shape, a.dtype, axes)].output_array
114 | 
115 | def fft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
116 |     global fft_object
117 |     if not (a.shape, a.dtype, overwrite_input, axis) in fft_object:
118 |         fft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.fft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
119 |     if not b is None:
120 |         if b.flags['C_CONTIGUOUS'] is True:
121 |             fft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
122 |         else:
123 |             fft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
124 |             b[:] = fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
125 |         return b
126 |     else:
127 |         fft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
128 |         return fft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
129 | 
130 | def fft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
131 |     global fft2_object
132 |     if not (a.shape, a.dtype, overwrite_input, axes) in fft2_object:
133 |         fft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
134 |     if not b is None:
135 |         if b.flags['C_CONTIGUOUS'] is True:
136 |             fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
137 |         else:
138 |             fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
139 |             b[:] = fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
140 |         return b
141 |     else:
142 |         fft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
143 |         return fft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
144 | 
145 | def fftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
146 |     global fftn_object
147 |     if not (a.shape, a.dtype, overwrite_input, axes) in fftn_object:
148 |         fftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.fftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)    
149 |     if not b is None:
150 |         if b.flags['C_CONTIGUOUS'] is True:
151 |             fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
152 |         else:
153 |             fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
154 |             b[:] = fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
155 |         return b
156 |     else:
157 |         fftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
158 |         return fftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
159 | 
160 | def rfft(a, b=None, axis=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
161 |     global rfft_object
162 |     if not (a.shape, a.dtype, overwrite_input, axis) in rfft_object:
163 |         rfft_object[(a.shape, a.dtype, overwrite_input, axis)] = pyfftw.builders.rfft(a, axis=axis, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
164 |     if not b is None:
165 |         if b.flags['C_CONTIGUOUS'] is True:
166 |             rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a, b)
167 |         else:
168 |             rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
169 |             b[:] = rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
170 |         return b
171 |     else:
172 |         rfft_object[(a.shape, a.dtype, overwrite_input, axis)](a)
173 |         return rfft_object[(a.shape, a.dtype, overwrite_input, axis)].output_array
174 | 
175 | def rfft2(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
176 |     global rfft2_object
177 |     if not (a.shape, a.dtype, overwrite_input, axes) in rfft2_object:
178 |         rfft2_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfft2(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)  
179 |     if not b is None:
180 |         if b.flags['C_CONTIGUOUS'] is True:
181 |             rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
182 |         else:
183 |             rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
184 |             b[:] = rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
185 |         return b
186 |     else:
187 |         rfft2_object[(a.shape, a.dtype, overwrite_input, axes)](a)
188 |         return rfft2_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
189 | 
190 | def rfftn(a, b=None, axes=None, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
191 |     global rfftn_object
192 |     if not (a.shape, a.dtype, overwrite_input, axes) in rfftn_object:
193 |         rfftn_object[(a.shape, a.dtype, overwrite_input, axes)] = pyfftw.builders.rfftn(a, axes=axes, overwrite_input=overwrite_input, threads=threads, planner_effort=planner_effort)
194 |     if not b is None:
195 |         if b.flags['C_CONTIGUOUS'] is True:
196 |             rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a, b)
197 |         else:
198 |             rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
199 |             b[:] = rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
200 |         return b
201 |     else:
202 |         rfftn_object[(a.shape, a.dtype, overwrite_input, axes)](a)
203 |         return rfftn_object[(a.shape, a.dtype, overwrite_input, axes)].output_array
204 | 
205 | if hasattr(pyfftw.builders, "dct"):
206 |     #@profile
207 |     def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_EXHAUSTIVE"):
208 |         global dct_object
209 |         key = (a.shape, a.dtype, overwrite_input, axis, type)
210 |         if not key in dct_object:
211 |             if iscomplexobj(a):
212 |                 ac = a.real.copy()
213 |             else:
214 |                 ac = a
215 |             dct_object[key] = pyfftw.builders.dct(ac, axis=axis, type=type, 
216 |                                                   overwrite_input=overwrite_input, 
217 |                                                   threads=threads,
218 |                                                   planner_effort=planner_effort)
219 |             
220 |         dobj = dct_object[key]
221 |         c = dobj.get_output_array()
222 |         if iscomplexobj(a):
223 |             dobj(a.real, c)
224 |             b.real[:] = c
225 |             dobj(a.imag, c)
226 |             b.imag[:] = c
227 | 
228 |         else:
229 |             dobj(a)
230 |             b[:] = c
231 |         return b
232 |     
233 | else:
234 |     dct1 = pyfftw.interfaces.scipy_fftpack.dct
235 |     #@profile
236 |     def dct(a, b, type=2, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
237 |         if iscomplexobj(a):
238 |             b.real[:] = dct1(a.real, type=type, axis=axis)
239 |             b.imag[:] = dct1(a.imag, type=type, axis=axis)
240 |             return b
241 | 
242 |         else:
243 |             b[:] = dct1(a, type=type, axis=axis)
244 |             return b
245 | 
246 | 
247 | #def fft(a, b=None, axis=0):
248 |     #if b is None:
249 |         #b = nfft.fft(a, axis=axis)
250 |     #else:
251 |         #b[:] = nfft.fft(a, axis=axis)
252 |     #return b
253 |         
254 | #def ifft(a, b=None, axis=0):
255 |     #if b is None:
256 |         #b = nfft.ifft(a, axis=axis)
257 |     #else:
258 |         #b[:] = nfft.ifft(a, axis=axis)
259 |     #return b
260 | 
261 | #def rfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
262 |     #b[:] = nfft.rfft(a, axis=axis, overwrite_input=overwrite_input)
263 |     #return b
264 |         
265 | #def irfft(a, b, axis=0, overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
266 |     #b[:] = nfft.irfft(a, axis=axis, overwrite_input=overwrite_input)
267 |     #return b
268 | 
269 | #def fft2(a, b=None, axes=(0, 1)):
270 |     #if b is None:
271 |         #b = nfft.fft2(a, axes=axes)
272 |     #else:
273 |         #b[:] = nfft.fft2(a, axes=axes)
274 |     #return b
275 |         
276 | #def ifft2(a, b=None, axes=(0, 1)):
277 |     #if b is None:
278 |         #b = nfft.ifft2(a, axes=axes)
279 |     #else:
280 |         #b[:] = nfft.ifft2(a, axes=axes)
281 |     #return b
282 | 
283 | #def rfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
284 |     #b[:] = nfft.rfft2(a, axes=axes, overwrite_input=overwrite_input)
285 |     #return b
286 |         
287 | #def irfft2(a, b, axes=(0, 1), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
288 |     #b[:] = nfft.irfft2(a, axes=axes, overwrite_input=overwrite_input)
289 |     #return b
290 | 
291 | #def fftn(a, b=None, axes=(0, 1, 2)):
292 |     #if b is None:
293 |         #b = nfft.fftn(a, axes=axes)
294 |     #else:
295 |         #b[:] = nfft.fftn(a, axes=axes)
296 |     #return b
297 |         
298 | #def ifftn(a, b=None, axes=(0, 1, 2)):
299 |     #if b is None:
300 |         #b = nfft.ifftn(a, axes=axes)
301 |     #else:
302 |         #b[:] = nfft.ifftn(a, axes=axes)
303 |     #return b
304 | 
305 | #def rfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
306 |     #b[:] = nfft.rfftn(a, axes=axes, overwrite_input=overwrite_input)
307 |     #return b
308 |         
309 | #def irfftn(a, b, axes=(0, 1, 2), overwrite_input=False, threads=1, planner_effort="FFTW_MEASURE"):
310 |     #b[:] = nfft.irfftn(a, axes=axes, overwrite_input=overwrite_input)
311 |     #return b
312 |  
313 | 


--------------------------------------------------------------------------------
/mpiFFT4py/slab.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | """Slab decomposition
  3 | 
  4 | This module contains classes for performing FFTs with slab decomposition
  5 | of three-dimensional data structures data[Nx, Ny, Nz], where (Nx, Ny, Nz) is
  6 | the shape of the input data. With slab decomposition only one of these three
  7 | indices is shared, leading to local datastructures on each processor
  8 | with shape data[Nx/P, Ny, Nz], where P is the total number of processors.
  9 | 
 10 | classes:
 11 |     R2C - For real to complex transforms
 12 |     C2C - For complex to complex transforms
 13 | """
 14 | __author__ = "Mikael Mortensen <mikaem@math.uio.no>"
 15 | __date__ = "2016-02-16"
 16 | __copyright__ = "Copyright (C) 2016 " + __author__
 17 | __license__ = "GNU Lesser GPL version 3 or any later version"
 18 | 
 19 | from .serialFFT import *
 20 | import numpy as np
 21 | from .mpibase import work_arrays, datatypes
 22 | from numpy.fft import fftfreq, rfftfreq
 23 | from .cython.maths import dealias_filter, transpose_Uc #, transpose_Umpi
 24 | from collections import defaultdict
 25 | from mpi4py import MPI
 26 | 
 27 | # Using Lisandro Dalcin's code for Alltoallw.
 28 | # Note that _subsize and _distribution are only really required for
 29 | # general shape meshes. Here we require power two.
 30 | 
 31 | def _subsize(N, size, rank):
 32 |     return N // size + (N % size > rank)
 33 | 
 34 | def _distribution(N, size):
 35 |     q = N // size
 36 |     r = N % size
 37 |     n = s = i = 0
 38 |     while i < size:
 39 |         n = q
 40 |         s = q * i
 41 |         if i < r:
 42 |             n += 1
 43 |             s += i
 44 |         else:
 45 |             s += r
 46 |         yield n, s
 47 |         i += 1
 48 | 
 49 | class R2C(object):
 50 |     """Class for performing FFT in 3D using MPI
 51 | 
 52 |     Slab decomposition
 53 | 
 54 |     Args:
 55 |         N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
 56 |         L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh
 57 |         comm - The MPI communicator object
 58 |         precision - "single" or "double"
 59 |         communication - Method used for communication ('Alltoall', 'Sendrecv_replace', 'Alltoallw')
 60 |         padsize - Padsize when dealias = 3/2-rule is used
 61 |         threads - Number of threads used by FFTs
 62 |         planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
 63 |                          Give as defaultdict, with keys representing transform (e.g., fft, ifft)
 64 | 
 65 |     The forward transform is real to complex and the inverse is complex to real
 66 |     """
 67 |     def __init__(self, N, L, comm, precision,
 68 |                  communication="Alltoallw",
 69 |                  padsize=1.5,
 70 |                  threads=1,
 71 |                  planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
 72 |         assert len(L) == 3
 73 |         assert len(N) == 3
 74 |         self.N = N
 75 |         self.Nf = N[2]//2+1          # Independent complex wavenumbers in z-direction
 76 |         self.Nfp = int(padsize*N[2]//2+1) # Independent complex wavenumbers in z-direction for padded array
 77 |         self.comm = comm
 78 |         self.float, self.complex, self.mpitype = datatypes(precision)
 79 |         self.communication = communication
 80 |         self.num_processes = comm.Get_size()
 81 |         self.rank = comm.Get_rank()
 82 |         self.Np = N // self.num_processes
 83 |         self.L = L.astype(self.float)
 84 |         self.dealias = np.zeros(0)
 85 |         self.padsize = padsize
 86 |         self.threads = threads
 87 |         self.planner_effort = planner_effort
 88 |         self.work_arrays = work_arrays()
 89 |         if not self.num_processes in [2**i for i in range(int(np.log2(N[0]))+1)]:
 90 |             raise IOError("Number of cpus must be in ",
 91 |                           [2**i for i in range(int(np.log2(N[0]))+1)])
 92 |         self._subarraysA = []
 93 |         self._subarraysB = []
 94 |         self._counts_displs = 0
 95 |         self._subarraysA_pad = []
 96 |         self._subarraysB_pad = []
 97 | 
 98 |     def real_shape(self):
 99 |         """The local shape of the real data"""
100 |         return (self.Np[0], self.N[1], self.N[2])
101 | 
102 |     def complex_shape(self):
103 |         """The local shape of the complex data"""
104 |         return (self.N[0], self.Np[1], self.Nf)
105 | 
106 |     def complex_shape_T(self):
107 |         """The local transposed shape of the complex data"""
108 |         return (self.Np[0], self.N[1], self.Nf)
109 | 
110 |     def global_real_shape(self):
111 |         """Global size of problem in real physical space"""
112 |         return (self.N[0], self.N[1], self.N[2])
113 | 
114 |     def global_complex_shape(self, padsize=1.):
115 |         """Global size of problem in complex wavenumber space"""
116 |         return (int(padsize*self.N[0]), int(padsize*self.N[1]),
117 |                 int(padsize*self.N[2]//2+1))
118 | 
119 |     def work_shape(self, dealias):
120 |         """Shape of work arrays used in convection with dealiasing.
121 | 
122 |         Note the different shape whether or not padding is involved.
123 |         """
124 |         if dealias == '3/2-rule':
125 |             return self.real_shape_padded()
126 | 
127 |         else:
128 |             return self.real_shape()
129 | 
130 |     def real_local_slice(self, padsize=1):
131 |         """Local slice in real space of the input array
132 | 
133 |         Array can be padded with padsize > 1
134 |         """
135 |         return (slice(int(padsize*self.rank*self.Np[0]),
136 |                       int(padsize*(self.rank+1)*self.Np[0]), 1),
137 |                 slice(0, int(padsize*self.N[1]), 1),
138 |                 slice(0, int(padsize*self.N[2]), 1))
139 | 
140 |     def complex_local_slice(self):
141 |         """Local slice of complex return array"""
142 |         return (slice(0, self.N[0], 1),
143 |                 slice(self.rank*self.Np[1], (self.rank+1)*self.Np[1], 1),
144 |                 slice(0, self.Nf, 1))
145 | 
146 |     def complex_local_wavenumbers(self):
147 |         """Returns local wavenumbers of complex space"""
148 |         return (fftfreq(self.N[0], 1./self.N[0]).astype(self.float),
149 |                 fftfreq(self.N[1], 1./self.N[1])[self.complex_local_slice()[1]].astype(self.float),
150 |                 rfftfreq(self.N[2], 1./self.N[2]).astype(self.float))
151 | 
152 |     def get_local_mesh(self):
153 |         """Returns the local decomposed physical mesh"""
154 |         X = np.ogrid[self.rank*self.Np[0]:(self.rank+1)*self.Np[0],
155 |                      :self.N[1], :self.N[2]]
156 |         X[0] = (X[0]*self.L[0]/self.N[0]).astype(self.float)
157 |         X[1] = (X[1]*self.L[1]/self.N[1]).astype(self.float)
158 |         X[2] = (X[2]*self.L[2]/self.N[2]).astype(self.float)
159 |         X = [np.broadcast_to(x, self.real_shape()) for x in X]
160 |         return X
161 | 
162 |     def get_local_wavenumbermesh(self, scaled=False, broadcast=False, eliminate_highest_freq=False):
163 |         """Returns (scaled) local decomposed wavenumbermesh
164 | 
165 |         If scaled is True, then the wavenumbermesh is scaled with physical mesh
166 |         size. This takes care of mapping the physical domain to a computational
167 |         cube of size (2pi)**3.
168 | 
169 |         If eliminate_highest_freq is True, then the Nyquist frequency is set to zero.
170 |         """
171 |         kx, ky, kz = self.complex_local_wavenumbers()
172 |         if eliminate_highest_freq:
173 |             ky = fftfreq(self.N[1], 1./self.N[1].astype(self.float))
174 |             for i, k in enumerate((kx, ky, kz)):
175 |                 if self.N[i] % 2 == 0:
176 |                     k[self.N[i]//2] = 0
177 |             ky = ky[self.complex_local_slice()[1]]
178 | 
179 |         Ks = np.meshgrid(kx, ky, kz, indexing='ij', sparse=True)
180 |         for i in range(3):
181 |             Ks[i] = Ks[i].astype(self.float)
182 |         if scaled:
183 |             Lp = 2*np.pi/self.L
184 |             for i in range(3):
185 |                 Ks[i] *= Lp[i]
186 |         K = Ks
187 |         if broadcast is True:
188 |             K = [np.broadcast_to(k, self.complex_shape()) for k in Ks]
189 |         return K
190 | 
191 |     def get_dealias_filter(self):
192 |         """Filter for dealiasing nonlinear convection"""
193 |         K = self.get_local_wavenumbermesh()
194 |         kmax = 2./3.*(self.N//2+1)
195 |         dealias = np.array((abs(K[0]) < kmax[0])*(abs(K[1]) < kmax[1])*
196 |                            (abs(K[2]) < kmax[2]), dtype=np.uint8)
197 |         return dealias
198 | 
199 |     def get_subarrays(self, padsize=1):
200 |         """Subarrays for Alltoallw transforms"""
201 |         datatype = MPI._typedict[np.dtype(self.complex).char]
202 |         _subarraysA = [
203 |             datatype.Create_subarray([int(padsize*self.N[0]), self.Np[1], self.Nf], [l, self.Np[1], self.Nf], [s, 0, 0]).Commit()
204 |             for l, s in _distribution(int(padsize*self.N[0]), self.num_processes)
205 |         ]
206 |         _subarraysB = [
207 |             datatype.Create_subarray([int(padsize*self.Np[0]), self.N[1], self.Nf], [int(padsize*self.Np[0]), l, self.Nf], [0, s, 0]).Commit()
208 |             for l, s in _distribution(self.N[1], self.num_processes)
209 |         ]
210 |         _counts_displs = ([1] * self.num_processes, [0] * self.num_processes)
211 |         return _subarraysA, _subarraysB, _counts_displs
212 | 
213 |     #@profile
214 |     def ifftn(self, fu, u, dealias=None):
215 |         """ifft in three directions using mpi.
216 | 
217 |         Need to do ifft in reversed order of fft
218 | 
219 |         dealias = "3/2-rule"
220 |             - Padded transform with 3/2-rule. fu is padded with zeros
221 |               before transforming to real space of shape real_shape_padded()
222 |             - u is of real_shape_padded()
223 | 
224 |         dealias = "2/3-rule"
225 |             - Transform is using 2/3-rule, i.e., frequencies higher than
226 |               2/3*N are set to zero before transforming
227 |             - u is of real_shape()
228 | 
229 |         dealias = None
230 |             - Regular transform
231 |             - u is of real_shape()
232 | 
233 |         fu is of complex_shape()
234 |         """
235 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
236 | 
237 |         if dealias == '2/3-rule' and self.dealias.shape == (0,):
238 |             self.dealias = self.get_dealias_filter()
239 | 
240 |         fu_ = fu
241 |         if dealias == '2/3-rule':
242 |             fu_ = self.work_arrays[(fu, 0, False)]
243 |             fu_[:] = fu
244 |             fu_ = dealias_filter(fu_, self.dealias)
245 |             #fu_ *= self.dealias
246 | 
247 |         if self.num_processes == 1:
248 |             if not dealias == '3/2-rule':
249 |                 u = irfftn(fu_, u, axes=(0, 1, 2), threads=self.threads, planner_effort=self.planner_effort['irfftn'])
250 | 
251 |             else:
252 |                 assert u.shape == self.real_shape_padded()
253 | 
254 |                 # Scale smallest array with padsize
255 |                 fu_ = self.work_arrays[(fu, 0, False)]
256 |                 fu_[:] = fu*self.padsize**3
257 | 
258 |                 # First create padded complex array and then perform irfftn
259 |                 fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5), self.complex, 0)]
260 |                 fu_padded[:self.N[0]//2, :self.N[1]//2, :self.Nf] = fu_[:self.N[0]//2, :self.N[1]//2]
261 |                 fu_padded[:self.N[0]//2, -self.N[1]//2:, :self.Nf] = fu_[:self.N[0]//2, self.N[1]//2:]
262 |                 fu_padded[-self.N[0]//2:, :self.N[1]//2, :self.Nf] = fu_[self.N[0]//2:, :self.N[1]//2]
263 |                 fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf] = fu_[self.N[0]//2:, -self.N[1]//2:]
264 | 
265 |                 u[:] = irfftn(fu_padded, overwrite_input=True,
266 |                               axes=(0, 1, 2), threads=self.threads,
267 |                               planner_effort=self.planner_effort['irfftn'])
268 |             return u
269 | 
270 |         if not dealias == '3/2-rule':
271 |             # Intermediate work arrays required for transform
272 |             Uc_hat = self.work_arrays[(self.complex_shape(), self.complex, 0, False)]
273 | 
274 |             # Do first owned direction
275 |             Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads, planner_effort=self.planner_effort['ifft'])
276 | 
277 |             if self.communication == 'Alltoall':
278 |                 Uc_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
279 | 
280 |                 ## Communicate all values
281 |                 self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype])
282 |                 #Uc_hatT = np.rollaxis(Uc_mpi, 1).reshape(self.complex_shape_T())
283 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
284 |                 Uc_hatT = transpose_Uc(Uc_hatT, Uc_mpi, self.num_processes, self.Np[0], self.Np[1], self.Nf)
285 | 
286 |                 #self.comm.Alltoall(MPI.IN_PLACE, [Uc_hat, self.mpitype])
287 |                 #Uc_hatT = np.rollaxis(Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf)), 1).reshape(self.complex_shape_T())
288 | 
289 |             elif self.communication == 'Sendrecv_replace':
290 |                 Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf))
291 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
292 |                 for i in xrange(self.num_processes):
293 |                     if not i == self.rank:
294 |                         self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0)
295 |                     Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i]
296 | 
297 |             elif self.communication == 'Alltoallw':
298 |                 if len(self._subarraysA) == 0:
299 |                     self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays()
300 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
301 |                 self.comm.Alltoallw(
302 |                     [Uc_hat, self._counts_displs, self._subarraysA],
303 |                     [Uc_hatT, self._counts_displs, self._subarraysB])
304 | 
305 |             # Do last two directions
306 |             u = irfft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2),
307 |                        threads=self.threads,
308 |                        planner_effort=self.planner_effort['irfft2'])
309 | 
310 |         else:
311 |             assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule"
312 | 
313 |             # Intermediate work arrays required for transform
314 |             Upad_hat  = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0)]
315 |             Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)]
316 |             Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0)]
317 |             Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0)]
318 | 
319 |             # Expand in x-direction and perform ifft
320 |             Upad_hat = R2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0)
321 |             Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads,
322 |                                planner_effort=self.planner_effort['ifft'])
323 | 
324 |             if not self.communication == 'Alltoallw':
325 |                 # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir)
326 |                 self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat, self.mpitype])
327 |                 Upad_hat1[:] = np.rollaxis(Upad_hat.reshape(self.complex_shape_padded_0_I()), 1).reshape(Upad_hat1.shape)
328 | 
329 |             else:
330 |                 if len(self._subarraysA_pad) == 0:
331 |                     self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize)
332 |                 self.comm.Alltoallw(
333 |                     [Upad_hat, self._counts_displs, self._subarraysA_pad],
334 |                     [Upad_hat1, self._counts_displs, self._subarraysB_pad])
335 | 
336 |             # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y
337 |             Upad_hat2 = R2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1)
338 |             Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads,
339 |                                 planner_effort=self.planner_effort['ifft'])
340 | 
341 |             # pad in z-direction and perform final irfft
342 |             Upad_hat3 = R2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2)
343 |             u[:] = irfft(Upad_hat3, overwrite_input=True, axis=2, threads=self.threads,
344 |                          planner_effort=self.planner_effort['irfft'])
345 | 
346 |         return u
347 | 
348 |     #@profile
349 |     def fftn(self, u, fu, dealias=None):
350 |         """fft in three directions using mpi
351 | 
352 |         dealias = "3/2-rule"
353 |             - Truncated transform with 3/2-rule. The transformed fu is truncated
354 |               when copied to complex space of complex_shape()
355 |             - fu is of complex_shape()
356 |             - u is of real_shape_padded()
357 | 
358 |         dealias = "2/3-rule" or None
359 |             - Regular transform
360 |             - fu is of complex_shape()
361 |             - u is of real_shape()
362 | 
363 |         """
364 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
365 | 
366 |         if self.num_processes == 1:
367 |             if not dealias == '3/2-rule':
368 |                 assert u.shape == self.real_shape()
369 |                 fu = rfftn(u, fu, axes=(0, 1, 2), threads=self.threads,
370 |                            planner_effort=self.planner_effort['rfftn'])
371 | 
372 |             else:
373 |                 assert u.shape == self.real_shape_padded()
374 | 
375 |                 fu_padded = self.work_arrays[(self.global_complex_shape(padsize=1.5),
376 |                                               self.complex, 0, False)]
377 |                 fu_padded = rfftn(u, fu_padded, axes=(0, 1, 2),
378 |                                   planner_effort=self.planner_effort['rfftn'])
379 | 
380 |                 # Copy with truncation
381 |                 fu.fill(0)
382 |                 fu[:self.N[0]//2+1, :self.N[1]//2+1] = fu_padded[:self.N[0]//2+1, :self.N[1]//2+1, :self.Nf]
383 |                 fu[:self.N[0]//2+1, self.N[1]//2:] += fu_padded[:self.N[0]//2+1, -self.N[1]//2:, :self.Nf]
384 |                 fu[self.N[0]//2:, :self.N[1]//2+1] += fu_padded[-self.N[0]//2:, :self.N[1]//2+1, :self.Nf]
385 |                 fu[self.N[0]//2:, self.N[1]//2:] += fu_padded[-self.N[0]//2:, -self.N[1]//2:, :self.Nf]
386 |                 fu /= self.padsize**3
387 | 
388 |             return fu
389 | 
390 |         if not dealias == '3/2-rule':
391 | 
392 |             Uc_hat = self.work_arrays[(fu, 0, False)]
393 | 
394 |             if self.communication == 'Alltoall':
395 |                 # Intermediate work arrays required for transform
396 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
397 |                 U_mpi = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
398 | 
399 |                 # Do 2 ffts in y-z directions on owned data
400 |                 Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads, planner_effort=self.planner_effort['rfft2'])
401 | 
402 |                 #Transform data to align with x-direction
403 |                 U_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1)
404 | 
405 |                 #Communicate all values
406 |                 self.comm.Alltoall([U_mpi, self.mpitype], [Uc_hat, self.mpitype])
407 | 
408 |                 ## Transform data to align with x-direction
409 |                 #U_mpi = transpose_Umpi(U_mpi, Uc_hatT, self.num_processes, self.Np[0], self.Np[1], self.Nf)
410 | 
411 |                 ## Communicate all values
412 |                 #self.comm.Alltoall([U_mpi, self.mpitype], [fu, self.mpitype])
413 | 
414 |             elif self.communication == 'Sendrecv_replace':
415 |                 # Communicating intermediate result
416 |                 ft = Uc_hat.transpose(1, 0, 2)
417 |                 ft = rfft2(u, ft, axes=(1, 2), threads=self.threads,
418 |                            planner_effort=self.planner_effort['rfft2'])
419 |                 fu_send = Uc_hat.reshape((self.num_processes, self.Np[1],
420 |                                           self.Np[1], self.Nf))
421 |                 for i in xrange(self.num_processes):
422 |                     if not i == self.rank:
423 |                         self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0)
424 |                 fu_send[:] = fu_send.transpose(0, 2, 1, 3)
425 | 
426 |             elif self.communication == 'Alltoallw':
427 |                 if len(self._subarraysA) == 0:
428 |                     self._subarraysA, self._subarraysB, self._counts_displs = self.get_subarrays()
429 | 
430 |                 # Intermediate work arrays required for transform
431 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
432 | 
433 |                 # Do 2 ffts in y-z directions on owned data
434 |                 Uc_hatT = rfft2(u, Uc_hatT, axes=(1, 2), threads=self.threads,
435 |                                 planner_effort=self.planner_effort['rfft2'])
436 | 
437 |                 self.comm.Alltoallw(
438 |                     [Uc_hatT, self._counts_displs, self._subarraysB],
439 |                     [Uc_hat, self._counts_displs, self._subarraysA])
440 | 
441 |             # Do fft for last direction
442 |             fu = fft(Uc_hat, fu, overwrite_input=True, axis=0,
443 |                      threads=self.threads, planner_effort=self.planner_effort['fft'])
444 | 
445 |         else:
446 |             assert self.num_processes <= self.N[0]//2, "Number of processors cannot be larger than N[0]//2 for 3/2-rule"
447 |             assert u.shape == self.real_shape_padded()
448 | 
449 |             # Intermediate work arrays required for transform
450 |             Upad_hat  = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
451 |             Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)]
452 |             Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)]
453 |             Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
454 | 
455 |             # Do ffts in the padded y and z directions
456 |             Upad_hat3 = rfft2(u, Upad_hat3, axes=(1, 2), threads=self.threads,
457 |                               planner_effort=self.planner_effort['rfft2'])
458 | 
459 |             # Copy with truncation
460 |             Upad_hat1 = R2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1)
461 | 
462 |             if self.communication == 'Alltoall':
463 |                 # Transpose and commuincate data
464 |                 Upad_hat0[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1).reshape(Upad_hat0.shape)
465 |                 self.comm.Alltoall(MPI.IN_PLACE, [Upad_hat0, self.mpitype])
466 | 
467 |             elif self.communication == 'Alltoallw':
468 |                 if len(self._subarraysA_pad) == 0:
469 |                     self._subarraysA_pad, self._subarraysB_pad, self._counts_displs = self.get_subarrays(padsize=self.padsize)
470 | 
471 |                 self.comm.Alltoallw(
472 |                     [Upad_hat1, self._counts_displs, self._subarraysB_pad],
473 |                     [Upad_hat0, self._counts_displs, self._subarraysA_pad])
474 | 
475 |             # Perform fft of data in x-direction
476 |             Upad_hat = fft(Upad_hat0, Upad_hat, axis=0, threads=self.threads,
477 |                            planner_effort=self.planner_effort['fft'])
478 | 
479 |             # Truncate to original complex shape
480 |             fu.fill(0)
481 |             fu[:self.N[0]//2+1] = Upad_hat[:self.N[0]//2+1]
482 |             fu[self.N[0]//2:] += Upad_hat[-self.N[0]//2:]
483 |             fu /= self.padsize**3
484 | 
485 |         return fu
486 | 
487 |     def real_shape_padded(self):
488 |         """The local shape of the real data"""
489 |         return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), int(self.padsize*self.N[2]))
490 | 
491 |     def complex_shape_padded_0(self):
492 |         """Padding in x-direction"""
493 |         return (int(self.padsize*self.N[0]), self.Np[1], self.Nf)
494 | 
495 |     def complex_shape_padded_0_I(self):
496 |         """Padding in x-direction - reshaped for MPI communications"""
497 |         return (self.num_processes, int(self.padsize*self.Np[0]), self.Np[1], self.Nf)
498 | 
499 |     def complex_shape_padded_1(self):
500 |         """Transpose of complex_shape_padded_0"""
501 |         return (int(self.padsize*self.Np[0]), self.N[1], self.Nf)
502 | 
503 |     def complex_shape_padded_2(self):
504 |         """Padding in x and y-directions"""
505 |         return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nf)
506 | 
507 |     def complex_shape_padded_3(self):
508 |         """Padding in all directions.
509 |         ifft of this shape leads to real_shape_padded"""
510 |         return (int(self.padsize*self.Np[0]), int(self.padsize*self.N[1]), self.Nfp)
511 | 
512 |     def complex_shape_padded_I(self):
513 |         """A local intermediate shape of the complex data"""
514 |         return (int(self.padsize*self.Np[0]), self.num_processes, self.Np[1], self.Nf)
515 | 
516 |     @staticmethod
517 |     def copy_to_padded(fu, fp, N, axis=0):
518 |         if axis == 0:
519 |             fp[:N[0]//2] = fu[:N[0]//2]
520 |             fp[-N[0]//2:] = fu[N[0]//2:]
521 |         elif axis == 1:
522 |             fp[:, :N[1]//2] = fu[:, :N[1]//2]
523 |             fp[:, -N[1]//2:] = fu[:, N[1]//2:]
524 |         elif axis == 2:
525 |             fp[:, :, :(N[2]//2+1)] = fu[:]
526 |         return fp
527 | 
528 |     @staticmethod
529 |     def copy_from_padded(fp, fu, N, axis=0):
530 |         if axis == 1:
531 |             fu.fill(0)
532 |             fu[:, :N[1]//2+1] = fp[:, :N[1]//2+1, :(N[2]//2+1)]
533 |             fu[:, N[1]//2:] += fp[:, -N[1]//2:, :(N[2]//2+1)]
534 |         elif axis == 2:
535 |             fu[:] = fp[:, :, :(N[2]//2+1)]
536 |         return fu
537 | 
538 | class C2C(R2C):
539 |     """Class for performing FFT in 3D using MPI
540 | 
541 |     Slab decomposition
542 | 
543 |     Args:
544 |         N - NumPy array([Nx, Ny, Nz]) Number of nodes for the real mesh
545 |         L - NumPy array([Lx, Ly, Lz]) The actual size of the real mesh
546 |         comm - The MPI communicator object
547 |         precision - "single" or "double"
548 |         communication - Method used for communication ('Alltoall', 'Sendrecv_replace')
549 |         padsize - Padsize when dealias = 3/2-rule is used
550 |         threads - Number of threads used by FFTs
551 |         planner_effort - Planner effort used by FFTs (e.g., "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE")
552 |                          Give as defaultdict, with keys representing transform (e.g., fft, ifft)
553 | 
554 |     The transform is complex to complex
555 |     """
556 |     def __init__(self, N, L, comm, precision,
557 |                  communication="Alltoall",
558 |                  padsize=1.5,
559 |                  threads=1,
560 |                  planner_effort=defaultdict(lambda: "FFTW_MEASURE")):
561 |         R2C.__init__(self, N, L, comm, precision,
562 |                      communication=communication,
563 |                      padsize=padsize, threads=threads,
564 |                      planner_effort=planner_effort)
565 |         # Reuse all shapes from r2c transform R2C simply by resizing the final complex z-dimension:
566 |         self.Nf = N[2]
567 |         self.Nfp = int(self.padsize*self.N[2]) # Independent complex wavenumbers in z-direction for padded array
568 | 
569 |         # Rename since there's no real space
570 |         self.original_shape_padded = self.real_shape_padded
571 |         self.original_shape = self.real_shape
572 |         self.transformed_shape = self.complex_shape
573 |         self.original_local_slice = self.real_local_slice
574 |         self.transformed_local_slice = self.complex_local_slice
575 |         self.ks = (fftfreq(N[2])*N[2]).astype(int)
576 | 
577 |     def global_shape(self, padsize=1.):
578 |         """Global size of problem in transformed space"""
579 |         return (int(padsize*self.N[0]), int(padsize*self.N[1]),
580 |                 int(padsize*self.N[2]))
581 | 
582 |     def transformed_local_wavenumbers(self):
583 |         return (fftfreq(self.N[0], 1./self.N[0]),
584 |                 fftfreq(self.N[1], 1./self.N[1])[self.transformed_local_slice()[1]],
585 |                 fftfreq(self.N[2], 1./self.N[2]))
586 | 
587 |     def ifftn(self, fu, u, dealias=None):
588 |         """ifft in three directions using mpi.
589 |         Need to do ifft in reversed order of fft
590 | 
591 |         dealias = "3/2-rule"
592 |             - Padded transform with 3/2-rule. fu is padded with zeros
593 |               before transforming to complex space of shape original_shape_padded()
594 |             - u is of original_shape_padded()
595 | 
596 |         dealias = "2/3-rule"
597 |             - Transform is using 2/3-rule, i.e., frequencies higher than
598 |               2/3*N are set to zero before transforming
599 |             - u is of original_shape()
600 | 
601 |         dealias = None
602 |             - Regular transform
603 |             - u is of original_shape()
604 | 
605 |         fu is of transformed_shape()
606 |         """
607 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
608 | 
609 |         if dealias == '2/3-rule' and self.dealias.shape == (0,):
610 |             self.dealias = self.get_dealias_filter()
611 | 
612 |         if self.num_processes == 1:
613 |             if not dealias == '3/2-rule':
614 |                 fu_ = fu
615 |                 if dealias == '2/3-rule':
616 |                     fu_ = self.work_arrays[(fu, 0, False)]
617 |                     fu_[:] = fu
618 |                     fu_ *= self.dealias
619 | 
620 |                 u = ifftn(fu_, u, axes=(0, 1, 2), threads=self.threads,
621 |                           planner_effort=self.planner_effort['ifftn'])
622 | 
623 |             else:
624 |                 assert u.shape == self.original_shape_padded()
625 | 
626 |                 # First create padded complex array and then perform irfftn
627 |                 fu_padded = self.work_arrays[(u, 0)]
628 |                 fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks] = fu[:self.N[0]//2, :self.N[1]//2]
629 |                 fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks] = fu[:self.N[0]//2, self.N[1]//2:]
630 |                 fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks] = fu[self.N[0]//2:, :self.N[1]//2]
631 |                 fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks] = fu[self.N[0]//2:, self.N[1]//2:]
632 |                 u = ifftn(fu_padded*self.padsize**3, u, overwrite_input=True,
633 |                           axes=(0, 1, 2), threads=self.threads,
634 |                           planner_effort=self.planner_effort['ifftn'])
635 | 
636 |             return u
637 | 
638 |         if not dealias == '3/2-rule':
639 |             fu_ = fu
640 |             if dealias == '2/3-rule':
641 |                 fu_ = self.work_arrays[(fu, 0, False)]
642 |                 fu_[:] = fu
643 |                 fu_ *= self.dealias
644 | 
645 |             # Intermediate work arrays required for transform
646 |             Uc_hat  = self.work_arrays[(self.complex_shape(), self.complex, 0, False)]
647 |             Uc_mpi  = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
648 |             Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
649 | 
650 |             # Do first owned direction
651 |             Uc_hat = ifft(fu_, Uc_hat, axis=0, threads=self.threads,
652 |                           planner_effort=self.planner_effort['ifft'])
653 | 
654 |             if self.communication == 'Alltoall':
655 |                 # Communicate all values
656 |                 self.comm.Alltoall([Uc_hat, self.mpitype], [Uc_mpi, self.mpitype])
657 |                 Uc_hatT[:] = np.rollaxis(Uc_mpi, 1).reshape(Uc_hatT.shape)
658 | 
659 |             else:
660 |                 Uc_send = Uc_hat.reshape((self.num_processes, self.Np[0], self.Np[1], self.Nf))
661 |                 for i in xrange(self.num_processes):
662 |                     if not i == self.rank:
663 |                         self.comm.Sendrecv_replace([Uc_send[i], self.mpitype], i, 0, i, 0)
664 |                     Uc_hatT[:, i*self.Np[1]:(i+1)*self.Np[1]] = Uc_send[i]
665 | 
666 |             # Do last two directions
667 |             u = ifft2(Uc_hatT, u, overwrite_input=True, axes=(1, 2),
668 |                       threads=self.threads,
669 |                       planner_effort=self.planner_effort['ifft2'])
670 | 
671 |         else:
672 |             # Intermediate work arrays required for transform
673 |             Upad_hat  = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
674 |             U_mpi     = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)]
675 |             Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0, False)]
676 |             Upad_hat2 = self.work_arrays[(self.complex_shape_padded_2(), self.complex, 0, False)]
677 |             Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
678 | 
679 |             # Expand in x-direction and perform ifft
680 |             Upad_hat = C2C.copy_to_padded(fu*self.padsize**3, Upad_hat, self.N, axis=0)
681 |             Upad_hat[:] = ifft(Upad_hat, axis=0, threads=self.threads,
682 |                                planner_effort=self.planner_effort['ifft'])
683 | 
684 |             # Communicate to distribute first dimension (like Fig. 2b but padded in x-dir and z-direction of full size)
685 |             self.comm.Alltoall([Upad_hat, self.mpitype], [U_mpi, self.mpitype])
686 | 
687 |             # Transpose data and pad in y-direction before doing ifft. Now data is padded in x and y
688 |             Upad_hat1[:] = np.rollaxis(U_mpi, 1).reshape(Upad_hat1.shape)
689 |             Upad_hat2 = C2C.copy_to_padded(Upad_hat1, Upad_hat2, self.N, axis=1)
690 |             Upad_hat2[:] = ifft(Upad_hat2, axis=1, threads=self.threads,
691 |                                 planner_effort=self.planner_effort['ifft'])
692 | 
693 |             # pad in z-direction and perform final ifft
694 |             Upad_hat3 = C2C.copy_to_padded(Upad_hat2, Upad_hat3, self.N, axis=2)
695 |             u = ifft(Upad_hat3, u, overwrite_input=True, axis=2,
696 |                      threads=self.threads, planner_effort=self.planner_effort['ifft'])
697 | 
698 |         return u
699 | 
700 |     def fftn(self, u, fu, dealias=None):
701 |         """fft in three directions using mpi
702 | 
703 |         dealias = "3/2-rule"
704 |             - Truncated transform with 3/2-rule. The transfored fu is truncated
705 |               when copied to complex space of complex_shape()
706 |             - fu is of transformed_shape()
707 |             - u is of original_shape_padded()
708 | 
709 |         dealias = "2/3-rule"
710 |             - Regular transform
711 |             - fu is of transformed_shape()
712 |             - u is of original_shape()
713 | 
714 |         dealias = None
715 |             - Regular transform
716 |             - fu is of transformed_shape()
717 |             - u is of original_shape()
718 |         """
719 |         assert dealias in ('3/2-rule', '2/3-rule', 'None', None)
720 | 
721 |         if self.num_processes == 1:
722 |             if not dealias == '3/2-rule':
723 |                 assert u.shape == self.original_shape()
724 | 
725 |                 fu = fftn(u, fu, axes=(0, 1, 2), threads=self.threads,
726 |                           planner_effort=self.planner_effort['fftn'])
727 | 
728 |             else:
729 |                 assert u.shape == self.original_shape_padded()
730 | 
731 |                 fu_padded = self.work_arrays[(u, 0)]
732 |                 fu_padded = fftn(u, fu_padded, axes=(0, 1, 2), threads=self.threads,
733 |                                  planner_effort=self.planner_effort['fftn'])
734 | 
735 |                 # Copy with truncation
736 |                 fu[:self.N[0]//2, :self.N[1]//2] = fu_padded[:self.N[0]//2, :self.N[1]//2, self.ks]
737 |                 fu[:self.N[0]//2, self.N[1]//2:] = fu_padded[:self.N[0]//2, -self.N[1]//2:, self.ks]
738 |                 fu[self.N[0]//2:, :self.N[1]//2] = fu_padded[-self.N[0]//2:, :self.N[1]//2, self.ks]
739 |                 fu[self.N[0]//2:, self.N[1]//2:] = fu_padded[-self.N[0]//2:, -self.N[1]//2:, self.ks]
740 |                 fu /= self.padsize**3
741 |             return fu
742 | 
743 |         if not dealias == '3/2-rule':
744 |             if self.communication == 'Alltoall':
745 |                 # Intermediate work arrays required for transform
746 |                 Uc_mpi  = self.work_arrays[((self.num_processes, self.Np[0], self.Np[1], self.Nf), self.complex, 0, False)]
747 |                 Uc_hatT = self.work_arrays[(self.complex_shape_T(), self.complex, 0, False)]
748 | 
749 |                 # Do 2 ffts in y-z directions on owned data
750 |                 Uc_hatT = fft2(u, Uc_hatT, axes=(1,2), threads=self.threads, planner_effort=self.planner_effort['fft2'])
751 | 
752 |                 # Transform data to align with x-direction
753 |                 Uc_mpi[:] = np.rollaxis(Uc_hatT.reshape(self.Np[0], self.num_processes, self.Np[1], self.Nf), 1)
754 | 
755 |                 # Communicate all values
756 |                 self.comm.Alltoall([Uc_mpi, self.mpitype], [fu, self.mpitype])
757 | 
758 |             else:
759 |                 # Communicating intermediate result
760 |                 ft = fu.transpose(1, 0, 2)
761 |                 ft = fft2(u, ft, axes=(1, 2), threads=self.threads,
762 |                           planner_effort=self.planner_effort['fft2'])
763 |                 fu_send = fu.reshape((self.num_processes, self.Np[1],
764 |                                       self.Np[1], self.Nf))
765 |                 for i in xrange(self.num_processes):
766 |                     if not i == self.rank:
767 |                         self.comm.Sendrecv_replace([fu_send[i], self.mpitype], i, 0, i, 0)
768 |                 fu_send[:] = fu_send.transpose(0, 2, 1, 3)
769 | 
770 |             # Do fft for last direction
771 |             fu[:] = fft(fu, axis=0, threads=self.threads,
772 |                         planner_effort=self.planner_effort['fft'])
773 | 
774 |         else:
775 |             # Intermediate work arrays required for transform
776 |             Upad_hat  = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 0, False)]
777 |             Upad_hat0 = self.work_arrays[(self.complex_shape_padded_0(), self.complex, 1, False)]
778 |             Upad_hat1 = self.work_arrays[(self.complex_shape_padded_1(), self.complex, 0)]
779 |             Upad_hat3 = self.work_arrays[(self.complex_shape_padded_3(), self.complex, 0, False)]
780 |             U_mpi     = self.work_arrays[(self.complex_shape_padded_0_I(), self.complex, 0, False)]
781 | 
782 |             # Do ffts in y and z directions
783 |             Upad_hat3 = fft2(u, Upad_hat3, axes=(1, 2), threads=self.threads,
784 |                              planner_effort=self.planner_effort['fft2'])
785 | 
786 |             # Copy with truncation
787 |             Upad_hat1 = C2C.copy_from_padded(Upad_hat3, Upad_hat1, self.N, 1)
788 | 
789 |             # Transpose and commuincate data
790 |             U_mpi[:] = np.rollaxis(Upad_hat1.reshape(self.complex_shape_padded_I()), 1)
791 |             self.comm.Alltoall([U_mpi, self.mpitype], [Upad_hat0, self.mpitype])
792 | 
793 |             # Perform fft of data in x-direction
794 |             Upad_hat = fft(Upad_hat0, Upad_hat, overwrite_input=True, axis=0, threads=self.threads, planner_effort=self.planner_effort['fft'])
795 | 
796 |             # Truncate to original complex shape
797 |             fu[:self.N[0]//2] = Upad_hat[:self.N[0]//2]
798 |             fu[self.N[0]//2:] = Upad_hat[-self.N[0]//2:]
799 |             fu /= self.padsize**3
800 | 
801 |         return fu
802 | 
803 |     @staticmethod
804 |     def copy_to_padded(fu, fp, N, axis=0):
805 |         if axis == 0:
806 |             fp[:N[0]//2] = fu[:N[0]//2]
807 |             fp[-N[0]//2:] = fu[N[0]//2:]
808 |         elif axis == 1:
809 |             fp[:, :N[1]//2] = fu[:, :N[1]//2]
810 |             fp[:, -N[1]//2:] = fu[:, N[1]//2:]
811 |         elif axis == 2:
812 |             fp[:, :, :N[2]//2] = fu[:, :, :N[2]//2]
813 |             fp[:, :, -N[2]//2:] = fu[:, :, N[2]//2:]
814 |         return fp
815 | 
816 |     @staticmethod
817 |     def copy_from_padded(fp, fu, N, axis=0):
818 |         if axis == 1:
819 |             fu.fill(0)
820 |             fu[:, :N[1]//2+1, :N[2]//2+1] = fp[:, :N[1]//2+1, :N[2]//2+1]
821 |             fu[:, :N[1]//2+1, N[2]//2:] += fp[:, :N[1]//2+1, -N[2]//2:]
822 |             fu[:, N[1]//2:, :N[2]//2+1] += fp[:, -N[1]//2:, :N[2]//2+1]
823 |             fu[:, N[1]//2:, N[2]//2:] += fp[:, -N[1]//2:, -N[2]//2:]
824 | 
825 |         return fu
826 | 
827 | 
828 | #def transpose_Uc(Uc_hatT, U_mpi, num_processes, Np0, Np1, Nf):
829 |     #for i in xrange(num_processes):
830 |         #Uc_hatT[:, i*Np1:(i+1)*Np1] = U_mpi[i]
831 |     #return Uc_hatT
832 | 
833 | #def transpose_Umpi(U_mpi, Uc_hatT, num_processes, Np0, Np1, Nf):
834 |     #for i in xrange(num_processes):
835 |         #U_mpi[i] = Uc_hatT[:, i*Np1:(i+1)*Np1]
836 |     #return U_mpi
837 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mpi4py
2 | cython
3 | numpy>=1.15


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import re
 5 | import subprocess
 6 | from setuptools import setup, Extension
 7 | from setuptools.command.build_ext import build_ext
 8 | from numpy import get_include
 9 | 
10 | cwd = os.path.abspath(os.path.dirname(__file__))
11 | cdir = os.path.join(cwd, "mpiFFT4py", "cython")
12 | 
13 | def has_flag(compiler, flagname):
14 |     """Return a boolean indicating whether a flag name is supported on
15 |     the specified compiler.
16 |     """
17 |     devnull = open(os.devnull, "w")
18 |     p = subprocess.Popen([compiler.compiler[0], '-E', '-'] + [flagname],
19 |                          stdin=subprocess.PIPE, stdout=devnull, stderr=devnull,
20 |                          shell=True)
21 |     p.communicate("")
22 |     return True if p.returncode == 0 else False
23 | 
24 | class build_ext_subclass(build_ext):
25 |     def build_extensions(self):
26 |         extra_compile_args = ['-g0']
27 |         for c in ['-w', '-Ofast', '-ffast-math', '-march=native']:
28 |             if has_flag(self.compiler, c):
29 |                 extra_compile_args.append(c)
30 | 
31 |         for e in self.extensions:
32 |             e.extra_compile_args += extra_compile_args
33 |             e.include_dirs.extend([get_include()])
34 |         build_ext.build_extensions(self)
35 | 
36 | ext = [Extension('mpiFFT4py.cython.maths',
37 |                  sources=[os.path.join(cdir, "maths.pyx")])]
38 | 
39 | def version():
40 |     srcdir = os.path.join(cwd, 'mpiFFT4py')
41 |     with open(os.path.join(srcdir, '__init__.py')) as f:
42 |         m = re.search(r"__version__\s*=\s*'(.*)'", f.read())
43 |         return m.groups()[0]
44 | 
45 | with open("README.rst", "r") as fh:
46 |     long_description = fh.read()
47 | 
48 | setup(name = "mpiFFT4py",
49 |       version = version(),
50 |       description = "mpiFFT4py -- Parallel 3D FFT in Python using MPI for Python",
51 |       long_description = long_description,
52 |       author = "Mikael Mortensen",
53 |       author_email = "mikaem@math.uio.no",
54 |       url = 'https://github.com/spectralDNS/mpiFFT4py',
55 |       classifiers = [
56 |           'Development Status :: 5 - Production/Stable',
57 |           'Environment :: Console',
58 |           'Intended Audience :: Developers',
59 |           'Intended Audience :: Science/Research',
60 |           'Intended Audience :: Education',
61 |           'Programming Language :: Python',
62 |           'Programming Language :: Python :: 2',
63 |           'Programming Language :: Python :: 3',
64 |           'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
65 |           'Topic :: Scientific/Engineering :: Mathematics',
66 |           'Topic :: Software Development :: Libraries :: Python Modules',
67 |           ],
68 |       packages = ["mpiFFT4py",
69 |                   "mpiFFT4py.serialFFT",
70 |                   "mpiFFT4py.cython"
71 |                   ],
72 |       package_dir = {"mpiFFT4py": "mpiFFT4py"},
73 |       install_requires=["numpy"],
74 |       setup_requires=["numpy>=1.11",
75 |                       "cython>=0.25",
76 |                       "setuptools>=18.0"],
77 |       ext_modules = ext,
78 |       cmdclass = {'build_ext': build_ext_subclass}
79 |     )
80 | 


--------------------------------------------------------------------------------
/tests/test_FFT.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import string
  3 | import numpy as np
  4 | from numpy.random import random, randn
  5 | from numpy import allclose, empty, zeros, zeros_like, pi, array, int, all, float64
  6 | from numpy.fft import fftfreq
  7 | from mpi4py import MPI
  8 | 
  9 | from mpiFFT4py.pencil import R2C as Pencil_R2C
 10 | from mpiFFT4py.slab import R2C as Slab_R2C
 11 | from mpiFFT4py.line import R2C as Line_R2C
 12 | from mpiFFT4py import rfft2, rfftn, irfftn, irfft2, fftn, ifftn, irfft, ifft
 13 | from mpiFFT4py.slab import C2C
 14 | 
 15 | def reset_profile(prof):
 16 |     prof.code_map = {}
 17 |     prof.last_time = {}
 18 |     prof.enable_count = 0
 19 |     for func in prof.functions:
 20 |         prof.add_function(func)
 21 | 
 22 | N = 2**5
 23 | L = array([2*pi, 2*pi, 2*pi])
 24 | ks = (fftfreq(N)*N).astype(int)
 25 | comm = MPI.COMM_WORLD
 26 | 
 27 | if comm.Get_size() >= 4:
 28 |     params = ("slabas", "slabad", "slabws", "slabwd",
 29 |               "pencilsys", "pencilsyd", "pencilnys", "pencilnyd",
 30 |               "pencilsxd", "pencilsxs", "pencilnxd", "pencilnxs",
 31 |               "pencilaxd", "pencilaxs", "pencilayd", "pencilays")
 32 | 
 33 | else:
 34 |     params = ("slabas", "slabad", "slabws", "slabwd")
 35 | 
 36 | @pytest.fixture(params=params, scope='module')
 37 | 
 38 | def FFT(request):
 39 |     prec = {"s": "single", "d":"double"}[request.param[-1]]
 40 |     if request.param[:3] == "pen":
 41 |         communication = {"s": "Alltoall", "n": "AlltoallN", "a": "Alltoallw"}[request.param[-3]]
 42 |         alignment = request.param[-2].upper()
 43 |         return Pencil_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication, alignment=alignment)
 44 |     else:
 45 |         communication = 'Alltoall' if request.param[-2] == 'a' else 'Alltoallw'
 46 |         return Slab_R2C(array([N, 2*N, 4*N]), L, comm, prec, communication=communication)
 47 | 
 48 | @pytest.fixture(params=("lines", "lined"), scope='module')
 49 | def FFT2(request):
 50 |     prec = {"s": "single", "d":"double"}[request.param[-1]]
 51 |     return Line_R2C(array([N, 2*N]), L[:-1], comm, prec)
 52 | 
 53 | 
 54 | @pytest.fixture(params=("slabd", "slabs"), scope='module')
 55 | def FFT_C2C(request):
 56 |     prec = {"s": "single", "d":"double"}[request.param[-1]]
 57 |     return C2C(array([N, 2*N, 4*N]), L, comm, prec)
 58 | 
 59 | #@profile
 60 | def test_FFT(FFT):
 61 |     N = FFT.N
 62 |     if FFT.rank == 0:
 63 |         A = random(N).astype(FFT.float)
 64 |         if FFT.communication == 'AlltoallN':
 65 |             C = empty(FFT.global_complex_shape(), dtype=FFT.complex)
 66 |             C = rfftn(A, C, axes=(0,1,2))
 67 |             C[:, :, -1] = 0  # Remove Nyquist frequency
 68 |             A = irfftn(C, A, axes=(0,1,2))
 69 |         B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
 70 |         B2 = rfftn(A, B2, axes=(0,1,2))
 71 | 
 72 |     else:
 73 |         A = zeros(N, dtype=FFT.float)
 74 |         B2 = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
 75 | 
 76 |     atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
 77 |     FFT.comm.Bcast(A, root=0)
 78 |     FFT.comm.Bcast(B2, root=0)
 79 | 
 80 |     a = zeros(FFT.real_shape(), dtype=FFT.float)
 81 |     c = zeros(FFT.complex_shape(), dtype=FFT.complex)
 82 |     a[:] = A[FFT.real_local_slice()]
 83 |     c = FFT.fftn(a, c)
 84 |     #print abs((c - B2[FFT.complex_local_slice()])/c.max()).max()
 85 |     assert all(abs((c - B2[FFT.complex_local_slice()])/c.max()) < rtol)
 86 |     #assert allclose(c, B2[FFT.complex_local_slice()], rtol, atol)
 87 |     a = FFT.ifftn(c, a)
 88 |     #print abs((a - A[FFT.real_local_slice()])/a.max()).max()
 89 | 
 90 |     assert all(abs((a - A[FFT.real_local_slice()])/a.max()) < rtol)
 91 |     #assert allclose(a, A[FFT.real_local_slice()], rtol, atol)
 92 | 
 93 | def test_FFT2(FFT2):
 94 |     N = FFT2.N
 95 |     if FFT2.rank == 0:
 96 |         A = random(N).astype(FFT2.float)
 97 | 
 98 |     else:
 99 |         A = zeros(N, dtype=FFT2.float)
100 | 
101 |     atol, rtol = (1e-10, 1e-8) if FFT2.float is float64 else (5e-7, 1e-4)
102 |     FFT2.comm.Bcast(A, root=0)
103 |     a = zeros(FFT2.real_shape(), dtype=FFT2.float)
104 |     c = zeros(FFT2.complex_shape(), dtype=FFT2.complex)
105 |     a[:] = A[FFT2.real_local_slice()]
106 |     c = FFT2.fft2(a, c)
107 |     B2 = zeros(FFT2.global_complex_shape(), dtype=FFT2.complex)
108 |     B2 = rfft2(A, B2, axes=(0,1))
109 |     assert allclose(c, B2[FFT2.complex_local_slice()], rtol, atol)
110 |     a = FFT2.ifft2(c, a)
111 |     assert allclose(a, A[FFT2.real_local_slice()], rtol, atol)
112 | 
113 | def test_FFT2_padded(FFT2):
114 |     FFT = FFT2
115 |     N = FFT.N
116 |     prec = "single" if isinstance(FFT.float, np.float32) else "double"
117 |     FFT_SELF = Line_R2C(N, FFT.L, MPI.COMM_SELF, prec)
118 | 
119 |     if FFT.rank == 0:
120 |         A = random(N).astype(FFT.float)
121 |         C = zeros((FFT.global_complex_shape()), dtype=FFT.complex)
122 |         C = FFT_SELF.fft2(A, C)
123 | 
124 |         # Eliminate Nyquist, otherwise test will fail
125 |         C[-N[0]//2] = 0
126 | 
127 |         A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
128 |         A_pad = FFT_SELF.ifft2(C, A_pad, dealias="3/2-rule")
129 | 
130 |     else:
131 |         C = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
132 |         A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
133 | 
134 |     FFT.comm.Bcast(C, root=0)
135 |     FFT.comm.Bcast(A_pad, root=0)
136 | 
137 |     ae = zeros(FFT.real_shape_padded(), dtype=FFT.float)
138 |     c = zeros(FFT.complex_shape(), dtype=FFT.complex)
139 | 
140 |     c[:] = C[FFT.complex_local_slice()]
141 |     ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)]
142 | 
143 |     ap = zeros(FFT.real_shape_padded(), dtype=FFT.float)
144 |     cp = zeros(FFT.complex_shape(), dtype=FFT.complex)
145 |     ap = FFT.ifft2(c, ap, dealias="3/2-rule")
146 | 
147 |     atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
148 | 
149 |     #from IPython import embed; embed()
150 |     #print np.linalg.norm(ap-ae)
151 |     assert allclose(ap, ae, rtol, atol)
152 | 
153 |     cp = FFT.fft2(ap, cp, dealias="3/2-rule")
154 | 
155 |     #print np.linalg.norm(abs((cp-c)/cp.max()))
156 |     assert all(abs((cp-c)/cp.max()) < rtol)
157 | 
158 | 
159 | def test_FFT_padded(FFT):
160 |     N = FFT.N
161 |     prec = "single" if isinstance(FFT.float, np.float32) else "double"
162 |     FFT_SELF = Slab_R2C(FFT.N, L, MPI.COMM_SELF, prec,
163 |                         communication=FFT.communication)
164 | 
165 |     if FFT.rank == 0:
166 |         A = random(N).astype(FFT.float)
167 |         C = zeros((FFT.global_complex_shape()), dtype=FFT.complex)
168 |         C = FFT_SELF.fftn(A, C)
169 | 
170 |         # Eliminate Nyquist, otherwise test will fail
171 |         #C[-N[0]//2] = 0
172 |         #C[:, -N[1]//2] = 0
173 |         if FFT.communication == 'AlltoallN':
174 |             C[:, :, -1] = 0  # Remove Nyquist frequency
175 | 
176 |         A_pad = np.zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
177 |         A_pad = FFT_SELF.ifftn(C, A_pad, dealias='3/2-rule')
178 | 
179 |     else:
180 |         C = zeros(FFT.global_complex_shape(), dtype=FFT.complex)
181 |         A_pad = zeros(FFT_SELF.real_shape_padded(), dtype=FFT.float)
182 | 
183 |     FFT.comm.Bcast(C, root=0)
184 |     FFT.comm.Bcast(A_pad, root=0)
185 | 
186 |     ae = zeros(FFT.real_shape_padded(), dtype=FFT.float)
187 |     c = zeros(FFT.complex_shape(), dtype=FFT.complex)
188 | 
189 |     c[:] = C[FFT.complex_local_slice()]
190 |     ae[:] = A_pad[FFT.real_local_slice(padsize=1.5)]
191 | 
192 |     ap = zeros(FFT.real_shape_padded(), dtype=FFT.float)
193 |     cp = zeros(FFT.complex_shape(), dtype=FFT.complex)
194 |     ap = FFT.ifftn(c, ap, dealias="3/2-rule")
195 | 
196 |     atol, rtol = (1e-10, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
197 | 
198 |     #print np.linalg.norm(ap-ae)
199 |     assert allclose(ap, ae, rtol, atol)
200 | 
201 |     cp = FFT.fftn(ap, cp, dealias="3/2-rule")
202 | 
203 |     #from IPython import embed; embed()
204 |     #print np.linalg.norm(abs((cp-c)/cp.max()))
205 |     assert all(abs((cp-c)/cp.max()) < rtol)
206 | 
207 |     #aa = zeros(FFT.real_shape(), dtype=FFT.float)
208 |     #aa = FFT.ifftn(cp, aa)
209 | 
210 |     #a3 = A[FFT.real_local_slice()]
211 |     #assert allclose(aa, a3, rtol, atol)
212 | 
213 | def test_FFT_C2C(FFT_C2C):
214 |     """Test both padded and unpadded transforms"""
215 |     FFT = FFT_C2C
216 |     N = FFT.N
217 |     atol, rtol = (1e-8, 1e-8) if FFT.float is float64 else (5e-7, 1e-4)
218 | 
219 |     if FFT.rank == 0:
220 |         # Create a reference solution using only one CPU
221 |         A = (random(N)+random(N)*1j).astype(FFT.complex)
222 |         C = zeros((FFT.global_shape()), dtype=FFT.complex)
223 |         C = fftn(A, C, axes=(0,1,2))
224 | 
225 |         # Copy to array padded with zeros
226 |         Cp = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
227 |         ks = (fftfreq(N[2])*N[2]).astype(int)
228 |         Cp[:N[0]//2, :N[1]//2, ks] = C[:N[0]//2, :N[1]//2]
229 |         Cp[:N[0]//2, -N[1]//2:, ks] = C[:N[0]//2, N[1]//2:]
230 |         Cp[-N[0]//2:, :N[1]//2, ks] = C[N[0]//2:, :N[1]//2]
231 |         Cp[-N[0]//2:, -N[1]//2:, ks] = C[N[0]//2:, N[1]//2:]
232 | 
233 |         # Get transform of padded array
234 |         Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
235 |         Ap = ifftn(Cp*1.5**3, Ap, axes=(0,1,2))
236 | 
237 |     else:
238 |         C = zeros(FFT.global_shape(), dtype=FFT.complex)
239 |         Ap = zeros((3*N[0]//2, 3*N[1]//2, 3*N[2]//2), dtype=FFT.complex)
240 |         A = zeros(N, dtype=FFT.complex)
241 | 
242 |     # For testing broadcast the arrays computed on root to all CPUs
243 |     FFT.comm.Bcast(C, root=0)
244 |     FFT.comm.Bcast(Ap, root=0)
245 |     FFT.comm.Bcast(A, root=0)
246 | 
247 |     # Get the single processor solution on local part of the solution
248 |     ae = zeros(FFT.original_shape_padded(), dtype=FFT.complex)
249 |     ae[:] = Ap[FFT.original_local_slice(padsize=1.5)]
250 |     c = zeros(FFT.transformed_shape(), dtype=FFT.complex)
251 |     c[:] = C[FFT.transformed_local_slice()]
252 | 
253 |     # Perform padded transform with MPI and assert ok
254 |     ap = zeros(FFT.original_shape_padded(), dtype=FFT.complex)
255 |     ap = FFT.ifftn(c, ap, dealias="3/2-rule")
256 |     assert allclose(ap, ae, rtol, atol)
257 | 
258 |     # Perform truncated transform with MPI and assert
259 |     cp = zeros(FFT.transformed_shape(), dtype=FFT.complex)
260 |     cp = FFT.fftn(ap, cp, dealias="3/2-rule")
261 |     assert all(abs(cp-c)/cp.max() < rtol)
262 | 
263 |     # Now without padding
264 |     # Transform back to original
265 |     aa = zeros(FFT.original_shape(), dtype=FFT.complex)
266 |     aa = FFT.ifftn(c, aa)
267 |     # Verify
268 |     a3 = A[FFT.original_local_slice()]
269 |     assert allclose(aa, a3, rtol, atol)
270 |     c2 = zeros(FFT.transformed_shape(), dtype=FFT.complex)
271 |     c2 = FFT.fftn(aa, c2)
272 |     # Verify
273 |     assert all(abs(c2-c)/c2.max() < rtol)
274 |     #assert allclose(c2, c, rtol, atol)
275 | 
276 | #import time
277 | #t0 = time.time()
278 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall'))
279 | #t1 = time.time()
280 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='Alltoall'))
281 | #t2 = time.time()
282 | 
283 | #ty = MPI.COMM_WORLD.reduce(t1-t0, op=MPI.MIN)
284 | #tx = MPI.COMM_WORLD.reduce(t2-t1, op=MPI.MIN)
285 | #if MPI.COMM_WORLD.Get_rank() == 0:
286 |     #print "Y: ", ty
287 |     #print "X: ", tx
288 | 
289 | #test_FFT(Slab_R2C(array([N, 2*N, 4*N]), L, MPI.COMM_WORLD, "double", communication='Alltoall'))
290 | #test_FFT(Pencil_R2C(array([N, N, N], dtype=int), L, MPI.COMM_WORLD, "double", alignment="Y", communication='Alltoall'))
291 | #test_FFT2(Line_R2C(array([N, N]), L[:-1], MPI, "single"))
292 | #test_FFT2_padded(Line_R2C(array([N, N]), L[:-1], MPI, "double"))
293 | #from collections import defaultdict
294 | #FFT = Slab_R2C(array([N//4, N, N]), L, MPI.COMM_WORLD, "double", communication='Alltoallw', threads=2, planner_effort=defaultdict(lambda: "FFTW_MEASURE"))
295 | #test_FFT_padded(FFT)
296 | #reset_profile(profile)
297 | #test_FFT_padded(FFT)
298 | 
299 | #test_FFT_padded(Pencil_R2C(array([N, N, N], dtype=int), L, MPI, "double", alignment="X", communication='AlltoallN'))
300 | #test_FFT_C2C(C2C(array([N, N, N]), L, MPI, "double"))
301 | 


--------------------------------------------------------------------------------